{ "best_metric": null, "best_model_checkpoint": null, "episode": 38400, "epoch": 0.6902254017327534, "eval_steps": 500, "global_step": 2400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "episode": 16, "epoch": 0.00028759391738864725, "loss/policy_avg": -0.014177359640598297, "lr": 3e-06, "objective/entropy": 119.65733337402344, "objective/kl": 15.623376846313477, "objective/non_score_reward": -1.5623377561569214, "objective/rlhf_reward": -3.325632084847662, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 472.72821044921875, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7515315413475037, "step": 0, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0000829696655273 }, { "episode": 32, "epoch": 0.0005751878347772945, "loss/policy_avg": 0.05164449289441109, "lr": 2.999808282208589e-06, "objective/entropy": -117.60435485839844, "objective/kl": 11.686213493347168, "objective/non_score_reward": -1.168621301651001, "objective/rlhf_reward": -4.274485094845295, "objective/scores": 0.1, "policy/approxkl_avg": 236.72177124023438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6307989358901978, "step": 1, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9973212480545044 }, { "episode": 48, "epoch": 0.0008627817521659417, "loss/policy_avg": 0.6165977120399475, "lr": 2.999616564417178e-06, "objective/entropy": -116.07769775390625, "objective/kl": 10.806825637817383, "objective/non_score_reward": -1.080682635307312, "objective/rlhf_reward": -3.922730395942926, "objective/scores": 0.1, "policy/approxkl_avg": 211.7506103515625, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.726571798324585, "step": 2, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0005264282226562 }, { "episode": 64, "epoch": 0.001150375669554589, "loss/policy_avg": 0.39946672320365906, "lr": 2.999424846625767e-06, "objective/entropy": -284.77886962890625, "objective/kl": 9.179925918579102, "objective/non_score_reward": -0.9179927110671997, "objective/rlhf_reward": -3.2719709336757656, "objective/scores": 0.1, "policy/approxkl_avg": 172.39312744140625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7219442129135132, "step": 3, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9991655349731445 }, { "episode": 80, "epoch": 0.001437969586943236, "loss/policy_avg": 0.18221884965896606, "lr": 2.999233128834356e-06, "objective/entropy": -326.7154541015625, "objective/kl": 10.727872848510742, "objective/non_score_reward": -1.0727872848510742, "objective/rlhf_reward": -1.3674301027667253, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 252.6199188232422, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5611602067947388, "step": 4, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9977035522460938 }, { "episode": 96, "epoch": 0.0017255635043318834, "loss/policy_avg": 0.37348473072052, "lr": 2.999041411042945e-06, "objective/entropy": -172.4725341796875, "objective/kl": 9.580272674560547, "objective/non_score_reward": -0.958027184009552, "objective/rlhf_reward": -3.43210876584053, "objective/scores": 0.1, "policy/approxkl_avg": 233.60519409179688, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6069347858428955, "step": 5, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.004929304122925 }, { "episode": 112, "epoch": 0.0020131574217205307, "loss/policy_avg": 0.5359442234039307, "lr": 2.9988496932515338e-06, "objective/entropy": 37.751182556152344, "objective/kl": 8.995965957641602, "objective/non_score_reward": -0.8995967507362366, "objective/rlhf_reward": -1.6509756696986513, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 157.80946350097656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.45883142948150635, "step": 6, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9981789588928223 }, { "episode": 128, "epoch": 0.002300751339109178, "loss/policy_avg": 0.07628901302814484, "lr": 2.998657975460123e-06, "objective/entropy": -271.4947509765625, "objective/kl": 9.241050720214844, "objective/non_score_reward": -0.9241052865982056, "objective/rlhf_reward": 0.7035788685083393, "objective/scores": 1.1, "policy/approxkl_avg": 179.53875732421875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6910897493362427, "step": 7, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9963037967681885 }, { "episode": 144, "epoch": 0.002588345256497825, "loss/policy_avg": 0.0354180671274662, "lr": 2.998466257668712e-06, "objective/entropy": 209.80404663085938, "objective/kl": 11.208139419555664, "objective/non_score_reward": -1.1208139657974243, "objective/rlhf_reward": -2.8213962368374927, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 217.8009033203125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6648087501525879, "step": 8, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9964534044265747 }, { "episode": 160, "epoch": 0.002875939173886472, "loss/policy_avg": 0.24756430089473724, "lr": 2.9982745398773006e-06, "objective/entropy": -5.9293365478515625, "objective/kl": 1.9302005767822266, "objective/non_score_reward": -0.1930200457572937, "objective/rlhf_reward": 1.3506260157367849, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 22.118091583251953, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6500340700149536, "step": 9, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.002328395843506 }, { "episode": 176, "epoch": 0.0031635330912751195, "loss/policy_avg": 0.22338274121284485, "lr": 2.99808282208589e-06, "objective/entropy": -51.18250274658203, "objective/kl": 4.893694877624512, "objective/non_score_reward": -0.48936957120895386, "objective/rlhf_reward": -3.9574780464172363, "objective/scores": -0.5, "policy/approxkl_avg": 56.685855865478516, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6514552235603333, "step": 10, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0002238750457764 }, { "episode": 192, "epoch": 0.0034511270086637668, "loss/policy_avg": 0.07259142398834229, "lr": 2.9978911042944787e-06, "objective/entropy": -35.05317306518555, "objective/kl": 7.698199272155762, "objective/non_score_reward": -0.769819974899292, "objective/rlhf_reward": 1.3207200556993488, "objective/scores": 1.1, "policy/approxkl_avg": 163.5728759765625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4998992681503296, "step": 11, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9982101917266846 }, { "episode": 208, "epoch": 0.003738720926052414, "loss/policy_avg": 0.30875226855278015, "lr": 2.9976993865030675e-06, "objective/entropy": 128.93115234375, "objective/kl": 8.55907154083252, "objective/non_score_reward": -0.8559072017669678, "objective/rlhf_reward": -3.0236288517713543, "objective/scores": 0.1, "policy/approxkl_avg": 115.09192657470703, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6001245975494385, "step": 12, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.99881911277771 }, { "episode": 224, "epoch": 0.004026314843441061, "loss/policy_avg": 0.5440771579742432, "lr": 2.9975076687116563e-06, "objective/entropy": 194.59161376953125, "objective/kl": 14.834866523742676, "objective/non_score_reward": -1.4834866523742676, "objective/rlhf_reward": -1.5339468330144879, "objective/scores": 1.1, "policy/approxkl_avg": 319.5052185058594, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.46385329961776733, "step": 13, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9979116916656494 }, { "episode": 240, "epoch": 0.004313908760829708, "loss/policy_avg": 0.12125951051712036, "lr": 2.9973159509202455e-06, "objective/entropy": 110.74070739746094, "objective/kl": 6.117404937744141, "objective/non_score_reward": -0.6117404699325562, "objective/rlhf_reward": -4.446961879730225, "objective/scores": -0.5, "policy/approxkl_avg": 54.780738830566406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6084516048431396, "step": 14, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9986605644226074 }, { "episode": 256, "epoch": 0.004601502678218356, "loss/policy_avg": 0.2855263352394104, "lr": 2.9971242331288343e-06, "objective/entropy": -80.00950622558594, "objective/kl": 8.695795059204102, "objective/non_score_reward": -0.8695796728134155, "objective/rlhf_reward": -1.3556124068060258, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 134.0033416748047, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4744781255722046, "step": 15, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.99568772315979 }, { "episode": 272, "epoch": 0.004889096595607003, "loss/policy_avg": 0.34731030464172363, "lr": 2.996932515337423e-06, "objective/entropy": -193.7414093017578, "objective/kl": 6.8158183097839355, "objective/non_score_reward": -0.6815819144248962, "objective/rlhf_reward": -0.7789164585637408, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 94.75167083740234, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5870293378829956, "step": 16, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999891996383667 }, { "episode": 288, "epoch": 0.00517669051299565, "loss/policy_avg": 0.09466144442558289, "lr": 2.9967407975460124e-06, "objective/entropy": 13.335285186767578, "objective/kl": 7.405551910400391, "objective/non_score_reward": -0.7405551671981812, "objective/rlhf_reward": -4.962221145629883, "objective/scores": -0.5, "policy/approxkl_avg": 131.025146484375, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4989420771598816, "step": 17, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9998366832733154 }, { "episode": 304, "epoch": 0.0054642844303842975, "loss/policy_avg": 0.05795682966709137, "lr": 2.996549079754601e-06, "objective/entropy": 121.52836608886719, "objective/kl": 9.26551628112793, "objective/non_score_reward": -0.9265516996383667, "objective/rlhf_reward": -3.3062067613005635, "objective/scores": 0.1, "policy/approxkl_avg": 123.83927917480469, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7211639285087585, "step": 18, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0007009506225586 }, { "episode": 320, "epoch": 0.005751878347772944, "loss/policy_avg": 0.33338093757629395, "lr": 2.9963573619631904e-06, "objective/entropy": -9.356884002685547, "objective/kl": 4.64314079284668, "objective/non_score_reward": -0.46431419253349304, "objective/rlhf_reward": -1.4572567533701657, "objective/scores": 0.1, "policy/approxkl_avg": 62.679962158203125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.545394778251648, "step": 19, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998852252960205 }, { "episode": 336, "epoch": 0.006039472265161592, "loss/policy_avg": 0.1268569827079773, "lr": 2.9961656441717792e-06, "objective/entropy": 150.50843811035156, "objective/kl": 4.864284515380859, "objective/non_score_reward": -0.48642849922180176, "objective/rlhf_reward": -0.3894547065168168, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 35.721397399902344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5442019701004028, "step": 20, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.002377986907959 }, { "episode": 352, "epoch": 0.006327066182550239, "loss/policy_avg": 0.21250608563423157, "lr": 2.995973926380368e-06, "objective/entropy": 80.20552062988281, "objective/kl": 9.005538940429688, "objective/non_score_reward": -0.9005540013313293, "objective/rlhf_reward": -1.8688826049367586, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 145.47787475585938, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5761544704437256, "step": 21, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9991693496704102 }, { "episode": 368, "epoch": 0.006614660099938887, "loss/policy_avg": 0.049590617418289185, "lr": 2.9957822085889573e-06, "objective/entropy": -100.090576171875, "objective/kl": 12.812070846557617, "objective/non_score_reward": -1.2812069654464722, "objective/rlhf_reward": -3.1774167818593337, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 271.628173828125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.636669397354126, "step": 22, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9962739944458008 }, { "episode": 384, "epoch": 0.0069022540173275335, "loss/policy_avg": 0.09350229799747467, "lr": 2.995590490797546e-06, "objective/entropy": -59.7061653137207, "objective/kl": 12.184288024902344, "objective/non_score_reward": -1.2184288501739502, "objective/rlhf_reward": -2.751008885280166, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 219.70611572265625, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6429183483123779, "step": 23, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0023770332336426 }, { "episode": 400, "epoch": 0.00718984793471618, "loss/policy_avg": 0.05978470295667648, "lr": 2.995398773006135e-06, "objective/entropy": 102.0956802368164, "objective/kl": 4.846743106842041, "objective/non_score_reward": -0.484674334526062, "objective/rlhf_reward": 0.9850217357862268, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 55.893699645996094, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7100609540939331, "step": 24, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.99696946144104 }, { "episode": 416, "epoch": 0.007477441852104828, "loss/policy_avg": 0.047114282846450806, "lr": 2.995207055214724e-06, "objective/entropy": -127.81246185302734, "objective/kl": 9.111923217773438, "objective/non_score_reward": -0.9111922979354858, "objective/rlhf_reward": -3.244769042730331, "objective/scores": 0.1, "policy/approxkl_avg": 122.56946563720703, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6466059684753418, "step": 25, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0014986991882324 }, { "episode": 432, "epoch": 0.007765035769493475, "loss/policy_avg": 0.7473582029342651, "lr": 2.995015337423313e-06, "objective/entropy": 150.059814453125, "objective/kl": 10.861345291137695, "objective/non_score_reward": -1.0861345529556274, "objective/rlhf_reward": -6.34453821182251, "objective/scores": -0.5, "policy/approxkl_avg": 136.23809814453125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.46276742219924927, "step": 26, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0018882751464844 }, { "episode": 448, "epoch": 0.008052629686882123, "loss/policy_avg": 0.444513201713562, "lr": 2.994823619631902e-06, "objective/entropy": -46.63388442993164, "objective/kl": 8.348082542419434, "objective/non_score_reward": -0.834808349609375, "objective/rlhf_reward": -2.939233502745628, "objective/scores": 0.1, "policy/approxkl_avg": 137.52908325195312, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7123738527297974, "step": 27, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9967291355133057 }, { "episode": 464, "epoch": 0.00834022360427077, "loss/policy_avg": 0.11199073493480682, "lr": 2.994631901840491e-06, "objective/entropy": -52.04168701171875, "objective/kl": 7.455352783203125, "objective/non_score_reward": -0.7455353736877441, "objective/rlhf_reward": -0.5821412935853005, "objective/scores": 0.6, "policy/approxkl_avg": 92.71267700195312, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.48107001185417175, "step": 28, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9976396560668945 }, { "episode": 480, "epoch": 0.008627817521659416, "loss/policy_avg": 0.11989644169807434, "lr": 2.9944401840490798e-06, "objective/entropy": -23.16903305053711, "objective/kl": 6.455074310302734, "objective/non_score_reward": -0.6455073356628418, "objective/rlhf_reward": 1.8179705530405048, "objective/scores": 1.1, "policy/approxkl_avg": 83.73835754394531, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.43547505140304565, "step": 29, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9984638690948486 }, { "episode": 496, "epoch": 0.008915411439048063, "loss/policy_avg": -0.034143999218940735, "lr": 2.994248466257669e-06, "objective/entropy": -80.7169418334961, "objective/kl": 6.607659339904785, "objective/non_score_reward": -0.6607659459114075, "objective/rlhf_reward": -0.2430639252066611, "objective/scores": 0.6, "policy/approxkl_avg": 48.386138916015625, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6621455550193787, "step": 30, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0041260719299316 }, { "episode": 512, "epoch": 0.009203005356436712, "loss/policy_avg": 0.3250073790550232, "lr": 2.994056748466258e-06, "objective/entropy": 30.010330200195312, "objective/kl": 8.299867630004883, "objective/non_score_reward": -0.8299866914749146, "objective/rlhf_reward": -0.39622787082311783, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 136.398681640625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5553452372550964, "step": 31, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9970982074737549 }, { "episode": 528, "epoch": 0.009490599273825359, "loss/policy_avg": 0.128182053565979, "lr": 2.9938650306748466e-06, "objective/entropy": -206.34194946289062, "objective/kl": 2.580972671508789, "objective/non_score_reward": -0.25809726119041443, "objective/rlhf_reward": 1.89132993972185, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 25.516483306884766, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4866250157356262, "step": 32, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0006601810455322 }, { "episode": 544, "epoch": 0.009778193191214006, "loss/policy_avg": 0.14243654906749725, "lr": 2.993673312883436e-06, "objective/entropy": 26.605953216552734, "objective/kl": 4.795662879943848, "objective/non_score_reward": -0.4795662462711334, "objective/rlhf_reward": 0.029146325810019302, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 41.724029541015625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.3814541697502136, "step": 33, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9984464645385742 }, { "episode": 560, "epoch": 0.010065787108602653, "loss/policy_avg": 0.0956430584192276, "lr": 2.9934815950920243e-06, "objective/entropy": -151.531982421875, "objective/kl": 8.481374740600586, "objective/non_score_reward": -0.8481374979019165, "objective/rlhf_reward": -2.9925499673932787, "objective/scores": 0.1, "policy/approxkl_avg": 161.83607482910156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7393007278442383, "step": 34, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999760627746582 }, { "episode": 576, "epoch": 0.0103533810259913, "loss/policy_avg": 0.20686647295951843, "lr": 2.9932898773006135e-06, "objective/entropy": -46.547210693359375, "objective/kl": 15.390876770019531, "objective/non_score_reward": -1.5390875339508057, "objective/rlhf_reward": -3.232631032110426, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 324.61724853515625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.49993449449539185, "step": 35, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9998364448547363 }, { "episode": 592, "epoch": 0.010640974943379948, "loss/policy_avg": 0.14229583740234375, "lr": 2.9930981595092023e-06, "objective/entropy": 53.75727462768555, "objective/kl": 9.625295639038086, "objective/non_score_reward": -0.9625297784805298, "objective/rlhf_reward": 0.5498811393976215, "objective/scores": 1.1, "policy/approxkl_avg": 131.6404266357422, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7562763690948486, "step": 36, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9986224174499512 }, { "episode": 608, "epoch": 0.010928568860768595, "loss/policy_avg": 0.09236406534910202, "lr": 2.9929064417177915e-06, "objective/entropy": 13.225410461425781, "objective/kl": 6.162755966186523, "objective/non_score_reward": -0.6162755489349365, "objective/rlhf_reward": -2.065102344751358, "objective/scores": 0.1, "policy/approxkl_avg": 30.094545364379883, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6474248170852661, "step": 37, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9988819360733032 }, { "episode": 624, "epoch": 0.011216162778157242, "loss/policy_avg": 0.3340108394622803, "lr": 2.9927147239263803e-06, "objective/entropy": 62.37703323364258, "objective/kl": 4.8724799156188965, "objective/non_score_reward": -0.4872480034828186, "objective/rlhf_reward": -1.5489919766783713, "objective/scores": 0.1, "policy/approxkl_avg": 43.97528076171875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.3959454894065857, "step": 38, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9985570907592773 }, { "episode": 640, "epoch": 0.011503756695545889, "loss/policy_avg": 0.028094250708818436, "lr": 2.992523006134969e-06, "objective/entropy": -5.561492919921875, "objective/kl": 11.000988006591797, "objective/non_score_reward": -1.100098967552185, "objective/rlhf_reward": -4.000396034121513, "objective/scores": 0.1, "policy/approxkl_avg": 196.45921325683594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7062462568283081, "step": 39, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0003294944763184 }, { "episode": 656, "epoch": 0.011791350612934537, "loss/policy_avg": 0.08169247210025787, "lr": 2.9923312883435584e-06, "objective/entropy": -130.83839416503906, "objective/kl": 8.569768905639648, "objective/non_score_reward": -0.8569770455360413, "objective/rlhf_reward": 0.9720918476581577, "objective/scores": 1.1, "policy/approxkl_avg": 74.26432037353516, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7602853775024414, "step": 40, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.99922776222229 }, { "episode": 672, "epoch": 0.012078944530323184, "loss/policy_avg": 0.0993257462978363, "lr": 2.992139570552147e-06, "objective/entropy": 204.11431884765625, "objective/kl": 5.267461776733398, "objective/non_score_reward": -0.5267462134361267, "objective/rlhf_reward": -1.7069848686456681, "objective/scores": 0.1, "policy/approxkl_avg": 72.98307800292969, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5099983215332031, "step": 41, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9977500438690186 }, { "episode": 688, "epoch": 0.012366538447711831, "loss/policy_avg": 0.14465492963790894, "lr": 2.9919478527607364e-06, "objective/entropy": 65.64169311523438, "objective/kl": 8.029642105102539, "objective/non_score_reward": -0.8029642701148987, "objective/rlhf_reward": 1.1881429269909862, "objective/scores": 1.1, "policy/approxkl_avg": 77.06425476074219, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4883229732513428, "step": 42, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9979116916656494 }, { "episode": 704, "epoch": 0.012654132365100478, "loss/policy_avg": 0.3832010328769684, "lr": 2.9917561349693252e-06, "objective/entropy": -72.26643371582031, "objective/kl": 8.761590957641602, "objective/non_score_reward": -0.876159131526947, "objective/rlhf_reward": -5.504636764526367, "objective/scores": -0.5, "policy/approxkl_avg": 114.79124450683594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7940360903739929, "step": 43, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9983117580413818 }, { "episode": 720, "epoch": 0.012941726282489125, "loss/policy_avg": 0.1457168310880661, "lr": 2.991564417177914e-06, "objective/entropy": 215.26284790039062, "objective/kl": 8.928382873535156, "objective/non_score_reward": -0.8928384184837341, "objective/rlhf_reward": 0.8286463558673862, "objective/scores": 1.1, "policy/approxkl_avg": 73.42234802246094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5377756953239441, "step": 44, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9967031478881836 }, { "episode": 736, "epoch": 0.013229320199877773, "loss/policy_avg": 0.5347464680671692, "lr": 2.9913726993865033e-06, "objective/entropy": -0.6218109130859375, "objective/kl": 10.952564239501953, "objective/non_score_reward": -1.0952564477920532, "objective/rlhf_reward": -1.98102588057518, "objective/scores": 0.6, "policy/approxkl_avg": 208.24761962890625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8326528072357178, "step": 45, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9986271858215332 }, { "episode": 752, "epoch": 0.01351691411726642, "loss/policy_avg": 0.08795450627803802, "lr": 2.991180981595092e-06, "objective/entropy": 53.97735595703125, "objective/kl": 9.161317825317383, "objective/non_score_reward": -0.9161317348480225, "objective/rlhf_reward": -3.2645270287990567, "objective/scores": 0.1, "policy/approxkl_avg": 139.39212036132812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.3978268504142761, "step": 46, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9986640214920044 }, { "episode": 768, "epoch": 0.013804508034655067, "loss/policy_avg": 0.22336724400520325, "lr": 2.990989263803681e-06, "objective/entropy": 95.49320983886719, "objective/kl": 6.099149703979492, "objective/non_score_reward": -0.6099148988723755, "objective/rlhf_reward": 0.4840593293893609, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 63.69355773925781, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6312753558158875, "step": 47, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9990160465240479 }, { "episode": 784, "epoch": 0.014092101952043714, "loss/policy_avg": -0.4834544062614441, "lr": 2.99079754601227e-06, "objective/entropy": 103.409912109375, "objective/kl": 8.192754745483398, "objective/non_score_reward": -0.8192753195762634, "objective/rlhf_reward": -0.8771014422178267, "objective/scores": 0.6, "policy/approxkl_avg": 209.68890380859375, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6371721029281616, "step": 48, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.002427101135254 }, { "episode": 800, "epoch": 0.01437969586943236, "loss/policy_avg": 0.0012040697038173676, "lr": 2.990605828220859e-06, "objective/entropy": 278.3375244140625, "objective/kl": 15.085844039916992, "objective/non_score_reward": -1.5085842609405518, "objective/rlhf_reward": -5.6343372169882056, "objective/scores": 0.1, "policy/approxkl_avg": 569.4739379882812, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8680420517921448, "step": 49, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9974499940872192 }, { "episode": 816, "epoch": 0.01466728978682101, "loss/policy_avg": 0.08105640113353729, "lr": 2.990414110429448e-06, "objective/entropy": 82.5201416015625, "objective/kl": 14.61972713470459, "objective/non_score_reward": -1.461972713470459, "objective/rlhf_reward": -7.847890853881836, "objective/scores": -0.5, "policy/approxkl_avg": 473.65753173828125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8534562587738037, "step": 50, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0009145736694336 }, { "episode": 832, "epoch": 0.014954883704209656, "loss/policy_avg": 0.40666699409484863, "lr": 2.990222392638037e-06, "objective/entropy": -77.4280014038086, "objective/kl": 7.943630218505859, "objective/non_score_reward": -0.7943630814552307, "objective/rlhf_reward": -0.2537333711397376, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 91.87364196777344, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6725068092346191, "step": 51, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9985764026641846 }, { "episode": 848, "epoch": 0.015242477621598303, "loss/policy_avg": 0.37404656410217285, "lr": 2.990030674846626e-06, "objective/entropy": 48.54077911376953, "objective/kl": 11.823626518249512, "objective/non_score_reward": -1.1823625564575195, "objective/rlhf_reward": -6.729450225830078, "objective/scores": -0.5, "policy/approxkl_avg": 223.62557983398438, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7328237295150757, "step": 52, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9987246990203857 }, { "episode": 864, "epoch": 0.01553007153898695, "loss/policy_avg": 0.5382946729660034, "lr": 2.989838957055215e-06, "objective/entropy": -190.2376708984375, "objective/kl": 9.007357597351074, "objective/non_score_reward": -0.9007357954978943, "objective/rlhf_reward": -0.6792241677057472, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 168.06661987304688, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4519304931163788, "step": 53, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998849868774414 }, { "episode": 880, "epoch": 0.0158176654563756, "loss/policy_avg": 0.4903010427951813, "lr": 2.989647239263804e-06, "objective/entropy": 7.207241058349609, "objective/kl": 4.414880275726318, "objective/non_score_reward": -0.44148799777030945, "objective/rlhf_reward": 2.634048038721085, "objective/scores": 1.1, "policy/approxkl_avg": 44.266204833984375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.747165858745575, "step": 54, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998467206954956 }, { "episode": 896, "epoch": 0.016105259373764245, "loss/policy_avg": 0.009914087131619453, "lr": 2.989455521472393e-06, "objective/entropy": -35.93499755859375, "objective/kl": 8.610208511352539, "objective/non_score_reward": -0.8610208630561829, "objective/rlhf_reward": -1.6192545398798694, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 146.18605041503906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6474767923355103, "step": 55, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0002448558807373 }, { "episode": 912, "epoch": 0.016392853291152892, "loss/policy_avg": 0.009470928460359573, "lr": 2.9892638036809815e-06, "objective/entropy": -125.8683853149414, "objective/kl": 9.624561309814453, "objective/non_score_reward": -0.9624560475349426, "objective/rlhf_reward": -0.9261052950632301, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 156.87704467773438, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4992007613182068, "step": 56, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9993857145309448 }, { "episode": 928, "epoch": 0.01668044720854154, "loss/policy_avg": 0.30396610498428345, "lr": 2.9890720858895707e-06, "objective/entropy": 47.94700622558594, "objective/kl": 8.891968727111816, "objective/non_score_reward": -0.8891968727111816, "objective/rlhf_reward": -5.556787490844727, "objective/scores": -0.5, "policy/approxkl_avg": 125.919189453125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7451653480529785, "step": 57, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9991803169250488 }, { "episode": 944, "epoch": 0.016968041125930186, "loss/policy_avg": -0.1904684454202652, "lr": 2.9888803680981595e-06, "objective/entropy": 228.5303192138672, "objective/kl": 4.137008190155029, "objective/non_score_reward": -0.4137008786201477, "objective/rlhf_reward": -1.25480350703001, "objective/scores": 0.1, "policy/approxkl_avg": 51.00769805908203, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6238487958908081, "step": 58, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0140719413757324 }, { "episode": 960, "epoch": 0.017255635043318833, "loss/policy_avg": 0.8186465501785278, "lr": 2.9886886503067483e-06, "objective/entropy": -20.18294906616211, "objective/kl": 10.374330520629883, "objective/non_score_reward": -1.03743314743042, "objective/rlhf_reward": -1.7497324258089064, "objective/scores": 0.6, "policy/approxkl_avg": 178.15145874023438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5619127750396729, "step": 59, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0021634101867676 }, { "episode": 976, "epoch": 0.01754322896070748, "loss/policy_avg": 0.45171883702278137, "lr": 2.9884969325153375e-06, "objective/entropy": 44.993682861328125, "objective/kl": 7.884735584259033, "objective/non_score_reward": -0.78847336769104, "objective/rlhf_reward": -2.75389347076416, "objective/scores": 0.1, "policy/approxkl_avg": 131.51107788085938, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5738007426261902, "step": 60, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9988288879394531 }, { "episode": 992, "epoch": 0.017830822878096127, "loss/policy_avg": 0.697486162185669, "lr": 2.9883052147239263e-06, "objective/entropy": 26.72112274169922, "objective/kl": 7.41924524307251, "objective/non_score_reward": -0.7419244647026062, "objective/rlhf_reward": -0.5676979482173918, "objective/scores": 0.6, "policy/approxkl_avg": 87.49612426757812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.673904538154602, "step": 61, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9979908466339111 }, { "episode": 1008, "epoch": 0.018118416795484777, "loss/policy_avg": 0.08780250698328018, "lr": 2.988113496932515e-06, "objective/entropy": 144.9136962890625, "objective/kl": 7.059360504150391, "objective/non_score_reward": -0.705936074256897, "objective/rlhf_reward": 0.09997491097333766, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 81.5174560546875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7397779226303101, "step": 62, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999450922012329 }, { "episode": 1024, "epoch": 0.018406010712873424, "loss/policy_avg": 0.6007635593414307, "lr": 2.9879217791411044e-06, "objective/entropy": 116.3339614868164, "objective/kl": 7.176075458526611, "objective/non_score_reward": -0.7176075577735901, "objective/rlhf_reward": -2.470430406183004, "objective/scores": 0.1, "policy/approxkl_avg": 73.61609649658203, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4787016808986664, "step": 63, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9989898204803467 }, { "episode": 1040, "epoch": 0.01869360463026207, "loss/policy_avg": 0.14422942698001862, "lr": 2.987730061349693e-06, "objective/entropy": 124.52241516113281, "objective/kl": 9.423843383789062, "objective/non_score_reward": -0.9423844218254089, "objective/rlhf_reward": -0.8458185240041939, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 138.37496948242188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8451459407806396, "step": 64, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998455286026001 }, { "episode": 1056, "epoch": 0.018981198547650718, "loss/policy_avg": 0.38644856214523315, "lr": 2.9875383435582824e-06, "objective/entropy": -148.96302795410156, "objective/kl": 6.528387546539307, "objective/non_score_reward": -0.6528387665748596, "objective/rlhf_reward": -0.4886488600828983, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 81.52159881591797, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.643337607383728, "step": 65, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999079942703247 }, { "episode": 1072, "epoch": 0.019268792465039365, "loss/policy_avg": -0.05883178487420082, "lr": 2.9873466257668712e-06, "objective/entropy": -121.89275360107422, "objective/kl": 7.966899871826172, "objective/non_score_reward": -0.7966899871826172, "objective/rlhf_reward": -2.786759978532791, "objective/scores": 0.1, "policy/approxkl_avg": 141.0238037109375, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6699905395507812, "step": 66, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9981606006622314 }, { "episode": 1088, "epoch": 0.01955638638242801, "loss/policy_avg": 0.21825438737869263, "lr": 2.98715490797546e-06, "objective/entropy": 17.15899658203125, "objective/kl": 11.302406311035156, "objective/non_score_reward": -1.130240559577942, "objective/rlhf_reward": -6.520962715148926, "objective/scores": -0.5, "policy/approxkl_avg": 198.47238159179688, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7949972748756409, "step": 67, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9980106353759766 }, { "episode": 1104, "epoch": 0.019843980299816658, "loss/policy_avg": 0.2142024040222168, "lr": 2.9869631901840493e-06, "objective/entropy": -47.186798095703125, "objective/kl": 10.205244064331055, "objective/non_score_reward": -1.0205243825912476, "objective/rlhf_reward": -3.682097455859184, "objective/scores": 0.1, "policy/approxkl_avg": 154.7986297607422, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6624069213867188, "step": 68, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9982914924621582 }, { "episode": 1120, "epoch": 0.020131574217205305, "loss/policy_avg": 0.38636407256126404, "lr": 2.986771472392638e-06, "objective/entropy": -56.353668212890625, "objective/kl": 9.813121795654297, "objective/non_score_reward": -0.9813121557235718, "objective/rlhf_reward": -5.925248146057129, "objective/scores": -0.5, "policy/approxkl_avg": 89.94273376464844, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.49075233936309814, "step": 69, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0006632804870605 }, { "episode": 1136, "epoch": 0.020419168134593952, "loss/policy_avg": 0.1683022379875183, "lr": 2.9865797546012273e-06, "objective/entropy": 218.84620666503906, "objective/kl": 18.28194808959961, "objective/non_score_reward": -1.8281950950622559, "objective/rlhf_reward": -4.912780082225799, "objective/scores": 0.6, "policy/approxkl_avg": 422.34417724609375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5468066930770874, "step": 70, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9993388652801514 }, { "episode": 1152, "epoch": 0.0207067620519826, "loss/policy_avg": 0.12364174425601959, "lr": 2.986388036809816e-06, "objective/entropy": -157.8475341796875, "objective/kl": 13.170989036560059, "objective/non_score_reward": -1.3170989751815796, "objective/rlhf_reward": -7.26839542388916, "objective/scores": -0.5, "policy/approxkl_avg": 184.91448974609375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8540889620780945, "step": 71, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9987365007400513 }, { "episode": 1168, "epoch": 0.02099435596937125, "loss/policy_avg": 0.03804938867688179, "lr": 2.986196319018405e-06, "objective/entropy": 12.461807250976562, "objective/kl": 7.584700107574463, "objective/non_score_reward": -0.7584700584411621, "objective/rlhf_reward": -0.11016108536836766, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 53.31656265258789, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7939921617507935, "step": 72, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0002026557922363 }, { "episode": 1184, "epoch": 0.021281949886759896, "loss/policy_avg": 0.4785844683647156, "lr": 2.986004601226994e-06, "objective/entropy": -64.63442993164062, "objective/kl": 13.00765609741211, "objective/non_score_reward": -1.3007656335830688, "objective/rlhf_reward": -4.803062631189823, "objective/scores": 0.1, "policy/approxkl_avg": 264.447998046875, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6026296615600586, "step": 73, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9995261430740356 }, { "episode": 1200, "epoch": 0.021569543804148543, "loss/policy_avg": 0.22290995717048645, "lr": 2.985812883435583e-06, "objective/entropy": -106.69702911376953, "objective/kl": 13.168065071105957, "objective/non_score_reward": -1.316806435585022, "objective/rlhf_reward": -7.267225742340088, "objective/scores": -0.5, "policy/approxkl_avg": 273.555419921875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6650924682617188, "step": 74, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9982670545578003 }, { "episode": 1216, "epoch": 0.02185713772153719, "loss/policy_avg": 0.29405301809310913, "lr": 2.985621165644172e-06, "objective/entropy": 133.62835693359375, "objective/kl": 8.014554023742676, "objective/non_score_reward": -0.8014553189277649, "objective/rlhf_reward": -2.8058213055133816, "objective/scores": 0.1, "policy/approxkl_avg": 96.36099243164062, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7634880542755127, "step": 75, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9991488456726074 }, { "episode": 1232, "epoch": 0.022144731638925837, "loss/policy_avg": 0.45445433259010315, "lr": 2.985429447852761e-06, "objective/entropy": 82.93301391601562, "objective/kl": 8.670784950256348, "objective/non_score_reward": -0.8670786023139954, "objective/rlhf_reward": -1.0683142602443696, "objective/scores": 0.6, "policy/approxkl_avg": 148.96737670898438, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.540373682975769, "step": 76, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9980230331420898 }, { "episode": 1248, "epoch": 0.022432325556314484, "loss/policy_avg": 0.020047597587108612, "lr": 2.98523773006135e-06, "objective/entropy": -212.8873291015625, "objective/kl": 5.708805561065674, "objective/non_score_reward": -0.5708805918693542, "objective/rlhf_reward": 0.6401966915118966, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 42.32015609741211, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7310128211975098, "step": 77, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9995827674865723 }, { "episode": 1264, "epoch": 0.02271991947370313, "loss/policy_avg": -0.00048611266538500786, "lr": 2.9850460122699387e-06, "objective/entropy": -48.7366943359375, "objective/kl": 6.82136344909668, "objective/non_score_reward": -0.6821364164352417, "objective/rlhf_reward": -2.3285456061363217, "objective/scores": 0.1, "policy/approxkl_avg": 66.36034393310547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4771096706390381, "step": 78, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9971214532852173 }, { "episode": 1280, "epoch": 0.023007513391091777, "loss/policy_avg": 0.4775531589984894, "lr": 2.9848542944785275e-06, "objective/entropy": -153.65670776367188, "objective/kl": 11.706863403320312, "objective/non_score_reward": -1.1706863641738892, "objective/rlhf_reward": -3.078625496391373, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 175.5146484375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5806171894073486, "step": 79, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999029278755188 }, { "episode": 1296, "epoch": 0.023295107308480424, "loss/policy_avg": 0.46404796838760376, "lr": 2.9846625766871167e-06, "objective/entropy": -0.7676467895507812, "objective/kl": 9.653882026672363, "objective/non_score_reward": -0.9653880596160889, "objective/rlhf_reward": -3.461552521586418, "objective/scores": 0.1, "policy/approxkl_avg": 124.8095703125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6914379596710205, "step": 80, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9988126754760742 }, { "episode": 1312, "epoch": 0.023582701225869074, "loss/policy_avg": 0.1949668675661087, "lr": 2.9844708588957055e-06, "objective/entropy": -137.8944549560547, "objective/kl": 9.551393508911133, "objective/non_score_reward": -0.9551393985748291, "objective/rlhf_reward": 0.5794423833489422, "objective/scores": 1.1, "policy/approxkl_avg": 131.54342651367188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7368413209915161, "step": 81, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9991984367370605 }, { "episode": 1328, "epoch": 0.02387029514325772, "loss/policy_avg": 0.08234795928001404, "lr": 2.9842791411042943e-06, "objective/entropy": -301.7047119140625, "objective/kl": 11.51591682434082, "objective/non_score_reward": -1.1515917778015137, "objective/rlhf_reward": -4.20636705160141, "objective/scores": 0.1, "policy/approxkl_avg": 113.93853759765625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.785065770149231, "step": 82, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9992117881774902 }, { "episode": 1344, "epoch": 0.024157889060646368, "loss/policy_avg": 0.11510531604290009, "lr": 2.9840874233128835e-06, "objective/entropy": -97.20633697509766, "objective/kl": 10.416614532470703, "objective/non_score_reward": -1.0416613817214966, "objective/rlhf_reward": -2.3418168380585422, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 169.93270874023438, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6234397888183594, "step": 83, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0019311904907227 }, { "episode": 1360, "epoch": 0.024445482978035015, "loss/policy_avg": -0.0069921668618917465, "lr": 2.9838957055214724e-06, "objective/entropy": 30.303848266601562, "objective/kl": 8.926748275756836, "objective/non_score_reward": -0.8926749229431152, "objective/rlhf_reward": -5.570699691772461, "objective/scores": -0.5, "policy/approxkl_avg": 76.47716522216797, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5039442181587219, "step": 84, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9998093843460083 }, { "episode": 1376, "epoch": 0.024733076895423662, "loss/policy_avg": 0.27738839387893677, "lr": 2.983703987730061e-06, "objective/entropy": -166.7930450439453, "objective/kl": 6.530454635620117, "objective/non_score_reward": -0.6530454754829407, "objective/rlhf_reward": -2.2121819466352464, "objective/scores": 0.1, "policy/approxkl_avg": 59.98471450805664, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.90630704164505, "step": 85, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998368263244629 }, { "episode": 1392, "epoch": 0.02502067081281231, "loss/policy_avg": 0.22002673149108887, "lr": 2.9835122699386504e-06, "objective/entropy": -67.69169616699219, "objective/kl": 11.10516357421875, "objective/non_score_reward": -1.1105163097381592, "objective/rlhf_reward": -2.6172365203228702, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 100.68114471435547, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6418424844741821, "step": 86, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9970009326934814 }, { "episode": 1408, "epoch": 0.025308264730200956, "loss/policy_avg": 0.41238439083099365, "lr": 2.983320552147239e-06, "objective/entropy": -16.58879852294922, "objective/kl": 8.708709716796875, "objective/non_score_reward": -0.8708709478378296, "objective/rlhf_reward": -5.483483791351318, "objective/scores": -0.5, "policy/approxkl_avg": 80.49099731445312, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6647894978523254, "step": 87, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9938620328903198 }, { "episode": 1424, "epoch": 0.025595858647589603, "loss/policy_avg": -0.10927846282720566, "lr": 2.9831288343558284e-06, "objective/entropy": 16.377220153808594, "objective/kl": 13.530142784118652, "objective/non_score_reward": -1.353014349937439, "objective/rlhf_reward": -3.2893511898079257, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 238.56805419921875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6255956888198853, "step": 88, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9985899925231934 }, { "episode": 1440, "epoch": 0.02588345256497825, "loss/policy_avg": 0.1418202817440033, "lr": 2.9829371165644172e-06, "objective/entropy": 156.89816284179688, "objective/kl": 12.802512168884277, "objective/non_score_reward": -1.2802512645721436, "objective/rlhf_reward": -3.296176548275064, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 294.27996826171875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6715450286865234, "step": 89, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9973416328430176 }, { "episode": 1456, "epoch": 0.026171046482366896, "loss/policy_avg": 0.008285747841000557, "lr": 2.982745398773006e-06, "objective/entropy": 97.516357421875, "objective/kl": 9.460161209106445, "objective/non_score_reward": -0.9460161328315735, "objective/rlhf_reward": 0.6159354835748676, "objective/scores": 1.1, "policy/approxkl_avg": 122.57460021972656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8431274890899658, "step": 90, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9984780550003052 }, { "episode": 1472, "epoch": 0.026458640399755547, "loss/policy_avg": 0.5656089782714844, "lr": 2.9825536809815953e-06, "objective/entropy": 103.79216766357422, "objective/kl": 8.058956146240234, "objective/non_score_reward": -0.8058955669403076, "objective/rlhf_reward": -2.8235821485519406, "objective/scores": 0.1, "policy/approxkl_avg": 123.79965209960938, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.46594178676605225, "step": 91, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9949241876602173 }, { "episode": 1488, "epoch": 0.026746234317144194, "loss/policy_avg": 0.6543309688568115, "lr": 2.982361963190184e-06, "objective/entropy": -186.4047393798828, "objective/kl": 11.067312240600586, "objective/non_score_reward": -1.1067311763763428, "objective/rlhf_reward": -2.479513700084622, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 161.9315948486328, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6305772066116333, "step": 92, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9993164539337158 }, { "episode": 1504, "epoch": 0.02703382823453284, "loss/policy_avg": 0.19107398390769958, "lr": 2.9821702453987733e-06, "objective/entropy": -34.54255294799805, "objective/kl": 7.058377265930176, "objective/non_score_reward": -0.7058378458023071, "objective/rlhf_reward": 1.5766486465930942, "objective/scores": 1.1, "policy/approxkl_avg": 83.1898193359375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5186392068862915, "step": 93, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9963679313659668 }, { "episode": 1520, "epoch": 0.027321422151921487, "loss/policy_avg": 0.0312882624566555, "lr": 2.981978527607362e-06, "objective/entropy": 19.131616592407227, "objective/kl": 7.02040958404541, "objective/non_score_reward": -0.7020410299301147, "objective/rlhf_reward": -4.808164119720459, "objective/scores": -0.5, "policy/approxkl_avg": 49.332481384277344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6216336488723755, "step": 94, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9971067905426025 }, { "episode": 1536, "epoch": 0.027609016069310134, "loss/policy_avg": 0.08140967786312103, "lr": 2.981786809815951e-06, "objective/entropy": 13.800872802734375, "objective/kl": 6.789825439453125, "objective/non_score_reward": -0.6789825558662415, "objective/rlhf_reward": -4.715930461883545, "objective/scores": -0.5, "policy/approxkl_avg": 50.50202560424805, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5023022890090942, "step": 95, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999751329421997 }, { "episode": 1552, "epoch": 0.02789660998669878, "loss/policy_avg": 0.1259385198354721, "lr": 2.98159509202454e-06, "objective/entropy": 1.0717048645019531, "objective/kl": 10.119159698486328, "objective/non_score_reward": -1.011915922164917, "objective/rlhf_reward": -3.6476640462875363, "objective/scores": 0.1, "policy/approxkl_avg": 155.03924560546875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5893478393554688, "step": 96, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999608039855957 }, { "episode": 1568, "epoch": 0.028184203904087428, "loss/policy_avg": 0.35828953981399536, "lr": 2.981403374233129e-06, "objective/entropy": 90.25312042236328, "objective/kl": 18.374267578125, "objective/non_score_reward": -1.8374266624450684, "objective/rlhf_reward": -2.949706649780273, "objective/scores": 1.1, "policy/approxkl_avg": 272.5823669433594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4844147562980652, "step": 97, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9982144832611084 }, { "episode": 1584, "epoch": 0.028471797821476075, "loss/policy_avg": 0.37511101365089417, "lr": 2.981211656441718e-06, "objective/entropy": 57.79621505737305, "objective/kl": 11.18044662475586, "objective/non_score_reward": -1.1180447340011597, "objective/rlhf_reward": -6.4721784591674805, "objective/scores": -0.5, "policy/approxkl_avg": 282.01220703125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4804549217224121, "step": 98, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9971604347229004 }, { "episode": 1600, "epoch": 0.02875939173886472, "loss/policy_avg": 0.43748798966407776, "lr": 2.981019938650307e-06, "objective/entropy": -91.33480834960938, "objective/kl": 9.78958797454834, "objective/non_score_reward": -0.9789588451385498, "objective/rlhf_reward": -5.915835380554199, "objective/scores": -0.5, "policy/approxkl_avg": 210.7143096923828, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6887623071670532, "step": 99, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999546766281128 }, { "episode": 1616, "epoch": 0.029046985656253372, "loss/policy_avg": 0.012114331126213074, "lr": 2.980828220858896e-06, "objective/entropy": 21.23130226135254, "objective/kl": 3.5349526405334473, "objective/non_score_reward": -0.3534952402114868, "objective/rlhf_reward": 1.509737993835238, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 18.795394897460938, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.649235188961029, "step": 100, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0008366107940674 }, { "episode": 1632, "epoch": 0.02933457957364202, "loss/policy_avg": 0.08204736560583115, "lr": 2.9806365030674847e-06, "objective/entropy": -141.25718688964844, "objective/kl": 11.29146957397461, "objective/non_score_reward": -1.1291468143463135, "objective/rlhf_reward": -0.11658706367015803, "objective/scores": 1.1, "policy/approxkl_avg": 166.33197021484375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6492342352867126, "step": 101, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9966166019439697 }, { "episode": 1648, "epoch": 0.029622173491030666, "loss/policy_avg": 0.08084648847579956, "lr": 2.9804447852760735e-06, "objective/entropy": 111.48165893554688, "objective/kl": 7.421027183532715, "objective/non_score_reward": -0.7421026825904846, "objective/rlhf_reward": 1.4315892295911912, "objective/scores": 1.1, "policy/approxkl_avg": 77.68070983886719, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7161847949028015, "step": 102, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9976788759231567 }, { "episode": 1664, "epoch": 0.029909767408419313, "loss/policy_avg": 0.34747451543807983, "lr": 2.9802530674846627e-06, "objective/entropy": 11.092723846435547, "objective/kl": 10.786249160766602, "objective/non_score_reward": -1.078624963760376, "objective/rlhf_reward": -6.314499855041504, "objective/scores": -0.5, "policy/approxkl_avg": 129.8665008544922, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6497814655303955, "step": 103, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999152660369873 }, { "episode": 1680, "epoch": 0.03019736132580796, "loss/policy_avg": 0.19717364013195038, "lr": 2.9800613496932515e-06, "objective/entropy": 84.13946533203125, "objective/kl": 14.11801528930664, "objective/non_score_reward": -1.4118015766143799, "objective/rlhf_reward": -7.6472063064575195, "objective/scores": -0.5, "policy/approxkl_avg": 345.1640319824219, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7492287158966064, "step": 104, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9964749813079834 }, { "episode": 1696, "epoch": 0.030484955243196606, "loss/policy_avg": 0.31150949001312256, "lr": 2.9798696319018403e-06, "objective/entropy": 189.52505493164062, "objective/kl": 6.602322578430176, "objective/non_score_reward": -0.6602323055267334, "objective/rlhf_reward": -2.2409292221069332, "objective/scores": 0.1, "policy/approxkl_avg": 36.76777648925781, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7701338529586792, "step": 105, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9979405403137207 }, { "episode": 1712, "epoch": 0.030772549160585253, "loss/policy_avg": 0.863737940788269, "lr": 2.9796779141104296e-06, "objective/entropy": -57.84851837158203, "objective/kl": 10.454719543457031, "objective/non_score_reward": -1.0454717874526978, "objective/rlhf_reward": -3.7818873882293698, "objective/scores": 0.1, "policy/approxkl_avg": 81.14009094238281, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6733952164649963, "step": 106, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9979674816131592 }, { "episode": 1728, "epoch": 0.0310601430779739, "loss/policy_avg": 0.3714882731437683, "lr": 2.9794861963190184e-06, "objective/entropy": -48.674835205078125, "objective/kl": 8.692556381225586, "objective/non_score_reward": -0.869255542755127, "objective/rlhf_reward": -1.354315804616485, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 95.59877014160156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5013031959533691, "step": 107, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9974138736724854 }, { "episode": 1744, "epoch": 0.03134773699536255, "loss/policy_avg": 0.279630184173584, "lr": 2.9792944785276076e-06, "objective/entropy": 45.82620620727539, "objective/kl": 6.540558338165283, "objective/non_score_reward": -0.6540557742118835, "objective/rlhf_reward": -4.616223335266113, "objective/scores": -0.5, "policy/approxkl_avg": 68.67784118652344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.46648138761520386, "step": 108, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9995229244232178 }, { "episode": 1760, "epoch": 0.0316353309127512, "loss/policy_avg": 0.08991475403308868, "lr": 2.9791027607361964e-06, "objective/entropy": -64.23330688476562, "objective/kl": 8.345191955566406, "objective/non_score_reward": -0.8345192074775696, "objective/rlhf_reward": -2.938076882064342, "objective/scores": 0.1, "policy/approxkl_avg": 93.55230712890625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.48790040612220764, "step": 109, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9989550113677979 }, { "episode": 1776, "epoch": 0.031922924830139844, "loss/policy_avg": 0.28548339009284973, "lr": 2.9789110429447852e-06, "objective/entropy": 43.4534912109375, "objective/kl": 13.334844589233398, "objective/non_score_reward": -1.333484411239624, "objective/rlhf_reward": -4.933937734365463, "objective/scores": 0.1, "policy/approxkl_avg": 361.4084167480469, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.721168041229248, "step": 110, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000476121902466 }, { "episode": 1792, "epoch": 0.03221051874752849, "loss/policy_avg": 0.4107435941696167, "lr": 2.9787193251533744e-06, "objective/entropy": -182.18148803710938, "objective/kl": 7.509696006774902, "objective/non_score_reward": -0.750969648361206, "objective/rlhf_reward": 1.396121428906918, "objective/scores": 1.1, "policy/approxkl_avg": 77.65406036376953, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5355631113052368, "step": 111, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999549388885498 }, { "episode": 1808, "epoch": 0.03249811266491714, "loss/policy_avg": 0.3796160817146301, "lr": 2.9785276073619633e-06, "objective/entropy": 124.50952911376953, "objective/kl": 8.484485626220703, "objective/non_score_reward": -0.8484484553337097, "objective/rlhf_reward": -5.393794059753418, "objective/scores": -0.5, "policy/approxkl_avg": 126.5394287109375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6620683670043945, "step": 112, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9979877471923828 }, { "episode": 1824, "epoch": 0.032785706582305785, "loss/policy_avg": 0.26249387860298157, "lr": 2.978335889570552e-06, "objective/entropy": 25.24797821044922, "objective/kl": 13.403532028198242, "objective/non_score_reward": -1.34035325050354, "objective/rlhf_reward": -0.961412942409515, "objective/scores": 1.1, "policy/approxkl_avg": 187.52792358398438, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6351895332336426, "step": 113, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9988791942596436 }, { "episode": 1840, "epoch": 0.03307330049969443, "loss/policy_avg": 0.3651992380619049, "lr": 2.9781441717791413e-06, "objective/entropy": 95.998779296875, "objective/kl": 13.46788215637207, "objective/non_score_reward": -1.3467882871627808, "objective/rlhf_reward": -7.387153148651123, "objective/scores": -0.5, "policy/approxkl_avg": 247.23101806640625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6072109937667847, "step": 114, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9954397678375244 }, { "episode": 1856, "epoch": 0.03336089441708308, "loss/policy_avg": 0.3738645017147064, "lr": 2.97795245398773e-06, "objective/entropy": 247.19410705566406, "objective/kl": 14.608449935913086, "objective/non_score_reward": -1.4608449935913086, "objective/rlhf_reward": -7.843379974365234, "objective/scores": -0.5, "policy/approxkl_avg": 280.8343505859375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7322804927825928, "step": 115, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9977314472198486 }, { "episode": 1872, "epoch": 0.033648488334471725, "loss/policy_avg": 0.39061659574508667, "lr": 2.9777607361963193e-06, "objective/entropy": 58.74927520751953, "objective/kl": 11.686922073364258, "objective/non_score_reward": -1.1686923503875732, "objective/rlhf_reward": -6.674769401550293, "objective/scores": -0.5, "policy/approxkl_avg": 226.12789916992188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.48344868421554565, "step": 116, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9999885559082031 }, { "episode": 1888, "epoch": 0.03393608225186037, "loss/policy_avg": 0.3333742022514343, "lr": 2.977569018404908e-06, "objective/entropy": -19.94247055053711, "objective/kl": 9.790740966796875, "objective/non_score_reward": -0.979074239730835, "objective/rlhf_reward": -0.9925778030764787, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 72.29800415039062, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5458022952079773, "step": 117, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.997154951095581 }, { "episode": 1904, "epoch": 0.03422367616924902, "loss/policy_avg": 0.09615316987037659, "lr": 2.977377300613497e-06, "objective/entropy": -167.89923095703125, "objective/kl": 8.815143585205078, "objective/non_score_reward": -0.8815143704414368, "objective/rlhf_reward": -3.1260574519634243, "objective/scores": 0.1, "policy/approxkl_avg": 45.062522888183594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8456206321716309, "step": 118, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0000126361846924 }, { "episode": 1920, "epoch": 0.034511270086637666, "loss/policy_avg": 0.08710992336273193, "lr": 2.977185582822086e-06, "objective/entropy": 263.03179931640625, "objective/kl": 11.179251670837402, "objective/non_score_reward": -1.1179251670837402, "objective/rlhf_reward": -4.071700690686702, "objective/scores": 0.1, "policy/approxkl_avg": 161.03988647460938, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7780265808105469, "step": 119, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9994699954986572 }, { "episode": 1936, "epoch": 0.03479886400402631, "loss/policy_avg": 0.13407912850379944, "lr": 2.976993865030675e-06, "objective/entropy": -70.0582504272461, "objective/kl": 6.793869972229004, "objective/non_score_reward": -0.6793869733810425, "objective/rlhf_reward": 0.2061711356628213, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 59.43596649169922, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7297772169113159, "step": 120, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998098611831665 }, { "episode": 1952, "epoch": 0.03508645792141496, "loss/policy_avg": 0.06745412945747375, "lr": 2.9768021472392642e-06, "objective/entropy": 70.17347717285156, "objective/kl": 9.706807136535645, "objective/non_score_reward": -0.9706807136535645, "objective/rlhf_reward": -1.9353115958737686, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 101.62091827392578, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5260573625564575, "step": 121, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0000791549682617 }, { "episode": 1968, "epoch": 0.03537405183880361, "loss/policy_avg": 0.056749723851680756, "lr": 2.976610429447853e-06, "objective/entropy": -228.255615234375, "objective/kl": 5.5145463943481445, "objective/non_score_reward": -0.5514546632766724, "objective/rlhf_reward": -1.8058185189962388, "objective/scores": 0.1, "policy/approxkl_avg": 43.354915618896484, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6759341955184937, "step": 122, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0004634857177734 }, { "episode": 1984, "epoch": 0.03566164575619225, "loss/policy_avg": 0.08238844573497772, "lr": 2.9764187116564414e-06, "objective/entropy": 241.84060668945312, "objective/kl": 9.057453155517578, "objective/non_score_reward": -0.905745267868042, "objective/rlhf_reward": -5.622981071472168, "objective/scores": -0.5, "policy/approxkl_avg": 59.88520050048828, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6288118362426758, "step": 123, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0009031295776367 }, { "episode": 2000, "epoch": 0.03594923967358091, "loss/policy_avg": 0.5170639753341675, "lr": 2.9762269938650307e-06, "objective/entropy": -229.75860595703125, "objective/kl": 6.897617816925049, "objective/non_score_reward": -0.6897618174552917, "objective/rlhf_reward": 0.1646718115198883, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 75.08253479003906, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.8155406713485718, "step": 124, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9993102550506592 }, { "episode": 2016, "epoch": 0.036236833590969554, "loss/policy_avg": 0.23536163568496704, "lr": 2.9760352760736195e-06, "objective/entropy": 104.1327896118164, "objective/kl": 8.855351448059082, "objective/non_score_reward": -0.8855351209640503, "objective/rlhf_reward": -0.6184214695703713, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 104.30943298339844, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7742270231246948, "step": 125, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.001922607421875 }, { "episode": 2032, "epoch": 0.0365244275083582, "loss/policy_avg": 0.3929940164089203, "lr": 2.9758435582822087e-06, "objective/entropy": 19.527324676513672, "objective/kl": 7.849102973937988, "objective/non_score_reward": -0.7849102020263672, "objective/rlhf_reward": -0.7396408528089523, "objective/scores": 0.6, "policy/approxkl_avg": 43.17992401123047, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.49572157859802246, "step": 126, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000518321990967 }, { "episode": 2048, "epoch": 0.03681202142574685, "loss/policy_avg": 0.02761128917336464, "lr": 2.9756518404907975e-06, "objective/entropy": -31.197162628173828, "objective/kl": 11.224246978759766, "objective/non_score_reward": -1.122424602508545, "objective/rlhf_reward": -6.48969841003418, "objective/scores": -0.5, "policy/approxkl_avg": 167.82305908203125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6054507493972778, "step": 127, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998534917831421 }, { "episode": 2064, "epoch": 0.037099615343135495, "loss/policy_avg": 0.8324223756790161, "lr": 2.9754601226993863e-06, "objective/entropy": -79.34418487548828, "objective/kl": 9.576016426086426, "objective/non_score_reward": -0.95760178565979, "objective/rlhf_reward": -5.83040714263916, "objective/scores": -0.5, "policy/approxkl_avg": 189.614990234375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6984212398529053, "step": 128, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9968839883804321 }, { "episode": 2080, "epoch": 0.03738720926052414, "loss/policy_avg": 0.13773420453071594, "lr": 2.9752684049079756e-06, "objective/entropy": 29.27914047241211, "objective/kl": 6.572282791137695, "objective/non_score_reward": -0.6572283506393433, "objective/rlhf_reward": -1.0247934012749966, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 35.22698974609375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4145284593105316, "step": 129, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000413417816162 }, { "episode": 2096, "epoch": 0.03767480317791279, "loss/policy_avg": 0.14625512063503265, "lr": 2.9750766871165644e-06, "objective/entropy": 248.92510986328125, "objective/kl": 8.044551849365234, "objective/non_score_reward": -0.8044552803039551, "objective/rlhf_reward": -5.21782112121582, "objective/scores": -0.5, "policy/approxkl_avg": 65.03269958496094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7755135893821716, "step": 130, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9952176809310913 }, { "episode": 2112, "epoch": 0.037962397095301435, "loss/policy_avg": 0.08024582266807556, "lr": 2.9748849693251536e-06, "objective/entropy": 122.44419860839844, "objective/kl": 17.353057861328125, "objective/non_score_reward": -1.735305666923523, "objective/rlhf_reward": -6.541222697496414, "objective/scores": 0.1, "policy/approxkl_avg": 378.7543029785156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7671464681625366, "step": 131, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9952489137649536 }, { "episode": 2128, "epoch": 0.03824999101269008, "loss/policy_avg": 0.052941206842660904, "lr": 2.9746932515337424e-06, "objective/entropy": 12.529216766357422, "objective/kl": 17.990718841552734, "objective/non_score_reward": -1.799072027206421, "objective/rlhf_reward": -2.7962879896163937, "objective/scores": 1.1, "policy/approxkl_avg": 328.53741455078125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.543836236000061, "step": 132, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9990971088409424 }, { "episode": 2144, "epoch": 0.03853758493007873, "loss/policy_avg": 0.08809210360050201, "lr": 2.9745015337423312e-06, "objective/entropy": 92.10977172851562, "objective/kl": 3.5646705627441406, "objective/non_score_reward": -0.35646697878837585, "objective/rlhf_reward": 2.974132025241852, "objective/scores": 1.1, "policy/approxkl_avg": 13.450565338134766, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5211790800094604, "step": 133, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0005075931549072 }, { "episode": 2160, "epoch": 0.038825178847467376, "loss/policy_avg": 0.25067800283432007, "lr": 2.9743098159509205e-06, "objective/entropy": -342.741455078125, "objective/kl": 13.730775833129883, "objective/non_score_reward": -1.373077630996704, "objective/rlhf_reward": -5.092310494184494, "objective/scores": 0.1, "policy/approxkl_avg": 226.23080444335938, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7523109912872314, "step": 134, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9971081018447876 }, { "episode": 2176, "epoch": 0.03911277276485602, "loss/policy_avg": -0.05823849141597748, "lr": 2.9741180981595093e-06, "objective/entropy": 177.39581298828125, "objective/kl": 2.4112541675567627, "objective/non_score_reward": -0.2411254346370697, "objective/rlhf_reward": -2.9645018577575684, "objective/scores": -0.5, "policy/approxkl_avg": 5.3866167068481445, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4305086135864258, "step": 135, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0287272930145264 }, { "episode": 2192, "epoch": 0.03940036668224467, "loss/policy_avg": 0.39458757638931274, "lr": 2.973926380368098e-06, "objective/entropy": -71.98441314697266, "objective/kl": 11.294361114501953, "objective/non_score_reward": -1.1294360160827637, "objective/rlhf_reward": -1.5940250202429025, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 172.00338745117188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7612372040748596, "step": 136, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9963352680206299 }, { "episode": 2208, "epoch": 0.039687960599633317, "loss/policy_avg": 0.6298972368240356, "lr": 2.9737346625766873e-06, "objective/entropy": -57.89506530761719, "objective/kl": 8.208264350891113, "objective/non_score_reward": -0.8208264112472534, "objective/rlhf_reward": -2.8833055704832073, "objective/scores": 0.1, "policy/approxkl_avg": 89.37754821777344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.817412257194519, "step": 137, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9989038705825806 }, { "episode": 2224, "epoch": 0.03997555451702196, "loss/policy_avg": 0.07556813955307007, "lr": 2.973542944785276e-06, "objective/entropy": -190.96238708496094, "objective/kl": 15.059877395629883, "objective/non_score_reward": -1.5059877634048462, "objective/rlhf_reward": -1.6239511057734486, "objective/scores": 1.1, "policy/approxkl_avg": 377.60052490234375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6793420314788818, "step": 138, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9978358745574951 }, { "episode": 2240, "epoch": 0.04026314843441061, "loss/policy_avg": 0.1697106957435608, "lr": 2.9733512269938653e-06, "objective/entropy": 53.708675384521484, "objective/kl": 8.52203369140625, "objective/non_score_reward": -0.8522033095359802, "objective/rlhf_reward": -3.0088131859898564, "objective/scores": 0.1, "policy/approxkl_avg": 61.883544921875, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5295162796974182, "step": 139, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9995434284210205 }, { "episode": 2256, "epoch": 0.04055074235179926, "loss/policy_avg": 0.4851709008216858, "lr": 2.973159509202454e-06, "objective/entropy": -114.12245178222656, "objective/kl": 8.938103675842285, "objective/non_score_reward": -0.8938103914260864, "objective/rlhf_reward": -3.1752414911985394, "objective/scores": 0.1, "policy/approxkl_avg": 119.33918762207031, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.567252516746521, "step": 140, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9977459907531738 }, { "episode": 2272, "epoch": 0.040838336269187904, "loss/policy_avg": 0.011087119579315186, "lr": 2.972967791411043e-06, "objective/entropy": -39.64904022216797, "objective/kl": 10.655853271484375, "objective/non_score_reward": -1.0655853748321533, "objective/rlhf_reward": -2.139635088221107, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 89.38186645507812, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6346065998077393, "step": 141, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0012426376342773 }, { "episode": 2288, "epoch": 0.04112593018657655, "loss/policy_avg": 0.18634071946144104, "lr": 2.972776073619632e-06, "objective/entropy": -234.11532592773438, "objective/kl": 9.971248626708984, "objective/non_score_reward": -0.9971247911453247, "objective/rlhf_reward": -5.988499641418457, "objective/scores": -0.5, "policy/approxkl_avg": 161.76507568359375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6083584427833557, "step": 142, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9975717067718506 }, { "episode": 2304, "epoch": 0.0414135241039652, "loss/policy_avg": 0.842505693435669, "lr": 2.972584355828221e-06, "objective/entropy": 146.90762329101562, "objective/kl": 12.336867332458496, "objective/non_score_reward": -1.2336868047714233, "objective/rlhf_reward": -4.534747010469436, "objective/scores": 0.1, "policy/approxkl_avg": 239.418701171875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4192795753479004, "step": 143, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9997382164001465 }, { "episode": 2320, "epoch": 0.04170111802135385, "loss/policy_avg": 0.6685837507247925, "lr": 2.9723926380368102e-06, "objective/entropy": 41.359195709228516, "objective/kl": 9.3118896484375, "objective/non_score_reward": -0.9311891794204712, "objective/rlhf_reward": 0.6752433419227604, "objective/scores": 1.1, "policy/approxkl_avg": 151.73721313476562, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5029563307762146, "step": 144, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9993720054626465 }, { "episode": 2336, "epoch": 0.0419887119387425, "loss/policy_avg": 0.2414681613445282, "lr": 2.9722009202453986e-06, "objective/entropy": -22.124481201171875, "objective/kl": 10.415138244628906, "objective/non_score_reward": -1.0415138006210327, "objective/rlhf_reward": -3.7660551875829693, "objective/scores": 0.1, "policy/approxkl_avg": 117.5968246459961, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5334524512290955, "step": 145, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998509407043457 }, { "episode": 2352, "epoch": 0.042276305856131145, "loss/policy_avg": -0.1104317232966423, "lr": 2.972009202453988e-06, "objective/entropy": 89.73234558105469, "objective/kl": 9.888954162597656, "objective/non_score_reward": -0.9888954162597656, "objective/rlhf_reward": -3.5555818438529965, "objective/scores": 0.1, "policy/approxkl_avg": 113.99063110351562, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5904539823532104, "step": 146, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.002488851547241 }, { "episode": 2368, "epoch": 0.04256389977351979, "loss/policy_avg": 0.6591033935546875, "lr": 2.9718174846625767e-06, "objective/entropy": 149.4038543701172, "objective/kl": 10.047257423400879, "objective/non_score_reward": -1.0047259330749512, "objective/rlhf_reward": -6.018903732299805, "objective/scores": -0.5, "policy/approxkl_avg": 178.25811767578125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.9356397986412048, "step": 147, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.997607707977295 }, { "episode": 2384, "epoch": 0.04285149369090844, "loss/policy_avg": 0.544201135635376, "lr": 2.9716257668711655e-06, "objective/entropy": 132.01980590820312, "objective/kl": 9.59277629852295, "objective/non_score_reward": -0.9592776298522949, "objective/rlhf_reward": -3.437110504508018, "objective/scores": 0.1, "policy/approxkl_avg": 97.24102783203125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8296840190887451, "step": 148, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0014381408691406 }, { "episode": 2400, "epoch": 0.043139087608297086, "loss/policy_avg": 0.36106303334236145, "lr": 2.9714340490797547e-06, "objective/entropy": 260.59033203125, "objective/kl": 11.327485084533691, "objective/non_score_reward": -1.1327484846115112, "objective/rlhf_reward": -2.4082878849664073, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 112.34231567382812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6541630029678345, "step": 149, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9988218545913696 }, { "episode": 2416, "epoch": 0.04342668152568573, "loss/policy_avg": 0.30818748474121094, "lr": 2.9712423312883435e-06, "objective/entropy": 167.4129180908203, "objective/kl": 9.699304580688477, "objective/non_score_reward": -0.9699304103851318, "objective/rlhf_reward": -5.879721641540527, "objective/scores": -0.5, "policy/approxkl_avg": 82.35363006591797, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6681854724884033, "step": 150, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9972755908966064 }, { "episode": 2432, "epoch": 0.04371427544307438, "loss/policy_avg": 0.6923952102661133, "lr": 2.9710506134969323e-06, "objective/entropy": 48.51850128173828, "objective/kl": 7.859927177429199, "objective/non_score_reward": -0.7859926223754883, "objective/rlhf_reward": -2.743970593810081, "objective/scores": 0.1, "policy/approxkl_avg": 103.42765808105469, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7212280035018921, "step": 151, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9981834888458252 }, { "episode": 2448, "epoch": 0.044001869360463026, "loss/policy_avg": 0.11756162345409393, "lr": 2.9708588957055216e-06, "objective/entropy": -18.197547912597656, "objective/kl": 9.619758605957031, "objective/non_score_reward": -0.9619758129119873, "objective/rlhf_reward": -2.243783268992024, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 92.98655700683594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6553933620452881, "step": 152, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9970009326934814 }, { "episode": 2464, "epoch": 0.04428946327785167, "loss/policy_avg": 0.1137843132019043, "lr": 2.9706671779141104e-06, "objective/entropy": 120.47866821289062, "objective/kl": 12.719396591186523, "objective/non_score_reward": -1.2719398736953735, "objective/rlhf_reward": -4.687759546935558, "objective/scores": 0.1, "policy/approxkl_avg": 160.37051391601562, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5731196403503418, "step": 153, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0005669593811035 }, { "episode": 2480, "epoch": 0.04457705719524032, "loss/policy_avg": 0.13933295011520386, "lr": 2.9704754601226996e-06, "objective/entropy": -150.15121459960938, "objective/kl": 5.141759395599365, "objective/non_score_reward": -0.5141758918762207, "objective/rlhf_reward": -4.056703567504883, "objective/scores": -0.5, "policy/approxkl_avg": 30.49239730834961, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7959345579147339, "step": 154, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0006468296051025 }, { "episode": 2496, "epoch": 0.04486465111262897, "loss/policy_avg": 0.0854576975107193, "lr": 2.9702837423312884e-06, "objective/entropy": 143.63348388671875, "objective/kl": 11.670942306518555, "objective/non_score_reward": -1.1670942306518555, "objective/rlhf_reward": -4.26837727278471, "objective/scores": 0.1, "policy/approxkl_avg": 232.97808837890625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.9138456583023071, "step": 155, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9975340366363525 }, { "episode": 2512, "epoch": 0.045152245030017614, "loss/policy_avg": 0.026141434907913208, "lr": 2.9700920245398772e-06, "objective/entropy": -4.6529083251953125, "objective/kl": 7.081835746765137, "objective/non_score_reward": -0.7081836462020874, "objective/rlhf_reward": -4.832734107971191, "objective/scores": -0.5, "policy/approxkl_avg": 31.927255630493164, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6679246425628662, "step": 156, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9991931915283203 }, { "episode": 2528, "epoch": 0.04543983894740626, "loss/policy_avg": 0.15662409365177155, "lr": 2.9699003067484665e-06, "objective/entropy": 175.9498291015625, "objective/kl": 6.773474216461182, "objective/non_score_reward": -0.6773474216461182, "objective/rlhf_reward": -2.3093897461891175, "objective/scores": 0.1, "policy/approxkl_avg": 34.37593078613281, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.47304588556289673, "step": 157, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9973795413970947 }, { "episode": 2544, "epoch": 0.04572743286479491, "loss/policy_avg": 0.23720163106918335, "lr": 2.9697085889570553e-06, "objective/entropy": -89.0332260131836, "objective/kl": 8.619776725769043, "objective/non_score_reward": -0.8619776964187622, "objective/rlhf_reward": -0.5241918011915412, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 29.201887130737305, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6532964110374451, "step": 158, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9988105297088623 }, { "episode": 2560, "epoch": 0.046015026782183555, "loss/policy_avg": 0.2267841100692749, "lr": 2.9695168711656445e-06, "objective/entropy": 1.6421661376953125, "objective/kl": 9.304061889648438, "objective/non_score_reward": -0.9304060935974121, "objective/rlhf_reward": 0.6783757746219639, "objective/scores": 1.1, "policy/approxkl_avg": 34.62976837158203, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7413831949234009, "step": 159, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9987226724624634 }, { "episode": 2576, "epoch": 0.0463026206995722, "loss/policy_avg": 0.5093711614608765, "lr": 2.9693251533742333e-06, "objective/entropy": -36.8858528137207, "objective/kl": 11.298078536987305, "objective/non_score_reward": -1.1298078298568726, "objective/rlhf_reward": -6.51923131942749, "objective/scores": -0.5, "policy/approxkl_avg": 135.19918823242188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6970053911209106, "step": 160, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0019335746765137 }, { "episode": 2592, "epoch": 0.04659021461696085, "loss/policy_avg": 0.17401546239852905, "lr": 2.969133435582822e-06, "objective/entropy": 172.7603759765625, "objective/kl": 9.036718368530273, "objective/non_score_reward": -0.9036718606948853, "objective/rlhf_reward": -3.2146874427795407, "objective/scores": 0.1, "policy/approxkl_avg": 27.195972442626953, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5838289260864258, "step": 161, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9981424808502197 }, { "episode": 2608, "epoch": 0.046877808534349495, "loss/policy_avg": 0.44493553042411804, "lr": 2.9689417177914114e-06, "objective/entropy": -19.54338836669922, "objective/kl": 11.437591552734375, "objective/non_score_reward": -1.1437591314315796, "objective/rlhf_reward": -6.57503604888916, "objective/scores": -0.5, "policy/approxkl_avg": 146.39810180664062, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.468772292137146, "step": 162, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998663067817688 }, { "episode": 2624, "epoch": 0.04716540245173815, "loss/policy_avg": 2.674943447113037, "lr": 2.96875e-06, "objective/entropy": -46.63656234741211, "objective/kl": 3.6555447578430176, "objective/non_score_reward": -0.3655545115470886, "objective/rlhf_reward": 0.48519318274981194, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 8.321266174316406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8199789524078369, "step": 163, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0079751014709473 }, { "episode": 2640, "epoch": 0.047452996369126796, "loss/policy_avg": 0.1934666782617569, "lr": 2.968558282208589e-06, "objective/entropy": -81.56670379638672, "objective/kl": 12.988276481628418, "objective/non_score_reward": -1.2988277673721313, "objective/rlhf_reward": -2.2715921520602436, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 293.0709228515625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7369424104690552, "step": 164, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997572898864746 }, { "episode": 2656, "epoch": 0.04774059028651544, "loss/policy_avg": 0.15999506413936615, "lr": 2.968366564417178e-06, "objective/entropy": 46.440673828125, "objective/kl": 13.255866050720215, "objective/non_score_reward": -1.3255865573883057, "objective/rlhf_reward": -7.302346229553223, "objective/scores": -0.5, "policy/approxkl_avg": 191.44906616210938, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8060017824172974, "step": 165, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9992680549621582 }, { "episode": 2672, "epoch": 0.04802818420390409, "loss/policy_avg": -0.11308600753545761, "lr": 2.968174846625767e-06, "objective/entropy": 229.09007263183594, "objective/kl": 10.140392303466797, "objective/non_score_reward": -1.0140392780303955, "objective/rlhf_reward": -3.6561572611331936, "objective/scores": 0.1, "policy/approxkl_avg": 81.9848861694336, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.8131240606307983, "step": 166, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0012285709381104 }, { "episode": 2688, "epoch": 0.048315778121292736, "loss/policy_avg": 0.2457667887210846, "lr": 2.967983128834356e-06, "objective/entropy": -224.82394409179688, "objective/kl": 5.642745018005371, "objective/non_score_reward": -0.5642745494842529, "objective/rlhf_reward": 0.666621024965075, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 43.03296661376953, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.596450686454773, "step": 167, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9965918064117432 }, { "episode": 2704, "epoch": 0.04860337203868138, "loss/policy_avg": 0.16231290996074677, "lr": 2.9677914110429446e-06, "objective/entropy": 53.96503829956055, "objective/kl": 9.837738037109375, "objective/non_score_reward": -0.9837738275527954, "objective/rlhf_reward": -5.935094833374023, "objective/scores": -0.5, "policy/approxkl_avg": 144.24468994140625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5579663515090942, "step": 168, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9979324340820312 }, { "episode": 2720, "epoch": 0.04889096595607003, "loss/policy_avg": 0.5551764369010925, "lr": 2.967599693251534e-06, "objective/entropy": -89.17186737060547, "objective/kl": 4.356380939483643, "objective/non_score_reward": -0.4356381893157959, "objective/rlhf_reward": 0.08227607312529184, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 33.07392120361328, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.44729527831077576, "step": 169, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9999492168426514 }, { "episode": 2736, "epoch": 0.04917855987345868, "loss/policy_avg": 0.4932054579257965, "lr": 2.9674079754601227e-06, "objective/entropy": -11.498092651367188, "objective/kl": 10.226805686950684, "objective/non_score_reward": -1.022680401802063, "objective/rlhf_reward": -1.1670026972305505, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 102.75898742675781, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6330607533454895, "step": 170, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9996318817138672 }, { "episode": 2752, "epoch": 0.049466153790847324, "loss/policy_avg": 0.1898762285709381, "lr": 2.9672162576687115e-06, "objective/entropy": -70.60189056396484, "objective/kl": 9.331042289733887, "objective/non_score_reward": -0.9331042766571045, "objective/rlhf_reward": -3.3324170172214505, "objective/scores": 0.1, "policy/approxkl_avg": 114.848388671875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6309188604354858, "step": 171, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0009915828704834 }, { "episode": 2768, "epoch": 0.04975374770823597, "loss/policy_avg": 0.2297024428844452, "lr": 2.9670245398773007e-06, "objective/entropy": 16.196762084960938, "objective/kl": 11.903242111206055, "objective/non_score_reward": -1.1903241872787476, "objective/rlhf_reward": -2.638590636030708, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 139.62954711914062, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.705674946308136, "step": 172, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9981346130371094 }, { "episode": 2784, "epoch": 0.05004134162562462, "loss/policy_avg": 0.39737239480018616, "lr": 2.9668328220858895e-06, "objective/entropy": -89.89057922363281, "objective/kl": 7.809091567993164, "objective/non_score_reward": -0.7809092402458191, "objective/rlhf_reward": -5.123636722564697, "objective/scores": -0.5, "policy/approxkl_avg": 93.33058166503906, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.9152443408966064, "step": 173, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9968818426132202 }, { "episode": 2800, "epoch": 0.050328935543013265, "loss/policy_avg": 0.06229601055383682, "lr": 2.9666411042944783e-06, "objective/entropy": 105.8713607788086, "objective/kl": 10.667573928833008, "objective/non_score_reward": -1.0667574405670166, "objective/rlhf_reward": -2.6629095709958843, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 72.16740417480469, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5240480899810791, "step": 174, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9984283447265625 }, { "episode": 2816, "epoch": 0.05061652946040191, "loss/policy_avg": 0.23533995449543, "lr": 2.9664493865030676e-06, "objective/entropy": 73.5090103149414, "objective/kl": 8.236711502075195, "objective/non_score_reward": -0.8236711621284485, "objective/rlhf_reward": 1.1053153514862064, "objective/scores": 1.1, "policy/approxkl_avg": 86.34696197509766, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.549429714679718, "step": 175, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9980177879333496 }, { "episode": 2832, "epoch": 0.05090412337779056, "loss/policy_avg": 0.23864325881004333, "lr": 2.9662576687116564e-06, "objective/entropy": 44.60013198852539, "objective/kl": 11.681440353393555, "objective/non_score_reward": -1.1681439876556396, "objective/rlhf_reward": -4.272575950622558, "objective/scores": 0.1, "policy/approxkl_avg": 141.5290985107422, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6180237531661987, "step": 176, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.996826410293579 }, { "episode": 2848, "epoch": 0.051191717295179205, "loss/policy_avg": 0.12463901191949844, "lr": 2.9660659509202456e-06, "objective/entropy": -185.71621704101562, "objective/kl": 6.215152263641357, "objective/non_score_reward": -0.6215152740478516, "objective/rlhf_reward": 1.9139389447867874, "objective/scores": 1.1, "policy/approxkl_avg": 31.746864318847656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6655900478363037, "step": 177, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9984078407287598 }, { "episode": 2864, "epoch": 0.05147931121256785, "loss/policy_avg": 0.41434115171432495, "lr": 2.9658742331288344e-06, "objective/entropy": 125.62101745605469, "objective/kl": 9.051776885986328, "objective/non_score_reward": -0.9051777124404907, "objective/rlhf_reward": -1.2207108795642851, "objective/scores": 0.6, "policy/approxkl_avg": 89.37495422363281, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6031736731529236, "step": 178, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9960637092590332 }, { "episode": 2880, "epoch": 0.0517669051299565, "loss/policy_avg": 0.0819600522518158, "lr": 2.9656825153374232e-06, "objective/entropy": 167.4649658203125, "objective/kl": 11.02088737487793, "objective/non_score_reward": -1.1020888090133667, "objective/rlhf_reward": -2.7464959226256473, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 87.59506225585938, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5513530969619751, "step": 179, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9990684986114502 }, { "episode": 2896, "epoch": 0.052054499047345146, "loss/policy_avg": 0.28252676129341125, "lr": 2.9654907975460125e-06, "objective/entropy": -71.35255432128906, "objective/kl": 6.714944362640381, "objective/non_score_reward": -0.6714943647384644, "objective/rlhf_reward": 1.7140224814414982, "objective/scores": 1.1, "policy/approxkl_avg": 61.805137634277344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5994826555252075, "step": 180, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9976210594177246 }, { "episode": 2912, "epoch": 0.05234209296473379, "loss/policy_avg": 0.0703403651714325, "lr": 2.9652990797546013e-06, "objective/entropy": -239.35452270507812, "objective/kl": 10.807499885559082, "objective/non_score_reward": -1.0807499885559082, "objective/rlhf_reward": -6.322999954223633, "objective/scores": -0.5, "policy/approxkl_avg": 269.2040710449219, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.626622200012207, "step": 181, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9963631629943848 }, { "episode": 2928, "epoch": 0.052629686882122446, "loss/policy_avg": 0.5756047964096069, "lr": 2.9651073619631905e-06, "objective/entropy": 81.2969741821289, "objective/kl": 9.503179550170898, "objective/non_score_reward": -0.9503180384635925, "objective/rlhf_reward": -1.6785659066596368, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 69.6463394165039, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4174951910972595, "step": 182, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000248432159424 }, { "episode": 2944, "epoch": 0.05291728079951109, "loss/policy_avg": 0.19677188992500305, "lr": 2.9649156441717793e-06, "objective/entropy": 97.329345703125, "objective/kl": 7.023814678192139, "objective/non_score_reward": -0.702381432056427, "objective/rlhf_reward": -4.809525966644287, "objective/scores": -0.5, "policy/approxkl_avg": 37.869384765625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5866813659667969, "step": 183, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9983813762664795 }, { "episode": 2960, "epoch": 0.05320487471689974, "loss/policy_avg": -0.09021998941898346, "lr": 2.964723926380368e-06, "objective/entropy": -104.08444213867188, "objective/kl": 8.810539245605469, "objective/non_score_reward": -0.8810538649559021, "objective/rlhf_reward": -3.1242154896259304, "objective/scores": 0.1, "policy/approxkl_avg": 64.01618957519531, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6541459560394287, "step": 184, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9997464418411255 }, { "episode": 2976, "epoch": 0.05349246863428839, "loss/policy_avg": 0.7093124389648438, "lr": 2.9645322085889574e-06, "objective/entropy": -2.2389583587646484, "objective/kl": 12.9491548538208, "objective/non_score_reward": -1.2949154376983643, "objective/rlhf_reward": -3.056955727116142, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 159.49282836914062, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5769963264465332, "step": 185, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998090386390686 }, { "episode": 2992, "epoch": 0.053780062551677034, "loss/policy_avg": 0.6172127723693848, "lr": 2.964340490797546e-06, "objective/entropy": 96.50690460205078, "objective/kl": 9.217771530151367, "objective/non_score_reward": -0.9217771291732788, "objective/rlhf_reward": 0.7128913417458538, "objective/scores": 1.1, "policy/approxkl_avg": 116.59093475341797, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5699312686920166, "step": 186, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9982131719589233 }, { "episode": 3008, "epoch": 0.05406765646906568, "loss/policy_avg": 0.1616460084915161, "lr": 2.964148773006135e-06, "objective/entropy": -13.621841430664062, "objective/kl": 11.028844833374023, "objective/non_score_reward": -1.1028845310211182, "objective/rlhf_reward": -2.8074182084837727, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 125.17231750488281, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6993151903152466, "step": 187, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9992413520812988 }, { "episode": 3024, "epoch": 0.05435525038645433, "loss/policy_avg": -0.022246820852160454, "lr": 2.9639570552147242e-06, "objective/entropy": -95.69093322753906, "objective/kl": 8.957221031188965, "objective/non_score_reward": -0.8957222700119019, "objective/rlhf_reward": -3.1828890204429623, "objective/scores": 0.1, "policy/approxkl_avg": 129.6190948486328, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7190344333648682, "step": 188, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0001869201660156 }, { "episode": 3040, "epoch": 0.054642844303842975, "loss/policy_avg": 0.3806031346321106, "lr": 2.9637653374233126e-06, "objective/entropy": 31.125537872314453, "objective/kl": 14.289737701416016, "objective/non_score_reward": -1.428973913192749, "objective/rlhf_reward": -5.315895533561706, "objective/scores": 0.1, "policy/approxkl_avg": 252.6319122314453, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4689924716949463, "step": 189, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998744010925293 }, { "episode": 3056, "epoch": 0.05493043822123162, "loss/policy_avg": 0.38266557455062866, "lr": 2.963573619631902e-06, "objective/entropy": -70.03289794921875, "objective/kl": 11.780672073364258, "objective/non_score_reward": -1.1780673265457153, "objective/rlhf_reward": -1.7885502024900646, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 211.99916076660156, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.736452043056488, "step": 190, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998718023300171 }, { "episode": 3072, "epoch": 0.05521803213862027, "loss/policy_avg": -0.1702420711517334, "lr": 2.9633819018404906e-06, "objective/entropy": -88.13114166259766, "objective/kl": 5.207786560058594, "objective/non_score_reward": -0.5207787156105042, "objective/rlhf_reward": -0.13570352919572182, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 66.82286834716797, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6916394233703613, "step": 191, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0128471851348877 }, { "episode": 3088, "epoch": 0.055505626056008915, "loss/policy_avg": 0.47809040546417236, "lr": 2.96319018404908e-06, "objective/entropy": -146.3225555419922, "objective/kl": 9.557235717773438, "objective/non_score_reward": -0.9557235240936279, "objective/rlhf_reward": -1.8754830089973764, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 62.17351531982422, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6510109901428223, "step": 192, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9980770349502563 }, { "episode": 3104, "epoch": 0.05579321997339756, "loss/policy_avg": 0.695833683013916, "lr": 2.9629984662576687e-06, "objective/entropy": 100.23960876464844, "objective/kl": 9.5051908493042, "objective/non_score_reward": -0.9505190849304199, "objective/rlhf_reward": -3.4020764887332913, "objective/scores": 0.1, "policy/approxkl_avg": 92.06460571289062, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7171235084533691, "step": 193, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999764084815979 }, { "episode": 3120, "epoch": 0.05608081389078621, "loss/policy_avg": 0.16662254929542542, "lr": 2.9628067484662575e-06, "objective/entropy": 166.84793090820312, "objective/kl": 12.172272682189941, "objective/non_score_reward": -1.2172273397445679, "objective/rlhf_reward": -4.4689091391861435, "objective/scores": 0.1, "policy/approxkl_avg": 133.61741638183594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5888961553573608, "step": 194, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9996709823608398 }, { "episode": 3136, "epoch": 0.056368407808174856, "loss/policy_avg": 0.7391001582145691, "lr": 2.9626150306748467e-06, "objective/entropy": 7.757408142089844, "objective/kl": 4.110318183898926, "objective/non_score_reward": -0.4110318422317505, "objective/rlhf_reward": 2.755872675776482, "objective/scores": 1.1, "policy/approxkl_avg": 21.918201446533203, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6208719611167908, "step": 195, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999298095703125 }, { "episode": 3152, "epoch": 0.0566560017255635, "loss/policy_avg": -0.04099520295858383, "lr": 2.9624233128834355e-06, "objective/entropy": -164.4539337158203, "objective/kl": 7.288479328155518, "objective/non_score_reward": -0.7288479804992676, "objective/rlhf_reward": 1.4846080929040912, "objective/scores": 1.1, "policy/approxkl_avg": 88.62603759765625, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7371472120285034, "step": 196, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9998602867126465 }, { "episode": 3168, "epoch": 0.05694359564295215, "loss/policy_avg": 0.10230283439159393, "lr": 2.9622315950920248e-06, "objective/entropy": -6.653453826904297, "objective/kl": 10.225364685058594, "objective/non_score_reward": -1.0225365161895752, "objective/rlhf_reward": -2.265317554744791, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 116.62469482421875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6673995852470398, "step": 197, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9955484867095947 }, { "episode": 3184, "epoch": 0.057231189560340796, "loss/policy_avg": 0.1610383689403534, "lr": 2.9620398773006136e-06, "objective/entropy": 83.79240417480469, "objective/kl": 6.000893592834473, "objective/non_score_reward": -0.6000893712043762, "objective/rlhf_reward": -2.000357484817505, "objective/scores": 0.1, "policy/approxkl_avg": 38.31947326660156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.49990415573120117, "step": 198, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9981744289398193 }, { "episode": 3200, "epoch": 0.05751878347772944, "loss/policy_avg": 2.8004019260406494, "lr": 2.9618481595092024e-06, "objective/entropy": 105.93596649169922, "objective/kl": 9.928138732910156, "objective/non_score_reward": -0.9928138852119446, "objective/rlhf_reward": 0.4287445038557056, "objective/scores": 1.1, "policy/approxkl_avg": 68.9993667602539, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5279573202133179, "step": 199, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0012660026550293 }, { "episode": 3216, "epoch": 0.05780637739511809, "loss/policy_avg": 0.1186942383646965, "lr": 2.9616564417177916e-06, "objective/entropy": 176.61386108398438, "objective/kl": 15.481854438781738, "objective/non_score_reward": -1.5481854677200317, "objective/rlhf_reward": -8.192741394042969, "objective/scores": -0.5, "policy/approxkl_avg": 287.8005676269531, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8114917278289795, "step": 200, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9980297088623047 }, { "episode": 3232, "epoch": 0.058093971312506744, "loss/policy_avg": 0.12549322843551636, "lr": 2.9614647239263804e-06, "objective/entropy": -43.08008575439453, "objective/kl": 1.7479121685028076, "objective/non_score_reward": -0.17479124665260315, "objective/rlhf_reward": 1.1256637878987084, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 1.0369627475738525, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6038674116134644, "step": 201, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001643657684326 }, { "episode": 3248, "epoch": 0.05838156522989539, "loss/policy_avg": 0.37325456738471985, "lr": 2.9612730061349692e-06, "objective/entropy": -55.67109680175781, "objective/kl": 16.567649841308594, "objective/non_score_reward": -1.6567649841308594, "objective/rlhf_reward": -8.627059936523438, "objective/scores": -0.5, "policy/approxkl_avg": 345.91241455078125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.688873291015625, "step": 202, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9956648349761963 }, { "episode": 3264, "epoch": 0.05866915914728404, "loss/policy_avg": 0.4203230142593384, "lr": 2.9610812883435585e-06, "objective/entropy": -102.62028503417969, "objective/kl": 8.865509033203125, "objective/non_score_reward": -0.8865509629249573, "objective/rlhf_reward": -0.622484960348579, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 93.6322021484375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6493798494338989, "step": 203, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9986227750778198 }, { "episode": 3280, "epoch": 0.058956753064672685, "loss/policy_avg": 0.6964031457901001, "lr": 2.9608895705521473e-06, "objective/entropy": 110.6749496459961, "objective/kl": 17.85771369934082, "objective/non_score_reward": -1.785771131515503, "objective/rlhf_reward": -9.143084526062012, "objective/scores": -0.5, "policy/approxkl_avg": 490.62164306640625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6564576625823975, "step": 204, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9979726076126099 }, { "episode": 3296, "epoch": 0.05924434698206133, "loss/policy_avg": 0.2527036964893341, "lr": 2.9606978527607365e-06, "objective/entropy": -116.59994506835938, "objective/kl": 11.40339469909668, "objective/non_score_reward": -1.1403393745422363, "objective/rlhf_reward": -2.161357662081718, "objective/scores": 0.6, "policy/approxkl_avg": 91.01628112792969, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6173946857452393, "step": 205, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9987362623214722 }, { "episode": 3312, "epoch": 0.05953194089944998, "loss/policy_avg": 0.2560387849807739, "lr": 2.9605061349693253e-06, "objective/entropy": -39.499755859375, "objective/kl": 6.948145866394043, "objective/non_score_reward": -0.6948145627975464, "objective/rlhf_reward": 1.6207415699958805, "objective/scores": 1.1, "policy/approxkl_avg": 74.67678833007812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6708812713623047, "step": 206, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.997892141342163 }, { "episode": 3328, "epoch": 0.059819534816838625, "loss/policy_avg": 0.27111512422561646, "lr": 2.960314417177914e-06, "objective/entropy": 84.36082458496094, "objective/kl": 7.624824523925781, "objective/non_score_reward": -0.7624824643135071, "objective/rlhf_reward": -5.049929618835449, "objective/scores": -0.5, "policy/approxkl_avg": 129.3857879638672, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.859719455242157, "step": 207, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9986274242401123 }, { "episode": 3344, "epoch": 0.06010712873422727, "loss/policy_avg": 0.3739089071750641, "lr": 2.9601226993865034e-06, "objective/entropy": -137.59747314453125, "objective/kl": 7.363832950592041, "objective/non_score_reward": -0.7363832592964172, "objective/rlhf_reward": -2.5455331265926358, "objective/scores": 0.1, "policy/approxkl_avg": 79.08015441894531, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6479380130767822, "step": 208, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9974188804626465 }, { "episode": 3360, "epoch": 0.06039472265161592, "loss/policy_avg": 0.2446056306362152, "lr": 2.959930981595092e-06, "objective/entropy": 87.66815185546875, "objective/kl": 6.449171543121338, "objective/non_score_reward": -0.6449171900749207, "objective/rlhf_reward": -2.1796687602996823, "objective/scores": 0.1, "policy/approxkl_avg": 41.2250862121582, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4830004572868347, "step": 209, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9996240139007568 }, { "episode": 3376, "epoch": 0.060682316569004566, "loss/policy_avg": 0.19204621016979218, "lr": 2.9597392638036814e-06, "objective/entropy": 67.65581512451172, "objective/kl": 9.747137069702148, "objective/non_score_reward": -0.9747136831283569, "objective/rlhf_reward": -3.4988546952605244, "objective/scores": 0.1, "policy/approxkl_avg": 113.63888549804688, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5157696604728699, "step": 210, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9994913339614868 }, { "episode": 3392, "epoch": 0.06096991048639321, "loss/policy_avg": -0.3198900520801544, "lr": 2.9595475460122702e-06, "objective/entropy": 64.95191955566406, "objective/kl": 6.794856071472168, "objective/non_score_reward": -0.6794856190681458, "objective/rlhf_reward": -2.317942655086517, "objective/scores": 0.1, "policy/approxkl_avg": 64.20761108398438, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6942774653434753, "step": 211, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.003568649291992 }, { "episode": 3408, "epoch": 0.06125750440378186, "loss/policy_avg": 0.8114681839942932, "lr": 2.959355828220859e-06, "objective/entropy": 88.22362518310547, "objective/kl": 10.00731372833252, "objective/non_score_reward": -1.0007314682006836, "objective/rlhf_reward": 0.39707423150539434, "objective/scores": 1.1, "policy/approxkl_avg": 100.05908203125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5509470105171204, "step": 212, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9995248317718506 }, { "episode": 3424, "epoch": 0.061545098321170506, "loss/policy_avg": 0.3652896285057068, "lr": 2.959164110429448e-06, "objective/entropy": -220.09613037109375, "objective/kl": 8.2984037399292, "objective/non_score_reward": -0.8298404216766357, "objective/rlhf_reward": -2.919361627101898, "objective/scores": 0.1, "policy/approxkl_avg": 93.64591979980469, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5828021168708801, "step": 213, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000081777572632 }, { "episode": 3440, "epoch": 0.06183269223855915, "loss/policy_avg": 0.028781473636627197, "lr": 2.9589723926380366e-06, "objective/entropy": 72.11604309082031, "objective/kl": 2.359449625015259, "objective/non_score_reward": -0.2359449863433838, "objective/rlhf_reward": -0.5437799677252769, "objective/scores": 0.1, "policy/approxkl_avg": 0.3476608991622925, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4451746940612793, "step": 214, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0019702911376953 }, { "episode": 3456, "epoch": 0.0621202861559478, "loss/policy_avg": 0.5238405466079712, "lr": 2.958780674846626e-06, "objective/entropy": 94.75743103027344, "objective/kl": 12.506973266601562, "objective/non_score_reward": -1.250697374343872, "objective/rlhf_reward": -4.602789735794067, "objective/scores": 0.1, "policy/approxkl_avg": 134.81378173828125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6946749091148376, "step": 215, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9987130165100098 }, { "episode": 3472, "epoch": 0.06240788007333645, "loss/policy_avg": 0.357485830783844, "lr": 2.9585889570552147e-06, "objective/entropy": -56.58507537841797, "objective/kl": 15.03990364074707, "objective/non_score_reward": -1.5039904117584229, "objective/rlhf_reward": -4.068550298886235, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 188.71969604492188, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5828025341033936, "step": 216, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9974546432495117 }, { "episode": 3488, "epoch": 0.0626954739907251, "loss/policy_avg": 0.5269087553024292, "lr": 2.9583972392638035e-06, "objective/entropy": -54.57160949707031, "objective/kl": 10.709373474121094, "objective/non_score_reward": -1.0709375143051147, "objective/rlhf_reward": -6.283750057220459, "objective/scores": -0.5, "policy/approxkl_avg": 49.47734069824219, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4404389560222626, "step": 217, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9983222484588623 }, { "episode": 3504, "epoch": 0.06298306790811374, "loss/policy_avg": 0.17589987814426422, "lr": 2.9582055214723927e-06, "objective/entropy": 64.53421020507812, "objective/kl": 7.162571907043457, "objective/non_score_reward": -0.7162571549415588, "objective/rlhf_reward": -0.7423223874726631, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 121.216552734375, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4267624020576477, "step": 218, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9995992183685303 }, { "episode": 3520, "epoch": 0.0632706618255024, "loss/policy_avg": 0.26339495182037354, "lr": 2.9580138036809815e-06, "objective/entropy": 229.096435546875, "objective/kl": 11.569236755371094, "objective/non_score_reward": -1.156923532485962, "objective/rlhf_reward": -1.7039751603615017, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 217.75863647460938, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7306234836578369, "step": 219, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9954323768615723 }, { "episode": 3536, "epoch": 0.06355825574289103, "loss/policy_avg": 0.14159545302391052, "lr": 2.9578220858895708e-06, "objective/entropy": -69.43865966796875, "objective/kl": 5.585199356079102, "objective/non_score_reward": -0.5585199594497681, "objective/rlhf_reward": -0.11137341179040439, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 20.743200302124023, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7005258202552795, "step": 220, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.997767448425293 }, { "episode": 3552, "epoch": 0.06384584966027969, "loss/policy_avg": -0.5519238710403442, "lr": 2.9576303680981596e-06, "objective/entropy": 208.38470458984375, "objective/kl": 6.988656997680664, "objective/non_score_reward": -0.6988657712936401, "objective/rlhf_reward": -2.395462906360626, "objective/scores": 0.1, "policy/approxkl_avg": 93.66447448730469, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.7662152051925659, "step": 221, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0369105339050293 }, { "episode": 3568, "epoch": 0.06413344357766833, "loss/policy_avg": 0.047732796519994736, "lr": 2.9574386503067484e-06, "objective/entropy": 143.40084838867188, "objective/kl": 11.851188659667969, "objective/non_score_reward": -1.1851186752319336, "objective/rlhf_reward": -2.340474939346313, "objective/scores": 0.6, "policy/approxkl_avg": 151.91505432128906, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7239178419113159, "step": 222, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.001242160797119 }, { "episode": 3584, "epoch": 0.06442103749505698, "loss/policy_avg": 0.7117222547531128, "lr": 2.9572469325153376e-06, "objective/entropy": -145.98013305664062, "objective/kl": 12.114925384521484, "objective/non_score_reward": -1.2114924192428589, "objective/rlhf_reward": -3.1126364628473913, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 156.07591247558594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7104471921920776, "step": 223, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9960601329803467 }, { "episode": 3600, "epoch": 0.06470863141244562, "loss/policy_avg": -0.4437275230884552, "lr": 2.9570552147239264e-06, "objective/entropy": 7.181800842285156, "objective/kl": 4.8645524978637695, "objective/non_score_reward": -0.4864552319049835, "objective/rlhf_reward": -1.5458209127187728, "objective/scores": 0.1, "policy/approxkl_avg": 67.22005462646484, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.3355029821395874, "step": 224, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.003509044647217 }, { "episode": 3616, "epoch": 0.06499622532983428, "loss/policy_avg": 0.09265188872814178, "lr": 2.9568634969325152e-06, "objective/entropy": -261.4568176269531, "objective/kl": 8.175820350646973, "objective/non_score_reward": -0.8175821304321289, "objective/rlhf_reward": -2.870328368991613, "objective/scores": 0.1, "policy/approxkl_avg": 53.4001350402832, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6872222423553467, "step": 225, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999666690826416 }, { "episode": 3632, "epoch": 0.06528381924722292, "loss/policy_avg": 0.27531012892723083, "lr": 2.9566717791411045e-06, "objective/entropy": 15.044029235839844, "objective/kl": 10.096330642700195, "objective/non_score_reward": -1.0096330642700195, "objective/rlhf_reward": -6.038532257080078, "objective/scores": -0.5, "policy/approxkl_avg": 120.44912719726562, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7716231346130371, "step": 226, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9978790283203125 }, { "episode": 3648, "epoch": 0.06557141316461157, "loss/policy_avg": 0.27537640929222107, "lr": 2.9564800613496933e-06, "objective/entropy": 171.693359375, "objective/kl": 13.582971572875977, "objective/non_score_reward": -1.3582972288131714, "objective/rlhf_reward": -2.5094699307691783, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 178.1964111328125, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6318575143814087, "step": 227, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9985480308532715 }, { "episode": 3664, "epoch": 0.06585900708200021, "loss/policy_avg": 0.5710182785987854, "lr": 2.9562883435582825e-06, "objective/entropy": 163.5762939453125, "objective/kl": 11.832403182983398, "objective/non_score_reward": -1.1832401752471924, "objective/rlhf_reward": -4.332961043715477, "objective/scores": 0.1, "policy/approxkl_avg": 84.51448059082031, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7185041904449463, "step": 228, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0005269050598145 }, { "episode": 3680, "epoch": 0.06614660099938886, "loss/policy_avg": 0.2001960575580597, "lr": 2.9560966257668713e-06, "objective/entropy": -31.42165756225586, "objective/kl": 11.342616081237793, "objective/non_score_reward": -1.1342616081237793, "objective/rlhf_reward": -2.5896352929639175, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 163.35409545898438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5161740779876709, "step": 229, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9981510639190674 }, { "episode": 3696, "epoch": 0.06643419491677752, "loss/policy_avg": 0.033837247639894485, "lr": 2.95590490797546e-06, "objective/entropy": -127.0381088256836, "objective/kl": 12.982643127441406, "objective/non_score_reward": -1.298264503479004, "objective/rlhf_reward": -7.193058013916016, "objective/scores": -0.5, "policy/approxkl_avg": 175.97671508789062, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7343233823776245, "step": 230, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9994841814041138 }, { "episode": 3712, "epoch": 0.06672178883416616, "loss/policy_avg": 0.3433837890625, "lr": 2.9557131901840494e-06, "objective/entropy": 68.5723876953125, "objective/kl": 13.85904598236084, "objective/non_score_reward": -1.3859045505523682, "objective/rlhf_reward": -3.8102850029865896, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 221.48330688476562, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7557601928710938, "step": 231, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9970917701721191 }, { "episode": 3728, "epoch": 0.06700938275155481, "loss/policy_avg": 0.2773955464363098, "lr": 2.955521472392638e-06, "objective/entropy": -77.472412109375, "objective/kl": 8.535304069519043, "objective/non_score_reward": -0.8535304069519043, "objective/rlhf_reward": -0.4904027923357215, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 31.26820182800293, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.47358882427215576, "step": 232, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9977991580963135 }, { "episode": 3744, "epoch": 0.06729697666894345, "loss/policy_avg": 0.419773131608963, "lr": 2.9553297546012274e-06, "objective/entropy": 151.9324951171875, "objective/kl": 10.881217956542969, "objective/non_score_reward": -1.0881218910217285, "objective/rlhf_reward": -2.4050763649510696, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 90.420166015625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7324544787406921, "step": 233, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9985604286193848 }, { "episode": 3760, "epoch": 0.0675845705863321, "loss/policy_avg": 0.19571278989315033, "lr": 2.955138036809816e-06, "objective/entropy": 164.4075927734375, "objective/kl": 9.931008338928223, "objective/non_score_reward": -0.9931010007858276, "objective/rlhf_reward": -1.8496975473323207, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 90.80259704589844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4490511417388916, "step": 234, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9968398809432983 }, { "episode": 3776, "epoch": 0.06787216450372074, "loss/policy_avg": -0.0029441099613904953, "lr": 2.954946319018405e-06, "objective/entropy": -57.89764404296875, "objective/kl": 11.663187026977539, "objective/non_score_reward": -1.1663187742233276, "objective/rlhf_reward": -2.265275067090988, "objective/scores": 0.6, "policy/approxkl_avg": 56.57416915893555, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6279686689376831, "step": 235, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9994487762451172 }, { "episode": 3792, "epoch": 0.0681597584211094, "loss/policy_avg": 0.19873343408107758, "lr": 2.954754601226994e-06, "objective/entropy": 69.95574951171875, "objective/kl": 2.667611598968506, "objective/non_score_reward": -0.2667612135410309, "objective/rlhf_reward": -3.067044734954834, "objective/scores": -0.5, "policy/approxkl_avg": 23.46300506591797, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6580133438110352, "step": 236, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998089075088501 }, { "episode": 3808, "epoch": 0.06844735233849804, "loss/policy_avg": 0.06835653632879257, "lr": 2.9545628834355827e-06, "objective/entropy": -18.042882919311523, "objective/kl": 10.576539993286133, "objective/non_score_reward": -1.0576539039611816, "objective/rlhf_reward": -6.230615615844727, "objective/scores": -0.5, "policy/approxkl_avg": 49.800323486328125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.38686278462409973, "step": 237, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9957520961761475 }, { "episode": 3824, "epoch": 0.06873494625588669, "loss/policy_avg": 0.09460563957691193, "lr": 2.954371165644172e-06, "objective/entropy": -168.65773010253906, "objective/kl": 9.4718017578125, "objective/non_score_reward": -0.9471801519393921, "objective/rlhf_reward": -3.388720667362213, "objective/scores": 0.1, "policy/approxkl_avg": 84.32270812988281, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.3954545259475708, "step": 238, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000286817550659 }, { "episode": 3840, "epoch": 0.06902254017327533, "loss/policy_avg": -0.28287017345428467, "lr": 2.9541794478527607e-06, "objective/entropy": 191.8770751953125, "objective/kl": 5.481754302978516, "objective/non_score_reward": -0.5481754541397095, "objective/rlhf_reward": -4.192701816558838, "objective/scores": -0.5, "policy/approxkl_avg": 33.588523864746094, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4681757688522339, "step": 239, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.001433849334717 }, { "episode": 3856, "epoch": 0.06931013409066399, "loss/policy_avg": 0.4215943217277527, "lr": 2.9539877300613495e-06, "objective/entropy": 122.14271545410156, "objective/kl": 11.92599868774414, "objective/non_score_reward": -1.1926000118255615, "objective/rlhf_reward": -4.370399898290634, "objective/scores": 0.1, "policy/approxkl_avg": 73.78562927246094, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6994770765304565, "step": 240, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9991042613983154 }, { "episode": 3872, "epoch": 0.06959772800805263, "loss/policy_avg": 0.14229349792003632, "lr": 2.9537960122699387e-06, "objective/entropy": 97.91790008544922, "objective/kl": 11.320087432861328, "objective/non_score_reward": -1.1320087909698486, "objective/rlhf_reward": -0.1280353426933285, "objective/scores": 1.1, "policy/approxkl_avg": 53.905311584472656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7801535129547119, "step": 241, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0005578994750977 }, { "episode": 3888, "epoch": 0.06988532192544128, "loss/policy_avg": 0.6044175624847412, "lr": 2.9536042944785275e-06, "objective/entropy": -45.497032165527344, "objective/kl": 14.7174072265625, "objective/non_score_reward": -1.47174072265625, "objective/rlhf_reward": -3.9395517510938003, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 80.24563598632812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4917639493942261, "step": 242, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9982421398162842 }, { "episode": 3904, "epoch": 0.07017291584282992, "loss/policy_avg": 0.37928763031959534, "lr": 2.9534125766871168e-06, "objective/entropy": 85.73689270019531, "objective/kl": 3.559020519256592, "objective/non_score_reward": -0.3559020459651947, "objective/rlhf_reward": 2.9763918161392215, "objective/scores": 1.1, "policy/approxkl_avg": 1.9539512395858765, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5831518173217773, "step": 243, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.023195743560791 }, { "episode": 3920, "epoch": 0.07046050976021857, "loss/policy_avg": 0.06145331636071205, "lr": 2.9532208588957056e-06, "objective/entropy": -83.13604736328125, "objective/kl": 10.009160995483398, "objective/non_score_reward": -1.0009161233901978, "objective/rlhf_reward": -6.003664016723633, "objective/scores": -0.5, "policy/approxkl_avg": 33.118873596191406, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7157425880432129, "step": 244, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9978001117706299 }, { "episode": 3936, "epoch": 0.07074810367760721, "loss/policy_avg": 0.34171396493911743, "lr": 2.9530291411042944e-06, "objective/entropy": -138.12054443359375, "objective/kl": 13.933228492736816, "objective/non_score_reward": -1.3933229446411133, "objective/rlhf_reward": -2.649572645069334, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 283.137939453125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8090240955352783, "step": 245, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.99606454372406 }, { "episode": 3952, "epoch": 0.07103569759499587, "loss/policy_avg": 0.05803845077753067, "lr": 2.9528374233128836e-06, "objective/entropy": -0.7659759521484375, "objective/kl": 10.326383590698242, "objective/non_score_reward": -1.0326383113861084, "objective/rlhf_reward": -3.730553215742111, "objective/scores": 0.1, "policy/approxkl_avg": 82.2501449584961, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6106427311897278, "step": 246, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0012848377227783 }, { "episode": 3968, "epoch": 0.0713232915123845, "loss/policy_avg": 0.27074679732322693, "lr": 2.9526457055214724e-06, "objective/entropy": -100.07415771484375, "objective/kl": 9.287663459777832, "objective/non_score_reward": -0.9287664294242859, "objective/rlhf_reward": -1.592359433249507, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 4.526418209075928, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5503559112548828, "step": 247, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0001773834228516 }, { "episode": 3984, "epoch": 0.07161088542977316, "loss/policy_avg": 0.1150532141327858, "lr": 2.9524539877300617e-06, "objective/entropy": 37.045387268066406, "objective/kl": 9.54155158996582, "objective/non_score_reward": -0.9541550874710083, "objective/rlhf_reward": 0.5833795011043552, "objective/scores": 1.1, "policy/approxkl_avg": 118.93629455566406, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7818096876144409, "step": 248, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9984393119812012 }, { "episode": 4000, "epoch": 0.07189847934716181, "loss/policy_avg": 0.11583375930786133, "lr": 2.9522622699386505e-06, "objective/entropy": -29.975513458251953, "objective/kl": 11.691999435424805, "objective/non_score_reward": -1.169199824333191, "objective/rlhf_reward": -4.276799207925796, "objective/scores": 0.1, "policy/approxkl_avg": 95.38002014160156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6541903018951416, "step": 249, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9976664781570435 }, { "episode": 4016, "epoch": 0.07218607326455045, "loss/policy_avg": 0.424482524394989, "lr": 2.9520705521472393e-06, "objective/entropy": 108.46614074707031, "objective/kl": 7.699254989624023, "objective/non_score_reward": -0.7699254751205444, "objective/rlhf_reward": -2.679701870679855, "objective/scores": 0.1, "policy/approxkl_avg": 38.84117126464844, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.9610189199447632, "step": 250, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0005178451538086 }, { "episode": 4032, "epoch": 0.07247366718193911, "loss/policy_avg": -0.013865754008293152, "lr": 2.9518788343558285e-06, "objective/entropy": 2.4157180786132812, "objective/kl": 8.162057876586914, "objective/non_score_reward": -0.8162057399749756, "objective/rlhf_reward": -5.264822959899902, "objective/scores": -0.5, "policy/approxkl_avg": 42.96641540527344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6327460408210754, "step": 251, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9980905055999756 }, { "episode": 4048, "epoch": 0.07276126109932775, "loss/policy_avg": 0.12326370179653168, "lr": 2.9516871165644173e-06, "objective/entropy": -18.136714935302734, "objective/kl": 15.872564315795898, "objective/non_score_reward": -1.5872564315795898, "objective/rlhf_reward": -8.34902572631836, "objective/scores": -0.5, "policy/approxkl_avg": 183.87185668945312, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.614548921585083, "step": 252, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9986623525619507 }, { "episode": 4064, "epoch": 0.0730488550167164, "loss/policy_avg": 0.42759251594543457, "lr": 2.951495398773006e-06, "objective/entropy": 187.13995361328125, "objective/kl": 15.658686637878418, "objective/non_score_reward": -1.5658683776855469, "objective/rlhf_reward": -5.863473868370056, "objective/scores": 0.1, "policy/approxkl_avg": 77.336669921875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7136948704719543, "step": 253, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.998879075050354 }, { "episode": 4080, "epoch": 0.07333644893410504, "loss/policy_avg": 0.45976442098617554, "lr": 2.9513036809815954e-06, "objective/entropy": -140.05638122558594, "objective/kl": 3.995250940322876, "objective/non_score_reward": -0.39952513575553894, "objective/rlhf_reward": 2.8018994905054573, "objective/scores": 1.1, "policy/approxkl_avg": 19.618144989013672, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5429213047027588, "step": 254, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998197078704834 }, { "episode": 4096, "epoch": 0.0736240428514937, "loss/policy_avg": -0.02419188618659973, "lr": 2.951111963190184e-06, "objective/entropy": -150.40243530273438, "objective/kl": 5.681851387023926, "objective/non_score_reward": -0.5681850910186768, "objective/rlhf_reward": -4.272740364074707, "objective/scores": -0.5, "policy/approxkl_avg": 25.766756057739258, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.49843645095825195, "step": 255, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9990272521972656 }, { "episode": 4112, "epoch": 0.07391163676888234, "loss/policy_avg": 0.09387945383787155, "lr": 2.950920245398773e-06, "objective/entropy": 21.77161407470703, "objective/kl": 8.116277694702148, "objective/non_score_reward": -0.8116278648376465, "objective/rlhf_reward": -5.246511459350586, "objective/scores": -0.5, "policy/approxkl_avg": 51.24212646484375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.709270715713501, "step": 256, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9979963302612305 }, { "episode": 4128, "epoch": 0.07419923068627099, "loss/policy_avg": -0.4069734215736389, "lr": 2.950728527607362e-06, "objective/entropy": -174.08395385742188, "objective/kl": 8.566609382629395, "objective/non_score_reward": -0.8566610217094421, "objective/rlhf_reward": -1.7647844008809193, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 61.56124496459961, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.4532226324081421, "step": 257, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.002131938934326 }, { "episode": 4144, "epoch": 0.07448682460365963, "loss/policy_avg": 0.04940726235508919, "lr": 2.950536809815951e-06, "objective/entropy": 32.93096923828125, "objective/kl": 13.826756477355957, "objective/non_score_reward": -1.3826756477355957, "objective/rlhf_reward": -7.530702590942383, "objective/scores": -0.5, "policy/approxkl_avg": 152.21774291992188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4292905330657959, "step": 258, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9970979690551758 }, { "episode": 4160, "epoch": 0.07477441852104828, "loss/policy_avg": 0.08529473841190338, "lr": 2.95034509202454e-06, "objective/entropy": 14.531261444091797, "objective/kl": 6.426525115966797, "objective/non_score_reward": -0.6426525712013245, "objective/rlhf_reward": 1.8293897002935413, "objective/scores": 1.1, "policy/approxkl_avg": 4.471156597137451, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.3390624523162842, "step": 259, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9988377094268799 }, { "episode": 4176, "epoch": 0.07506201243843692, "loss/policy_avg": 0.6565670371055603, "lr": 2.9501533742331287e-06, "objective/entropy": 13.341388702392578, "objective/kl": 13.047385215759277, "objective/non_score_reward": -1.3047385215759277, "objective/rlhf_reward": -3.0962477944054942, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 156.30238342285156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6474744081497192, "step": 260, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9990012645721436 }, { "episode": 4192, "epoch": 0.07534960635582558, "loss/policy_avg": 0.8194575309753418, "lr": 2.949961656441718e-06, "objective/entropy": 39.95905303955078, "objective/kl": 11.301864624023438, "objective/non_score_reward": -1.1301864385604858, "objective/rlhf_reward": -6.520745754241943, "objective/scores": -0.5, "policy/approxkl_avg": 100.30358123779297, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6240185499191284, "step": 261, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9996020793914795 }, { "episode": 4208, "epoch": 0.07563720027321422, "loss/policy_avg": 0.9927411079406738, "lr": 2.9497699386503067e-06, "objective/entropy": 214.742431640625, "objective/kl": 10.751565933227539, "objective/non_score_reward": -1.075156569480896, "objective/rlhf_reward": -3.900626084208488, "objective/scores": 0.1, "policy/approxkl_avg": 190.6407928466797, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5556049346923828, "step": 262, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0008435249328613 }, { "episode": 4224, "epoch": 0.07592479419060287, "loss/policy_avg": 0.1370810866355896, "lr": 2.949578220858896e-06, "objective/entropy": -30.638282775878906, "objective/kl": 4.496548652648926, "objective/non_score_reward": -0.4496549069881439, "objective/rlhf_reward": 0.14879166059023552, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 4.034202575683594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6356778740882874, "step": 263, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9985594749450684 }, { "episode": 4240, "epoch": 0.07621238810799151, "loss/policy_avg": 0.6080716848373413, "lr": 2.9493865030674847e-06, "objective/entropy": -11.881404876708984, "objective/kl": 10.906417846679688, "objective/non_score_reward": -1.0906418561935425, "objective/rlhf_reward": -2.5377384528246627, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 79.76661682128906, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5296034812927246, "step": 264, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9986059665679932 }, { "episode": 4256, "epoch": 0.07649998202538016, "loss/policy_avg": 0.21646641194820404, "lr": 2.9491947852760736e-06, "objective/entropy": 142.3988800048828, "objective/kl": 6.696286201477051, "objective/non_score_reward": -0.6696287393569946, "objective/rlhf_reward": 1.7214852511882786, "objective/scores": 1.1, "policy/approxkl_avg": 16.300674438476562, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6062541007995605, "step": 265, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.001516342163086 }, { "episode": 4272, "epoch": 0.0767875759427688, "loss/policy_avg": 0.13891346752643585, "lr": 2.9490030674846628e-06, "objective/entropy": -61.452545166015625, "objective/kl": 5.754723072052002, "objective/non_score_reward": -0.575472354888916, "objective/rlhf_reward": 0.09811072945594779, "objective/scores": 0.6, "policy/approxkl_avg": 18.09847068786621, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.41696321964263916, "step": 266, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999955177307129 }, { "episode": 4288, "epoch": 0.07707516986015746, "loss/policy_avg": 0.19311824440956116, "lr": 2.9488113496932516e-06, "objective/entropy": 32.384788513183594, "objective/kl": 10.006759643554688, "objective/non_score_reward": -1.000675916671753, "objective/rlhf_reward": -3.6027039051055905, "objective/scores": 0.1, "policy/approxkl_avg": 50.9388427734375, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.571370542049408, "step": 267, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998531460762024 }, { "episode": 4304, "epoch": 0.0773627637775461, "loss/policy_avg": -0.14073559641838074, "lr": 2.9486196319018404e-06, "objective/entropy": 85.10265350341797, "objective/kl": 15.816366195678711, "objective/non_score_reward": -1.5816365480422974, "objective/rlhf_reward": -5.9265462219715115, "objective/scores": 0.1, "policy/approxkl_avg": 122.70755004882812, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.756123423576355, "step": 268, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9997191429138184 }, { "episode": 4320, "epoch": 0.07765035769493475, "loss/policy_avg": 0.17501243948936462, "lr": 2.9484279141104296e-06, "objective/entropy": 93.92215728759766, "objective/kl": 9.422307968139648, "objective/non_score_reward": -0.9422306418418884, "objective/rlhf_reward": -3.368922537565231, "objective/scores": 0.1, "policy/approxkl_avg": 55.133872985839844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5802706480026245, "step": 269, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9981038570404053 }, { "episode": 4336, "epoch": 0.0779379516123234, "loss/policy_avg": 0.4402735233306885, "lr": 2.9482361963190184e-06, "objective/entropy": -10.538581848144531, "objective/kl": 10.759380340576172, "objective/non_score_reward": -1.075938105583191, "objective/rlhf_reward": -2.6418928853875263, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 119.47884368896484, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5788609385490417, "step": 270, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9987006187438965 }, { "episode": 4352, "epoch": 0.07822554552971205, "loss/policy_avg": 0.582582414150238, "lr": 2.9480444785276077e-06, "objective/entropy": -98.20463562011719, "objective/kl": 9.743408203125, "objective/non_score_reward": -0.9743408560752869, "objective/rlhf_reward": 0.5026364937424663, "objective/scores": 1.1, "policy/approxkl_avg": 72.30321502685547, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6191748380661011, "step": 271, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998560905456543 }, { "episode": 4368, "epoch": 0.0785131394471007, "loss/policy_avg": 0.6289035081863403, "lr": 2.9478527607361965e-06, "objective/entropy": -71.34103393554688, "objective/kl": 12.412795066833496, "objective/non_score_reward": -1.2412794828414917, "objective/rlhf_reward": -4.565117752552032, "objective/scores": 0.1, "policy/approxkl_avg": 164.9723358154297, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.736472487449646, "step": 272, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9970333576202393 }, { "episode": 4384, "epoch": 0.07880073336448934, "loss/policy_avg": 0.21773582696914673, "lr": 2.9476610429447853e-06, "objective/entropy": -0.68145751953125, "objective/kl": 13.457925796508789, "objective/non_score_reward": -1.3457924127578735, "objective/rlhf_reward": -4.98316973298788, "objective/scores": 0.1, "policy/approxkl_avg": 254.5444793701172, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8520537614822388, "step": 273, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9983816146850586 }, { "episode": 4400, "epoch": 0.079088327281878, "loss/policy_avg": 0.7082804441452026, "lr": 2.9474693251533745e-06, "objective/entropy": -72.47603607177734, "objective/kl": 11.576058387756348, "objective/non_score_reward": -1.1576058864593506, "objective/rlhf_reward": -6.630423545837402, "objective/scores": -0.5, "policy/approxkl_avg": 119.74085235595703, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6347866058349609, "step": 274, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9995064735412598 }, { "episode": 4416, "epoch": 0.07937592119926663, "loss/policy_avg": -0.11256889998912811, "lr": 2.9472776073619633e-06, "objective/entropy": -83.20799255371094, "objective/kl": 3.853982925415039, "objective/non_score_reward": -0.385398268699646, "objective/rlhf_reward": 2.8584069401025776, "objective/scores": 1.1, "policy/approxkl_avg": 0.9227147102355957, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.39738836884498596, "step": 275, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0011487007141113 }, { "episode": 4432, "epoch": 0.07966351511665529, "loss/policy_avg": 0.32202059030532837, "lr": 2.947085889570552e-06, "objective/entropy": 71.21481323242188, "objective/kl": 10.407328605651855, "objective/non_score_reward": -1.0407328605651855, "objective/rlhf_reward": -6.162931442260742, "objective/scores": -0.5, "policy/approxkl_avg": 108.96805572509766, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5844370126724243, "step": 276, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9993963241577148 }, { "episode": 4448, "epoch": 0.07995110903404393, "loss/policy_avg": -0.023769661784172058, "lr": 2.9468941717791414e-06, "objective/entropy": -13.402759552001953, "objective/kl": 12.098983764648438, "objective/non_score_reward": -1.2098984718322754, "objective/rlhf_reward": -6.839593887329102, "objective/scores": -0.5, "policy/approxkl_avg": 13.1106595993042, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7059996724128723, "step": 277, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9998667240142822 }, { "episode": 4464, "epoch": 0.08023870295143258, "loss/policy_avg": 0.3893451690673828, "lr": 2.9467024539877298e-06, "objective/entropy": 189.41259765625, "objective/kl": 6.5235137939453125, "objective/non_score_reward": -0.6523513793945312, "objective/rlhf_reward": -2.20940545797348, "objective/scores": 0.1, "policy/approxkl_avg": 24.074134826660156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.798334002494812, "step": 278, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0013234615325928 }, { "episode": 4480, "epoch": 0.08052629686882122, "loss/policy_avg": 0.0914541631937027, "lr": 2.946510736196319e-06, "objective/entropy": 84.65312194824219, "objective/kl": 10.466255187988281, "objective/non_score_reward": -1.0466254949569702, "objective/rlhf_reward": 0.21349813938140905, "objective/scores": 1.1, "policy/approxkl_avg": 125.91380310058594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7349311113357544, "step": 279, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9984486103057861 }, { "episode": 4496, "epoch": 0.08081389078620987, "loss/policy_avg": 0.26043936610221863, "lr": 2.946319018404908e-06, "objective/entropy": -58.848392486572266, "objective/kl": 10.368853569030762, "objective/non_score_reward": -1.0368852615356445, "objective/rlhf_reward": -1.7475412249565123, "objective/scores": 0.6, "policy/approxkl_avg": 51.58127975463867, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.612343430519104, "step": 280, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9989829063415527 }, { "episode": 4512, "epoch": 0.08110148470359851, "loss/policy_avg": 0.2319001704454422, "lr": 2.946127300613497e-06, "objective/entropy": -36.56258773803711, "objective/kl": 9.824468612670898, "objective/non_score_reward": -0.9824467897415161, "objective/rlhf_reward": -5.9297871589660645, "objective/scores": -0.5, "policy/approxkl_avg": 79.27013397216797, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7777178883552551, "step": 281, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9996426105499268 }, { "episode": 4528, "epoch": 0.08138907862098717, "loss/policy_avg": 0.2995202839374542, "lr": 2.945935582822086e-06, "objective/entropy": 48.848323822021484, "objective/kl": 16.30365753173828, "objective/non_score_reward": -1.6303660869598389, "objective/rlhf_reward": -6.1214640274643894, "objective/scores": 0.1, "policy/approxkl_avg": 165.35614013671875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6986551284790039, "step": 282, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9979193210601807 }, { "episode": 4544, "epoch": 0.08167667253837581, "loss/policy_avg": 0.47233566641807556, "lr": 2.9457438650306747e-06, "objective/entropy": 59.0998420715332, "objective/kl": 12.852258682250977, "objective/non_score_reward": -1.2852261066436768, "objective/rlhf_reward": -7.140904426574707, "objective/scores": -0.5, "policy/approxkl_avg": 134.68528747558594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.42645326256752014, "step": 283, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9972941875457764 }, { "episode": 4560, "epoch": 0.08196426645576446, "loss/policy_avg": 0.36607012152671814, "lr": 2.945552147239264e-06, "objective/entropy": 53.86030578613281, "objective/kl": 5.768060684204102, "objective/non_score_reward": -0.5768060684204102, "objective/rlhf_reward": 2.0927755475044254, "objective/scores": 1.1, "policy/approxkl_avg": 36.973777770996094, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7312701940536499, "step": 284, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9965434074401855 }, { "episode": 4576, "epoch": 0.0822518603731531, "loss/policy_avg": -0.048812031745910645, "lr": 2.9453604294478527e-06, "objective/entropy": 65.61720275878906, "objective/kl": 8.64478588104248, "objective/non_score_reward": -0.864478588104248, "objective/rlhf_reward": 0.9420856922864917, "objective/scores": 1.1, "policy/approxkl_avg": 79.18849182128906, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6703629493713379, "step": 285, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9975764751434326 }, { "episode": 4592, "epoch": 0.08253945429054176, "loss/policy_avg": 0.12739452719688416, "lr": 2.945168711656442e-06, "objective/entropy": -44.89176940917969, "objective/kl": 4.6148176193237305, "objective/non_score_reward": -0.4614817500114441, "objective/rlhf_reward": 2.554072973877192, "objective/scores": 1.1, "policy/approxkl_avg": 13.531970024108887, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6312613487243652, "step": 286, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9990942478179932 }, { "episode": 4608, "epoch": 0.0828270482079304, "loss/policy_avg": 0.13534197211265564, "lr": 2.9449769938650308e-06, "objective/entropy": 234.43975830078125, "objective/kl": 9.361823081970215, "objective/non_score_reward": -0.9361822605133057, "objective/rlhf_reward": -3.3447290569543835, "objective/scores": 0.1, "policy/approxkl_avg": 136.4779815673828, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5726273059844971, "step": 287, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.99815034866333 }, { "episode": 4624, "epoch": 0.08311464212531905, "loss/policy_avg": 0.42366448044776917, "lr": 2.9447852760736196e-06, "objective/entropy": -113.96307373046875, "objective/kl": 8.458015441894531, "objective/non_score_reward": -0.845801591873169, "objective/rlhf_reward": -5.383206367492676, "objective/scores": -0.5, "policy/approxkl_avg": 80.63624572753906, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7157449126243591, "step": 288, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999110221862793 }, { "episode": 4640, "epoch": 0.0834022360427077, "loss/policy_avg": 0.5396846532821655, "lr": 2.944593558282209e-06, "objective/entropy": 132.98178100585938, "objective/kl": 13.068355560302734, "objective/non_score_reward": -1.3068355321884155, "objective/rlhf_reward": -2.8273421287536618, "objective/scores": 0.6, "policy/approxkl_avg": 203.39913940429688, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8317103385925293, "step": 289, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000296115875244 }, { "episode": 4656, "epoch": 0.08368982996009634, "loss/policy_avg": 0.19873766601085663, "lr": 2.9444018404907976e-06, "objective/entropy": -150.62936401367188, "objective/kl": 12.215543746948242, "objective/non_score_reward": -1.2215545177459717, "objective/rlhf_reward": -3.282097909514027, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 94.99072265625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8213875889778137, "step": 290, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9985655546188354 }, { "episode": 4672, "epoch": 0.083977423877485, "loss/policy_avg": 0.02723608911037445, "lr": 2.9442101226993864e-06, "objective/entropy": 122.41561889648438, "objective/kl": 8.34988784790039, "objective/non_score_reward": -0.8349887132644653, "objective/rlhf_reward": -5.3399553298950195, "objective/scores": -0.5, "policy/approxkl_avg": 45.76279067993164, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.34027427434921265, "step": 291, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0011491775512695 }, { "episode": 4688, "epoch": 0.08426501779487364, "loss/policy_avg": 0.32025736570358276, "lr": 2.9440184049079756e-06, "objective/entropy": -45.17048645019531, "objective/kl": 4.012126445770264, "objective/non_score_reward": -0.4012127220630646, "objective/rlhf_reward": 2.795149059593678, "objective/scores": 1.1, "policy/approxkl_avg": 35.060951232910156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4729318916797638, "step": 292, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9977355003356934 }, { "episode": 4704, "epoch": 0.08455261171226229, "loss/policy_avg": 0.22684016823768616, "lr": 2.9438266871165645e-06, "objective/entropy": 16.97724151611328, "objective/kl": 14.390579223632812, "objective/non_score_reward": -1.4390579462051392, "objective/rlhf_reward": -3.808820496277745, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 180.50299072265625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6010082960128784, "step": 293, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9963459968566895 }, { "episode": 4720, "epoch": 0.08484020562965093, "loss/policy_avg": 0.44294729828834534, "lr": 2.9436349693251537e-06, "objective/entropy": -67.02993774414062, "objective/kl": 11.463648796081543, "objective/non_score_reward": -1.1463651657104492, "objective/rlhf_reward": -0.1854602456092831, "objective/scores": 1.1, "policy/approxkl_avg": 74.088623046875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5088694095611572, "step": 294, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9967097043991089 }, { "episode": 4736, "epoch": 0.08512779954703958, "loss/policy_avg": 0.12860551476478577, "lr": 2.9434432515337425e-06, "objective/entropy": 73.75012969970703, "objective/kl": 11.233770370483398, "objective/non_score_reward": -1.1233770847320557, "objective/rlhf_reward": -4.093508290499448, "objective/scores": 0.1, "policy/approxkl_avg": 110.15925598144531, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.49316030740737915, "step": 295, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9951969385147095 }, { "episode": 4752, "epoch": 0.08541539346442822, "loss/policy_avg": 0.4201592803001404, "lr": 2.9432515337423313e-06, "objective/entropy": 43.006744384765625, "objective/kl": 12.237357139587402, "objective/non_score_reward": -1.2237358093261719, "objective/rlhf_reward": -6.8949432373046875, "objective/scores": -0.5, "policy/approxkl_avg": 152.20016479492188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7024413347244263, "step": 296, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9959149360656738 }, { "episode": 4768, "epoch": 0.08570298738181688, "loss/policy_avg": 0.17788049578666687, "lr": 2.9430598159509205e-06, "objective/entropy": -236.15725708007812, "objective/kl": 9.120914459228516, "objective/non_score_reward": -0.9120914936065674, "objective/rlhf_reward": 0.751633965969086, "objective/scores": 1.1, "policy/approxkl_avg": 85.18730163574219, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6455779671669006, "step": 297, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998746395111084 }, { "episode": 4784, "epoch": 0.08599058129920552, "loss/policy_avg": -0.09744630753993988, "lr": 2.9428680981595093e-06, "objective/entropy": 28.533233642578125, "objective/kl": 6.665700912475586, "objective/non_score_reward": -0.6665701270103455, "objective/rlhf_reward": -4.666280269622803, "objective/scores": -0.5, "policy/approxkl_avg": 56.560096740722656, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.8575639724731445, "step": 298, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0005359649658203 }, { "episode": 4800, "epoch": 0.08627817521659417, "loss/policy_avg": 0.11877211183309555, "lr": 2.9426763803680986e-06, "objective/entropy": 163.72457885742188, "objective/kl": 7.44589900970459, "objective/non_score_reward": -0.7445899248123169, "objective/rlhf_reward": 1.4216401070356373, "objective/scores": 1.1, "policy/approxkl_avg": 10.336346626281738, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6966899633407593, "step": 299, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9998868703842163 }, { "episode": 4816, "epoch": 0.08656576913398281, "loss/policy_avg": 0.3418015241622925, "lr": 2.9424846625766874e-06, "objective/entropy": -112.08236694335938, "objective/kl": 12.987334251403809, "objective/non_score_reward": -1.2987333536148071, "objective/rlhf_reward": -0.7949335634708401, "objective/scores": 1.1, "policy/approxkl_avg": 233.1175994873047, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6791957020759583, "step": 300, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9992109537124634 }, { "episode": 4832, "epoch": 0.08685336305137147, "loss/policy_avg": 0.17691361904144287, "lr": 2.942292944785276e-06, "objective/entropy": 261.80804443359375, "objective/kl": 16.804275512695312, "objective/non_score_reward": -1.6804277896881104, "objective/rlhf_reward": -6.321710979938507, "objective/scores": 0.1, "policy/approxkl_avg": 268.57550048828125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6984161734580994, "step": 301, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9969482421875 }, { "episode": 4848, "epoch": 0.0871409569687601, "loss/policy_avg": 0.16571223735809326, "lr": 2.942101226993865e-06, "objective/entropy": 186.30453491210938, "objective/kl": 7.996967315673828, "objective/non_score_reward": -0.799696683883667, "objective/rlhf_reward": -5.198786735534668, "objective/scores": -0.5, "policy/approxkl_avg": 59.46959686279297, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5912961959838867, "step": 302, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0007529258728027 }, { "episode": 4864, "epoch": 0.08742855088614876, "loss/policy_avg": 0.19111140072345734, "lr": 2.941909509202454e-06, "objective/entropy": -6.660182952880859, "objective/kl": 9.802804946899414, "objective/non_score_reward": -0.9802805781364441, "objective/rlhf_reward": -5.9211225509643555, "objective/scores": -0.5, "policy/approxkl_avg": 100.47252655029297, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6817684173583984, "step": 303, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0000228881835938 }, { "episode": 4880, "epoch": 0.0877161448035374, "loss/policy_avg": 0.050571128726005554, "lr": 2.941717791411043e-06, "objective/entropy": 106.45433807373047, "objective/kl": 15.282678604125977, "objective/non_score_reward": -1.528267741203308, "objective/rlhf_reward": -8.11307144165039, "objective/scores": -0.5, "policy/approxkl_avg": 272.7275085449219, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5246777534484863, "step": 304, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0000553131103516 }, { "episode": 4896, "epoch": 0.08800373872092605, "loss/policy_avg": 0.5568979978561401, "lr": 2.941526073619632e-06, "objective/entropy": 16.014564514160156, "objective/kl": 9.838717460632324, "objective/non_score_reward": -0.9838719367980957, "objective/rlhf_reward": -5.935487747192383, "objective/scores": -0.5, "policy/approxkl_avg": 102.69760131835938, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5866303443908691, "step": 305, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998826026916504 }, { "episode": 4912, "epoch": 0.08829133263831469, "loss/policy_avg": 0.19343560934066772, "lr": 2.9413343558282207e-06, "objective/entropy": -7.7786407470703125, "objective/kl": 13.301519393920898, "objective/non_score_reward": -1.3301520347595215, "objective/rlhf_reward": -7.320608139038086, "objective/scores": -0.5, "policy/approxkl_avg": 99.96537780761719, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6204877495765686, "step": 306, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.997145414352417 }, { "episode": 4928, "epoch": 0.08857892655570335, "loss/policy_avg": 0.4353540539741516, "lr": 2.94114263803681e-06, "objective/entropy": 180.70339965820312, "objective/kl": 15.014328002929688, "objective/non_score_reward": -1.5014326572418213, "objective/rlhf_reward": -4.343870957911598, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 205.107666015625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5958458185195923, "step": 307, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999456763267517 }, { "episode": 4944, "epoch": 0.088866520473092, "loss/policy_avg": 0.044183149933815, "lr": 2.9409509202453987e-06, "objective/entropy": 147.0219268798828, "objective/kl": 7.249485969543457, "objective/non_score_reward": -0.7249486446380615, "objective/rlhf_reward": -2.49979438483715, "objective/scores": 0.1, "policy/approxkl_avg": 31.019502639770508, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.8432949185371399, "step": 308, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000023126602173 }, { "episode": 4960, "epoch": 0.08915411439048064, "loss/policy_avg": -0.1347326934337616, "lr": 2.940759202453988e-06, "objective/entropy": -102.82943725585938, "objective/kl": 8.855981826782227, "objective/non_score_reward": -0.8855981826782227, "objective/rlhf_reward": -5.542392730712891, "objective/scores": -0.5, "policy/approxkl_avg": 78.90658569335938, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6382081508636475, "step": 309, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9998550415039062 }, { "episode": 4976, "epoch": 0.0894417083078693, "loss/policy_avg": 0.2769806981086731, "lr": 2.9405674846625768e-06, "objective/entropy": 222.6593017578125, "objective/kl": 12.968841552734375, "objective/non_score_reward": -1.2968841791152954, "objective/rlhf_reward": -0.7875365525484082, "objective/scores": 1.1, "policy/approxkl_avg": 143.65463256835938, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.644939661026001, "step": 310, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0016446113586426 }, { "episode": 4992, "epoch": 0.08972930222525793, "loss/policy_avg": -0.31937313079833984, "lr": 2.9403757668711656e-06, "objective/entropy": 224.8351593017578, "objective/kl": 11.696734428405762, "objective/non_score_reward": -1.1696734428405762, "objective/rlhf_reward": -2.5559872857489925, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 78.87408447265625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.729614794254303, "step": 311, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0000832080841064 }, { "episode": 5008, "epoch": 0.09001689614264659, "loss/policy_avg": 0.6097627878189087, "lr": 2.940184049079755e-06, "objective/entropy": 106.34514617919922, "objective/kl": 8.760353088378906, "objective/non_score_reward": -0.8760353326797485, "objective/rlhf_reward": -5.504140853881836, "objective/scores": -0.5, "policy/approxkl_avg": 121.40504455566406, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6102486848831177, "step": 312, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0006418228149414 }, { "episode": 5024, "epoch": 0.09030449006003523, "loss/policy_avg": 0.2938482463359833, "lr": 2.9399923312883436e-06, "objective/entropy": -114.29545593261719, "objective/kl": 8.454465866088867, "objective/non_score_reward": -0.8454465866088867, "objective/rlhf_reward": -2.9817864209413525, "objective/scores": 0.1, "policy/approxkl_avg": 52.16905212402344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6143299341201782, "step": 313, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9990007877349854 }, { "episode": 5040, "epoch": 0.09059208397742388, "loss/policy_avg": 0.1566888689994812, "lr": 2.939800613496933e-06, "objective/entropy": 114.07573699951172, "objective/kl": 5.000811576843262, "objective/non_score_reward": -0.5000811219215393, "objective/rlhf_reward": -4.000324726104736, "objective/scores": -0.5, "policy/approxkl_avg": 17.11898422241211, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5816758871078491, "step": 314, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9986605644226074 }, { "episode": 5056, "epoch": 0.09087967789481252, "loss/policy_avg": 0.08153313398361206, "lr": 2.9396088957055217e-06, "objective/entropy": -2.472991943359375, "objective/kl": 15.302618026733398, "objective/non_score_reward": -1.5302616357803345, "objective/rlhf_reward": -4.5169265604654125, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 221.7631072998047, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7500788569450378, "step": 315, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.996685266494751 }, { "episode": 5072, "epoch": 0.09116727181220118, "loss/policy_avg": -0.22540059685707092, "lr": 2.9394171779141105e-06, "objective/entropy": 194.83139038085938, "objective/kl": 6.124555587768555, "objective/non_score_reward": -0.6124556064605713, "objective/rlhf_reward": -4.449822425842285, "objective/scores": -0.5, "policy/approxkl_avg": 52.98652648925781, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.4853079319000244, "step": 316, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0110392570495605 }, { "episode": 5088, "epoch": 0.09145486572958982, "loss/policy_avg": 0.33575987815856934, "lr": 2.9392254601226997e-06, "objective/entropy": -246.2069854736328, "objective/kl": 6.744620323181152, "objective/non_score_reward": -0.674461841583252, "objective/rlhf_reward": -0.7504362864064533, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 44.58488464355469, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5204676389694214, "step": 317, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0011723041534424 }, { "episode": 5104, "epoch": 0.09174245964697847, "loss/policy_avg": -0.03650900349020958, "lr": 2.9390337423312885e-06, "objective/entropy": -38.37519454956055, "objective/kl": 12.865215301513672, "objective/non_score_reward": -1.2865217924118042, "objective/rlhf_reward": -7.146087169647217, "objective/scores": -0.5, "policy/approxkl_avg": 179.89398193359375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7440187931060791, "step": 318, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0004358291625977 }, { "episode": 5120, "epoch": 0.09203005356436711, "loss/policy_avg": 0.39611124992370605, "lr": 2.9388420245398773e-06, "objective/entropy": -115.19349670410156, "objective/kl": 7.894246578216553, "objective/non_score_reward": -0.7894245982170105, "objective/rlhf_reward": -5.157698631286621, "objective/scores": -0.5, "policy/approxkl_avg": 43.43158721923828, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6976910829544067, "step": 319, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9998779296875 }, { "episode": 5136, "epoch": 0.09231764748175576, "loss/policy_avg": 0.6114602088928223, "lr": 2.9386503067484665e-06, "objective/entropy": 105.1552734375, "objective/kl": 13.109886169433594, "objective/non_score_reward": -1.3109886646270752, "objective/rlhf_reward": -0.8439547479152676, "objective/scores": 1.1, "policy/approxkl_avg": 112.3132095336914, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5804147720336914, "step": 320, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9989616870880127 }, { "episode": 5152, "epoch": 0.0926052413991444, "loss/policy_avg": 0.0743880569934845, "lr": 2.9384585889570554e-06, "objective/entropy": 77.44183349609375, "objective/kl": 8.181852340698242, "objective/non_score_reward": -0.8181852698326111, "objective/rlhf_reward": -1.1500348619380332, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 82.51998901367188, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5503402948379517, "step": 321, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0014610290527344 }, { "episode": 5168, "epoch": 0.09289283531653306, "loss/policy_avg": 0.2947062849998474, "lr": 2.9382668711656446e-06, "objective/entropy": 6.5170745849609375, "objective/kl": 14.03689956665039, "objective/non_score_reward": -1.4036900997161865, "objective/rlhf_reward": -1.214760041236877, "objective/scores": 1.1, "policy/approxkl_avg": 161.59352111816406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5341845750808716, "step": 322, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9988269805908203 }, { "episode": 5184, "epoch": 0.0931804292339217, "loss/policy_avg": 0.2814280092716217, "lr": 2.938075153374233e-06, "objective/entropy": -135.19436645507812, "objective/kl": 8.1387357711792, "objective/non_score_reward": -0.8138736486434937, "objective/rlhf_reward": -2.8554944455623623, "objective/scores": 0.1, "policy/approxkl_avg": 29.818918228149414, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.581717312335968, "step": 323, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.997791051864624 }, { "episode": 5200, "epoch": 0.09346802315131035, "loss/policy_avg": 0.36961764097213745, "lr": 2.937883435582822e-06, "objective/entropy": -77.63428497314453, "objective/kl": 9.158490180969238, "objective/non_score_reward": -0.915848970413208, "objective/rlhf_reward": -5.663395881652832, "objective/scores": -0.5, "policy/approxkl_avg": 134.7732696533203, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5512831807136536, "step": 324, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.995112419128418 }, { "episode": 5216, "epoch": 0.09375561706869899, "loss/policy_avg": 0.11706581711769104, "lr": 2.937691717791411e-06, "objective/entropy": -31.434471130371094, "objective/kl": 15.156240463256836, "objective/non_score_reward": -1.5156242847442627, "objective/rlhf_reward": -8.06249713897705, "objective/scores": -0.5, "policy/approxkl_avg": 217.86083984375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.458157479763031, "step": 325, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.997313380241394 }, { "episode": 5232, "epoch": 0.09404321098608764, "loss/policy_avg": 0.08816379308700562, "lr": 2.9375e-06, "objective/entropy": 182.1945343017578, "objective/kl": 14.343957901000977, "objective/non_score_reward": -1.4343959093093872, "objective/rlhf_reward": -2.813864682556364, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 90.6133804321289, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.763087272644043, "step": 326, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9991672039031982 }, { "episode": 5248, "epoch": 0.0943308049034763, "loss/policy_avg": 0.4887702167034149, "lr": 2.937308282208589e-06, "objective/entropy": 276.0743103027344, "objective/kl": 19.531585693359375, "objective/non_score_reward": -1.9531588554382324, "objective/rlhf_reward": -7.412635026872159, "objective/scores": 0.1, "policy/approxkl_avg": 292.0692138671875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6329343318939209, "step": 327, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9973821640014648 }, { "episode": 5264, "epoch": 0.09461839882086494, "loss/policy_avg": 0.19927456974983215, "lr": 2.937116564417178e-06, "objective/entropy": -68.87179565429688, "objective/kl": 9.256897926330566, "objective/non_score_reward": -0.9256898164749146, "objective/rlhf_reward": -5.702759265899658, "objective/scores": -0.5, "policy/approxkl_avg": 59.61810302734375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.825851321220398, "step": 328, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997621774673462 }, { "episode": 5280, "epoch": 0.09490599273825359, "loss/policy_avg": 0.05179551616311073, "lr": 2.9369248466257667e-06, "objective/entropy": 154.8633575439453, "objective/kl": 11.498334884643555, "objective/non_score_reward": -1.1498336791992188, "objective/rlhf_reward": -1.6756155534994333, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 41.51012420654297, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6221466064453125, "step": 329, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999433994293213 }, { "episode": 5296, "epoch": 0.09519358665564223, "loss/policy_avg": 0.7231928110122681, "lr": 2.936733128834356e-06, "objective/entropy": 186.2045135498047, "objective/kl": 10.730362892150879, "objective/non_score_reward": -1.0730363130569458, "objective/rlhf_reward": -3.8921451330184933, "objective/scores": 0.1, "policy/approxkl_avg": 121.8580322265625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7908709645271301, "step": 330, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0002248287200928 }, { "episode": 5312, "epoch": 0.09548118057303089, "loss/policy_avg": 0.2208048403263092, "lr": 2.9365414110429447e-06, "objective/entropy": -59.700164794921875, "objective/kl": 16.838346481323242, "objective/non_score_reward": -1.6838349103927612, "objective/rlhf_reward": -8.735339164733887, "objective/scores": -0.5, "policy/approxkl_avg": 154.73214721679688, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7444063425064087, "step": 331, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9977869987487793 }, { "episode": 5328, "epoch": 0.09576877449041953, "loss/policy_avg": 0.06984035670757294, "lr": 2.936349693251534e-06, "objective/entropy": 54.201515197753906, "objective/kl": 8.278887748718262, "objective/non_score_reward": -0.8278888463973999, "objective/rlhf_reward": -5.311555862426758, "objective/scores": -0.5, "policy/approxkl_avg": 91.28977966308594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7553679943084717, "step": 332, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9997904300689697 }, { "episode": 5344, "epoch": 0.09605636840780818, "loss/policy_avg": 0.3257616460323334, "lr": 2.9361579754601228e-06, "objective/entropy": -28.56784439086914, "objective/kl": 12.646832466125488, "objective/non_score_reward": -1.2646832466125488, "objective/rlhf_reward": -7.058732986450195, "objective/scores": -0.5, "policy/approxkl_avg": 129.71322631835938, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5907671451568604, "step": 333, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9982057809829712 }, { "episode": 5360, "epoch": 0.09634396232519682, "loss/policy_avg": 0.4729722738265991, "lr": 2.9359662576687116e-06, "objective/entropy": -150.07943725585938, "objective/kl": 11.293041229248047, "objective/non_score_reward": -1.1293039321899414, "objective/rlhf_reward": -4.1172159075737, "objective/scores": 0.1, "policy/approxkl_avg": 144.25387573242188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6657143831253052, "step": 334, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998594045639038 }, { "episode": 5376, "epoch": 0.09663155624258547, "loss/policy_avg": 0.2886536419391632, "lr": 2.935774539877301e-06, "objective/entropy": -131.1785125732422, "objective/kl": 10.785483360290527, "objective/non_score_reward": -1.0785483121871948, "objective/rlhf_reward": -3.9141932338476177, "objective/scores": 0.1, "policy/approxkl_avg": 52.560306549072266, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8351963758468628, "step": 335, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998948335647583 }, { "episode": 5392, "epoch": 0.09691915015997411, "loss/policy_avg": 1.0824708938598633, "lr": 2.9355828220858896e-06, "objective/entropy": -5.367637634277344, "objective/kl": 14.300538063049316, "objective/non_score_reward": -1.430053949356079, "objective/rlhf_reward": -2.7964971407663555, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 151.68798828125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5800391435623169, "step": 336, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9980026483535767 }, { "episode": 5408, "epoch": 0.09720674407736277, "loss/policy_avg": 0.3940733075141907, "lr": 2.935391104294479e-06, "objective/entropy": -165.78109741210938, "objective/kl": 15.360054016113281, "objective/non_score_reward": -1.5360053777694702, "objective/rlhf_reward": -1.7440215110778805, "objective/scores": 1.1, "policy/approxkl_avg": 114.84159088134766, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6686538457870483, "step": 337, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0000908374786377 }, { "episode": 5424, "epoch": 0.0974943379947514, "loss/policy_avg": 0.07578772306442261, "lr": 2.9351993865030677e-06, "objective/entropy": 17.821250915527344, "objective/kl": 17.395343780517578, "objective/non_score_reward": -1.739534616470337, "objective/rlhf_reward": -4.83543178655294, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 294.8856506347656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6329281330108643, "step": 338, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9982552528381348 }, { "episode": 5440, "epoch": 0.09778193191214006, "loss/policy_avg": -0.0766182690858841, "lr": 2.9350076687116565e-06, "objective/entropy": -72.60086059570312, "objective/kl": 9.59086799621582, "objective/non_score_reward": -0.9590868949890137, "objective/rlhf_reward": -5.836347579956055, "objective/scores": -0.5, "policy/approxkl_avg": 61.654563903808594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8256030678749084, "step": 339, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.002091407775879 }, { "episode": 5456, "epoch": 0.0980695258295287, "loss/policy_avg": 0.039667725563049316, "lr": 2.9348159509202457e-06, "objective/entropy": 194.9036865234375, "objective/kl": 9.621345520019531, "objective/non_score_reward": -0.9621344804763794, "objective/rlhf_reward": -3.4485378623008724, "objective/scores": 0.1, "policy/approxkl_avg": 69.39463806152344, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.49096935987472534, "step": 340, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0042881965637207 }, { "episode": 5472, "epoch": 0.09835711974691735, "loss/policy_avg": 0.37447261810302734, "lr": 2.9346242331288345e-06, "objective/entropy": 5.140836715698242, "objective/kl": 12.80933666229248, "objective/non_score_reward": -1.2809334993362427, "objective/rlhf_reward": -3.001028003469978, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 136.77401733398438, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6044674515724182, "step": 341, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9971853494644165 }, { "episode": 5488, "epoch": 0.098644713664306, "loss/policy_avg": 0.07032056152820587, "lr": 2.9344325153374233e-06, "objective/entropy": 34.30079650878906, "objective/kl": 11.535228729248047, "objective/non_score_reward": -1.1535229682922363, "objective/rlhf_reward": -2.4913854024567943, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 80.33157348632812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5720343589782715, "step": 342, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9973433017730713 }, { "episode": 5504, "epoch": 0.09893230758169465, "loss/policy_avg": 0.039407968521118164, "lr": 2.9342407975460126e-06, "objective/entropy": 29.278091430664062, "objective/kl": 1.9333374500274658, "objective/non_score_reward": -0.19333375990390778, "objective/rlhf_reward": 1.0514937088164578, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 2.3760266304016113, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8676018714904785, "step": 343, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0017752647399902 }, { "episode": 5520, "epoch": 0.09921990149908329, "loss/policy_avg": 0.3448472023010254, "lr": 2.9340490797546014e-06, "objective/entropy": -73.16712951660156, "objective/kl": 12.475850105285645, "objective/non_score_reward": -1.2475850582122803, "objective/rlhf_reward": -2.066621486784193, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 89.76551818847656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4174689054489136, "step": 344, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.997223973274231 }, { "episode": 5536, "epoch": 0.09950749541647194, "loss/policy_avg": 0.019372761249542236, "lr": 2.93385736196319e-06, "objective/entropy": 100.60924530029297, "objective/kl": 12.531920433044434, "objective/non_score_reward": -1.2531919479370117, "objective/rlhf_reward": -4.612768149375915, "objective/scores": 0.1, "policy/approxkl_avg": 26.791622161865234, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.47664302587509155, "step": 345, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0009078979492188 }, { "episode": 5552, "epoch": 0.0997950893338606, "loss/policy_avg": 0.6332914233207703, "lr": 2.933665644171779e-06, "objective/entropy": 134.54344177246094, "objective/kl": 11.1735200881958, "objective/non_score_reward": -1.11735200881958, "objective/rlhf_reward": -4.0694083034992214, "objective/scores": 0.1, "policy/approxkl_avg": 68.55665588378906, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.561040997505188, "step": 346, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.998960256576538 }, { "episode": 5568, "epoch": 0.10008268325124924, "loss/policy_avg": 0.11013670265674591, "lr": 2.9334739263803682e-06, "objective/entropy": -63.645904541015625, "objective/kl": 14.62928581237793, "objective/non_score_reward": -1.4629285335540771, "objective/rlhf_reward": -7.851714134216309, "objective/scores": -0.5, "policy/approxkl_avg": 86.95831298828125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7983701229095459, "step": 347, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9993393421173096 }, { "episode": 5584, "epoch": 0.10037027716863789, "loss/policy_avg": 0.5474449396133423, "lr": 2.933282208588957e-06, "objective/entropy": -133.3090362548828, "objective/kl": 13.566909790039062, "objective/non_score_reward": -1.3566908836364746, "objective/rlhf_reward": -1.0267638623714443, "objective/scores": 1.1, "policy/approxkl_avg": 108.0693359375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7627835273742676, "step": 348, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999748706817627 }, { "episode": 5600, "epoch": 0.10065787108602653, "loss/policy_avg": 0.07263286411762238, "lr": 2.933090490797546e-06, "objective/entropy": 126.58871459960938, "objective/kl": 9.20844554901123, "objective/non_score_reward": -0.920844554901123, "objective/rlhf_reward": 0.7166217207908634, "objective/scores": 1.1, "policy/approxkl_avg": 49.46379089355469, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.9294000864028931, "step": 349, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9979853630065918 }, { "episode": 5616, "epoch": 0.10094546500341518, "loss/policy_avg": 0.1791333109140396, "lr": 2.932898773006135e-06, "objective/entropy": 0.43863677978515625, "objective/kl": 8.826881408691406, "objective/non_score_reward": -0.8826882243156433, "objective/rlhf_reward": -1.130752792954445, "objective/scores": 0.6, "policy/approxkl_avg": 41.58755874633789, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6624951362609863, "step": 350, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9989933967590332 }, { "episode": 5632, "epoch": 0.10123305892080382, "loss/policy_avg": 0.20636233687400818, "lr": 2.932707055214724e-06, "objective/entropy": 150.28713989257812, "objective/kl": 8.030426025390625, "objective/non_score_reward": -0.8030425906181335, "objective/rlhf_reward": -5.212170124053955, "objective/scores": -0.5, "policy/approxkl_avg": 98.02102661132812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7396011352539062, "step": 351, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0004942417144775 }, { "episode": 5648, "epoch": 0.10152065283819248, "loss/policy_avg": 0.07704152166843414, "lr": 2.932515337423313e-06, "objective/entropy": 49.47189712524414, "objective/kl": 14.631547927856445, "objective/non_score_reward": -1.4631547927856445, "objective/rlhf_reward": -5.45261919721961, "objective/scores": 0.1, "policy/approxkl_avg": 92.44937133789062, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7191329002380371, "step": 352, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9998352527618408 }, { "episode": 5664, "epoch": 0.10180824675558112, "loss/policy_avg": 0.09739228338003159, "lr": 2.932323619631902e-06, "objective/entropy": -24.802486419677734, "objective/kl": 10.526655197143555, "objective/non_score_reward": -1.0526655912399292, "objective/rlhf_reward": 0.1893375083804134, "objective/scores": 1.1, "policy/approxkl_avg": 132.25132751464844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6535590887069702, "step": 353, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.997605562210083 }, { "episode": 5680, "epoch": 0.10209584067296977, "loss/policy_avg": 0.13763202726840973, "lr": 2.9321319018404907e-06, "objective/entropy": -119.98158264160156, "objective/kl": 5.913897514343262, "objective/non_score_reward": -0.5913897752761841, "objective/rlhf_reward": -1.9655589669942855, "objective/scores": 0.1, "policy/approxkl_avg": 31.558517456054688, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6357654333114624, "step": 354, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000248908996582 }, { "episode": 5696, "epoch": 0.10238343459035841, "loss/policy_avg": 0.527988851070404, "lr": 2.93194018404908e-06, "objective/entropy": -29.399810791015625, "objective/kl": 13.658191680908203, "objective/non_score_reward": -1.3658192157745361, "objective/rlhf_reward": -3.063276922702789, "objective/scores": 0.6, "policy/approxkl_avg": 92.77324676513672, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.728462815284729, "step": 355, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9965999126434326 }, { "episode": 5712, "epoch": 0.10267102850774706, "loss/policy_avg": 0.6842390298843384, "lr": 2.9317484662576688e-06, "objective/entropy": 109.09453582763672, "objective/kl": 10.667269706726074, "objective/non_score_reward": -1.0667269229888916, "objective/rlhf_reward": -2.710648416486338, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 69.14006042480469, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7146704196929932, "step": 356, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998762607574463 }, { "episode": 5728, "epoch": 0.1029586224251357, "loss/policy_avg": 0.07212770730257034, "lr": 2.9315567484662576e-06, "objective/entropy": -49.94731903076172, "objective/kl": 2.350062370300293, "objective/non_score_reward": -0.23500625789165497, "objective/rlhf_reward": -0.5400250017642976, "objective/scores": 0.1, "policy/approxkl_avg": 1.055467128753662, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5184276103973389, "step": 357, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0012760162353516 }, { "episode": 5744, "epoch": 0.10324621634252436, "loss/policy_avg": -0.052633900195360184, "lr": 2.931365030674847e-06, "objective/entropy": -12.867652893066406, "objective/kl": 7.817996025085449, "objective/non_score_reward": -0.7817996740341187, "objective/rlhf_reward": -5.127198219299316, "objective/scores": -0.5, "policy/approxkl_avg": 38.15538787841797, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7452627420425415, "step": 358, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9967327117919922 }, { "episode": 5760, "epoch": 0.103533810259913, "loss/policy_avg": 0.3177230954170227, "lr": 2.9311733128834356e-06, "objective/entropy": 126.2174301147461, "objective/kl": 15.039046287536621, "objective/non_score_reward": -1.5039048194885254, "objective/rlhf_reward": -8.015619277954102, "objective/scores": -0.5, "policy/approxkl_avg": 148.06988525390625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5116921067237854, "step": 359, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9969123601913452 }, { "episode": 5776, "epoch": 0.10382140417730165, "loss/policy_avg": 0.015442397445440292, "lr": 2.930981595092025e-06, "objective/entropy": 113.17227935791016, "objective/kl": 15.746637344360352, "objective/non_score_reward": -1.574663758277893, "objective/rlhf_reward": -5.898655241727829, "objective/scores": 0.1, "policy/approxkl_avg": 97.0933837890625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6202758550643921, "step": 360, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9986428022384644 }, { "episode": 5792, "epoch": 0.10410899809469029, "loss/policy_avg": 0.2362920194864273, "lr": 2.9307898773006137e-06, "objective/entropy": -133.8544464111328, "objective/kl": 9.721014022827148, "objective/non_score_reward": -0.9721014499664307, "objective/rlhf_reward": 0.5115940213203434, "objective/scores": 1.1, "policy/approxkl_avg": 94.33212280273438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6412885189056396, "step": 361, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9975833892822266 }, { "episode": 5808, "epoch": 0.10439659201207895, "loss/policy_avg": 0.16871516406536102, "lr": 2.9305981595092025e-06, "objective/entropy": -65.79563903808594, "objective/kl": 13.480731964111328, "objective/non_score_reward": -1.3480732440948486, "objective/rlhf_reward": -2.468574051500532, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 170.740234375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7938269972801208, "step": 362, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9986903667449951 }, { "episode": 5824, "epoch": 0.10468418592946759, "loss/policy_avg": 0.2472498118877411, "lr": 2.9304064417177917e-06, "objective/entropy": 209.15557861328125, "objective/kl": 8.526037216186523, "objective/non_score_reward": -0.8526037335395813, "objective/rlhf_reward": 0.989584976434708, "objective/scores": 1.1, "policy/approxkl_avg": 67.1485366821289, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9279449582099915, "step": 363, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000206470489502 }, { "episode": 5840, "epoch": 0.10497177984685624, "loss/policy_avg": -0.05213417112827301, "lr": 2.9302147239263805e-06, "objective/entropy": -235.3236541748047, "objective/kl": 16.88799285888672, "objective/non_score_reward": -1.688799262046814, "objective/rlhf_reward": -6.355197063088417, "objective/scores": 0.1, "policy/approxkl_avg": 340.7134094238281, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7127069234848022, "step": 364, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9987154006958008 }, { "episode": 5856, "epoch": 0.10525937376424489, "loss/policy_avg": 0.2484489381313324, "lr": 2.9300230061349698e-06, "objective/entropy": -33.450096130371094, "objective/kl": 13.260076522827148, "objective/non_score_reward": -1.326007604598999, "objective/rlhf_reward": -4.9040304780006405, "objective/scores": 0.1, "policy/approxkl_avg": 131.92379760742188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5392622947692871, "step": 365, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9975396394729614 }, { "episode": 5872, "epoch": 0.10554696768163353, "loss/policy_avg": -0.4047367572784424, "lr": 2.9298312883435586e-06, "objective/entropy": 82.97904968261719, "objective/kl": 7.515501499176025, "objective/non_score_reward": -0.7515501976013184, "objective/rlhf_reward": -0.082481582404348, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 43.22978973388672, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.5681832432746887, "step": 366, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9999988079071045 }, { "episode": 5888, "epoch": 0.10583456159902219, "loss/policy_avg": 0.2642351984977722, "lr": 2.929639570552147e-06, "objective/entropy": 75.05047607421875, "objective/kl": 7.34889030456543, "objective/non_score_reward": -0.734889030456543, "objective/rlhf_reward": 1.4604437291622165, "objective/scores": 1.1, "policy/approxkl_avg": 24.294891357421875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6089534759521484, "step": 367, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9984581470489502 }, { "episode": 5904, "epoch": 0.10612215551641083, "loss/policy_avg": 0.4635845720767975, "lr": 2.929447852760736e-06, "objective/entropy": 111.49870300292969, "objective/kl": 14.92940902709961, "objective/non_score_reward": -1.4929410219192505, "objective/rlhf_reward": -1.5717640727758404, "objective/scores": 1.1, "policy/approxkl_avg": 170.13226318359375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.896259069442749, "step": 368, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.99690842628479 }, { "episode": 5920, "epoch": 0.10640974943379948, "loss/policy_avg": 0.0428597554564476, "lr": 2.929256134969325e-06, "objective/entropy": -17.536102294921875, "objective/kl": 13.898289680480957, "objective/non_score_reward": -1.3898290395736694, "objective/rlhf_reward": -1.1593163371086117, "objective/scores": 1.1, "policy/approxkl_avg": 113.92097473144531, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5009787678718567, "step": 369, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998550534248352 }, { "episode": 5936, "epoch": 0.10669734335118812, "loss/policy_avg": 0.4961566627025604, "lr": 2.9290644171779142e-06, "objective/entropy": 40.97224426269531, "objective/kl": 8.79596996307373, "objective/non_score_reward": -0.8795971274375916, "objective/rlhf_reward": -0.5946695550691811, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 88.30393981933594, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6913511753082275, "step": 370, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9971789121627808 }, { "episode": 5952, "epoch": 0.10698493726857677, "loss/policy_avg": 0.3505915701389313, "lr": 2.928872699386503e-06, "objective/entropy": -14.86764907836914, "objective/kl": 13.301910400390625, "objective/non_score_reward": -1.3301911354064941, "objective/rlhf_reward": -3.495935942205499, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 67.43904113769531, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5367602109909058, "step": 371, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998258352279663 }, { "episode": 5968, "epoch": 0.10727253118596541, "loss/policy_avg": 0.04512263089418411, "lr": 2.928680981595092e-06, "objective/entropy": 106.7132568359375, "objective/kl": 7.053742408752441, "objective/non_score_reward": -0.7053742408752441, "objective/rlhf_reward": -2.4214967399835583, "objective/scores": 0.1, "policy/approxkl_avg": 64.6439437866211, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6978051066398621, "step": 372, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0001304149627686 }, { "episode": 5984, "epoch": 0.10756012510335407, "loss/policy_avg": 0.10745556652545929, "lr": 2.928489263803681e-06, "objective/entropy": -95.63072967529297, "objective/kl": 16.271547317504883, "objective/non_score_reward": -1.6271545886993408, "objective/rlhf_reward": -2.1086182355880734, "objective/scores": 1.1, "policy/approxkl_avg": 83.08837890625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8408117294311523, "step": 373, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9974865913391113 }, { "episode": 6000, "epoch": 0.10784771902074271, "loss/policy_avg": 0.18899598717689514, "lr": 2.92829754601227e-06, "objective/entropy": -149.71536254882812, "objective/kl": 12.09565544128418, "objective/non_score_reward": -1.2095654010772705, "objective/rlhf_reward": -0.438261783123016, "objective/scores": 1.1, "policy/approxkl_avg": 102.32780456542969, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5978922843933105, "step": 374, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9985668659210205 }, { "episode": 6016, "epoch": 0.10813531293813136, "loss/policy_avg": 0.0514984093606472, "lr": 2.928105828220859e-06, "objective/entropy": 53.81720733642578, "objective/kl": 13.123618125915527, "objective/non_score_reward": -1.3123618364334106, "objective/rlhf_reward": -3.4246185078945865, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 97.61923217773438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.561882495880127, "step": 375, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9986340999603271 }, { "episode": 6032, "epoch": 0.10842290685552, "loss/policy_avg": 0.05128341168165207, "lr": 2.927914110429448e-06, "objective/entropy": -142.33819580078125, "objective/kl": 12.197196960449219, "objective/non_score_reward": -1.219719648361206, "objective/rlhf_reward": -3.274758789602833, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 29.826644897460938, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7544271945953369, "step": 376, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0010008811950684 }, { "episode": 6048, "epoch": 0.10871050077290866, "loss/policy_avg": 0.3710365891456604, "lr": 2.9277223926380367e-06, "objective/entropy": -158.48403930664062, "objective/kl": 12.475811958312988, "objective/non_score_reward": -1.2475812435150146, "objective/rlhf_reward": -6.990324974060059, "objective/scores": -0.5, "policy/approxkl_avg": 35.358436584472656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7211358547210693, "step": 377, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9964120388031006 }, { "episode": 6064, "epoch": 0.1089980946902973, "loss/policy_avg": 0.44148433208465576, "lr": 2.927530674846626e-06, "objective/entropy": 203.32000732421875, "objective/kl": 11.192790985107422, "objective/non_score_reward": -1.1192790269851685, "objective/rlhf_reward": -0.07711610794067347, "objective/scores": 1.1, "policy/approxkl_avg": 140.77037048339844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.49844634532928467, "step": 378, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.996138334274292 }, { "episode": 6080, "epoch": 0.10928568860768595, "loss/policy_avg": 0.07269307225942612, "lr": 2.9273389570552148e-06, "objective/entropy": -48.06731414794922, "objective/kl": 7.008990287780762, "objective/non_score_reward": -0.7008991241455078, "objective/rlhf_reward": -0.40359642207622537, "objective/scores": 0.6, "policy/approxkl_avg": 30.824771881103516, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.765288233757019, "step": 379, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0009591579437256 }, { "episode": 6096, "epoch": 0.10957328252507459, "loss/policy_avg": 0.3331525921821594, "lr": 2.9271472392638036e-06, "objective/entropy": -129.8748779296875, "objective/kl": 6.912174224853516, "objective/non_score_reward": -0.6912174224853516, "objective/rlhf_reward": -2.3648696750402447, "objective/scores": 0.1, "policy/approxkl_avg": 86.31391143798828, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7057325839996338, "step": 380, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999925136566162 }, { "episode": 6112, "epoch": 0.10986087644246324, "loss/policy_avg": 0.2209397256374359, "lr": 2.926955521472393e-06, "objective/entropy": -17.206472396850586, "objective/kl": 12.392889022827148, "objective/non_score_reward": -1.2392890453338623, "objective/rlhf_reward": -3.2238229076067606, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 61.33112716674805, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7327935695648193, "step": 381, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999511957168579 }, { "episode": 6128, "epoch": 0.11014847035985188, "loss/policy_avg": 0.015440240502357483, "lr": 2.9267638036809816e-06, "objective/entropy": 136.05276489257812, "objective/kl": 10.93641471862793, "objective/non_score_reward": -1.0936416387557983, "objective/rlhf_reward": -3.9745664507150646, "objective/scores": 0.1, "policy/approxkl_avg": 7.861133575439453, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.642856240272522, "step": 382, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0005943775177 }, { "episode": 6144, "epoch": 0.11043606427724054, "loss/policy_avg": 0.4618483781814575, "lr": 2.926572085889571e-06, "objective/entropy": 248.1273193359375, "objective/kl": 10.450383186340332, "objective/non_score_reward": -1.0450382232666016, "objective/rlhf_reward": -6.180152893066406, "objective/scores": -0.5, "policy/approxkl_avg": 22.98204231262207, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.585578441619873, "step": 383, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9986340999603271 }, { "episode": 6160, "epoch": 0.11072365819462919, "loss/policy_avg": 0.527269721031189, "lr": 2.9263803680981597e-06, "objective/entropy": -85.81866455078125, "objective/kl": 7.232128143310547, "objective/non_score_reward": -0.7232127785682678, "objective/rlhf_reward": -0.4928510844707489, "objective/scores": 0.6, "policy/approxkl_avg": 35.018524169921875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6665592789649963, "step": 384, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9987972974777222 }, { "episode": 6176, "epoch": 0.11101125211201783, "loss/policy_avg": 0.012356449849903584, "lr": 2.9261886503067485e-06, "objective/entropy": -87.99043273925781, "objective/kl": 15.716808319091797, "objective/non_score_reward": -1.5716807842254639, "objective/rlhf_reward": -1.8867232263088223, "objective/scores": 1.1, "policy/approxkl_avg": 112.06707763671875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8166660070419312, "step": 385, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9980388879776 }, { "episode": 6192, "epoch": 0.11129884602940648, "loss/policy_avg": 0.21891015768051147, "lr": 2.9259969325153377e-06, "objective/entropy": 149.92320251464844, "objective/kl": 15.745594024658203, "objective/non_score_reward": -1.5745596885681152, "objective/rlhf_reward": -8.298238754272461, "objective/scores": -0.5, "policy/approxkl_avg": 270.8348693847656, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5218294858932495, "step": 386, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9982883930206299 }, { "episode": 6208, "epoch": 0.11158643994679512, "loss/policy_avg": 0.1566510945558548, "lr": 2.9258052147239265e-06, "objective/entropy": 279.3895263671875, "objective/kl": 14.221332550048828, "objective/non_score_reward": -1.4221333265304565, "objective/rlhf_reward": -7.688533306121826, "objective/scores": -0.5, "policy/approxkl_avg": 161.9359588623047, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9130169153213501, "step": 387, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9984962940216064 }, { "episode": 6224, "epoch": 0.11187403386418378, "loss/policy_avg": 0.3156845271587372, "lr": 2.9256134969325158e-06, "objective/entropy": -91.46309661865234, "objective/kl": 5.5192670822143555, "objective/non_score_reward": -0.5519267320632935, "objective/rlhf_reward": -4.207706928253174, "objective/scores": -0.5, "policy/approxkl_avg": 80.50323486328125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4393225312232971, "step": 388, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9989399909973145 }, { "episode": 6240, "epoch": 0.11216162778157242, "loss/policy_avg": -0.24140848219394684, "lr": 2.925421779141104e-06, "objective/entropy": -6.909185409545898, "objective/kl": 7.97199821472168, "objective/non_score_reward": -0.7971999049186707, "objective/rlhf_reward": -1.066093357578788, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 99.13380432128906, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4111158847808838, "step": 389, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.03024959564209 }, { "episode": 6256, "epoch": 0.11244922169896107, "loss/policy_avg": 0.10580594837665558, "lr": 2.9252300613496934e-06, "objective/entropy": 139.240234375, "objective/kl": 7.618363857269287, "objective/non_score_reward": -0.7618364095687866, "objective/rlhf_reward": -0.6473455265164376, "objective/scores": 0.6, "policy/approxkl_avg": 75.60391998291016, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6820776462554932, "step": 390, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000373363494873 }, { "episode": 6272, "epoch": 0.11273681561634971, "loss/policy_avg": 0.23864039778709412, "lr": 2.925038343558282e-06, "objective/entropy": -165.75607299804688, "objective/kl": 9.443279266357422, "objective/non_score_reward": -0.9443280696868896, "objective/rlhf_reward": -5.777312278747559, "objective/scores": -0.5, "policy/approxkl_avg": 73.20270538330078, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6525593996047974, "step": 391, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9987553358078003 }, { "episode": 6288, "epoch": 0.11302440953373837, "loss/policy_avg": 0.01731543242931366, "lr": 2.924846625766871e-06, "objective/entropy": 202.19996643066406, "objective/kl": 15.700738906860352, "objective/non_score_reward": -1.5700738430023193, "objective/rlhf_reward": -5.880295610427856, "objective/scores": 0.1, "policy/approxkl_avg": 44.909934997558594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5537660121917725, "step": 392, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9973206520080566 }, { "episode": 6304, "epoch": 0.113312003451127, "loss/policy_avg": 0.37011197209358215, "lr": 2.9246549079754602e-06, "objective/entropy": -100.13412475585938, "objective/kl": 7.34639835357666, "objective/non_score_reward": -0.7346398234367371, "objective/rlhf_reward": -0.5385593235492707, "objective/scores": 0.6, "policy/approxkl_avg": 39.805572509765625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5902200937271118, "step": 393, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9974175691604614 }, { "episode": 6320, "epoch": 0.11359959736851566, "loss/policy_avg": 0.1728161722421646, "lr": 2.924463190184049e-06, "objective/entropy": 12.221923828125, "objective/kl": 8.158146858215332, "objective/non_score_reward": -0.8158146142959595, "objective/rlhf_reward": -2.863258576393127, "objective/scores": 0.1, "policy/approxkl_avg": 17.08139419555664, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6115730404853821, "step": 394, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9969968795776367 }, { "episode": 6336, "epoch": 0.1138871912859043, "loss/policy_avg": 0.1638275384902954, "lr": 2.924271472392638e-06, "objective/entropy": -92.78604125976562, "objective/kl": 10.350639343261719, "objective/non_score_reward": -1.0350639820098877, "objective/rlhf_reward": -3.7402558535337445, "objective/scores": 0.1, "policy/approxkl_avg": 192.456298828125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5784432888031006, "step": 395, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9983556270599365 }, { "episode": 6352, "epoch": 0.11417478520329295, "loss/policy_avg": 0.008675817400217056, "lr": 2.924079754601227e-06, "objective/entropy": 14.630111694335938, "objective/kl": 11.06743049621582, "objective/non_score_reward": -1.1067430973052979, "objective/rlhf_reward": -4.026972463726997, "objective/scores": 0.1, "policy/approxkl_avg": 110.01115417480469, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9663007259368896, "step": 396, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9992257356643677 }, { "episode": 6368, "epoch": 0.11446237912068159, "loss/policy_avg": 0.22570039331912994, "lr": 2.923888036809816e-06, "objective/entropy": -62.594932556152344, "objective/kl": 11.640623092651367, "objective/non_score_reward": -1.1640625, "objective/rlhf_reward": -2.533543454782043, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 53.970237731933594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6753679513931274, "step": 397, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9984841346740723 }, { "episode": 6384, "epoch": 0.11474997303807025, "loss/policy_avg": 0.18725144863128662, "lr": 2.923696319018405e-06, "objective/entropy": -118.94644165039062, "objective/kl": 10.655905723571777, "objective/non_score_reward": -1.0655906200408936, "objective/rlhf_reward": -3.8623625993728634, "objective/scores": 0.1, "policy/approxkl_avg": 19.7579345703125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.645980954170227, "step": 398, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0001778602600098 }, { "episode": 6400, "epoch": 0.11503756695545889, "loss/policy_avg": 0.04687364026904106, "lr": 2.923504601226994e-06, "objective/entropy": 107.14441680908203, "objective/kl": 5.046243667602539, "objective/non_score_reward": -0.5046243071556091, "objective/rlhf_reward": 2.381502674520016, "objective/scores": 1.1, "policy/approxkl_avg": 28.62555503845215, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6493499279022217, "step": 399, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.00180721282959 }, { "episode": 6416, "epoch": 0.11532516087284754, "loss/policy_avg": 0.18883462250232697, "lr": 2.9233128834355827e-06, "objective/entropy": 93.51673126220703, "objective/kl": 9.593416213989258, "objective/non_score_reward": -0.9593416452407837, "objective/rlhf_reward": -1.4373664021492005, "objective/scores": 0.6, "policy/approxkl_avg": 43.221275329589844, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5561137199401855, "step": 400, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9976122379302979 }, { "episode": 6432, "epoch": 0.11561275479023618, "loss/policy_avg": 0.07294710725545883, "lr": 2.923121165644172e-06, "objective/entropy": 41.92622375488281, "objective/kl": 10.513269424438477, "objective/non_score_reward": -1.0513269901275635, "objective/rlhf_reward": -6.205307960510254, "objective/scores": -0.5, "policy/approxkl_avg": 136.67205810546875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.40027546882629395, "step": 401, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000309944152832 }, { "episode": 6448, "epoch": 0.11590034870762483, "loss/policy_avg": -0.12561793625354767, "lr": 2.9229294478527608e-06, "objective/entropy": 82.13600158691406, "objective/kl": 7.441690444946289, "objective/non_score_reward": -0.7441689968109131, "objective/rlhf_reward": -0.05295715177175664, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 81.62566375732422, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7616924047470093, "step": 402, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0073320865631104 }, { "episode": 6464, "epoch": 0.11618794262501349, "loss/policy_avg": 0.19540420174598694, "lr": 2.92273773006135e-06, "objective/entropy": -97.04734802246094, "objective/kl": 9.546144485473633, "objective/non_score_reward": -0.954614520072937, "objective/rlhf_reward": -5.818458080291748, "objective/scores": -0.5, "policy/approxkl_avg": 41.53440475463867, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6575814485549927, "step": 403, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998401403427124 }, { "episode": 6480, "epoch": 0.11647553654240213, "loss/policy_avg": 0.2724316716194153, "lr": 2.922546012269939e-06, "objective/entropy": -222.66941833496094, "objective/kl": 6.35382080078125, "objective/non_score_reward": -0.6353820562362671, "objective/rlhf_reward": -2.141528314352035, "objective/scores": 0.1, "policy/approxkl_avg": 45.857295989990234, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5307646989822388, "step": 404, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0005037784576416 }, { "episode": 6496, "epoch": 0.11676313045979078, "loss/policy_avg": -0.19320714473724365, "lr": 2.9223542944785276e-06, "objective/entropy": 128.687744140625, "objective/kl": 9.543111801147461, "objective/non_score_reward": -0.954311192035675, "objective/rlhf_reward": -2.2131247407832912, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 83.3912353515625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5714800953865051, "step": 405, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.014221668243408 }, { "episode": 6512, "epoch": 0.11705072437717942, "loss/policy_avg": -0.02320697158575058, "lr": 2.922162576687117e-06, "objective/entropy": -111.4472427368164, "objective/kl": 14.546286582946777, "objective/non_score_reward": -1.4546287059783936, "objective/rlhf_reward": -5.418514943122863, "objective/scores": 0.1, "policy/approxkl_avg": 198.98716735839844, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6330698728561401, "step": 406, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.000318765640259 }, { "episode": 6528, "epoch": 0.11733831829456808, "loss/policy_avg": -0.05689065158367157, "lr": 2.9219708588957057e-06, "objective/entropy": -94.29441833496094, "objective/kl": 6.670037746429443, "objective/non_score_reward": -0.6670037508010864, "objective/rlhf_reward": 1.731984862685204, "objective/scores": 1.1, "policy/approxkl_avg": 58.32909393310547, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4962747097015381, "step": 407, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0033135414123535 }, { "episode": 6544, "epoch": 0.11762591221195672, "loss/policy_avg": 0.058236684650182724, "lr": 2.9217791411042945e-06, "objective/entropy": 73.15773010253906, "objective/kl": 10.113113403320312, "objective/non_score_reward": -1.011311411857605, "objective/rlhf_reward": -3.6452456325292584, "objective/scores": 0.1, "policy/approxkl_avg": 100.70352172851562, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4618947505950928, "step": 408, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9986381530761719 }, { "episode": 6560, "epoch": 0.11791350612934537, "loss/policy_avg": 0.2816429138183594, "lr": 2.9215874233128837e-06, "objective/entropy": 199.59906005859375, "objective/kl": 8.274097442626953, "objective/non_score_reward": -0.8274096846580505, "objective/rlhf_reward": -5.309638977050781, "objective/scores": -0.5, "policy/approxkl_avg": 47.900413513183594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4910353124141693, "step": 409, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.000108480453491 }, { "episode": 6576, "epoch": 0.11820110004673401, "loss/policy_avg": 0.43750858306884766, "lr": 2.9213957055214725e-06, "objective/entropy": 63.67340087890625, "objective/kl": 17.44285011291504, "objective/non_score_reward": -1.7442851066589355, "objective/rlhf_reward": -4.577140188217163, "objective/scores": 0.6, "policy/approxkl_avg": 116.11654663085938, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7501018047332764, "step": 410, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9976780414581299 }, { "episode": 6592, "epoch": 0.11848869396412266, "loss/policy_avg": 0.32210612297058105, "lr": 2.9212039877300618e-06, "objective/entropy": -220.1966552734375, "objective/kl": 11.985102653503418, "objective/non_score_reward": -1.1985102891921997, "objective/rlhf_reward": -4.39404130578041, "objective/scores": 0.1, "policy/approxkl_avg": 161.2996826171875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5652351379394531, "step": 411, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9979429244995117 }, { "episode": 6608, "epoch": 0.1187762878815113, "loss/policy_avg": 0.1565137803554535, "lr": 2.92101226993865e-06, "objective/entropy": -16.16411781311035, "objective/kl": 14.623466491699219, "objective/non_score_reward": -1.4623467922210693, "objective/rlhf_reward": -7.849387168884277, "objective/scores": -0.5, "policy/approxkl_avg": 106.36109924316406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5213359594345093, "step": 412, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.997380256652832 }, { "episode": 6624, "epoch": 0.11906388179889996, "loss/policy_avg": 0.16901980340480804, "lr": 2.9208205521472394e-06, "objective/entropy": -32.831336975097656, "objective/kl": 9.713844299316406, "objective/non_score_reward": -0.9713844060897827, "objective/rlhf_reward": -1.9381265444325761, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 27.6256160736084, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5835360288619995, "step": 413, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000209093093872 }, { "episode": 6640, "epoch": 0.1193514757162886, "loss/policy_avg": -0.03889453411102295, "lr": 2.920628834355828e-06, "objective/entropy": 3.4859085083007812, "objective/kl": 7.356728553771973, "objective/non_score_reward": -0.7356729507446289, "objective/rlhf_reward": -1.3385716415086562, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 150.19898986816406, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6149972677230835, "step": 414, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.001542568206787 }, { "episode": 6656, "epoch": 0.11963906963367725, "loss/policy_avg": 0.07131887972354889, "lr": 2.920437116564417e-06, "objective/entropy": 135.44573974609375, "objective/kl": 6.854515075683594, "objective/non_score_reward": -0.6854515075683594, "objective/rlhf_reward": -4.7418060302734375, "objective/scores": -0.5, "policy/approxkl_avg": 11.425703048706055, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6121791005134583, "step": 415, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.997809886932373 }, { "episode": 6672, "epoch": 0.11992666355106589, "loss/policy_avg": -0.02449220046401024, "lr": 2.9202453987730062e-06, "objective/entropy": -228.9163818359375, "objective/kl": 3.998107433319092, "objective/non_score_reward": -0.3998107314109802, "objective/rlhf_reward": 0.8007570147514342, "objective/scores": 0.6, "policy/approxkl_avg": 1.9464802742004395, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6649434566497803, "step": 416, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9998199939727783 }, { "episode": 6688, "epoch": 0.12021425746845454, "loss/policy_avg": 0.1401190161705017, "lr": 2.920053680981595e-06, "objective/entropy": 109.60284423828125, "objective/kl": 11.780817031860352, "objective/non_score_reward": -1.178081750869751, "objective/rlhf_reward": -6.712327003479004, "objective/scores": -0.5, "policy/approxkl_avg": 101.54756927490234, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6730775833129883, "step": 417, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9986844062805176 }, { "episode": 6704, "epoch": 0.12050185138584318, "loss/policy_avg": -0.057150907814502716, "lr": 2.919861963190184e-06, "objective/entropy": 114.74760437011719, "objective/kl": 13.613632202148438, "objective/non_score_reward": -1.361363172531128, "objective/rlhf_reward": -7.44545316696167, "objective/scores": -0.5, "policy/approxkl_avg": 83.6063232421875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7602818012237549, "step": 418, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.99805748462677 }, { "episode": 6720, "epoch": 0.12078944530323184, "loss/policy_avg": 0.33992117643356323, "lr": 2.919670245398773e-06, "objective/entropy": 95.6858139038086, "objective/kl": 13.743134498596191, "objective/non_score_reward": -1.3743133544921875, "objective/rlhf_reward": -3.3745473048844676, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 54.830780029296875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.745945930480957, "step": 419, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9975709915161133 }, { "episode": 6736, "epoch": 0.12107703922062048, "loss/policy_avg": 0.04007640480995178, "lr": 2.919478527607362e-06, "objective/entropy": 158.9964599609375, "objective/kl": 11.750536918640137, "objective/non_score_reward": -1.175053596496582, "objective/rlhf_reward": -4.300214721262455, "objective/scores": 0.1, "policy/approxkl_avg": 262.0268249511719, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.691627025604248, "step": 420, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.001215934753418 }, { "episode": 6752, "epoch": 0.12136463313800913, "loss/policy_avg": 0.3095911741256714, "lr": 2.919286809815951e-06, "objective/entropy": -35.89824676513672, "objective/kl": 11.786452293395996, "objective/non_score_reward": -1.178645372390747, "objective/rlhf_reward": -4.314581578969955, "objective/scores": 0.1, "policy/approxkl_avg": 36.27446746826172, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8371065855026245, "step": 421, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9973646402359009 }, { "episode": 6768, "epoch": 0.12165222705539779, "loss/policy_avg": 0.22105564177036285, "lr": 2.91909509202454e-06, "objective/entropy": 15.297624588012695, "objective/kl": 9.64991283416748, "objective/non_score_reward": -0.9649913311004639, "objective/rlhf_reward": -3.459965533018112, "objective/scores": 0.1, "policy/approxkl_avg": 37.608909606933594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6050717830657959, "step": 422, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.99948251247406 }, { "episode": 6784, "epoch": 0.12193982097278643, "loss/policy_avg": 0.18998616933822632, "lr": 2.9189033742331287e-06, "objective/entropy": -28.578590393066406, "objective/kl": 5.720818519592285, "objective/non_score_reward": -0.5720819234848022, "objective/rlhf_reward": -0.3409164053963978, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 1.9535547494888306, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5882730484008789, "step": 423, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9995017051696777 }, { "episode": 6800, "epoch": 0.12222741489017508, "loss/policy_avg": 0.6313632726669312, "lr": 2.918711656441718e-06, "objective/entropy": 219.71212768554688, "objective/kl": 10.877071380615234, "objective/non_score_reward": -1.0877070426940918, "objective/rlhf_reward": -6.350828170776367, "objective/scores": -0.5, "policy/approxkl_avg": 94.11060333251953, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6537734270095825, "step": 424, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9975165128707886 }, { "episode": 6816, "epoch": 0.12251500880756372, "loss/policy_avg": 0.024708323180675507, "lr": 2.918519938650307e-06, "objective/entropy": -79.56177520751953, "objective/kl": 16.1546573638916, "objective/non_score_reward": -1.615465760231018, "objective/rlhf_reward": -2.0618630260229107, "objective/scores": 1.1, "policy/approxkl_avg": 151.0323028564453, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7285983562469482, "step": 425, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9983255863189697 }, { "episode": 6832, "epoch": 0.12280260272495237, "loss/policy_avg": 0.07644349336624146, "lr": 2.918328220858896e-06, "objective/entropy": -197.13235473632812, "objective/kl": 7.779457092285156, "objective/non_score_reward": -0.7779456973075867, "objective/rlhf_reward": -1.5555234988599567, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 37.899497985839844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6256439685821533, "step": 426, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9981521368026733 }, { "episode": 6848, "epoch": 0.12309019664234101, "loss/policy_avg": 0.26216840744018555, "lr": 2.918136503067485e-06, "objective/entropy": 4.240196228027344, "objective/kl": 9.129347801208496, "objective/non_score_reward": -0.9129348397254944, "objective/rlhf_reward": -5.651739120483398, "objective/scores": -0.5, "policy/approxkl_avg": 51.532691955566406, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6356798410415649, "step": 427, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9993443489074707 }, { "episode": 6864, "epoch": 0.12337779055972967, "loss/policy_avg": 0.08391077816486359, "lr": 2.9179447852760736e-06, "objective/entropy": 116.065185546875, "objective/kl": 12.789249420166016, "objective/non_score_reward": -1.2789249420166016, "objective/rlhf_reward": -4.71570006608963, "objective/scores": 0.1, "policy/approxkl_avg": 107.84225463867188, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6037728786468506, "step": 428, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9945564270019531 }, { "episode": 6880, "epoch": 0.1236653844771183, "loss/policy_avg": 0.5531384944915771, "lr": 2.917753067484663e-06, "objective/entropy": 259.3071594238281, "objective/kl": 14.592714309692383, "objective/non_score_reward": -1.4592714309692383, "objective/rlhf_reward": -4.175226414174425, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 109.08944702148438, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6318076848983765, "step": 429, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9996614456176758 }, { "episode": 6896, "epoch": 0.12395297839450696, "loss/policy_avg": 0.21222534775733948, "lr": 2.9175613496932517e-06, "objective/entropy": -58.68345642089844, "objective/kl": 11.372722625732422, "objective/non_score_reward": -1.1372722387313843, "objective/rlhf_reward": -1.625370000244352, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 109.58206939697266, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.47258567810058594, "step": 430, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9987989664077759 }, { "episode": 6912, "epoch": 0.1242405723118956, "loss/policy_avg": 0.22482730448246002, "lr": 2.9173696319018405e-06, "objective/entropy": 67.2789535522461, "objective/kl": 9.9976806640625, "objective/non_score_reward": -0.9997680187225342, "objective/rlhf_reward": -3.5990721903741356, "objective/scores": 0.1, "policy/approxkl_avg": 59.97844696044922, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5720049142837524, "step": 431, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9989635944366455 }, { "episode": 6928, "epoch": 0.12452816622928425, "loss/policy_avg": 0.4322272539138794, "lr": 2.9171779141104297e-06, "objective/entropy": -290.14111328125, "objective/kl": 14.833064079284668, "objective/non_score_reward": -1.4833064079284668, "objective/rlhf_reward": -4.108396942886422, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 118.15055847167969, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7155340313911438, "step": 432, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9969213008880615 }, { "episode": 6944, "epoch": 0.1248157601466729, "loss/policy_avg": 0.8206951022148132, "lr": 2.9169861963190185e-06, "objective/entropy": 81.16971588134766, "objective/kl": 12.39436149597168, "objective/non_score_reward": -1.2394360303878784, "objective/rlhf_reward": -0.55774433016777, "objective/scores": 1.1, "policy/approxkl_avg": 56.21806335449219, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.9199215173721313, "step": 433, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.997793436050415 }, { "episode": 6960, "epoch": 0.12510335406406153, "loss/policy_avg": 0.12932038307189941, "lr": 2.9167944785276073e-06, "objective/entropy": -108.72267150878906, "objective/kl": 9.69011116027832, "objective/non_score_reward": -0.969011127948761, "objective/rlhf_reward": -3.476044631004333, "objective/scores": 0.1, "policy/approxkl_avg": 113.92694091796875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7485846281051636, "step": 434, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000093936920166 }, { "episode": 6976, "epoch": 0.1253909479814502, "loss/policy_avg": 0.25388216972351074, "lr": 2.916602760736196e-06, "objective/entropy": 131.2146453857422, "objective/kl": 12.779239654541016, "objective/non_score_reward": -1.2779240608215332, "objective/rlhf_reward": -7.111696243286133, "objective/scores": -0.5, "policy/approxkl_avg": 195.51593017578125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.3997848629951477, "step": 435, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9982001781463623 }, { "episode": 6992, "epoch": 0.12567854189883884, "loss/policy_avg": 0.34683698415756226, "lr": 2.9164110429447854e-06, "objective/entropy": 94.69212341308594, "objective/kl": 8.857991218566895, "objective/non_score_reward": -0.8857991695404053, "objective/rlhf_reward": -5.543196678161621, "objective/scores": -0.5, "policy/approxkl_avg": 67.66943359375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6433195471763611, "step": 436, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9968899488449097 }, { "episode": 7008, "epoch": 0.12596613581622748, "loss/policy_avg": 0.4975810945034027, "lr": 2.916219325153374e-06, "objective/entropy": 99.60516357421875, "objective/kl": 17.621997833251953, "objective/non_score_reward": -1.762199878692627, "objective/rlhf_reward": -5.223970900448869, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 126.27960205078125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.9998393654823303, "step": 437, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9977905750274658 }, { "episode": 7024, "epoch": 0.12625372973361612, "loss/policy_avg": -0.40271705389022827, "lr": 2.916027607361963e-06, "objective/entropy": -58.73802947998047, "objective/kl": 11.266298294067383, "objective/non_score_reward": -1.1266300678253174, "objective/rlhf_reward": -2.68169152286918, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 36.971275329589844, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.5164896249771118, "step": 438, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0037713050842285 }, { "episode": 7040, "epoch": 0.1265413236510048, "loss/policy_avg": 0.21917138993740082, "lr": 2.9158358895705522e-06, "objective/entropy": -138.54464721679688, "objective/kl": 17.991540908813477, "objective/non_score_reward": -1.799154281616211, "objective/rlhf_reward": -5.073910774961982, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 85.51068878173828, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4984607994556427, "step": 439, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9979517459869385 }, { "episode": 7056, "epoch": 0.12682891756839343, "loss/policy_avg": 0.14226309955120087, "lr": 2.915644171779141e-06, "objective/entropy": -70.7157974243164, "objective/kl": 17.338790893554688, "objective/non_score_reward": -1.7338790893554688, "objective/rlhf_reward": -4.535516625642776, "objective/scores": 0.6, "policy/approxkl_avg": 281.78021240234375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6945629119873047, "step": 440, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9976835250854492 }, { "episode": 7072, "epoch": 0.12711651148578207, "loss/policy_avg": 0.7052698135375977, "lr": 2.9154524539877303e-06, "objective/entropy": -51.95797348022461, "objective/kl": 11.16838264465332, "objective/non_score_reward": -1.1168383359909058, "objective/rlhf_reward": -4.067353239655494, "objective/scores": 0.1, "policy/approxkl_avg": 145.99929809570312, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7760474681854248, "step": 441, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9976942539215088 }, { "episode": 7088, "epoch": 0.12740410540317074, "loss/policy_avg": 0.2039000242948532, "lr": 2.915260736196319e-06, "objective/entropy": 70.03821563720703, "objective/kl": 10.011724472045898, "objective/non_score_reward": -1.0011723041534424, "objective/rlhf_reward": -3.604689425230026, "objective/scores": 0.1, "policy/approxkl_avg": 79.66981506347656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5425417423248291, "step": 442, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9985206127166748 }, { "episode": 7104, "epoch": 0.12769169932055938, "loss/policy_avg": 0.046909917145967484, "lr": 2.915069018404908e-06, "objective/entropy": -109.3448486328125, "objective/kl": 10.856573104858398, "objective/non_score_reward": -1.0856573581695557, "objective/rlhf_reward": -3.9426295816898342, "objective/scores": 0.1, "policy/approxkl_avg": 99.86489868164062, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5526108741760254, "step": 443, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9977734088897705 }, { "episode": 7120, "epoch": 0.12797929323794802, "loss/policy_avg": 0.3472464084625244, "lr": 2.914877300613497e-06, "objective/entropy": 133.83642578125, "objective/kl": 8.187213897705078, "objective/non_score_reward": -0.8187214136123657, "objective/rlhf_reward": -1.6130260578995808, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 60.895503997802734, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6827129125595093, "step": 444, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9999275207519531 }, { "episode": 7136, "epoch": 0.12826688715533666, "loss/policy_avg": 0.030289731919765472, "lr": 2.914685582822086e-06, "objective/entropy": -95.79446411132812, "objective/kl": 12.967110633850098, "objective/non_score_reward": -1.2967112064361572, "objective/rlhf_reward": -4.786844870448112, "objective/scores": 0.1, "policy/approxkl_avg": 158.4815673828125, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6717748641967773, "step": 445, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9999089241027832 }, { "episode": 7152, "epoch": 0.12855448107272532, "loss/policy_avg": 0.3508151173591614, "lr": 2.9144938650306748e-06, "objective/entropy": -29.020809173583984, "objective/kl": 10.402070045471191, "objective/non_score_reward": -1.0402069091796875, "objective/rlhf_reward": -1.2371087416422095, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 22.769393920898438, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6285759210586548, "step": 446, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9999580383300781 }, { "episode": 7168, "epoch": 0.12884207499011396, "loss/policy_avg": 0.5095102190971375, "lr": 2.914302147239264e-06, "objective/entropy": 4.9149627685546875, "objective/kl": 11.270059585571289, "objective/non_score_reward": -1.1270060539245605, "objective/rlhf_reward": -0.10802436470985377, "objective/scores": 1.1, "policy/approxkl_avg": 42.081207275390625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6107115745544434, "step": 447, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9997491836547852 }, { "episode": 7184, "epoch": 0.1291296689075026, "loss/policy_avg": 0.15434984862804413, "lr": 2.914110429447853e-06, "objective/entropy": 93.47442626953125, "objective/kl": 5.827376842498779, "objective/non_score_reward": -0.5827376842498779, "objective/rlhf_reward": -4.330950736999512, "objective/scores": -0.5, "policy/approxkl_avg": 56.859039306640625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5905852913856506, "step": 448, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9985287189483643 }, { "episode": 7200, "epoch": 0.12941726282489124, "loss/policy_avg": 0.05858859419822693, "lr": 2.913918711656442e-06, "objective/entropy": -168.11392211914062, "objective/kl": 12.220864295959473, "objective/non_score_reward": -1.2220864295959473, "objective/rlhf_reward": -6.888345718383789, "objective/scores": -0.5, "policy/approxkl_avg": 64.31013488769531, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6738055348396301, "step": 449, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9984550476074219 }, { "episode": 7216, "epoch": 0.1297048567422799, "loss/policy_avg": 0.6401174068450928, "lr": 2.913726993865031e-06, "objective/entropy": 143.6673126220703, "objective/kl": 16.03870964050293, "objective/non_score_reward": -1.6038709878921509, "objective/rlhf_reward": -6.015483951568603, "objective/scores": 0.1, "policy/approxkl_avg": 215.43011474609375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.852541446685791, "step": 450, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9969959259033203 }, { "episode": 7232, "epoch": 0.12999245065966855, "loss/policy_avg": -0.20923450589179993, "lr": 2.9135352760736196e-06, "objective/entropy": -48.08766174316406, "objective/kl": 3.6980080604553223, "objective/non_score_reward": -0.36980074644088745, "objective/rlhf_reward": -3.479203224182129, "objective/scores": -0.5, "policy/approxkl_avg": 28.520061492919922, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.49264973402023315, "step": 451, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0105478763580322 }, { "episode": 7248, "epoch": 0.1302800445770572, "loss/policy_avg": 0.2910463511943817, "lr": 2.913343558282209e-06, "objective/entropy": -25.366256713867188, "objective/kl": 18.143402099609375, "objective/non_score_reward": -1.814340353012085, "objective/rlhf_reward": -9.25736141204834, "objective/scores": -0.5, "policy/approxkl_avg": 229.06854248046875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5178902745246887, "step": 452, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9988359212875366 }, { "episode": 7264, "epoch": 0.13056763849444583, "loss/policy_avg": 0.019646476954221725, "lr": 2.9131518404907977e-06, "objective/entropy": 163.14837646484375, "objective/kl": 15.703920364379883, "objective/non_score_reward": -1.5703920125961304, "objective/rlhf_reward": -4.548234538237253, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 97.07273864746094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.629855751991272, "step": 453, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0000038146972656 }, { "episode": 7280, "epoch": 0.1308552324118345, "loss/policy_avg": 0.03351786732673645, "lr": 2.912960122699387e-06, "objective/entropy": 106.53297424316406, "objective/kl": 15.689282417297363, "objective/non_score_reward": -1.5689281225204468, "objective/rlhf_reward": -8.275712966918945, "objective/scores": -0.5, "policy/approxkl_avg": 168.91738891601562, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5836728811264038, "step": 454, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0003838539123535 }, { "episode": 7296, "epoch": 0.13114282632922314, "loss/policy_avg": 0.025853008031845093, "lr": 2.9127684049079757e-06, "objective/entropy": -120.41839599609375, "objective/kl": 14.210792541503906, "objective/non_score_reward": -1.4210792779922485, "objective/rlhf_reward": -4.022457575023758, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 25.721412658691406, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5456880331039429, "step": 455, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998733639717102 }, { "episode": 7312, "epoch": 0.13143042024661178, "loss/policy_avg": 0.11350809037685394, "lr": 2.912576687116564e-06, "objective/entropy": 95.34707641601562, "objective/kl": 14.430747985839844, "objective/non_score_reward": -1.4430747032165527, "objective/rlhf_reward": -5.3722985744476315, "objective/scores": 0.1, "policy/approxkl_avg": 132.41342163085938, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5699273347854614, "step": 456, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.995617151260376 }, { "episode": 7328, "epoch": 0.13171801416400042, "loss/policy_avg": 0.5415328741073608, "lr": 2.9123849693251534e-06, "objective/entropy": -35.476009368896484, "objective/kl": 8.802606582641602, "objective/non_score_reward": -0.880260705947876, "objective/rlhf_reward": -3.1210429728031155, "objective/scores": 0.1, "policy/approxkl_avg": 70.38357543945312, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6030310988426208, "step": 457, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000958204269409 }, { "episode": 7344, "epoch": 0.1320056080813891, "loss/policy_avg": 0.18966403603553772, "lr": 2.912193251533742e-06, "objective/entropy": 160.48800659179688, "objective/kl": 14.643467903137207, "objective/non_score_reward": -1.4643468856811523, "objective/rlhf_reward": -1.45738730430603, "objective/scores": 1.1, "policy/approxkl_avg": 121.34632110595703, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7794003486633301, "step": 458, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9975476264953613 }, { "episode": 7360, "epoch": 0.13229320199877773, "loss/policy_avg": 0.038652483373880386, "lr": 2.9120015337423314e-06, "objective/entropy": -118.91299438476562, "objective/kl": 8.336200714111328, "objective/non_score_reward": -0.8336200714111328, "objective/rlhf_reward": -1.7303604072967347, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 37.65449523925781, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7707014083862305, "step": 459, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998978614807129 }, { "episode": 7376, "epoch": 0.13258079591616637, "loss/policy_avg": 0.2408444583415985, "lr": 2.91180981595092e-06, "objective/entropy": 8.406875610351562, "objective/kl": 12.288520812988281, "objective/non_score_reward": -1.2288521528244019, "objective/rlhf_reward": -6.915408134460449, "objective/scores": -0.5, "policy/approxkl_avg": 36.923194885253906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7405025959014893, "step": 460, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999840497970581 }, { "episode": 7392, "epoch": 0.13286838983355503, "loss/policy_avg": 0.15941619873046875, "lr": 2.911618098159509e-06, "objective/entropy": 113.75840759277344, "objective/kl": 7.790184020996094, "objective/non_score_reward": -0.7790184617042542, "objective/rlhf_reward": -2.7160738766193386, "objective/scores": 0.1, "policy/approxkl_avg": 42.8814697265625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7081342935562134, "step": 461, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9986152648925781 }, { "episode": 7408, "epoch": 0.13315598375094367, "loss/policy_avg": 0.012417584657669067, "lr": 2.9114263803680982e-06, "objective/entropy": -38.02949142456055, "objective/kl": 7.286294937133789, "objective/non_score_reward": -0.728629469871521, "objective/rlhf_reward": -4.914518356323242, "objective/scores": -0.5, "policy/approxkl_avg": 56.29476547241211, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.38131576776504517, "step": 462, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9974234104156494 }, { "episode": 7424, "epoch": 0.1334435776683323, "loss/policy_avg": -0.01223007496446371, "lr": 2.911234662576687e-06, "objective/entropy": -89.77444458007812, "objective/kl": 6.793033599853516, "objective/non_score_reward": -0.6793034076690674, "objective/rlhf_reward": -2.3172135859727856, "objective/scores": 0.1, "policy/approxkl_avg": 0.7712490558624268, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5063762664794922, "step": 463, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0153064727783203 }, { "episode": 7440, "epoch": 0.13373117158572095, "loss/policy_avg": 0.7638596892356873, "lr": 2.9110429447852763e-06, "objective/entropy": -124.14325714111328, "objective/kl": 12.126808166503906, "objective/non_score_reward": -1.2126808166503906, "objective/rlhf_reward": -4.450723177194595, "objective/scores": 0.1, "policy/approxkl_avg": 31.479761123657227, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6185396313667297, "step": 464, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9997889995574951 }, { "episode": 7456, "epoch": 0.13401876550310962, "loss/policy_avg": -0.23089107871055603, "lr": 2.910851226993865e-06, "objective/entropy": 12.087081909179688, "objective/kl": 9.909775733947754, "objective/non_score_reward": -0.9909776449203491, "objective/rlhf_reward": 0.4360893458127979, "objective/scores": 1.1, "policy/approxkl_avg": 77.47049713134766, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6658675670623779, "step": 465, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0070953369140625 }, { "episode": 7472, "epoch": 0.13430635942049826, "loss/policy_avg": 0.0948818027973175, "lr": 2.910659509202454e-06, "objective/entropy": 121.13216400146484, "objective/kl": 3.1745963096618652, "objective/non_score_reward": -0.3174596130847931, "objective/rlhf_reward": 0.33428153776733316, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 2.6572422981262207, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6701205372810364, "step": 466, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0000531673431396 }, { "episode": 7488, "epoch": 0.1345939533378869, "loss/policy_avg": 0.06078179180622101, "lr": 2.910467791411043e-06, "objective/entropy": 109.86882019042969, "objective/kl": 11.02462387084961, "objective/non_score_reward": -1.1024622917175293, "objective/rlhf_reward": -4.009849047660827, "objective/scores": 0.1, "policy/approxkl_avg": 32.87171936035156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5864859819412231, "step": 467, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9973387718200684 }, { "episode": 7504, "epoch": 0.13488154725527554, "loss/policy_avg": 0.3744744658470154, "lr": 2.910276073619632e-06, "objective/entropy": 224.23721313476562, "objective/kl": 11.501972198486328, "objective/non_score_reward": -1.1501970291137695, "objective/rlhf_reward": -4.200788414478302, "objective/scores": 0.1, "policy/approxkl_avg": 118.61778259277344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.9559670090675354, "step": 468, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999631643295288 }, { "episode": 7520, "epoch": 0.1351691411726642, "loss/policy_avg": 0.3056102395057678, "lr": 2.9100843558282208e-06, "objective/entropy": -3.655149459838867, "objective/kl": 9.39276123046875, "objective/non_score_reward": -0.9392762184143066, "objective/rlhf_reward": 0.642895066738129, "objective/scores": 1.1, "policy/approxkl_avg": 105.86042785644531, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5348349809646606, "step": 469, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9986598491668701 }, { "episode": 7536, "epoch": 0.13545673509005285, "loss/policy_avg": 0.26237964630126953, "lr": 2.90989263803681e-06, "objective/entropy": -334.96240234375, "objective/kl": 11.451269149780273, "objective/non_score_reward": -1.1451269388198853, "objective/rlhf_reward": -0.18050771057605708, "objective/scores": 1.1, "policy/approxkl_avg": 53.953277587890625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8247783184051514, "step": 470, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9965124130249023 }, { "episode": 7552, "epoch": 0.1357443290074415, "loss/policy_avg": 0.2353116273880005, "lr": 2.909700920245399e-06, "objective/entropy": -169.8182373046875, "objective/kl": 14.575862884521484, "objective/non_score_reward": -1.4575862884521484, "objective/rlhf_reward": -1.430345384776592, "objective/scores": 1.1, "policy/approxkl_avg": 134.40069580078125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6239136457443237, "step": 471, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998337745666504 }, { "episode": 7568, "epoch": 0.13603192292483013, "loss/policy_avg": 0.39453232288360596, "lr": 2.909509202453988e-06, "objective/entropy": 88.90142822265625, "objective/kl": 10.912429809570312, "objective/non_score_reward": -1.091243028640747, "objective/rlhf_reward": 0.03502755761146581, "objective/scores": 1.1, "policy/approxkl_avg": 41.724300384521484, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8498834371566772, "step": 472, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9997682571411133 }, { "episode": 7584, "epoch": 0.1363195168422188, "loss/policy_avg": 0.8089221119880676, "lr": 2.909317484662577e-06, "objective/entropy": 245.41949462890625, "objective/kl": 19.53654670715332, "objective/non_score_reward": -1.9536547660827637, "objective/rlhf_reward": -6.258360086885049, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 363.8136291503906, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6021559834480286, "step": 473, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.999751091003418 }, { "episode": 7600, "epoch": 0.13660711075960744, "loss/policy_avg": 0.5075941681861877, "lr": 2.9091257668711657e-06, "objective/entropy": -48.53450012207031, "objective/kl": 5.752985000610352, "objective/non_score_reward": -0.5752984285354614, "objective/rlhf_reward": 2.098806151747704, "objective/scores": 1.1, "policy/approxkl_avg": 1.8740129470825195, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.45245441794395447, "step": 474, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0002760887145996 }, { "episode": 7616, "epoch": 0.13689470467699608, "loss/policy_avg": 0.11127430945634842, "lr": 2.908934049079755e-06, "objective/entropy": -34.70821762084961, "objective/kl": 13.361335754394531, "objective/non_score_reward": -1.336133599281311, "objective/rlhf_reward": -3.2218282840409618, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 80.18733978271484, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6081266403198242, "step": 475, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9994287490844727 }, { "episode": 7632, "epoch": 0.13718229859438472, "loss/policy_avg": 0.10813181102275848, "lr": 2.9087423312883437e-06, "objective/entropy": 111.931640625, "objective/kl": 6.184762954711914, "objective/non_score_reward": -0.6184762716293335, "objective/rlhf_reward": -0.6490763976898899, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 10.89632511138916, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4316413700580597, "step": 476, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000701904296875 }, { "episode": 7648, "epoch": 0.13746989251177338, "loss/policy_avg": 0.4999096989631653, "lr": 2.908550613496933e-06, "objective/entropy": -403.03533935546875, "objective/kl": 9.01245403289795, "objective/non_score_reward": -0.9012453556060791, "objective/rlhf_reward": -5.604981422424316, "objective/scores": -0.5, "policy/approxkl_avg": 74.18182373046875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5862891674041748, "step": 477, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9975061416625977 }, { "episode": 7664, "epoch": 0.13775748642916202, "loss/policy_avg": 0.14539723098278046, "lr": 2.9083588957055213e-06, "objective/entropy": -126.68431091308594, "objective/kl": 18.38888168334961, "objective/non_score_reward": -1.8388880491256714, "objective/rlhf_reward": -9.355552673339844, "objective/scores": -0.5, "policy/approxkl_avg": 197.85671997070312, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6251143217086792, "step": 478, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9994767904281616 }, { "episode": 7680, "epoch": 0.13804508034655066, "loss/policy_avg": 0.0330502912402153, "lr": 2.9081671779141105e-06, "objective/entropy": -44.419708251953125, "objective/kl": 15.150094985961914, "objective/non_score_reward": -1.5150094032287598, "objective/rlhf_reward": -8.060037612915039, "objective/scores": -0.5, "policy/approxkl_avg": 151.68356323242188, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7994130253791809, "step": 479, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9995399713516235 }, { "episode": 7696, "epoch": 0.13833267426393933, "loss/policy_avg": 0.14779314398765564, "lr": 2.9079754601226994e-06, "objective/entropy": 105.36286926269531, "objective/kl": 8.258722305297852, "objective/non_score_reward": -0.8258723616600037, "objective/rlhf_reward": -2.9034894019365307, "objective/scores": 0.1, "policy/approxkl_avg": 67.71802520751953, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5357914566993713, "step": 480, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9967106580734253 }, { "episode": 7712, "epoch": 0.13862026818132797, "loss/policy_avg": 0.27561917901039124, "lr": 2.907783742331288e-06, "objective/entropy": -65.93132019042969, "objective/kl": 11.474952697753906, "objective/non_score_reward": -1.1474955081939697, "objective/rlhf_reward": -6.589982032775879, "objective/scores": -0.5, "policy/approxkl_avg": 69.8592529296875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.572990894317627, "step": 481, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9971063137054443 }, { "episode": 7728, "epoch": 0.1389078620987166, "loss/policy_avg": 0.31458884477615356, "lr": 2.9075920245398774e-06, "objective/entropy": -12.270210266113281, "objective/kl": 11.645172119140625, "objective/non_score_reward": -1.1645172834396362, "objective/rlhf_reward": -1.7343501045715537, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 109.0627670288086, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.47612980008125305, "step": 482, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9962890148162842 }, { "episode": 7744, "epoch": 0.13919545601610525, "loss/policy_avg": 0.6003249883651733, "lr": 2.907400306748466e-06, "objective/entropy": 110.20198059082031, "objective/kl": 10.82795524597168, "objective/non_score_reward": -1.0827956199645996, "objective/rlhf_reward": -6.331182479858398, "objective/scores": -0.5, "policy/approxkl_avg": 79.25099182128906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.49656587839126587, "step": 483, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9974627494812012 }, { "episode": 7760, "epoch": 0.13948304993349392, "loss/policy_avg": -0.30824440717697144, "lr": 2.907208588957055e-06, "objective/entropy": 195.908447265625, "objective/kl": 11.146462440490723, "objective/non_score_reward": -1.1146461963653564, "objective/rlhf_reward": -4.058584606647491, "objective/scores": 0.1, "policy/approxkl_avg": 49.840267181396484, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7134747505187988, "step": 484, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.003911256790161 }, { "episode": 7776, "epoch": 0.13977064385088256, "loss/policy_avg": 0.2714402973651886, "lr": 2.9070168711656443e-06, "objective/entropy": 200.92269897460938, "objective/kl": 9.644775390625, "objective/non_score_reward": -0.9644776582717896, "objective/rlhf_reward": 0.5420892477035526, "objective/scores": 1.1, "policy/approxkl_avg": 84.43876647949219, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5147348642349243, "step": 485, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9978854656219482 }, { "episode": 7792, "epoch": 0.1400582377682712, "loss/policy_avg": 0.06527984887361526, "lr": 2.906825153374233e-06, "objective/entropy": 108.83843994140625, "objective/kl": 6.777806282043457, "objective/non_score_reward": -0.6777806282043457, "objective/rlhf_reward": 1.6888774499297146, "objective/scores": 1.1, "policy/approxkl_avg": 19.67444610595703, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6244113445281982, "step": 486, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9988651275634766 }, { "episode": 7808, "epoch": 0.14034583168565984, "loss/policy_avg": 0.16577281057834625, "lr": 2.9066334355828223e-06, "objective/entropy": 89.5904541015625, "objective/kl": 5.467947483062744, "objective/non_score_reward": -0.5467947125434875, "objective/rlhf_reward": -1.7871789395809172, "objective/scores": 0.1, "policy/approxkl_avg": 26.6594181060791, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.589790940284729, "step": 487, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9983232021331787 }, { "episode": 7824, "epoch": 0.1406334256030485, "loss/policy_avg": 0.04526926949620247, "lr": 2.906441717791411e-06, "objective/entropy": -265.4620056152344, "objective/kl": 5.726231575012207, "objective/non_score_reward": -0.5726232528686523, "objective/rlhf_reward": 2.109507152438164, "objective/scores": 1.1, "policy/approxkl_avg": 27.101959228515625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5848753452301025, "step": 488, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9989309310913086 }, { "episode": 7840, "epoch": 0.14092101952043715, "loss/policy_avg": 0.4120207726955414, "lr": 2.90625e-06, "objective/entropy": -195.201171875, "objective/kl": 11.044137001037598, "objective/non_score_reward": -1.104413628578186, "objective/rlhf_reward": -4.017654529213905, "objective/scores": 0.1, "policy/approxkl_avg": 52.49024200439453, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6557003259658813, "step": 489, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9995417594909668 }, { "episode": 7856, "epoch": 0.1412086134378258, "loss/policy_avg": 0.07290078699588776, "lr": 2.906058282208589e-06, "objective/entropy": -118.10462951660156, "objective/kl": 13.856963157653809, "objective/non_score_reward": -1.3856964111328125, "objective/rlhf_reward": -5.142785763740539, "objective/scores": 0.1, "policy/approxkl_avg": 48.285400390625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5914438366889954, "step": 490, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999072790145874 }, { "episode": 7872, "epoch": 0.14149620735521443, "loss/policy_avg": 0.19345812499523163, "lr": 2.905866564417178e-06, "objective/entropy": -72.50758361816406, "objective/kl": 10.323736190795898, "objective/non_score_reward": -1.0323736667633057, "objective/rlhf_reward": -3.7294944696128365, "objective/scores": 0.1, "policy/approxkl_avg": 89.49455261230469, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6444407105445862, "step": 491, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9993422031402588 }, { "episode": 7888, "epoch": 0.1417838012726031, "loss/policy_avg": 0.2960240840911865, "lr": 2.905674846625767e-06, "objective/entropy": -52.92985534667969, "objective/kl": 10.259614944458008, "objective/non_score_reward": -1.0259615182876587, "objective/rlhf_reward": 0.2961540907621387, "objective/scores": 1.1, "policy/approxkl_avg": 101.89286804199219, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6554427742958069, "step": 492, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9968318939208984 }, { "episode": 7904, "epoch": 0.14207139518999173, "loss/policy_avg": 0.18822041153907776, "lr": 2.905483128834356e-06, "objective/entropy": 212.44216918945312, "objective/kl": 15.652924537658691, "objective/non_score_reward": -1.5652923583984375, "objective/rlhf_reward": -4.599310164869415, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 205.89048767089844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5811524987220764, "step": 493, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9962797164916992 }, { "episode": 7920, "epoch": 0.14235898910738037, "loss/policy_avg": 0.12098196893930435, "lr": 2.905291411042945e-06, "objective/entropy": 153.53717041015625, "objective/kl": 20.49001693725586, "objective/non_score_reward": -2.049001693725586, "objective/rlhf_reward": -6.073301257864509, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 77.01814270019531, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7395473718643188, "step": 494, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9977500438690186 }, { "episode": 7936, "epoch": 0.142646583024769, "loss/policy_avg": -0.01829097419977188, "lr": 2.905099693251534e-06, "objective/entropy": 66.50902557373047, "objective/kl": 10.364259719848633, "objective/non_score_reward": -1.036426067352295, "objective/rlhf_reward": -1.2219853147279947, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 26.25365447998047, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4803291857242584, "step": 495, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9991976022720337 }, { "episode": 7952, "epoch": 0.14293417694215768, "loss/policy_avg": 0.10506206750869751, "lr": 2.904907975460123e-06, "objective/entropy": 136.27267456054688, "objective/kl": 6.317910194396973, "objective/non_score_reward": -0.6317909955978394, "objective/rlhf_reward": -4.527163982391357, "objective/scores": -0.5, "policy/approxkl_avg": 16.371524810791016, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6706969738006592, "step": 496, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9975488185882568 }, { "episode": 7968, "epoch": 0.14322177085954632, "loss/policy_avg": 0.4831598401069641, "lr": 2.9047162576687117e-06, "objective/entropy": -31.924362182617188, "objective/kl": 11.602783203125, "objective/non_score_reward": -1.1602783203125, "objective/rlhf_reward": -4.2411130428314205, "objective/scores": 0.1, "policy/approxkl_avg": 87.01671600341797, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7876095771789551, "step": 497, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.99977707862854 }, { "episode": 7984, "epoch": 0.14350936477693496, "loss/policy_avg": 1.053896427154541, "lr": 2.904524539877301e-06, "objective/entropy": 100.03763580322266, "objective/kl": 9.37060260772705, "objective/non_score_reward": -0.9370602369308472, "objective/rlhf_reward": -1.6255346558251715, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 7.041189193725586, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6169747114181519, "step": 498, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0003209114074707 }, { "episode": 8000, "epoch": 0.14379695869432363, "loss/policy_avg": 0.15767307579517365, "lr": 2.9043328220858897e-06, "objective/entropy": 32.90543746948242, "objective/kl": 7.48246955871582, "objective/non_score_reward": -0.7482469081878662, "objective/rlhf_reward": -1.3311283044224842, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 31.293258666992188, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7192215919494629, "step": 499, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.001739978790283 }, { "episode": 8016, "epoch": 0.14408455261171227, "loss/policy_avg": 0.3422059416770935, "lr": 2.904141104294479e-06, "objective/entropy": -277.40203857421875, "objective/kl": 13.774396896362305, "objective/non_score_reward": -1.3774398565292358, "objective/rlhf_reward": -7.509759426116943, "objective/scores": -0.5, "policy/approxkl_avg": 74.98161315917969, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6245834827423096, "step": 500, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9970264434814453 }, { "episode": 8032, "epoch": 0.1443721465291009, "loss/policy_avg": 0.4801827669143677, "lr": 2.9039493865030673e-06, "objective/entropy": 83.53890228271484, "objective/kl": 8.603704452514648, "objective/non_score_reward": -0.8603705167770386, "objective/rlhf_reward": -0.5177629336130348, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 94.87248992919922, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7735698223114014, "step": 501, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000962257385254 }, { "episode": 8048, "epoch": 0.14465974044648955, "loss/policy_avg": 0.1936042606830597, "lr": 2.9037576687116566e-06, "objective/entropy": 16.39226531982422, "objective/kl": 13.502885818481445, "objective/non_score_reward": -1.3502888679504395, "objective/rlhf_reward": -3.453743944840367, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 83.78414916992188, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7048830986022949, "step": 502, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9994542598724365 }, { "episode": 8064, "epoch": 0.14494733436387822, "loss/policy_avg": 0.1588769406080246, "lr": 2.9035659509202454e-06, "objective/entropy": -40.00954055786133, "objective/kl": 18.89105987548828, "objective/non_score_reward": -1.8891057968139648, "objective/rlhf_reward": -4.632704351783964, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 181.8993682861328, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5775229930877686, "step": 503, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.997877836227417 }, { "episode": 8080, "epoch": 0.14523492828126686, "loss/policy_avg": 0.6248252987861633, "lr": 2.903374233128834e-06, "objective/entropy": -210.8176727294922, "objective/kl": 10.124932289123535, "objective/non_score_reward": -1.0124932527542114, "objective/rlhf_reward": -3.649973204731941, "objective/scores": 0.1, "policy/approxkl_avg": 49.988563537597656, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6010805368423462, "step": 504, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9989054203033447 }, { "episode": 8096, "epoch": 0.1455225221986555, "loss/policy_avg": 0.1622433364391327, "lr": 2.9031825153374234e-06, "objective/entropy": 17.784866333007812, "objective/kl": 9.719334602355957, "objective/non_score_reward": -0.9719333648681641, "objective/rlhf_reward": 0.5122664213180546, "objective/scores": 1.1, "policy/approxkl_avg": 56.30583953857422, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6292353272438049, "step": 505, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0021533966064453 }, { "episode": 8112, "epoch": 0.14581011611604414, "loss/policy_avg": -0.1938256323337555, "lr": 2.9029907975460122e-06, "objective/entropy": 127.38702392578125, "objective/kl": 5.040970802307129, "objective/non_score_reward": -0.5040971636772156, "objective/rlhf_reward": -1.6163885876536368, "objective/scores": 0.1, "policy/approxkl_avg": 21.422080993652344, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.5696605443954468, "step": 506, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0037827491760254 }, { "episode": 8128, "epoch": 0.1460977100334328, "loss/policy_avg": 0.38081973791122437, "lr": 2.902799079754601e-06, "objective/entropy": 42.37715530395508, "objective/kl": 7.989552021026611, "objective/non_score_reward": -0.7989552021026611, "objective/rlhf_reward": -1.073114486710105, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 50.31044006347656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6788329482078552, "step": 507, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.99943208694458 }, { "episode": 8144, "epoch": 0.14638530395082144, "loss/policy_avg": 0.3784164786338806, "lr": 2.9026073619631903e-06, "objective/entropy": -50.67826843261719, "objective/kl": 13.639227867126465, "objective/non_score_reward": -1.3639228343963623, "objective/rlhf_reward": -7.455691337585449, "objective/scores": -0.5, "policy/approxkl_avg": 109.05412292480469, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8064265847206116, "step": 508, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9993607997894287 }, { "episode": 8160, "epoch": 0.14667289786821008, "loss/policy_avg": 0.13403823971748352, "lr": 2.902415644171779e-06, "objective/entropy": 96.67460632324219, "objective/kl": 12.861349105834961, "objective/non_score_reward": -1.2861348390579224, "objective/rlhf_reward": -3.540419537488537, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 196.5169677734375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.47650325298309326, "step": 509, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0008530616760254 }, { "episode": 8176, "epoch": 0.14696049178559872, "loss/policy_avg": 0.2548585534095764, "lr": 2.9022239263803683e-06, "objective/entropy": -259.3787841796875, "objective/kl": 13.765344619750977, "objective/non_score_reward": -1.376534342765808, "objective/rlhf_reward": -5.106137281656265, "objective/scores": 0.1, "policy/approxkl_avg": 124.93861389160156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.578956663608551, "step": 510, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9970561265945435 }, { "episode": 8192, "epoch": 0.1472480857029874, "loss/policy_avg": 0.32806700468063354, "lr": 2.902032208588957e-06, "objective/entropy": -89.41710662841797, "objective/kl": 11.446915626525879, "objective/non_score_reward": -1.1446915864944458, "objective/rlhf_reward": -2.7539375975456943, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 100.32626342773438, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6500498056411743, "step": 511, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9999818801879883 }, { "episode": 8208, "epoch": 0.14753567962037603, "loss/policy_avg": 0.1273409128189087, "lr": 2.901840490797546e-06, "objective/entropy": 69.84151458740234, "objective/kl": 18.043598175048828, "objective/non_score_reward": -1.804359793663025, "objective/rlhf_reward": -4.8174391150474545, "objective/scores": 0.6, "policy/approxkl_avg": 85.71723937988281, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5488656163215637, "step": 512, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.998365879058838 }, { "episode": 8224, "epoch": 0.14782327353776467, "loss/policy_avg": 1.1218316555023193, "lr": 2.901648773006135e-06, "objective/entropy": -103.88362121582031, "objective/kl": 12.342060089111328, "objective/non_score_reward": -1.2342060804367065, "objective/rlhf_reward": -4.536824202537536, "objective/scores": 0.1, "policy/approxkl_avg": 1.8322827816009521, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.735711932182312, "step": 513, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000443458557129 }, { "episode": 8240, "epoch": 0.1481108674551533, "loss/policy_avg": -0.014876842498779297, "lr": 2.901457055214724e-06, "objective/entropy": 75.55338287353516, "objective/kl": 12.932259559631348, "objective/non_score_reward": -1.2932257652282715, "objective/rlhf_reward": -3.439569846789042, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 28.75712776184082, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.526807963848114, "step": 514, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9995429515838623 }, { "episode": 8256, "epoch": 0.14839846137254198, "loss/policy_avg": 0.11381683498620987, "lr": 2.901265337423313e-06, "objective/entropy": 28.295608520507812, "objective/kl": 12.695294380187988, "objective/non_score_reward": -1.2695293426513672, "objective/rlhf_reward": -4.678117549419403, "objective/scores": 0.1, "policy/approxkl_avg": 62.5381965637207, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4214191138744354, "step": 515, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000277519226074 }, { "episode": 8272, "epoch": 0.14868605528993062, "loss/policy_avg": 0.031907178461551666, "lr": 2.901073619631902e-06, "objective/entropy": 202.26161193847656, "objective/kl": 7.357514381408691, "objective/non_score_reward": -0.7357515096664429, "objective/rlhf_reward": -2.5430058300495144, "objective/scores": 0.1, "policy/approxkl_avg": 9.950460433959961, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7936433553695679, "step": 516, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0013363361358643 }, { "episode": 8288, "epoch": 0.14897364920731926, "loss/policy_avg": 0.1753254234790802, "lr": 2.900881901840491e-06, "objective/entropy": -103.65457153320312, "objective/kl": 12.325798034667969, "objective/non_score_reward": -1.2325799465179443, "objective/rlhf_reward": -2.807613672987495, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 124.17157745361328, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6873651742935181, "step": 517, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0010011196136475 }, { "episode": 8304, "epoch": 0.1492612431247079, "loss/policy_avg": 0.49640148878097534, "lr": 2.90069018404908e-06, "objective/entropy": -23.451457977294922, "objective/kl": 11.856928825378418, "objective/non_score_reward": -1.1856927871704102, "objective/rlhf_reward": -2.7953601283597305, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 100.8171157836914, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.530035138130188, "step": 518, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.997998595237732 }, { "episode": 8320, "epoch": 0.14954883704209657, "loss/policy_avg": -0.026080047711730003, "lr": 2.900498466257669e-06, "objective/entropy": -170.7906951904297, "objective/kl": 5.801922798156738, "objective/non_score_reward": -0.5801923871040344, "objective/rlhf_reward": 2.0792303770780567, "objective/scores": 1.1, "policy/approxkl_avg": 14.545293807983398, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4394223093986511, "step": 519, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9988439083099365 }, { "episode": 8336, "epoch": 0.1498364309594852, "loss/policy_avg": 0.18013785779476166, "lr": 2.9003067484662577e-06, "objective/entropy": -30.96051788330078, "objective/kl": 15.340864181518555, "objective/non_score_reward": -1.5340864658355713, "objective/rlhf_reward": -5.736345967650413, "objective/scores": 0.1, "policy/approxkl_avg": 13.957459449768066, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7148102521896362, "step": 520, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0002388954162598 }, { "episode": 8352, "epoch": 0.15012402487687385, "loss/policy_avg": 0.21995435655117035, "lr": 2.900115030674847e-06, "objective/entropy": 118.80613708496094, "objective/kl": 13.875422477722168, "objective/non_score_reward": -1.3875422477722168, "objective/rlhf_reward": -5.150168991088867, "objective/scores": 0.1, "policy/approxkl_avg": 30.807886123657227, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6429756879806519, "step": 521, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9986985921859741 }, { "episode": 8368, "epoch": 0.1504116187942625, "loss/policy_avg": 0.44822782278060913, "lr": 2.8999233128834357e-06, "objective/entropy": 131.02780151367188, "objective/kl": 8.63813304901123, "objective/non_score_reward": -0.8638133406639099, "objective/rlhf_reward": -3.0552533924579617, "objective/scores": 0.1, "policy/approxkl_avg": 54.1556510925293, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8205841779708862, "step": 522, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9984720945358276 }, { "episode": 8384, "epoch": 0.15069921271165115, "loss/policy_avg": 0.2645617723464966, "lr": 2.8997315950920245e-06, "objective/entropy": -156.31088256835938, "objective/kl": 7.035023212432861, "objective/non_score_reward": -0.7035022974014282, "objective/rlhf_reward": -4.814008712768555, "objective/scores": -0.5, "policy/approxkl_avg": 37.48949432373047, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.49438732862472534, "step": 523, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9992444515228271 }, { "episode": 8400, "epoch": 0.1509868066290398, "loss/policy_avg": 0.09130215644836426, "lr": 2.8995398773006133e-06, "objective/entropy": -78.74007415771484, "objective/kl": 8.69231128692627, "objective/non_score_reward": -0.8692312240600586, "objective/rlhf_reward": -0.5532056882393088, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 96.52299499511719, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.46065402030944824, "step": 524, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998188853263855 }, { "episode": 8416, "epoch": 0.15127440054642843, "loss/policy_avg": 0.13580232858657837, "lr": 2.8993481595092026e-06, "objective/entropy": -72.97970581054688, "objective/kl": 7.367366313934326, "objective/non_score_reward": -0.7367366552352905, "objective/rlhf_reward": -4.946946620941162, "objective/scores": -0.5, "policy/approxkl_avg": 13.75210189819336, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7579455375671387, "step": 525, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000337600708008 }, { "episode": 8432, "epoch": 0.1515619944638171, "loss/policy_avg": 0.11613625288009644, "lr": 2.8991564417177914e-06, "objective/entropy": 83.00586700439453, "objective/kl": 12.637749671936035, "objective/non_score_reward": -1.2637748718261719, "objective/rlhf_reward": -3.2302709474888553, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 89.52025604248047, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5902446508407593, "step": 526, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9989619255065918 }, { "episode": 8448, "epoch": 0.15184958838120574, "loss/policy_avg": 0.03080570697784424, "lr": 2.89896472392638e-06, "objective/entropy": 106.05413055419922, "objective/kl": 8.069011688232422, "objective/non_score_reward": -0.8069012761116028, "objective/rlhf_reward": -0.3038861199629035, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 105.38276672363281, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5903005599975586, "step": 527, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9979126453399658 }, { "episode": 8464, "epoch": 0.15213718229859438, "loss/policy_avg": 0.3046284317970276, "lr": 2.8987730061349694e-06, "objective/entropy": 152.33953857421875, "objective/kl": 14.629440307617188, "objective/non_score_reward": -1.4629439115524292, "objective/rlhf_reward": -4.189916288078415, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 60.608787536621094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6111043691635132, "step": 528, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9986634254455566 }, { "episode": 8480, "epoch": 0.15242477621598302, "loss/policy_avg": 0.3153807520866394, "lr": 2.8985812883435582e-06, "objective/entropy": 11.483592987060547, "objective/kl": 7.498883247375488, "objective/non_score_reward": -0.7498883008956909, "objective/rlhf_reward": -2.5995532035827633, "objective/scores": 0.1, "policy/approxkl_avg": 43.66455841064453, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.530735969543457, "step": 529, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.997819185256958 }, { "episode": 8496, "epoch": 0.1527123701333717, "loss/policy_avg": -0.4344179630279541, "lr": 2.8983895705521475e-06, "objective/entropy": 172.50282287597656, "objective/kl": 9.609460830688477, "objective/non_score_reward": -0.9609460234642029, "objective/rlhf_reward": -5.843784332275391, "objective/scores": -0.5, "policy/approxkl_avg": 74.63162231445312, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.8059056997299194, "step": 530, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.002572536468506 }, { "episode": 8512, "epoch": 0.15299996405076033, "loss/policy_avg": 0.09429708868265152, "lr": 2.8981978527607363e-06, "objective/entropy": 294.640625, "objective/kl": 18.882694244384766, "objective/non_score_reward": -1.888269305229187, "objective/rlhf_reward": -7.153077340126037, "objective/scores": 0.1, "policy/approxkl_avg": 111.17985534667969, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7101346254348755, "step": 531, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9989078044891357 }, { "episode": 8528, "epoch": 0.15328755796814897, "loss/policy_avg": 0.15176677703857422, "lr": 2.898006134969325e-06, "objective/entropy": 136.55715942382812, "objective/kl": 12.547300338745117, "objective/non_score_reward": -1.2547301054000854, "objective/rlhf_reward": -0.6189204812049862, "objective/scores": 1.1, "policy/approxkl_avg": 103.96243286132812, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.8805792331695557, "step": 532, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000082015991211 }, { "episode": 8544, "epoch": 0.1535751518855376, "loss/policy_avg": 0.2759408950805664, "lr": 2.8978144171779143e-06, "objective/entropy": 157.77984619140625, "objective/kl": 8.116591453552246, "objective/non_score_reward": -0.8116590976715088, "objective/rlhf_reward": -5.246636390686035, "objective/scores": -0.5, "policy/approxkl_avg": 10.0709867477417, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7335650324821472, "step": 533, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999115228652954 }, { "episode": 8560, "epoch": 0.15386274580292628, "loss/policy_avg": 0.49007368087768555, "lr": 2.897622699386503e-06, "objective/entropy": -148.64157104492188, "objective/kl": 15.987449645996094, "objective/non_score_reward": -1.5987448692321777, "objective/rlhf_reward": -5.994979923963546, "objective/scores": 0.1, "policy/approxkl_avg": 167.39520263671875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6771738529205322, "step": 534, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.995469093322754 }, { "episode": 8576, "epoch": 0.15415033972031492, "loss/policy_avg": 0.2475529909133911, "lr": 2.897430981595092e-06, "objective/entropy": 63.576107025146484, "objective/kl": 8.813644409179688, "objective/non_score_reward": -0.8813644051551819, "objective/rlhf_reward": -3.125457620620727, "objective/scores": 0.1, "policy/approxkl_avg": 47.86650848388672, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7202204465866089, "step": 535, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9946643114089966 }, { "episode": 8592, "epoch": 0.15443793363770356, "loss/policy_avg": 0.3695685565471649, "lr": 2.897239263803681e-06, "objective/entropy": 72.53046417236328, "objective/kl": 15.155887603759766, "objective/non_score_reward": -1.5155887603759766, "objective/rlhf_reward": -1.6623550713062283, "objective/scores": 1.1, "policy/approxkl_avg": 99.77005004882812, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.731490969657898, "step": 536, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9989895820617676 }, { "episode": 8608, "epoch": 0.1547255275550922, "loss/policy_avg": 0.19564469158649445, "lr": 2.89704754601227e-06, "objective/entropy": 104.1103515625, "objective/kl": 12.909381866455078, "objective/non_score_reward": -1.290938377380371, "objective/rlhf_reward": -4.7637532413005825, "objective/scores": 0.1, "policy/approxkl_avg": 49.42005920410156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4913232922554016, "step": 537, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9961007833480835 }, { "episode": 8624, "epoch": 0.15501312147248086, "loss/policy_avg": 0.0638544037938118, "lr": 2.896855828220859e-06, "objective/entropy": 120.96896362304688, "objective/kl": 17.086078643798828, "objective/non_score_reward": -1.708607792854309, "objective/rlhf_reward": -6.434431171417236, "objective/scores": 0.1, "policy/approxkl_avg": 112.74113464355469, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.625002920627594, "step": 538, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9973227977752686 }, { "episode": 8640, "epoch": 0.1553007153898695, "loss/policy_avg": 0.4011816382408142, "lr": 2.896664110429448e-06, "objective/entropy": 95.19891357421875, "objective/kl": 5.403826713562012, "objective/non_score_reward": -0.5403826236724854, "objective/rlhf_reward": -4.161530494689941, "objective/scores": -0.5, "policy/approxkl_avg": 49.818931579589844, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.44589221477508545, "step": 539, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999190330505371 }, { "episode": 8656, "epoch": 0.15558830930725814, "loss/policy_avg": 0.006004150491207838, "lr": 2.896472392638037e-06, "objective/entropy": -98.64430236816406, "objective/kl": 13.903255462646484, "objective/non_score_reward": -1.3903255462646484, "objective/rlhf_reward": -3.4385958335557323, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 96.46614074707031, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5993703603744507, "step": 540, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9983282089233398 }, { "episode": 8672, "epoch": 0.1558759032246468, "loss/policy_avg": 0.25113996863365173, "lr": 2.896280674846626e-06, "objective/entropy": 28.177112579345703, "objective/kl": 10.27774715423584, "objective/non_score_reward": -1.027774691581726, "objective/rlhf_reward": -3.711098855733871, "objective/scores": 0.1, "policy/approxkl_avg": 15.50900650024414, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.38805845379829407, "step": 541, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9975563287734985 }, { "episode": 8688, "epoch": 0.15616349714203545, "loss/policy_avg": 0.22983065247535706, "lr": 2.896088957055215e-06, "objective/entropy": -43.48884963989258, "objective/kl": 16.519594192504883, "objective/non_score_reward": -1.6519595384597778, "objective/rlhf_reward": -4.207838369905948, "objective/scores": 0.6, "policy/approxkl_avg": 145.91189575195312, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4935316741466522, "step": 542, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9969571828842163 }, { "episode": 8704, "epoch": 0.1564510910594241, "loss/policy_avg": 0.5034677386283875, "lr": 2.895897239263804e-06, "objective/entropy": -180.35511779785156, "objective/kl": 11.565031051635742, "objective/non_score_reward": -1.1565032005310059, "objective/rlhf_reward": -0.22601289153099025, "objective/scores": 1.1, "policy/approxkl_avg": 45.29411315917969, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6911122798919678, "step": 543, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9962868690490723 }, { "episode": 8720, "epoch": 0.15673868497681273, "loss/policy_avg": 0.02647099643945694, "lr": 2.895705521472393e-06, "objective/entropy": -299.92938232421875, "objective/kl": 12.21584701538086, "objective/non_score_reward": -1.2215845584869385, "objective/rlhf_reward": -4.486337954550981, "objective/scores": 0.1, "policy/approxkl_avg": 101.26480102539062, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5777586698532104, "step": 544, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0000030994415283 }, { "episode": 8736, "epoch": 0.1570262788942014, "loss/policy_avg": 0.0650666356086731, "lr": 2.8955138036809817e-06, "objective/entropy": -25.684341430664062, "objective/kl": 12.18275260925293, "objective/non_score_reward": -1.2182753086090088, "objective/rlhf_reward": -0.47310084700584376, "objective/scores": 1.1, "policy/approxkl_avg": 73.90670776367188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5378080606460571, "step": 545, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9970163106918335 }, { "episode": 8752, "epoch": 0.15731387281159004, "loss/policy_avg": 0.3169589042663574, "lr": 2.8953220858895705e-06, "objective/entropy": 100.88613891601562, "objective/kl": 7.750062942504883, "objective/non_score_reward": -0.7750062942504883, "objective/rlhf_reward": -5.100025177001953, "objective/scores": -0.5, "policy/approxkl_avg": 76.69898986816406, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7943099737167358, "step": 546, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0006103515625 }, { "episode": 8768, "epoch": 0.15760146672897868, "loss/policy_avg": 0.15454688668251038, "lr": 2.8951303680981593e-06, "objective/entropy": -160.29241943359375, "objective/kl": 12.760615348815918, "objective/non_score_reward": -1.2760615348815918, "objective/rlhf_reward": -4.704246199131012, "objective/scores": 0.1, "policy/approxkl_avg": 11.389159202575684, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6418617963790894, "step": 547, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000383138656616 }, { "episode": 8784, "epoch": 0.15788906064636732, "loss/policy_avg": -0.0003520110622048378, "lr": 2.8949386503067486e-06, "objective/entropy": 29.00008773803711, "objective/kl": 12.984089851379395, "objective/non_score_reward": -1.2984089851379395, "objective/rlhf_reward": -2.2699168368589606, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 37.11572265625, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.479004442691803, "step": 548, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0004520416259766 }, { "episode": 8800, "epoch": 0.158176654563756, "loss/policy_avg": -0.15695317089557648, "lr": 2.8947469325153374e-06, "objective/entropy": -84.50102233886719, "objective/kl": 18.578369140625, "objective/non_score_reward": -1.8578369617462158, "objective/rlhf_reward": -5.698014610509077, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 86.01622009277344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4987892508506775, "step": 549, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9977936744689941 }, { "episode": 8816, "epoch": 0.15846424848114463, "loss/policy_avg": -0.01714038848876953, "lr": 2.894555214723926e-06, "objective/entropy": -128.65835571289062, "objective/kl": 5.4775614738464355, "objective/non_score_reward": -0.5477561950683594, "objective/rlhf_reward": -4.1910247802734375, "objective/scores": -0.5, "policy/approxkl_avg": 34.354248046875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5765225291252136, "step": 550, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9985828399658203 }, { "episode": 8832, "epoch": 0.15875184239853327, "loss/policy_avg": 0.19021713733673096, "lr": 2.8943634969325154e-06, "objective/entropy": -61.83000946044922, "objective/kl": 10.497882843017578, "objective/non_score_reward": -1.0497883558273315, "objective/rlhf_reward": -3.7991533041000363, "objective/scores": 0.1, "policy/approxkl_avg": 79.45492553710938, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.4981521964073181, "step": 551, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998138666152954 }, { "episode": 8848, "epoch": 0.1590394363159219, "loss/policy_avg": 0.06787601113319397, "lr": 2.8941717791411042e-06, "objective/entropy": 38.238014221191406, "objective/kl": 15.474870681762695, "objective/non_score_reward": -1.5474871397018433, "objective/rlhf_reward": -8.189949035644531, "objective/scores": -0.5, "policy/approxkl_avg": 82.42859649658203, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5804091095924377, "step": 552, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9993863105773926 }, { "episode": 8864, "epoch": 0.15932703023331057, "loss/policy_avg": 0.30045267939567566, "lr": 2.8939800613496935e-06, "objective/entropy": -27.927902221679688, "objective/kl": 10.419059753417969, "objective/non_score_reward": -1.0419061183929443, "objective/rlhf_reward": 0.23237573504447973, "objective/scores": 1.1, "policy/approxkl_avg": 209.59939575195312, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6831308603286743, "step": 553, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0001933574676514 }, { "episode": 8880, "epoch": 0.1596146241506992, "loss/policy_avg": 0.2085457146167755, "lr": 2.8937883435582823e-06, "objective/entropy": 35.96397018432617, "objective/kl": 9.092164993286133, "objective/non_score_reward": -0.9092164635658264, "objective/rlhf_reward": -1.9035325507322947, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 39.68387222290039, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5189159512519836, "step": 554, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000199794769287 }, { "episode": 8896, "epoch": 0.15990221806808785, "loss/policy_avg": 0.3386860191822052, "lr": 2.893596625766871e-06, "objective/entropy": 12.668701171875, "objective/kl": 14.887137413024902, "objective/non_score_reward": -1.4887138605117798, "objective/rlhf_reward": -5.554855218529701, "objective/scores": 0.1, "policy/approxkl_avg": 57.799781799316406, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8298596143722534, "step": 555, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.001885414123535 }, { "episode": 8912, "epoch": 0.1601898119854765, "loss/policy_avg": 0.2826036214828491, "lr": 2.8934049079754603e-06, "objective/entropy": 291.82574462890625, "objective/kl": 11.674125671386719, "objective/non_score_reward": -1.167412519454956, "objective/rlhf_reward": -6.669650077819824, "objective/scores": -0.5, "policy/approxkl_avg": 129.4175567626953, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7528454065322876, "step": 556, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999050498008728 }, { "episode": 8928, "epoch": 0.16047740590286516, "loss/policy_avg": 0.10318401455879211, "lr": 2.893213190184049e-06, "objective/entropy": 120.07017517089844, "objective/kl": 4.665571212768555, "objective/non_score_reward": -0.46655717492103577, "objective/rlhf_reward": -1.4662286698818208, "objective/scores": 0.1, "policy/approxkl_avg": 5.405817031860352, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5051665306091309, "step": 557, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.997723937034607 }, { "episode": 8944, "epoch": 0.1607649998202538, "loss/policy_avg": -0.29497459530830383, "lr": 2.893021472392638e-06, "objective/entropy": 37.394432067871094, "objective/kl": 9.016752243041992, "objective/non_score_reward": -0.9016750454902649, "objective/rlhf_reward": -1.6592889232205705, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 60.83589172363281, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6633663773536682, "step": 558, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.003632068634033 }, { "episode": 8960, "epoch": 0.16105259373764244, "loss/policy_avg": 0.27766960859298706, "lr": 2.892829754601227e-06, "objective/entropy": 162.43531799316406, "objective/kl": 14.33592414855957, "objective/non_score_reward": -1.433592438697815, "objective/rlhf_reward": -2.8106508597147197, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 65.74765014648438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7771644592285156, "step": 559, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998669147491455 }, { "episode": 8976, "epoch": 0.1613401876550311, "loss/policy_avg": 0.10618109256029129, "lr": 2.892638036809816e-06, "objective/entropy": -214.1174774169922, "objective/kl": 7.891653537750244, "objective/non_score_reward": -0.7891653776168823, "objective/rlhf_reward": 1.2433385044336323, "objective/scores": 1.1, "policy/approxkl_avg": 46.754520416259766, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6963553428649902, "step": 560, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9995391368865967 }, { "episode": 8992, "epoch": 0.16162778157241975, "loss/policy_avg": 0.16953638195991516, "lr": 2.892446319018405e-06, "objective/entropy": -72.53345489501953, "objective/kl": 14.390440940856934, "objective/non_score_reward": -1.4390441179275513, "objective/rlhf_reward": -5.356176471710205, "objective/scores": 0.1, "policy/approxkl_avg": 157.7322235107422, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4842403531074524, "step": 561, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9982311725616455 }, { "episode": 9008, "epoch": 0.1619153754898084, "loss/policy_avg": -0.21345758438110352, "lr": 2.892254601226994e-06, "objective/entropy": 70.15734100341797, "objective/kl": 8.587663650512695, "objective/non_score_reward": -0.8587663769721985, "objective/rlhf_reward": -5.435065269470215, "objective/scores": -0.5, "policy/approxkl_avg": 48.32007598876953, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6229357719421387, "step": 562, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0034308433532715 }, { "episode": 9024, "epoch": 0.16220296940719703, "loss/policy_avg": 0.628226637840271, "lr": 2.892062883435583e-06, "objective/entropy": 71.88837432861328, "objective/kl": 13.11463737487793, "objective/non_score_reward": -1.3114639520645142, "objective/rlhf_reward": -7.245855331420898, "objective/scores": -0.5, "policy/approxkl_avg": 86.9578628540039, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6590292453765869, "step": 563, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999263048171997 }, { "episode": 9040, "epoch": 0.1624905633245857, "loss/policy_avg": 0.4895268380641937, "lr": 2.891871165644172e-06, "objective/entropy": -137.79327392578125, "objective/kl": 11.846264839172363, "objective/non_score_reward": -1.1846263408660889, "objective/rlhf_reward": -6.7385053634643555, "objective/scores": -0.5, "policy/approxkl_avg": 49.822731018066406, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6190036535263062, "step": 564, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9992012977600098 }, { "episode": 9056, "epoch": 0.16277815724197434, "loss/policy_avg": 0.02343292161822319, "lr": 2.891679447852761e-06, "objective/entropy": -80.07128143310547, "objective/kl": 8.20994758605957, "objective/non_score_reward": -0.8209947943687439, "objective/rlhf_reward": -2.8839792668819424, "objective/scores": 0.1, "policy/approxkl_avg": 1.4821832180023193, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7127578258514404, "step": 565, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.004193067550659 }, { "episode": 9072, "epoch": 0.16306575115936298, "loss/policy_avg": 0.0619128942489624, "lr": 2.89148773006135e-06, "objective/entropy": -55.37126922607422, "objective/kl": 17.618186950683594, "objective/non_score_reward": -1.7618186473846436, "objective/rlhf_reward": -4.12355539643881, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 126.11732482910156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5896855592727661, "step": 566, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.002530097961426 }, { "episode": 9088, "epoch": 0.16335334507675162, "loss/policy_avg": 0.33223459124565125, "lr": 2.8912960122699385e-06, "objective/entropy": 44.957420349121094, "objective/kl": 12.796667098999023, "objective/non_score_reward": -1.2796669006347656, "objective/rlhf_reward": -7.1186676025390625, "objective/scores": -0.5, "policy/approxkl_avg": 8.001811027526855, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4202909469604492, "step": 567, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0002686977386475 }, { "episode": 9104, "epoch": 0.16364093899414028, "loss/policy_avg": 0.46875202655792236, "lr": 2.8911042944785277e-06, "objective/entropy": 75.29147338867188, "objective/kl": 14.741684913635254, "objective/non_score_reward": -1.4741685390472412, "objective/rlhf_reward": -4.071845050129006, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 119.49514770507812, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4648001790046692, "step": 568, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998049259185791 }, { "episode": 9120, "epoch": 0.16392853291152892, "loss/policy_avg": 0.037746116518974304, "lr": 2.8909125766871165e-06, "objective/entropy": -114.35173797607422, "objective/kl": 17.751426696777344, "objective/non_score_reward": -1.7751425504684448, "objective/rlhf_reward": -4.176851276994917, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 63.35493850708008, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7318030595779419, "step": 569, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9995402097702026 }, { "episode": 9136, "epoch": 0.16421612682891756, "loss/policy_avg": 0.12556898593902588, "lr": 2.8907208588957053e-06, "objective/entropy": 275.97406005859375, "objective/kl": 16.65966796875, "objective/non_score_reward": -1.6659668684005737, "objective/rlhf_reward": -3.740148504019949, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 182.56800842285156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7940037250518799, "step": 570, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.998062252998352 }, { "episode": 9152, "epoch": 0.1645037207463062, "loss/policy_avg": 0.19311276078224182, "lr": 2.8905291411042946e-06, "objective/entropy": -13.802238464355469, "objective/kl": 11.889385223388672, "objective/non_score_reward": -1.1889386177062988, "objective/rlhf_reward": -0.3557543516159054, "objective/scores": 1.1, "policy/approxkl_avg": 168.97225952148438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5607097148895264, "step": 571, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9966776371002197 }, { "episode": 9168, "epoch": 0.16479131466369487, "loss/policy_avg": -0.0551675409078598, "lr": 2.8903374233128834e-06, "objective/entropy": 199.9774627685547, "objective/kl": 10.514884948730469, "objective/non_score_reward": -1.0514883995056152, "objective/rlhf_reward": -3.8059536576271054, "objective/scores": 0.1, "policy/approxkl_avg": 40.9141960144043, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5954399108886719, "step": 572, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0013375282287598 }, { "episode": 9184, "epoch": 0.1650789085810835, "loss/policy_avg": 0.5291420817375183, "lr": 2.890145705521472e-06, "objective/entropy": -87.00729370117188, "objective/kl": 17.926801681518555, "objective/non_score_reward": -1.792680263519287, "objective/rlhf_reward": -5.048014911190544, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 174.02410888671875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6229202747344971, "step": 573, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9971938133239746 }, { "episode": 9200, "epoch": 0.16536650249847215, "loss/policy_avg": 0.026936478912830353, "lr": 2.8899539877300614e-06, "objective/entropy": 16.224300384521484, "objective/kl": 12.667734146118164, "objective/non_score_reward": -1.2667735815048218, "objective/rlhf_reward": -0.667094303667545, "objective/scores": 1.1, "policy/approxkl_avg": 61.12175369262695, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6178743243217468, "step": 574, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9982073307037354 }, { "episode": 9216, "epoch": 0.1656540964158608, "loss/policy_avg": 0.3519830107688904, "lr": 2.8897622699386502e-06, "objective/entropy": -118.76617431640625, "objective/kl": 12.919654846191406, "objective/non_score_reward": -1.2919654846191406, "objective/rlhf_reward": -3.5060024909382927, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 151.7967529296875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4337049126625061, "step": 575, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.996576189994812 }, { "episode": 9232, "epoch": 0.16594169033324946, "loss/policy_avg": 0.10243120789527893, "lr": 2.8895705521472395e-06, "objective/entropy": -136.20498657226562, "objective/kl": 8.501260757446289, "objective/non_score_reward": -0.8501260280609131, "objective/rlhf_reward": -1.8442450006871969, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 44.17273712158203, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6579757928848267, "step": 576, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9993293285369873 }, { "episode": 9248, "epoch": 0.1662292842506381, "loss/policy_avg": 0.09091467410326004, "lr": 2.8893788343558283e-06, "objective/entropy": -303.64678955078125, "objective/kl": 8.604415893554688, "objective/non_score_reward": -0.8604416847229004, "objective/rlhf_reward": -1.6169378116455784, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 31.75139045715332, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7412898540496826, "step": 577, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9997444152832031 }, { "episode": 9264, "epoch": 0.16651687816802674, "loss/policy_avg": 0.21000587940216064, "lr": 2.889187116564417e-06, "objective/entropy": 239.27052307128906, "objective/kl": 10.989913940429688, "objective/non_score_reward": -1.0989913940429688, "objective/rlhf_reward": 0.004034423828125355, "objective/scores": 1.1, "policy/approxkl_avg": 27.15020179748535, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5851229429244995, "step": 578, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0004048347473145 }, { "episode": 9280, "epoch": 0.1668044720854154, "loss/policy_avg": 0.22401443123817444, "lr": 2.8889953987730063e-06, "objective/entropy": 83.70755004882812, "objective/kl": 11.797503471374512, "objective/non_score_reward": -1.1797504425048828, "objective/rlhf_reward": -1.7952826365244117, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 76.76632690429688, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6816673278808594, "step": 579, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9980626106262207 }, { "episode": 9296, "epoch": 0.16709206600280405, "loss/policy_avg": -0.2864213287830353, "lr": 2.888803680981595e-06, "objective/entropy": -0.46259307861328125, "objective/kl": 8.927458763122559, "objective/non_score_reward": -0.8927459716796875, "objective/rlhf_reward": 0.8290162324905399, "objective/scores": 1.1, "policy/approxkl_avg": 12.566313743591309, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.4375641345977783, "step": 580, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000452995300293 }, { "episode": 9312, "epoch": 0.16737965992019269, "loss/policy_avg": 0.34181928634643555, "lr": 2.8886119631901844e-06, "objective/entropy": -10.957565307617188, "objective/kl": 11.041678428649902, "objective/non_score_reward": -1.1041678190231323, "objective/rlhf_reward": -4.016671425104141, "objective/scores": 0.1, "policy/approxkl_avg": 72.29110717773438, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5227146148681641, "step": 581, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998535394668579 }, { "episode": 9328, "epoch": 0.16766725383758133, "loss/policy_avg": 0.7440632581710815, "lr": 2.888420245398773e-06, "objective/entropy": -169.97946166992188, "objective/kl": 5.892565727233887, "objective/non_score_reward": -0.5892565846443176, "objective/rlhf_reward": -1.9570263683795928, "objective/scores": 0.1, "policy/approxkl_avg": 42.19253158569336, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6428844928741455, "step": 582, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9971792697906494 }, { "episode": 9344, "epoch": 0.16795484775497, "loss/policy_avg": 0.039995964616537094, "lr": 2.888228527607362e-06, "objective/entropy": 11.100467681884766, "objective/kl": 8.740276336669922, "objective/non_score_reward": -0.874027669429779, "objective/rlhf_reward": -3.0961106330156323, "objective/scores": 0.1, "policy/approxkl_avg": 68.66104125976562, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5335364937782288, "step": 583, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9975062608718872 }, { "episode": 9360, "epoch": 0.16824244167235863, "loss/policy_avg": 0.18156462907791138, "lr": 2.8880368098159512e-06, "objective/entropy": 146.76071166992188, "objective/kl": 13.685348510742188, "objective/non_score_reward": -1.368534803390503, "objective/rlhf_reward": -2.5504204376947612, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 198.45315551757812, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6299135684967041, "step": 584, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.996903896331787 }, { "episode": 9376, "epoch": 0.16853003558974727, "loss/policy_avg": 0.8130825757980347, "lr": 2.88784509202454e-06, "objective/entropy": 111.27020263671875, "objective/kl": 10.760910034179688, "objective/non_score_reward": -1.076090931892395, "objective/rlhf_reward": -3.9043638467788693, "objective/scores": 0.1, "policy/approxkl_avg": 103.25462341308594, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5371990203857422, "step": 585, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9976208209991455 }, { "episode": 9392, "epoch": 0.1688176295071359, "loss/policy_avg": 0.39816397428512573, "lr": 2.887653374233129e-06, "objective/entropy": -99.50511932373047, "objective/kl": 9.237605094909668, "objective/non_score_reward": -0.9237604737281799, "objective/rlhf_reward": -5.695041656494141, "objective/scores": -0.5, "policy/approxkl_avg": 37.84246063232422, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5522249341011047, "step": 586, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9989943504333496 }, { "episode": 9408, "epoch": 0.16910522342452458, "loss/policy_avg": 0.16980989277362823, "lr": 2.887461656441718e-06, "objective/entropy": -199.70681762695312, "objective/kl": 11.351678848266602, "objective/non_score_reward": -1.1351678371429443, "objective/rlhf_reward": -2.1406713783740994, "objective/scores": 0.6, "policy/approxkl_avg": 99.28809356689453, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6828469038009644, "step": 587, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9995384216308594 }, { "episode": 9424, "epoch": 0.16939281734191322, "loss/policy_avg": 0.4977327585220337, "lr": 2.887269938650307e-06, "objective/entropy": 92.04341888427734, "objective/kl": 7.678049087524414, "objective/non_score_reward": -0.7678048610687256, "objective/rlhf_reward": -2.671219593286514, "objective/scores": 0.1, "policy/approxkl_avg": 24.31591796875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6588965654373169, "step": 588, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9978811740875244 }, { "episode": 9440, "epoch": 0.16968041125930186, "loss/policy_avg": 0.7905654311180115, "lr": 2.887078220858896e-06, "objective/entropy": 132.01498413085938, "objective/kl": 14.539533615112305, "objective/non_score_reward": -1.453953504562378, "objective/rlhf_reward": -5.4158137202262875, "objective/scores": 0.1, "policy/approxkl_avg": 65.95307159423828, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.46912258863449097, "step": 589, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9991271495819092 }, { "episode": 9456, "epoch": 0.1699680051766905, "loss/policy_avg": -0.066254623234272, "lr": 2.8868865030674845e-06, "objective/entropy": 71.34278869628906, "objective/kl": 14.032512664794922, "objective/non_score_reward": -1.4032511711120605, "objective/rlhf_reward": -7.613004684448242, "objective/scores": -0.5, "policy/approxkl_avg": 56.8614501953125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6078362464904785, "step": 590, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000101089477539 }, { "episode": 9472, "epoch": 0.17025559909407917, "loss/policy_avg": 0.5789448618888855, "lr": 2.8866947852760737e-06, "objective/entropy": 48.365203857421875, "objective/kl": 10.217412948608398, "objective/non_score_reward": -1.0217413902282715, "objective/rlhf_reward": -3.6869654193520542, "objective/scores": 0.1, "policy/approxkl_avg": 16.687175750732422, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.48520535230636597, "step": 591, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0006895065307617 }, { "episode": 9488, "epoch": 0.1705431930114678, "loss/policy_avg": 0.4344612956047058, "lr": 2.8865030674846625e-06, "objective/entropy": 252.8432159423828, "objective/kl": 17.551427841186523, "objective/non_score_reward": -1.7551428079605103, "objective/rlhf_reward": -9.020570755004883, "objective/scores": -0.5, "policy/approxkl_avg": 154.91946411132812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7674171924591064, "step": 592, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9977104663848877 }, { "episode": 9504, "epoch": 0.17083078692885645, "loss/policy_avg": 0.45362526178359985, "lr": 2.8863113496932513e-06, "objective/entropy": 19.733963012695312, "objective/kl": 12.605318069458008, "objective/non_score_reward": -1.2605319023132324, "objective/rlhf_reward": -4.642127639055252, "objective/scores": 0.1, "policy/approxkl_avg": 236.24461364746094, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7392984628677368, "step": 593, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.99709153175354 }, { "episode": 9520, "epoch": 0.1711183808462451, "loss/policy_avg": 0.05706373229622841, "lr": 2.8861196319018406e-06, "objective/entropy": -80.88824462890625, "objective/kl": 10.57326889038086, "objective/non_score_reward": -1.0573269128799438, "objective/rlhf_reward": 0.1706922888755802, "objective/scores": 1.1, "policy/approxkl_avg": 220.6209716796875, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6385011672973633, "step": 594, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.000190258026123 }, { "episode": 9536, "epoch": 0.17140597476363376, "loss/policy_avg": 0.4798924922943115, "lr": 2.8859279141104294e-06, "objective/entropy": 96.04144287109375, "objective/kl": 13.669404029846191, "objective/non_score_reward": -1.3669404983520508, "objective/rlhf_reward": -1.0677618443965908, "objective/scores": 1.1, "policy/approxkl_avg": 79.78167724609375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7598651647567749, "step": 595, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9998369216918945 }, { "episode": 9552, "epoch": 0.1716935686810224, "loss/policy_avg": 0.6355167627334595, "lr": 2.8857361963190186e-06, "objective/entropy": 64.89462280273438, "objective/kl": 13.15363883972168, "objective/non_score_reward": -1.315363883972168, "objective/rlhf_reward": -4.861455595493316, "objective/scores": 0.1, "policy/approxkl_avg": 57.076480865478516, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.47427719831466675, "step": 596, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9995226860046387 }, { "episode": 9568, "epoch": 0.17198116259841104, "loss/policy_avg": 0.10829915851354599, "lr": 2.8855444785276074e-06, "objective/entropy": 79.2706298828125, "objective/kl": 16.379791259765625, "objective/non_score_reward": -1.63797926902771, "objective/rlhf_reward": -6.151916986703872, "objective/scores": 0.1, "policy/approxkl_avg": 103.64852142333984, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6097987294197083, "step": 597, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9987919330596924 }, { "episode": 9584, "epoch": 0.1722687565157997, "loss/policy_avg": 0.13417291641235352, "lr": 2.8853527607361962e-06, "objective/entropy": -54.99534606933594, "objective/kl": 13.775299072265625, "objective/non_score_reward": -1.3775299787521362, "objective/rlhf_reward": -3.110119885206222, "objective/scores": 0.6, "policy/approxkl_avg": 41.135406494140625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6685090661048889, "step": 598, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9994128942489624 }, { "episode": 9600, "epoch": 0.17255635043318834, "loss/policy_avg": 0.2789982557296753, "lr": 2.8851610429447855e-06, "objective/entropy": 6.352031707763672, "objective/kl": 15.038141250610352, "objective/non_score_reward": -1.5038139820098877, "objective/rlhf_reward": -3.61525604724884, "objective/scores": 0.6, "policy/approxkl_avg": 80.10298919677734, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.47064271569252014, "step": 599, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9999215602874756 }, { "episode": 9616, "epoch": 0.17284394435057698, "loss/policy_avg": 0.17868322134017944, "lr": 2.8849693251533743e-06, "objective/entropy": 100.41844940185547, "objective/kl": 14.95844841003418, "objective/non_score_reward": -1.495845079421997, "objective/rlhf_reward": -3.8606740257897716, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 51.73733139038086, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8421609997749329, "step": 600, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.001194953918457 }, { "episode": 9632, "epoch": 0.17313153826796562, "loss/policy_avg": 0.6214461326599121, "lr": 2.884777607361963e-06, "objective/entropy": 231.32444763183594, "objective/kl": 14.899240493774414, "objective/non_score_reward": -1.4899241924285889, "objective/rlhf_reward": -3.035977606416914, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 157.85556030273438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5734395980834961, "step": 601, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9957280158996582 }, { "episode": 9648, "epoch": 0.1734191321853543, "loss/policy_avg": 0.16177789866924286, "lr": 2.8845858895705523e-06, "objective/entropy": -75.67822265625, "objective/kl": 13.02596664428711, "objective/non_score_reward": -1.3025965690612793, "objective/rlhf_reward": -0.8103865742683407, "objective/scores": 1.1, "policy/approxkl_avg": 51.60423278808594, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7318694591522217, "step": 602, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9996418952941895 }, { "episode": 9664, "epoch": 0.17370672610274293, "loss/policy_avg": 0.22588306665420532, "lr": 2.884394171779141e-06, "objective/entropy": 132.67103576660156, "objective/kl": 13.433195114135742, "objective/non_score_reward": -1.3433196544647217, "objective/rlhf_reward": -4.973278379440307, "objective/scores": 0.1, "policy/approxkl_avg": 99.59487915039062, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6894341707229614, "step": 603, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9990489482879639 }, { "episode": 9680, "epoch": 0.17399432002013157, "loss/policy_avg": 0.22847160696983337, "lr": 2.8842024539877304e-06, "objective/entropy": 122.22633361816406, "objective/kl": 20.479631423950195, "objective/non_score_reward": -2.0479629039764404, "objective/rlhf_reward": -3.7918519139289852, "objective/scores": 1.1, "policy/approxkl_avg": 103.07848358154297, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8397064805030823, "step": 604, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9971545934677124 }, { "episode": 9696, "epoch": 0.1742819139375202, "loss/policy_avg": 0.397056519985199, "lr": 2.884010736196319e-06, "objective/entropy": -62.73558044433594, "objective/kl": 10.932550430297852, "objective/non_score_reward": -1.0932550430297852, "objective/rlhf_reward": -3.9730204105377194, "objective/scores": 0.1, "policy/approxkl_avg": 28.36017417907715, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5593264102935791, "step": 605, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9984571933746338 }, { "episode": 9712, "epoch": 0.17456950785490888, "loss/policy_avg": 0.28502804040908813, "lr": 2.883819018404908e-06, "objective/entropy": 72.55328369140625, "objective/kl": 12.644740104675293, "objective/non_score_reward": -1.2644741535186768, "objective/rlhf_reward": -7.057896614074707, "objective/scores": -0.5, "policy/approxkl_avg": 103.56513214111328, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.49720489978790283, "step": 606, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9984281063079834 }, { "episode": 9728, "epoch": 0.17485710177229752, "loss/policy_avg": 0.49002373218536377, "lr": 2.8836273006134972e-06, "objective/entropy": -66.95445251464844, "objective/kl": 12.276761054992676, "objective/non_score_reward": -1.2276760339736938, "objective/rlhf_reward": -6.910704135894775, "objective/scores": -0.5, "policy/approxkl_avg": 128.71206665039062, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5521280765533447, "step": 607, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998512625694275 }, { "episode": 9744, "epoch": 0.17514469568968616, "loss/policy_avg": 0.15580351650714874, "lr": 2.883435582822086e-06, "objective/entropy": -14.935211181640625, "objective/kl": 12.201488494873047, "objective/non_score_reward": -1.2201488018035889, "objective/rlhf_reward": -4.480595326423645, "objective/scores": 0.1, "policy/approxkl_avg": 72.03843688964844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6915831565856934, "step": 608, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9991041421890259 }, { "episode": 9760, "epoch": 0.1754322896070748, "loss/policy_avg": -0.07565896213054657, "lr": 2.883243865030675e-06, "objective/entropy": -222.00851440429688, "objective/kl": 13.098295211791992, "objective/non_score_reward": -1.3098294734954834, "objective/rlhf_reward": -3.4144895031777134, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 23.588008880615234, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4547193646430969, "step": 609, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0002942085266113 }, { "episode": 9776, "epoch": 0.17571988352446347, "loss/policy_avg": -0.02608119696378708, "lr": 2.883052147239264e-06, "objective/entropy": -214.93325805664062, "objective/kl": 13.121513366699219, "objective/non_score_reward": -1.3121511936187744, "objective/rlhf_reward": -2.324885469616625, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 37.12300109863281, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7193573713302612, "step": 610, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.001455307006836 }, { "episode": 9792, "epoch": 0.1760074774418521, "loss/policy_avg": 0.05792073905467987, "lr": 2.882860429447853e-06, "objective/entropy": -38.28462600708008, "objective/kl": 6.414361000061035, "objective/non_score_reward": -0.6414362192153931, "objective/rlhf_reward": -4.565744876861572, "objective/scores": -0.5, "policy/approxkl_avg": 27.29863929748535, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7934014797210693, "step": 611, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9998834133148193 }, { "episode": 9808, "epoch": 0.17629507135924075, "loss/policy_avg": 0.34763196110725403, "lr": 2.8826687116564417e-06, "objective/entropy": -187.356201171875, "objective/kl": 14.694540977478027, "objective/non_score_reward": -1.4694541692733765, "objective/rlhf_reward": -7.877816200256348, "objective/scores": -0.5, "policy/approxkl_avg": 164.05078125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.48596644401550293, "step": 612, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.99922776222229 }, { "episode": 9824, "epoch": 0.17658266527662939, "loss/policy_avg": 0.034425608813762665, "lr": 2.8824769938650305e-06, "objective/entropy": 30.521377563476562, "objective/kl": 14.546671867370605, "objective/non_score_reward": -1.4546672105789185, "objective/rlhf_reward": -5.4186688423156735, "objective/scores": 0.1, "policy/approxkl_avg": 92.84369659423828, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6131436824798584, "step": 613, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000127077102661 }, { "episode": 9840, "epoch": 0.17687025919401805, "loss/policy_avg": 0.6788812875747681, "lr": 2.8822852760736197e-06, "objective/entropy": -29.50526237487793, "objective/kl": 11.87314224243164, "objective/non_score_reward": -1.1873142719268799, "objective/rlhf_reward": -1.8255377455961435, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 69.99320220947266, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6484935283660889, "step": 614, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9981306791305542 }, { "episode": 9856, "epoch": 0.1771578531114067, "loss/policy_avg": 0.018577001988887787, "lr": 2.8820935582822085e-06, "objective/entropy": 91.57209777832031, "objective/kl": 15.141888618469238, "objective/non_score_reward": -1.5141887664794922, "objective/rlhf_reward": -8.056755065917969, "objective/scores": -0.5, "policy/approxkl_avg": 128.7979736328125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5703952312469482, "step": 615, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9988903999328613 }, { "episode": 9872, "epoch": 0.17744544702879533, "loss/policy_avg": -0.041117969900369644, "lr": 2.8819018404907974e-06, "objective/entropy": 137.1066436767578, "objective/kl": 6.767946243286133, "objective/non_score_reward": -0.6767945885658264, "objective/rlhf_reward": -2.30717841386795, "objective/scores": 0.1, "policy/approxkl_avg": 16.107406616210938, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6602547764778137, "step": 616, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9993090629577637 }, { "episode": 9888, "epoch": 0.177733040946184, "loss/policy_avg": 0.3238484859466553, "lr": 2.8817101226993866e-06, "objective/entropy": -231.63035583496094, "objective/kl": 9.579354286193848, "objective/non_score_reward": -0.9579353928565979, "objective/rlhf_reward": -5.8317413330078125, "objective/scores": -0.5, "policy/approxkl_avg": 95.74900817871094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.772737979888916, "step": 617, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9988899230957031 }, { "episode": 9904, "epoch": 0.17802063486357264, "loss/policy_avg": -0.08733166754245758, "lr": 2.8815184049079754e-06, "objective/entropy": 94.04405975341797, "objective/kl": 10.622529983520508, "objective/non_score_reward": -1.0622529983520508, "objective/rlhf_reward": -1.849012157320976, "objective/scores": 0.6, "policy/approxkl_avg": 161.30787658691406, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6008134484291077, "step": 618, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9983432292938232 }, { "episode": 9920, "epoch": 0.17830822878096128, "loss/policy_avg": 0.22342449426651, "lr": 2.8813266871165646e-06, "objective/entropy": 23.68462562561035, "objective/kl": 11.187488555908203, "objective/non_score_reward": -1.1187489032745361, "objective/rlhf_reward": -6.4749956130981445, "objective/scores": -0.5, "policy/approxkl_avg": 71.44534301757812, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.44539380073547363, "step": 619, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9987552165985107 }, { "episode": 9936, "epoch": 0.17859582269834992, "loss/policy_avg": 0.22347092628479004, "lr": 2.8811349693251534e-06, "objective/entropy": 175.77520751953125, "objective/kl": 17.442081451416016, "objective/non_score_reward": -1.7442083358764648, "objective/rlhf_reward": -8.97683334350586, "objective/scores": -0.5, "policy/approxkl_avg": 161.70484924316406, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4676833748817444, "step": 620, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9993557929992676 }, { "episode": 9952, "epoch": 0.1788834166157386, "loss/policy_avg": 0.09556691348552704, "lr": 2.8809432515337422e-06, "objective/entropy": 166.47305297851562, "objective/kl": 14.810133934020996, "objective/non_score_reward": -1.4810134172439575, "objective/rlhf_reward": -5.524053907394409, "objective/scores": 0.1, "policy/approxkl_avg": 101.40066528320312, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5872761011123657, "step": 621, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000371217727661 }, { "episode": 9968, "epoch": 0.17917101053312723, "loss/policy_avg": 0.30460673570632935, "lr": 2.8807515337423315e-06, "objective/entropy": -225.1236572265625, "objective/kl": 13.543222427368164, "objective/non_score_reward": -1.3543224334716797, "objective/rlhf_reward": -1.0172893762588497, "objective/scores": 1.1, "policy/approxkl_avg": 82.92485046386719, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5670063495635986, "step": 622, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9980478286743164 }, { "episode": 9984, "epoch": 0.17945860445051587, "loss/policy_avg": 0.2849801778793335, "lr": 2.8805598159509203e-06, "objective/entropy": 96.21595764160156, "objective/kl": 12.93740463256836, "objective/non_score_reward": -1.2937402725219727, "objective/rlhf_reward": -4.774961328506469, "objective/scores": 0.1, "policy/approxkl_avg": 142.59600830078125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.47951966524124146, "step": 623, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9973121881484985 }, { "episode": 10000, "epoch": 0.1797461983679045, "loss/policy_avg": 0.25585901737213135, "lr": 2.880368098159509e-06, "objective/entropy": 194.88865661621094, "objective/kl": 6.153909206390381, "objective/non_score_reward": -0.6153908967971802, "objective/rlhf_reward": 0.4621554904270444, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 31.100088119506836, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7605673670768738, "step": 624, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.002366065979004 }, { "episode": 10016, "epoch": 0.18003379228529318, "loss/policy_avg": 0.24915774166584015, "lr": 2.8801763803680983e-06, "objective/entropy": 120.67022705078125, "objective/kl": 8.496511459350586, "objective/non_score_reward": -0.8496510982513428, "objective/rlhf_reward": -5.398604393005371, "objective/scores": -0.5, "policy/approxkl_avg": 42.57280731201172, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6938654184341431, "step": 625, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9988131523132324 }, { "episode": 10032, "epoch": 0.18032138620268182, "loss/policy_avg": 0.4236345887184143, "lr": 2.879984662576687e-06, "objective/entropy": -12.667598724365234, "objective/kl": 11.722315788269043, "objective/non_score_reward": -1.1722315549850464, "objective/rlhf_reward": -4.2889264062047, "objective/scores": 0.1, "policy/approxkl_avg": 72.98892211914062, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6948565244674683, "step": 626, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9976496696472168 }, { "episode": 10048, "epoch": 0.18060898012007046, "loss/policy_avg": 0.16770553588867188, "lr": 2.8797929447852764e-06, "objective/entropy": 57.50523376464844, "objective/kl": 17.840484619140625, "objective/non_score_reward": -1.7840485572814941, "objective/rlhf_reward": -9.136194229125977, "objective/scores": -0.5, "policy/approxkl_avg": 120.76194763183594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6823735237121582, "step": 627, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9997957944869995 }, { "episode": 10064, "epoch": 0.1808965740374591, "loss/policy_avg": 0.4961099624633789, "lr": 2.879601226993865e-06, "objective/entropy": -25.265766143798828, "objective/kl": 10.27054214477539, "objective/non_score_reward": -1.0270541906356812, "objective/rlhf_reward": -1.1844979419719903, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 57.600494384765625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6599663496017456, "step": 628, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999619722366333 }, { "episode": 10080, "epoch": 0.18118416795484776, "loss/policy_avg": 0.562252938747406, "lr": 2.879409509202454e-06, "objective/entropy": 54.366180419921875, "objective/kl": 13.384576797485352, "objective/non_score_reward": -1.3384575843811035, "objective/rlhf_reward": -0.9538304716348645, "objective/scores": 1.1, "policy/approxkl_avg": 58.41983413696289, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6028238534927368, "step": 629, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9995472431182861 }, { "episode": 10096, "epoch": 0.1814717618722364, "loss/policy_avg": 0.9494496583938599, "lr": 2.8792177914110432e-06, "objective/entropy": 102.92730712890625, "objective/kl": 11.970917701721191, "objective/non_score_reward": -1.1970919370651245, "objective/rlhf_reward": -3.055034414927164, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 71.0523910522461, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5434826612472534, "step": 630, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000587224960327 }, { "episode": 10112, "epoch": 0.18175935578962504, "loss/policy_avg": 0.06726402044296265, "lr": 2.879026073619632e-06, "objective/entropy": -49.708221435546875, "objective/kl": 11.118875503540039, "objective/non_score_reward": -1.1118874549865723, "objective/rlhf_reward": -1.5238312228929727, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 84.36143493652344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7672351002693176, "step": 631, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9989380836486816 }, { "episode": 10128, "epoch": 0.18204694970701368, "loss/policy_avg": -0.2285957634449005, "lr": 2.8788343558282213e-06, "objective/entropy": 167.11105346679688, "objective/kl": 11.83863353729248, "objective/non_score_reward": -1.1838631629943848, "objective/rlhf_reward": -3.1791934733658582, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 22.718273162841797, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.45381465554237366, "step": 632, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0061585903167725 }, { "episode": 10144, "epoch": 0.18233454362440235, "loss/policy_avg": 1.1582974195480347, "lr": 2.87864263803681e-06, "objective/entropy": -151.89218139648438, "objective/kl": 7.8154096603393555, "objective/non_score_reward": -0.7815409898757935, "objective/rlhf_reward": -2.7261639595031735, "objective/scores": 0.1, "policy/approxkl_avg": 20.87982177734375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6609333753585815, "step": 633, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000443458557129 }, { "episode": 10160, "epoch": 0.182622137541791, "loss/policy_avg": -0.023708324879407883, "lr": 2.878450920245399e-06, "objective/entropy": 37.85424041748047, "objective/kl": 12.195389747619629, "objective/non_score_reward": -1.2195390462875366, "objective/rlhf_reward": -1.9544372900736064, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 84.37319946289062, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.563186764717102, "step": 634, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0016093254089355 }, { "episode": 10176, "epoch": 0.18290973145917963, "loss/policy_avg": 0.22720938920974731, "lr": 2.8782592024539877e-06, "objective/entropy": 70.03634643554688, "objective/kl": 14.060079574584961, "objective/non_score_reward": -1.406008005142212, "objective/rlhf_reward": -7.624032020568848, "objective/scores": -0.5, "policy/approxkl_avg": 70.89201354980469, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.449720174074173, "step": 635, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9980741739273071 }, { "episode": 10192, "epoch": 0.1831973253765683, "loss/policy_avg": 0.5389465689659119, "lr": 2.8780674846625765e-06, "objective/entropy": -199.75421142578125, "objective/kl": 15.233451843261719, "objective/non_score_reward": -1.5233452320098877, "objective/rlhf_reward": -8.093381881713867, "objective/scores": -0.5, "policy/approxkl_avg": 33.79878234863281, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.48563235998153687, "step": 636, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998356819152832 }, { "episode": 10208, "epoch": 0.18348491929395694, "loss/policy_avg": 0.37082988023757935, "lr": 2.8778757668711657e-06, "objective/entropy": 29.62700653076172, "objective/kl": 10.318947792053223, "objective/non_score_reward": -1.0318948030471802, "objective/rlhf_reward": -1.2038601979028907, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 19.83915138244629, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5535821914672852, "step": 637, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0009536743164062 }, { "episode": 10224, "epoch": 0.18377251321134558, "loss/policy_avg": 0.01469859853386879, "lr": 2.8776840490797546e-06, "objective/entropy": 13.477630615234375, "objective/kl": 14.343404769897461, "objective/non_score_reward": -1.4343405961990356, "objective/rlhf_reward": -5.337362265586853, "objective/scores": 0.1, "policy/approxkl_avg": 108.1371078491211, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4337630867958069, "step": 638, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9978924989700317 }, { "episode": 10240, "epoch": 0.18406010712873422, "loss/policy_avg": 0.3221081495285034, "lr": 2.8774923312883434e-06, "objective/entropy": 282.66595458984375, "objective/kl": 11.63412857055664, "objective/non_score_reward": -1.1634126901626587, "objective/rlhf_reward": -0.2536508500576016, "objective/scores": 1.1, "policy/approxkl_avg": 111.42243957519531, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7095741033554077, "step": 639, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9970577955245972 }, { "episode": 10256, "epoch": 0.18434770104612289, "loss/policy_avg": 0.08750347793102264, "lr": 2.8773006134969326e-06, "objective/entropy": -36.40140914916992, "objective/kl": 12.267041206359863, "objective/non_score_reward": -1.2267042398452759, "objective/rlhf_reward": -6.9068169593811035, "objective/scores": -0.5, "policy/approxkl_avg": 62.82714080810547, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5733236074447632, "step": 640, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000546932220459 }, { "episode": 10272, "epoch": 0.18463529496351153, "loss/policy_avg": 0.3036690652370453, "lr": 2.8771088957055214e-06, "objective/entropy": 49.865753173828125, "objective/kl": 11.792703628540039, "objective/non_score_reward": -1.1792702674865723, "objective/rlhf_reward": -4.317080801725387, "objective/scores": 0.1, "policy/approxkl_avg": 35.753089904785156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.695482611656189, "step": 641, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9989582300186157 }, { "episode": 10288, "epoch": 0.18492288888090017, "loss/policy_avg": 0.7012618780136108, "lr": 2.8769171779141106e-06, "objective/entropy": 92.8203125, "objective/kl": 10.009756088256836, "objective/non_score_reward": -1.0009756088256836, "objective/rlhf_reward": -6.003902435302734, "objective/scores": -0.5, "policy/approxkl_avg": 160.71142578125, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.45697641372680664, "step": 642, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9986211061477661 }, { "episode": 10304, "epoch": 0.1852104827982888, "loss/policy_avg": 0.18139493465423584, "lr": 2.8767254601226994e-06, "objective/entropy": 35.44036865234375, "objective/kl": 11.35489273071289, "objective/non_score_reward": -1.1354892253875732, "objective/rlhf_reward": -6.541956901550293, "objective/scores": -0.5, "policy/approxkl_avg": 73.70870971679688, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6653156876564026, "step": 643, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9983882904052734 }, { "episode": 10320, "epoch": 0.18549807671567747, "loss/policy_avg": 0.21259146928787231, "lr": 2.8765337423312883e-06, "objective/entropy": 65.79669189453125, "objective/kl": 5.1166887283325195, "objective/non_score_reward": -0.5116689205169678, "objective/rlhf_reward": -1.6466757565736772, "objective/scores": 0.1, "policy/approxkl_avg": 6.101611137390137, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.44910210371017456, "step": 644, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9965652227401733 }, { "episode": 10336, "epoch": 0.1857856706330661, "loss/policy_avg": 1.021752953529358, "lr": 2.8763420245398775e-06, "objective/entropy": -34.64234161376953, "objective/kl": 15.709405899047852, "objective/non_score_reward": -1.5709404945373535, "objective/rlhf_reward": -5.883762127161026, "objective/scores": 0.1, "policy/approxkl_avg": 142.4956817626953, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5723711252212524, "step": 645, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9976788759231567 }, { "episode": 10352, "epoch": 0.18607326455045475, "loss/policy_avg": 0.24777646362781525, "lr": 2.8761503067484663e-06, "objective/entropy": 77.03242492675781, "objective/kl": 15.705501556396484, "objective/non_score_reward": -1.5705503225326538, "objective/rlhf_reward": -5.882201319932937, "objective/scores": 0.1, "policy/approxkl_avg": 104.79753875732422, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.619193434715271, "step": 646, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9973654747009277 }, { "episode": 10368, "epoch": 0.1863608584678434, "loss/policy_avg": 0.05733855068683624, "lr": 2.8759585889570555e-06, "objective/entropy": 105.6750717163086, "objective/kl": 13.765717506408691, "objective/non_score_reward": -1.3765718936920166, "objective/rlhf_reward": -5.1062874555587765, "objective/scores": 0.1, "policy/approxkl_avg": 96.42398834228516, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.2780163288116455, "step": 647, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9983540773391724 }, { "episode": 10384, "epoch": 0.18664845238523206, "loss/policy_avg": 0.4622950553894043, "lr": 2.8757668711656443e-06, "objective/entropy": -98.47879028320312, "objective/kl": 7.556210517883301, "objective/non_score_reward": -0.755621075630188, "objective/rlhf_reward": -0.09876528823492192, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 68.89460754394531, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7213411927223206, "step": 648, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9966726303100586 }, { "episode": 10400, "epoch": 0.1869360463026207, "loss/policy_avg": 0.8236960172653198, "lr": 2.875575153374233e-06, "objective/entropy": 18.49713897705078, "objective/kl": 15.854425430297852, "objective/non_score_reward": -1.5854425430297852, "objective/rlhf_reward": -8.34177017211914, "objective/scores": -0.5, "policy/approxkl_avg": 67.23236083984375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8861958980560303, "step": 649, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0006723403930664 }, { "episode": 10416, "epoch": 0.18722364022000934, "loss/policy_avg": -0.11324408650398254, "lr": 2.8753834355828224e-06, "objective/entropy": -102.02124786376953, "objective/kl": 5.1298675537109375, "objective/non_score_reward": -0.5129867792129517, "objective/rlhf_reward": 0.8717719123351846, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 28.45832633972168, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4376002848148346, "step": 650, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0088820457458496 }, { "episode": 10432, "epoch": 0.18751123413739798, "loss/policy_avg": 0.2450694590806961, "lr": 2.875191717791411e-06, "objective/entropy": 33.388450622558594, "objective/kl": 13.52882194519043, "objective/non_score_reward": -1.3528821468353271, "objective/rlhf_reward": -2.4878097816717357, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 11.315901756286621, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.41914981603622437, "step": 651, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0015416145324707 }, { "episode": 10448, "epoch": 0.18779882805478665, "loss/policy_avg": 0.009298861026763916, "lr": 2.875e-06, "objective/entropy": -16.950359344482422, "objective/kl": 9.792383193969727, "objective/non_score_reward": -0.9792382717132568, "objective/rlhf_reward": 0.4830468982458118, "objective/scores": 1.1, "policy/approxkl_avg": 14.477102279663086, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6265867948532104, "step": 652, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000293016433716 }, { "episode": 10464, "epoch": 0.1880864219721753, "loss/policy_avg": 0.4039332866668701, "lr": 2.8748082822085892e-06, "objective/entropy": 66.02500915527344, "objective/kl": 8.322999954223633, "objective/non_score_reward": -0.8322999477386475, "objective/rlhf_reward": -5.32919979095459, "objective/scores": -0.5, "policy/approxkl_avg": 12.993392944335938, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6458040475845337, "step": 653, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9998199939727783 }, { "episode": 10480, "epoch": 0.18837401588956393, "loss/policy_avg": 0.15707165002822876, "lr": 2.874616564417178e-06, "objective/entropy": 171.90771484375, "objective/kl": 11.948579788208008, "objective/non_score_reward": -1.1948580741882324, "objective/rlhf_reward": -6.77943229675293, "objective/scores": -0.5, "policy/approxkl_avg": 10.851991653442383, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6504145860671997, "step": 654, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9997503757476807 }, { "episode": 10496, "epoch": 0.1886616098069526, "loss/policy_avg": 0.07920925319194794, "lr": 2.8744248466257673e-06, "objective/entropy": 61.45829391479492, "objective/kl": 12.758872985839844, "objective/non_score_reward": -1.275887370109558, "objective/rlhf_reward": -3.1561381918954208, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 101.437744140625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8374680876731873, "step": 655, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0019378662109375 }, { "episode": 10512, "epoch": 0.18894920372434124, "loss/policy_avg": 0.555253267288208, "lr": 2.8742331288343557e-06, "objective/entropy": -175.50808715820312, "objective/kl": 14.069064140319824, "objective/non_score_reward": -1.4069066047668457, "objective/rlhf_reward": -5.227626121044159, "objective/scores": 0.1, "policy/approxkl_avg": 81.80171203613281, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.590570330619812, "step": 656, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998897910118103 }, { "episode": 10528, "epoch": 0.18923679764172988, "loss/policy_avg": 0.11733473837375641, "lr": 2.874041411042945e-06, "objective/entropy": 114.00068664550781, "objective/kl": 18.550704956054688, "objective/non_score_reward": -1.8550705909729004, "objective/rlhf_reward": -7.020282661914825, "objective/scores": 0.1, "policy/approxkl_avg": 203.83114624023438, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6477522850036621, "step": 657, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9994854927062988 }, { "episode": 10544, "epoch": 0.18952439155911852, "loss/policy_avg": 0.15716442465782166, "lr": 2.8738496932515337e-06, "objective/entropy": 280.2091064453125, "objective/kl": 15.730361938476562, "objective/non_score_reward": -1.5730363130569458, "objective/rlhf_reward": -8.292144775390625, "objective/scores": -0.5, "policy/approxkl_avg": 69.69169616699219, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 1.0129367113113403, "step": 658, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.002072334289551 }, { "episode": 10560, "epoch": 0.18981198547650718, "loss/policy_avg": 0.6117931604385376, "lr": 2.8736579754601225e-06, "objective/entropy": -17.964401245117188, "objective/kl": 12.338603973388672, "objective/non_score_reward": -1.2338604927062988, "objective/rlhf_reward": -6.935441970825195, "objective/scores": -0.5, "policy/approxkl_avg": 42.43081283569336, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6273235082626343, "step": 659, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0019495487213135 }, { "episode": 10576, "epoch": 0.19009957939389582, "loss/policy_avg": 0.1620130091905594, "lr": 2.8734662576687117e-06, "objective/entropy": -31.008544921875, "objective/kl": 8.896329879760742, "objective/non_score_reward": -0.8896329998970032, "objective/rlhf_reward": -1.6111206514405567, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 8.579498291015625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8015791177749634, "step": 660, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0010838508605957 }, { "episode": 10592, "epoch": 0.19038717331128446, "loss/policy_avg": 0.30528926849365234, "lr": 2.8732745398773006e-06, "objective/entropy": -81.17588806152344, "objective/kl": 12.422649383544922, "objective/non_score_reward": -1.2422648668289185, "objective/rlhf_reward": -2.569059437513351, "objective/scores": 0.6, "policy/approxkl_avg": 26.168109893798828, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6268837451934814, "step": 661, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000943899154663 }, { "episode": 10608, "epoch": 0.1906747672286731, "loss/policy_avg": 0.21409085392951965, "lr": 2.8730828220858894e-06, "objective/entropy": 109.389404296875, "objective/kl": 14.156780242919922, "objective/non_score_reward": -1.415677785873413, "objective/rlhf_reward": -1.2627113521099087, "objective/scores": 1.1, "policy/approxkl_avg": 74.9066390991211, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6278871297836304, "step": 662, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9998223781585693 }, { "episode": 10624, "epoch": 0.19096236114606177, "loss/policy_avg": 0.4462759494781494, "lr": 2.8728911042944786e-06, "objective/entropy": 9.342247009277344, "objective/kl": 15.820850372314453, "objective/non_score_reward": -1.5820850133895874, "objective/rlhf_reward": -4.205633642450843, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 166.1714630126953, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4893154501914978, "step": 663, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9985361099243164 }, { "episode": 10640, "epoch": 0.1912499550634504, "loss/policy_avg": 0.26638445258140564, "lr": 2.8726993865030674e-06, "objective/entropy": 177.13156127929688, "objective/kl": 9.343158721923828, "objective/non_score_reward": -0.9343159794807434, "objective/rlhf_reward": -5.7372636795043945, "objective/scores": -0.5, "policy/approxkl_avg": 26.281099319458008, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6856767535209656, "step": 664, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9989508390426636 }, { "episode": 10656, "epoch": 0.19153754898083905, "loss/policy_avg": 0.16023144125938416, "lr": 2.8725076687116566e-06, "objective/entropy": 78.05880737304688, "objective/kl": 7.6621246337890625, "objective/non_score_reward": -0.7662124633789062, "objective/rlhf_reward": -2.664849868416786, "objective/scores": 0.1, "policy/approxkl_avg": 50.030677795410156, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.3968629837036133, "step": 665, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0032896995544434 }, { "episode": 10672, "epoch": 0.1918251428982277, "loss/policy_avg": 0.06800419837236404, "lr": 2.8723159509202455e-06, "objective/entropy": 59.63671875, "objective/kl": 12.568862915039062, "objective/non_score_reward": -1.25688636302948, "objective/rlhf_reward": -7.02754545211792, "objective/scores": -0.5, "policy/approxkl_avg": 222.47463989257812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5737947821617126, "step": 666, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0001986026763916 }, { "episode": 10688, "epoch": 0.19211273681561636, "loss/policy_avg": 0.553016185760498, "lr": 2.8721242331288343e-06, "objective/entropy": 235.48336791992188, "objective/kl": 10.159873962402344, "objective/non_score_reward": -1.0159873962402344, "objective/rlhf_reward": -2.3306164304415384, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 54.860572814941406, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7995113134384155, "step": 667, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0003960132598877 }, { "episode": 10704, "epoch": 0.192400330733005, "loss/policy_avg": -0.27713173627853394, "lr": 2.8719325153374235e-06, "objective/entropy": 32.731258392333984, "objective/kl": 14.078591346740723, "objective/non_score_reward": -1.407859206199646, "objective/rlhf_reward": -5.231436854600906, "objective/scores": 0.1, "policy/approxkl_avg": 41.393226623535156, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6517434120178223, "step": 668, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0037641525268555 }, { "episode": 10720, "epoch": 0.19268792465039364, "loss/policy_avg": 0.40979307889938354, "lr": 2.8717407975460123e-06, "objective/entropy": -46.12900924682617, "objective/kl": 10.303600311279297, "objective/non_score_reward": -1.0303599834442139, "objective/rlhf_reward": -3.7214399039745327, "objective/scores": 0.1, "policy/approxkl_avg": 37.29252624511719, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6077979803085327, "step": 669, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998400092124939 }, { "episode": 10736, "epoch": 0.19297551856778228, "loss/policy_avg": 0.25220245122909546, "lr": 2.8715490797546015e-06, "objective/entropy": 105.48408508300781, "objective/kl": 17.024581909179688, "objective/non_score_reward": -1.702458143234253, "objective/rlhf_reward": -4.409832602739334, "objective/scores": 0.6, "policy/approxkl_avg": 208.58787536621094, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7229609489440918, "step": 670, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999326467514038 }, { "episode": 10752, "epoch": 0.19326311248517095, "loss/policy_avg": -0.009634248912334442, "lr": 2.8713573619631903e-06, "objective/entropy": -176.94802856445312, "objective/kl": 12.598766326904297, "objective/non_score_reward": -1.2598767280578613, "objective/rlhf_reward": -4.639506882429123, "objective/scores": 0.1, "policy/approxkl_avg": 41.82705307006836, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.590308666229248, "step": 671, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9989120960235596 }, { "episode": 10768, "epoch": 0.19355070640255959, "loss/policy_avg": 0.18840017914772034, "lr": 2.871165644171779e-06, "objective/entropy": 130.62278747558594, "objective/kl": 11.168953895568848, "objective/non_score_reward": -1.1168954372406006, "objective/rlhf_reward": -6.467581748962402, "objective/scores": -0.5, "policy/approxkl_avg": 15.816328048706055, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6742215752601624, "step": 672, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0026257038116455 }, { "episode": 10784, "epoch": 0.19383830031994823, "loss/policy_avg": 0.23392519354820251, "lr": 2.8709739263803684e-06, "objective/entropy": 127.62983703613281, "objective/kl": 7.803813934326172, "objective/non_score_reward": -0.780381441116333, "objective/rlhf_reward": -1.1741145131754236, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 32.454715728759766, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6811597347259521, "step": 673, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9976636171340942 }, { "episode": 10800, "epoch": 0.1941258942373369, "loss/policy_avg": -0.1319524347782135, "lr": 2.870782208588957e-06, "objective/entropy": -20.453079223632812, "objective/kl": 13.015239715576172, "objective/non_score_reward": -1.301524043083191, "objective/rlhf_reward": -7.206096172332764, "objective/scores": -0.5, "policy/approxkl_avg": 11.110284805297852, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7017459869384766, "step": 674, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0026445388793945 }, { "episode": 10816, "epoch": 0.19441348815472553, "loss/policy_avg": 0.04714903235435486, "lr": 2.870590490797546e-06, "objective/entropy": 303.12890625, "objective/kl": 7.725805282592773, "objective/non_score_reward": -0.7725805044174194, "objective/rlhf_reward": -5.090322017669678, "objective/scores": -0.5, "policy/approxkl_avg": 25.820777893066406, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7819321155548096, "step": 675, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9988234043121338 }, { "episode": 10832, "epoch": 0.19470108207211417, "loss/policy_avg": 0.23574650287628174, "lr": 2.8703987730061352e-06, "objective/entropy": -194.20651245117188, "objective/kl": 14.129228591918945, "objective/non_score_reward": -1.412922978401184, "objective/rlhf_reward": -1.2516918838024136, "objective/scores": 1.1, "policy/approxkl_avg": 34.932640075683594, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.635951817035675, "step": 676, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9998043775558472 }, { "episode": 10848, "epoch": 0.1949886759895028, "loss/policy_avg": 1.922278881072998, "lr": 2.870207055214724e-06, "objective/entropy": 202.7659454345703, "objective/kl": 8.443321228027344, "objective/non_score_reward": -0.8443321585655212, "objective/rlhf_reward": -5.377328872680664, "objective/scores": -0.5, "policy/approxkl_avg": 80.10383605957031, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7711433172225952, "step": 677, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0003726482391357 }, { "episode": 10864, "epoch": 0.19527626990689148, "loss/policy_avg": 0.08129671216011047, "lr": 2.870015337423313e-06, "objective/entropy": -35.23242950439453, "objective/kl": 12.633974075317383, "objective/non_score_reward": -1.263397455215454, "objective/rlhf_reward": -7.053589820861816, "objective/scores": -0.5, "policy/approxkl_avg": 148.25845336914062, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8183538913726807, "step": 678, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9973478317260742 }, { "episode": 10880, "epoch": 0.19556386382428012, "loss/policy_avg": 0.07529361546039581, "lr": 2.8698236196319017e-06, "objective/entropy": -160.49822998046875, "objective/kl": 14.277109146118164, "objective/non_score_reward": -1.4277108907699585, "objective/rlhf_reward": -7.710843086242676, "objective/scores": -0.5, "policy/approxkl_avg": 79.69904327392578, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6668201684951782, "step": 679, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9965490102767944 }, { "episode": 10896, "epoch": 0.19585145774166876, "loss/policy_avg": -0.24820567667484283, "lr": 2.869631901840491e-06, "objective/entropy": -144.34962463378906, "objective/kl": 7.242027759552002, "objective/non_score_reward": -0.7242028117179871, "objective/rlhf_reward": -2.4968111276626583, "objective/scores": 0.1, "policy/approxkl_avg": 4.443586349487305, "policy/clipfrac_avg": 2.0, "policy/entropy_avg": 0.5139869451522827, "step": 680, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0061938762664795 }, { "episode": 10912, "epoch": 0.1961390516590574, "loss/policy_avg": -0.08409806340932846, "lr": 2.8694401840490797e-06, "objective/entropy": -54.80754852294922, "objective/kl": 12.144088745117188, "objective/non_score_reward": -1.2144087553024292, "objective/rlhf_reward": -1.9339162155401437, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 2.676344156265259, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8483699560165405, "step": 681, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0018930435180664 }, { "episode": 10928, "epoch": 0.19642664557644607, "loss/policy_avg": 0.12138275802135468, "lr": 2.8692484662576685e-06, "objective/entropy": 70.32062530517578, "objective/kl": 12.482439041137695, "objective/non_score_reward": -1.248243808746338, "objective/rlhf_reward": -4.592975145578384, "objective/scores": 0.1, "policy/approxkl_avg": 6.664492130279541, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5509634017944336, "step": 682, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0010645389556885 }, { "episode": 10944, "epoch": 0.1967142394938347, "loss/policy_avg": 0.2954164147377014, "lr": 2.8690567484662578e-06, "objective/entropy": 46.32169723510742, "objective/kl": 7.934720039367676, "objective/non_score_reward": -0.7934720516204834, "objective/rlhf_reward": 1.2261117786169056, "objective/scores": 1.1, "policy/approxkl_avg": 69.41313934326172, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5250649452209473, "step": 683, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.001229763031006 }, { "episode": 10960, "epoch": 0.19700183341122335, "loss/policy_avg": 0.33847346901893616, "lr": 2.8688650306748466e-06, "objective/entropy": 136.00357055664062, "objective/kl": 11.347841262817383, "objective/non_score_reward": -1.1347841024398804, "objective/rlhf_reward": -0.13913632035255397, "objective/scores": 1.1, "policy/approxkl_avg": 39.6363525390625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5623573064804077, "step": 684, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.99955153465271 }, { "episode": 10976, "epoch": 0.197289427328612, "loss/policy_avg": 0.0871865451335907, "lr": 2.868673312883436e-06, "objective/entropy": -171.5851593017578, "objective/kl": 11.16063404083252, "objective/non_score_reward": -1.1160634756088257, "objective/rlhf_reward": -4.064253827929496, "objective/scores": 0.1, "policy/approxkl_avg": 69.5920181274414, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.790228009223938, "step": 685, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9968866109848022 }, { "episode": 10992, "epoch": 0.19757702124600066, "loss/policy_avg": 0.17770184576511383, "lr": 2.8684815950920246e-06, "objective/entropy": 117.78619384765625, "objective/kl": 14.742965698242188, "objective/non_score_reward": -1.4742965698242188, "objective/rlhf_reward": -5.497186398506164, "objective/scores": 0.1, "policy/approxkl_avg": 171.13133239746094, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5882231593132019, "step": 686, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9990973472595215 }, { "episode": 11008, "epoch": 0.1978646151633893, "loss/policy_avg": 0.4302918314933777, "lr": 2.8682898773006134e-06, "objective/entropy": 86.18273162841797, "objective/kl": 11.427553176879883, "objective/non_score_reward": -1.142755389213562, "objective/rlhf_reward": -6.571021556854248, "objective/scores": -0.5, "policy/approxkl_avg": 177.87432861328125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.468346506357193, "step": 687, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9976341724395752 }, { "episode": 11024, "epoch": 0.19815220908077794, "loss/policy_avg": 0.49217331409454346, "lr": 2.8680981595092026e-06, "objective/entropy": -60.496864318847656, "objective/kl": 19.168210983276367, "objective/non_score_reward": -1.9168212413787842, "objective/rlhf_reward": -5.54457879282621, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 153.42633056640625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6640913486480713, "step": 688, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000704765319824 }, { "episode": 11040, "epoch": 0.19843980299816658, "loss/policy_avg": 0.5149247646331787, "lr": 2.8679064417177915e-06, "objective/entropy": 217.9249725341797, "objective/kl": 10.10614013671875, "objective/non_score_reward": -1.010614037513733, "objective/rlhf_reward": 0.3575438499450687, "objective/scores": 1.1, "policy/approxkl_avg": 21.427753448486328, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5551241636276245, "step": 689, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0004658699035645 }, { "episode": 11056, "epoch": 0.19872739691555524, "loss/policy_avg": 2.387340784072876, "lr": 2.8677147239263803e-06, "objective/entropy": 90.62098693847656, "objective/kl": 13.577856063842773, "objective/non_score_reward": -1.357785701751709, "objective/rlhf_reward": -5.031142449378967, "objective/scores": 0.1, "policy/approxkl_avg": 76.07826232910156, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5787378549575806, "step": 690, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0003950595855713 }, { "episode": 11072, "epoch": 0.19901499083294388, "loss/policy_avg": 0.07591082900762558, "lr": 2.8675230061349695e-06, "objective/entropy": 192.91531372070312, "objective/kl": 18.27234649658203, "objective/non_score_reward": -1.8272345066070557, "objective/rlhf_reward": -5.186231674925361, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 53.839683532714844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5381182432174683, "step": 691, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999629259109497 }, { "episode": 11088, "epoch": 0.19930258475033252, "loss/policy_avg": 0.24866509437561035, "lr": 2.8673312883435583e-06, "objective/entropy": 162.3503875732422, "objective/kl": 18.564533233642578, "objective/non_score_reward": -1.8564532995224, "objective/rlhf_reward": -3.025813496112823, "objective/scores": 1.1, "policy/approxkl_avg": 120.78541564941406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5609779357910156, "step": 692, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9961520433425903 }, { "episode": 11104, "epoch": 0.1995901786677212, "loss/policy_avg": -0.0011049304157495499, "lr": 2.8671395705521475e-06, "objective/entropy": 96.88809967041016, "objective/kl": 7.879084587097168, "objective/non_score_reward": -0.7879084944725037, "objective/rlhf_reward": -2.7516339480876923, "objective/scores": 0.1, "policy/approxkl_avg": 6.773414611816406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.45891791582107544, "step": 693, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0016512870788574 }, { "episode": 11120, "epoch": 0.19987777258510983, "loss/policy_avg": 0.10084787011146545, "lr": 2.8669478527607364e-06, "objective/entropy": -41.20310974121094, "objective/kl": 13.97619915008545, "objective/non_score_reward": -1.3976198434829712, "objective/rlhf_reward": -1.190479493141174, "objective/scores": 1.1, "policy/approxkl_avg": 52.299827575683594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5506526231765747, "step": 694, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9993311166763306 }, { "episode": 11136, "epoch": 0.20016536650249847, "loss/policy_avg": 0.06375744938850403, "lr": 2.866756134969325e-06, "objective/entropy": -108.92198181152344, "objective/kl": 9.946971893310547, "objective/non_score_reward": -0.9946972131729126, "objective/rlhf_reward": -5.97878885269165, "objective/scores": -0.5, "policy/approxkl_avg": 2.416961193084717, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5260124802589417, "step": 695, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9992403984069824 }, { "episode": 11152, "epoch": 0.2004529604198871, "loss/policy_avg": 0.08030396699905396, "lr": 2.8665644171779144e-06, "objective/entropy": 123.5355453491211, "objective/kl": 15.836409568786621, "objective/non_score_reward": -1.5836410522460938, "objective/rlhf_reward": -8.334564208984375, "objective/scores": -0.5, "policy/approxkl_avg": 43.76978302001953, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6828103065490723, "step": 696, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9998550415039062 }, { "episode": 11168, "epoch": 0.20074055433727578, "loss/policy_avg": 0.29151037335395813, "lr": 2.866372699386503e-06, "objective/entropy": -57.21668243408203, "objective/kl": 15.339473724365234, "objective/non_score_reward": -1.5339473485946655, "objective/rlhf_reward": -1.7357893645763394, "objective/scores": 1.1, "policy/approxkl_avg": 91.99110412597656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5846511721611023, "step": 697, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9978073835372925 }, { "episode": 11184, "epoch": 0.20102814825466442, "loss/policy_avg": 0.11156813055276871, "lr": 2.8661809815950924e-06, "objective/entropy": 4.086456298828125, "objective/kl": 12.176067352294922, "objective/non_score_reward": -1.2176066637039185, "objective/rlhf_reward": -4.470426669716835, "objective/scores": 0.1, "policy/approxkl_avg": 22.13668441772461, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7282131910324097, "step": 698, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0009164810180664 }, { "episode": 11200, "epoch": 0.20131574217205306, "loss/policy_avg": 0.30346500873565674, "lr": 2.8659892638036812e-06, "objective/entropy": -137.93194580078125, "objective/kl": 9.752511978149414, "objective/non_score_reward": -0.975251317024231, "objective/rlhf_reward": -3.501005089282989, "objective/scores": 0.1, "policy/approxkl_avg": 11.240784645080566, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4937957525253296, "step": 699, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9995919466018677 }, { "episode": 11216, "epoch": 0.2016033360894417, "loss/policy_avg": 0.2282610535621643, "lr": 2.86579754601227e-06, "objective/entropy": 57.311073303222656, "objective/kl": 14.862542152404785, "objective/non_score_reward": -1.486254334449768, "objective/rlhf_reward": -4.120188410553049, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 18.98688507080078, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6022450923919678, "step": 700, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9996930360794067 }, { "episode": 11232, "epoch": 0.20189093000683037, "loss/policy_avg": 0.3324587941169739, "lr": 2.865605828220859e-06, "objective/entropy": 49.462989807128906, "objective/kl": 7.709723472595215, "objective/non_score_reward": -0.7709723711013794, "objective/rlhf_reward": -5.083889484405518, "objective/scores": -0.5, "policy/approxkl_avg": 28.05154800415039, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.41537797451019287, "step": 701, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0002284049987793 }, { "episode": 11248, "epoch": 0.202178523924219, "loss/policy_avg": 0.16259673237800598, "lr": 2.8654141104294477e-06, "objective/entropy": 60.31419372558594, "objective/kl": 13.977958679199219, "objective/non_score_reward": -1.3977960348129272, "objective/rlhf_reward": -5.191184258460998, "objective/scores": 0.1, "policy/approxkl_avg": 202.6460418701172, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7732006311416626, "step": 702, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9978877305984497 }, { "episode": 11264, "epoch": 0.20246611784160765, "loss/policy_avg": 0.2512626051902771, "lr": 2.865222392638037e-06, "objective/entropy": 156.12353515625, "objective/kl": 16.06157112121582, "objective/non_score_reward": -1.6061570644378662, "objective/rlhf_reward": -3.50090917641041, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 191.80953979492188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6488738059997559, "step": 703, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9996211528778076 }, { "episode": 11280, "epoch": 0.20275371175899629, "loss/policy_avg": 0.15953761339187622, "lr": 2.8650306748466257e-06, "objective/entropy": -43.2182731628418, "objective/kl": 16.940393447875977, "objective/non_score_reward": -1.6940394639968872, "objective/rlhf_reward": -2.3761578559875485, "objective/scores": 1.1, "policy/approxkl_avg": 112.27688598632812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.45469629764556885, "step": 704, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9988117218017578 }, { "episode": 11296, "epoch": 0.20304130567638495, "loss/policy_avg": 0.6711443066596985, "lr": 2.8648389570552145e-06, "objective/entropy": -33.72349548339844, "objective/kl": 11.539321899414062, "objective/non_score_reward": -1.1539320945739746, "objective/rlhf_reward": -2.8823953429857885, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 61.21073913574219, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5703375339508057, "step": 705, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9978996515274048 }, { "episode": 11312, "epoch": 0.2033288995937736, "loss/policy_avg": 0.7480028867721558, "lr": 2.8646472392638038e-06, "objective/entropy": 220.5673828125, "objective/kl": 7.203307151794434, "objective/non_score_reward": -0.7203306555747986, "objective/rlhf_reward": -1.2772026694455916, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 139.5849609375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.786684513092041, "step": 706, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9992549419403076 }, { "episode": 11328, "epoch": 0.20361649351116223, "loss/policy_avg": 0.00046522170305252075, "lr": 2.8644555214723926e-06, "objective/entropy": 77.21452331542969, "objective/kl": 15.027108192443848, "objective/non_score_reward": -1.5027105808258057, "objective/rlhf_reward": -8.010842323303223, "objective/scores": -0.5, "policy/approxkl_avg": 49.50825500488281, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7454473972320557, "step": 707, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9993491172790527 }, { "episode": 11344, "epoch": 0.20390408742855087, "loss/policy_avg": 0.4308600425720215, "lr": 2.864263803680982e-06, "objective/entropy": 105.22822570800781, "objective/kl": 10.61517333984375, "objective/non_score_reward": -1.0615174770355225, "objective/rlhf_reward": -1.3223507746469703, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 15.070667266845703, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5496830940246582, "step": 708, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998878836631775 }, { "episode": 11360, "epoch": 0.20419168134593954, "loss/policy_avg": 0.02560766041278839, "lr": 2.8640720858895706e-06, "objective/entropy": 52.49213790893555, "objective/kl": 12.071894645690918, "objective/non_score_reward": -1.2071894407272339, "objective/rlhf_reward": -1.9050386294138162, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 22.58022689819336, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.678420901298523, "step": 709, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0016746520996094 }, { "episode": 11376, "epoch": 0.20447927526332818, "loss/policy_avg": 0.14677026867866516, "lr": 2.8638803680981594e-06, "objective/entropy": -44.507568359375, "objective/kl": 10.297760009765625, "objective/non_score_reward": -1.0297759771347046, "objective/rlhf_reward": -6.119103908538818, "objective/scores": -0.5, "policy/approxkl_avg": 21.161418914794922, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.69853675365448, "step": 710, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9988245964050293 }, { "episode": 11392, "epoch": 0.20476686918071682, "loss/policy_avg": 0.3421841859817505, "lr": 2.8636886503067487e-06, "objective/entropy": 1.61920166015625, "objective/kl": 10.758232116699219, "objective/non_score_reward": -1.0758233070373535, "objective/rlhf_reward": -6.303293228149414, "objective/scores": -0.5, "policy/approxkl_avg": 68.81875610351562, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.852238118648529, "step": 711, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9968268871307373 }, { "episode": 11408, "epoch": 0.2050544630981055, "loss/policy_avg": 0.48776674270629883, "lr": 2.8634969325153375e-06, "objective/entropy": -14.844409942626953, "objective/kl": 10.682808876037598, "objective/non_score_reward": -1.0682809352874756, "objective/rlhf_reward": -3.873123502731323, "objective/scores": 0.1, "policy/approxkl_avg": 63.03434371948242, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7068368196487427, "step": 712, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9972708225250244 }, { "episode": 11424, "epoch": 0.20534205701549413, "loss/policy_avg": 0.8941645622253418, "lr": 2.8633052147239263e-06, "objective/entropy": -93.52375030517578, "objective/kl": 6.7023420333862305, "objective/non_score_reward": -0.6702341437339783, "objective/rlhf_reward": -4.680936813354492, "objective/scores": -0.5, "policy/approxkl_avg": 46.55957794189453, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7821812629699707, "step": 713, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9999279975891113 }, { "episode": 11440, "epoch": 0.20562965093288277, "loss/policy_avg": 0.5883785486221313, "lr": 2.8631134969325155e-06, "objective/entropy": -66.35493469238281, "objective/kl": 16.39910125732422, "objective/non_score_reward": -1.6399102210998535, "objective/rlhf_reward": -6.159640645980835, "objective/scores": 0.1, "policy/approxkl_avg": 99.94831848144531, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5211118459701538, "step": 714, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999578833580017 }, { "episode": 11456, "epoch": 0.2059172448502714, "loss/policy_avg": 0.673638105392456, "lr": 2.8629217791411043e-06, "objective/entropy": 56.791744232177734, "objective/kl": 13.02347183227539, "objective/non_score_reward": -1.302347183227539, "objective/rlhf_reward": -7.209388732910156, "objective/scores": -0.5, "policy/approxkl_avg": 57.524818420410156, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6059812307357788, "step": 715, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9978859424591064 }, { "episode": 11472, "epoch": 0.20620483876766008, "loss/policy_avg": 0.033942222595214844, "lr": 2.8627300613496936e-06, "objective/entropy": -125.00245666503906, "objective/kl": 16.061996459960938, "objective/non_score_reward": -1.6061995029449463, "objective/rlhf_reward": -2.0247983470559117, "objective/scores": 1.1, "policy/approxkl_avg": 95.87913513183594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.49507004022598267, "step": 716, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000415563583374 }, { "episode": 11488, "epoch": 0.20649243268504872, "loss/policy_avg": 0.32607385516166687, "lr": 2.8625383435582824e-06, "objective/entropy": 165.55465698242188, "objective/kl": 17.082618713378906, "objective/non_score_reward": -1.7082619667053223, "objective/rlhf_reward": -5.008219312104295, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 79.94935607910156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5599914789199829, "step": 717, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9984850883483887 }, { "episode": 11504, "epoch": 0.20678002660243736, "loss/policy_avg": 0.18473944067955017, "lr": 2.862346625766871e-06, "objective/entropy": 144.7602996826172, "objective/kl": 13.379175186157227, "objective/non_score_reward": -1.3379178047180176, "objective/rlhf_reward": -4.951671248674392, "objective/scores": 0.1, "policy/approxkl_avg": 234.73941040039062, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8746376037597656, "step": 718, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9992458820343018 }, { "episode": 11520, "epoch": 0.207067620519826, "loss/policy_avg": 0.3600703775882721, "lr": 2.8621549079754604e-06, "objective/entropy": 63.41236877441406, "objective/kl": 14.644229888916016, "objective/non_score_reward": -1.4644229412078857, "objective/rlhf_reward": -5.457692122459411, "objective/scores": 0.1, "policy/approxkl_avg": 130.00485229492188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.47121816873550415, "step": 719, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9991137981414795 }, { "episode": 11536, "epoch": 0.20735521443721466, "loss/policy_avg": 0.23683911561965942, "lr": 2.8619631901840492e-06, "objective/entropy": 5.525566101074219, "objective/kl": 11.11286735534668, "objective/non_score_reward": -1.1112868785858154, "objective/rlhf_reward": -2.8888880302577764, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 63.8202018737793, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.42511242628097534, "step": 720, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9963274002075195 }, { "episode": 11552, "epoch": 0.2076428083546033, "loss/policy_avg": 0.6022622585296631, "lr": 2.8617714723926384e-06, "objective/entropy": -102.19918823242188, "objective/kl": 15.452737808227539, "objective/non_score_reward": -1.545273780822754, "objective/rlhf_reward": -3.257376019598219, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 49.32636642456055, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7975940704345703, "step": 721, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9964704513549805 }, { "episode": 11568, "epoch": 0.20793040227199194, "loss/policy_avg": 0.24385926127433777, "lr": 2.8615797546012273e-06, "objective/entropy": 32.55232238769531, "objective/kl": 14.009422302246094, "objective/non_score_reward": -1.400942325592041, "objective/rlhf_reward": -3.2037693619728085, "objective/scores": 0.6, "policy/approxkl_avg": 153.17926025390625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8219155073165894, "step": 722, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0011699199676514 }, { "episode": 11584, "epoch": 0.20821799618938058, "loss/policy_avg": 0.3180992007255554, "lr": 2.861388036809816e-06, "objective/entropy": 72.60052490234375, "objective/kl": 15.588754653930664, "objective/non_score_reward": -1.558875560760498, "objective/rlhf_reward": -8.235502243041992, "objective/scores": -0.5, "policy/approxkl_avg": 123.99591064453125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7457439303398132, "step": 723, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9982542991638184 }, { "episode": 11600, "epoch": 0.20850559010676925, "loss/policy_avg": 0.08571982383728027, "lr": 2.861196319018405e-06, "objective/entropy": 43.14987564086914, "objective/kl": 17.108150482177734, "objective/non_score_reward": -1.7108149528503418, "objective/rlhf_reward": -2.4432600051164624, "objective/scores": 1.1, "policy/approxkl_avg": 57.58824920654297, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.49476999044418335, "step": 724, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9984831809997559 }, { "episode": 11616, "epoch": 0.2087931840241579, "loss/policy_avg": 0.21315881609916687, "lr": 2.8610046012269937e-06, "objective/entropy": -128.3315887451172, "objective/kl": 15.486307144165039, "objective/non_score_reward": -1.548630714416504, "objective/rlhf_reward": -1.7945227384567257, "objective/scores": 1.1, "policy/approxkl_avg": 213.23350524902344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5425740480422974, "step": 725, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9989008903503418 }, { "episode": 11632, "epoch": 0.20908077794154653, "loss/policy_avg": 0.5638971328735352, "lr": 2.860812883435583e-06, "objective/entropy": 91.92890930175781, "objective/kl": 10.46470832824707, "objective/non_score_reward": -1.0464708805084229, "objective/rlhf_reward": -3.7858832538127896, "objective/scores": 0.1, "policy/approxkl_avg": 65.73753356933594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7266730070114136, "step": 726, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9996936321258545 }, { "episode": 11648, "epoch": 0.20936837185893517, "loss/policy_avg": 0.33472561836242676, "lr": 2.8606211656441717e-06, "objective/entropy": 194.0995330810547, "objective/kl": 17.15127182006836, "objective/non_score_reward": -1.7151273488998413, "objective/rlhf_reward": -6.460509246587753, "objective/scores": 0.1, "policy/approxkl_avg": 155.51809692382812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7743821144104004, "step": 727, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9982187747955322 }, { "episode": 11664, "epoch": 0.20965596577632384, "loss/policy_avg": 0.0969974622130394, "lr": 2.8604294478527605e-06, "objective/entropy": 252.77114868164062, "objective/kl": 13.091619491577148, "objective/non_score_reward": -1.3091620206832886, "objective/rlhf_reward": -2.3129289641391964, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 98.46098327636719, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9146468639373779, "step": 728, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.998947262763977 }, { "episode": 11680, "epoch": 0.20994355969371248, "loss/policy_avg": 1.0030015707015991, "lr": 2.8602377300613498e-06, "objective/entropy": -29.64310073852539, "objective/kl": 9.100172996520996, "objective/non_score_reward": -0.9100174307823181, "objective/rlhf_reward": -5.640069961547852, "objective/scores": -0.5, "policy/approxkl_avg": 66.61669921875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6575506329536438, "step": 729, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9992115497589111 }, { "episode": 11696, "epoch": 0.21023115361110112, "loss/policy_avg": 0.07021422684192657, "lr": 2.8600460122699386e-06, "objective/entropy": -20.677194595336914, "objective/kl": 11.13044548034668, "objective/non_score_reward": -1.1130445003509521, "objective/rlhf_reward": -6.452178001403809, "objective/scores": -0.5, "policy/approxkl_avg": 20.34206771850586, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4357107877731323, "step": 730, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9978418350219727 }, { "episode": 11712, "epoch": 0.21051874752848979, "loss/policy_avg": 0.11832322180271149, "lr": 2.859854294478528e-06, "objective/entropy": -303.6877136230469, "objective/kl": 14.444772720336914, "objective/non_score_reward": -1.4444773197174072, "objective/rlhf_reward": -5.377909517288208, "objective/scores": 0.1, "policy/approxkl_avg": 57.83880615234375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7242922782897949, "step": 731, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0005035400390625 }, { "episode": 11728, "epoch": 0.21080634144587843, "loss/policy_avg": 0.2551548480987549, "lr": 2.8596625766871166e-06, "objective/entropy": 227.5770721435547, "objective/kl": 18.140262603759766, "objective/non_score_reward": -1.8140263557434082, "objective/rlhf_reward": -4.332386408687803, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 55.3328857421875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6282124519348145, "step": 732, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0000762939453125 }, { "episode": 11744, "epoch": 0.21109393536326707, "loss/policy_avg": 0.02594660222530365, "lr": 2.8594708588957054e-06, "objective/entropy": 29.276161193847656, "objective/kl": 18.943225860595703, "objective/non_score_reward": -1.8943226337432861, "objective/rlhf_reward": -9.577290534973145, "objective/scores": -0.5, "policy/approxkl_avg": 128.90655517578125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6192996501922607, "step": 733, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.997767448425293 }, { "episode": 11760, "epoch": 0.2113815292806557, "loss/policy_avg": 0.03574896976351738, "lr": 2.8592791411042947e-06, "objective/entropy": -71.63544464111328, "objective/kl": 11.780060768127441, "objective/non_score_reward": -1.1780060529708862, "objective/rlhf_reward": -1.788305324257585, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 23.68338394165039, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6172910332679749, "step": 734, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9973077774047852 }, { "episode": 11776, "epoch": 0.21166912319804437, "loss/policy_avg": 0.6160891056060791, "lr": 2.8590874233128835e-06, "objective/entropy": -90.55964660644531, "objective/kl": 15.64774227142334, "objective/non_score_reward": -1.5647742748260498, "objective/rlhf_reward": -3.8590969800949093, "objective/scores": 0.6, "policy/approxkl_avg": 52.503440856933594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6182564496994019, "step": 735, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9989289045333862 }, { "episode": 11792, "epoch": 0.211956717115433, "loss/policy_avg": 0.3826139569282532, "lr": 2.8588957055214727e-06, "objective/entropy": -80.17495727539062, "objective/kl": 13.256806373596191, "objective/non_score_reward": -1.3256807327270508, "objective/rlhf_reward": -2.3790036782037944, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 100.29702758789062, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7060404419898987, "step": 736, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999138593673706 }, { "episode": 11808, "epoch": 0.21224431103282165, "loss/policy_avg": -0.032949626445770264, "lr": 2.8587039877300615e-06, "objective/entropy": -176.73757934570312, "objective/kl": 8.910408973693848, "objective/non_score_reward": -0.8910409212112427, "objective/rlhf_reward": -5.564163684844971, "objective/scores": -0.5, "policy/approxkl_avg": 92.01910400390625, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.49018532037734985, "step": 737, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0024096965789795 }, { "episode": 11824, "epoch": 0.2125319049502103, "loss/policy_avg": 0.06443023681640625, "lr": 2.8585122699386503e-06, "objective/entropy": -30.911895751953125, "objective/kl": 12.735221862792969, "objective/non_score_reward": -1.2735222578048706, "objective/rlhf_reward": -0.6940891504287716, "objective/scores": 1.1, "policy/approxkl_avg": 67.95742797851562, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6443949341773987, "step": 738, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9994258880615234 }, { "episode": 11840, "epoch": 0.21281949886759896, "loss/policy_avg": 0.8280965089797974, "lr": 2.8583205521472396e-06, "objective/entropy": -47.22200012207031, "objective/kl": 11.071340560913086, "objective/non_score_reward": -1.1071341037750244, "objective/rlhf_reward": -6.428536415100098, "objective/scores": -0.5, "policy/approxkl_avg": 87.060302734375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5341061353683472, "step": 739, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9975091218948364 }, { "episode": 11856, "epoch": 0.2131070927849876, "loss/policy_avg": 0.17557145655155182, "lr": 2.8581288343558284e-06, "objective/entropy": -33.48394775390625, "objective/kl": 12.476408004760742, "objective/non_score_reward": -1.24764084815979, "objective/rlhf_reward": -2.0668442777704925, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 54.93560791015625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5069207549095154, "step": 740, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9968397617340088 }, { "episode": 11872, "epoch": 0.21339468670237624, "loss/policy_avg": 0.07476645708084106, "lr": 2.857937116564417e-06, "objective/entropy": -147.0701904296875, "objective/kl": 16.620323181152344, "objective/non_score_reward": -1.6620323657989502, "objective/rlhf_reward": -6.248129403591156, "objective/scores": 0.1, "policy/approxkl_avg": 201.01736450195312, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.700377345085144, "step": 741, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000122547149658 }, { "episode": 11888, "epoch": 0.21368228061976488, "loss/policy_avg": 0.13703355193138123, "lr": 2.8577453987730064e-06, "objective/entropy": -27.77845001220703, "objective/kl": 11.785709381103516, "objective/non_score_reward": -1.1785709857940674, "objective/rlhf_reward": -4.314283764362335, "objective/scores": 0.1, "policy/approxkl_avg": 119.8775863647461, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6977899074554443, "step": 742, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9989674091339111 }, { "episode": 11904, "epoch": 0.21396987453715355, "loss/policy_avg": 0.5084174871444702, "lr": 2.8575536809815952e-06, "objective/entropy": -178.72799682617188, "objective/kl": 6.797292709350586, "objective/non_score_reward": -0.6797292828559875, "objective/rlhf_reward": -2.3189171761274334, "objective/scores": 0.1, "policy/approxkl_avg": 19.976099014282227, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.889729380607605, "step": 743, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9978809356689453 }, { "episode": 11920, "epoch": 0.2142574684545422, "loss/policy_avg": 0.08717440068721771, "lr": 2.8573619631901845e-06, "objective/entropy": -100.47814178466797, "objective/kl": 11.974678993225098, "objective/non_score_reward": -1.1974678039550781, "objective/rlhf_reward": -4.38987118601799, "objective/scores": 0.1, "policy/approxkl_avg": 116.80694580078125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6132055521011353, "step": 744, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9981837272644043 }, { "episode": 11936, "epoch": 0.21454506237193083, "loss/policy_avg": 0.1440531611442566, "lr": 2.857170245398773e-06, "objective/entropy": -37.25544738769531, "objective/kl": 12.166690826416016, "objective/non_score_reward": -1.2166690826416016, "objective/rlhf_reward": -1.9429572268736093, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 31.051591873168945, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7306356430053711, "step": 745, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000868558883667 }, { "episode": 11952, "epoch": 0.21483265628931947, "loss/policy_avg": 0.22620095312595367, "lr": 2.856978527607362e-06, "objective/entropy": -61.764095306396484, "objective/kl": 12.740127563476562, "objective/non_score_reward": -1.274012804031372, "objective/rlhf_reward": -0.6960513353347775, "objective/scores": 1.1, "policy/approxkl_avg": 113.7810287475586, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7009084224700928, "step": 746, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9981598854064941 }, { "episode": 11968, "epoch": 0.21512025020670814, "loss/policy_avg": 0.14071348309516907, "lr": 2.856786809815951e-06, "objective/entropy": 29.96725845336914, "objective/kl": 12.842681884765625, "objective/non_score_reward": -1.2842683792114258, "objective/rlhf_reward": -2.213354293943617, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 2.990078926086426, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4330785572528839, "step": 747, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9996414184570312 }, { "episode": 11984, "epoch": 0.21540784412409678, "loss/policy_avg": 0.24370655417442322, "lr": 2.8565950920245397e-06, "objective/entropy": -147.6442108154297, "objective/kl": 16.267515182495117, "objective/non_score_reward": -1.6267515420913696, "objective/rlhf_reward": -2.107005929946899, "objective/scores": 1.1, "policy/approxkl_avg": 191.24424743652344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5275372862815857, "step": 748, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9969547986984253 }, { "episode": 12000, "epoch": 0.21569543804148542, "loss/policy_avg": 0.16116216778755188, "lr": 2.856403374233129e-06, "objective/entropy": -21.843975067138672, "objective/kl": 13.669893264770508, "objective/non_score_reward": -1.3669893741607666, "objective/rlhf_reward": -7.467957496643066, "objective/scores": -0.5, "policy/approxkl_avg": 52.672183990478516, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.9156934022903442, "step": 749, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0002281665802 }, { "episode": 12016, "epoch": 0.21598303195887408, "loss/policy_avg": 0.539941668510437, "lr": 2.8562116564417177e-06, "objective/entropy": 195.0397186279297, "objective/kl": 7.716229438781738, "objective/non_score_reward": -0.7716230154037476, "objective/rlhf_reward": -5.086491584777832, "objective/scores": -0.5, "policy/approxkl_avg": 57.51429748535156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7705237865447998, "step": 750, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0006752014160156 }, { "episode": 12032, "epoch": 0.21627062587626272, "loss/policy_avg": 0.711264967918396, "lr": 2.8560199386503065e-06, "objective/entropy": -299.9069519042969, "objective/kl": 12.776535987854004, "objective/non_score_reward": -1.277653694152832, "objective/rlhf_reward": -4.710614657402038, "objective/scores": 0.1, "policy/approxkl_avg": 82.51051330566406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6128495335578918, "step": 751, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.9975297451019287 }, { "episode": 12048, "epoch": 0.21655821979365136, "loss/policy_avg": 0.4313165545463562, "lr": 2.8558282208588958e-06, "objective/entropy": 8.31052017211914, "objective/kl": 17.218887329101562, "objective/non_score_reward": -1.7218886613845825, "objective/rlhf_reward": -3.9638356163513393, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 92.13627624511719, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.56125807762146, "step": 752, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.996793270111084 }, { "episode": 12064, "epoch": 0.21684581371104, "loss/policy_avg": 0.2670312821865082, "lr": 2.8556365030674846e-06, "objective/entropy": 1.7406082153320312, "objective/kl": 10.113969802856445, "objective/non_score_reward": -1.011396884918213, "objective/rlhf_reward": -2.0981764336692645, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 66.017822265625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6029417514801025, "step": 753, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0002827644348145 }, { "episode": 12080, "epoch": 0.21713340762842867, "loss/policy_avg": 0.22361746430397034, "lr": 2.855444785276074e-06, "objective/entropy": -6.838325500488281, "objective/kl": 9.408108711242676, "objective/non_score_reward": -0.9408108592033386, "objective/rlhf_reward": -3.363243496417999, "objective/scores": 0.1, "policy/approxkl_avg": 45.14617156982422, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7355029582977295, "step": 754, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000067949295044 }, { "episode": 12096, "epoch": 0.2174210015458173, "loss/policy_avg": 0.4131731688976288, "lr": 2.8552530674846626e-06, "objective/entropy": -199.2462158203125, "objective/kl": 11.438251495361328, "objective/non_score_reward": -1.1438250541687012, "objective/rlhf_reward": -6.575300216674805, "objective/scores": -0.5, "policy/approxkl_avg": 29.276948928833008, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.49213117361068726, "step": 755, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0014610290527344 }, { "episode": 12112, "epoch": 0.21770859546320595, "loss/policy_avg": 0.2563888430595398, "lr": 2.8550613496932514e-06, "objective/entropy": -50.35034942626953, "objective/kl": 10.818819046020508, "objective/non_score_reward": -1.0818817615509033, "objective/rlhf_reward": -2.7234071231523327, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 5.181286811828613, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6795003414154053, "step": 756, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9993948936462402 }, { "episode": 12128, "epoch": 0.2179961893805946, "loss/policy_avg": 0.27015259861946106, "lr": 2.8548696319018407e-06, "objective/entropy": 50.269439697265625, "objective/kl": 11.087736129760742, "objective/non_score_reward": -1.1087735891342163, "objective/rlhf_reward": -0.035094296932220104, "objective/scores": 1.1, "policy/approxkl_avg": 4.901422500610352, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6422553062438965, "step": 757, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998644471168518 }, { "episode": 12144, "epoch": 0.21828378329798326, "loss/policy_avg": 0.6295909881591797, "lr": 2.8546779141104295e-06, "objective/entropy": 40.25965118408203, "objective/kl": 14.175272941589355, "objective/non_score_reward": -1.417527198791504, "objective/rlhf_reward": -2.7463900640022487, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 221.42971801757812, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5903568267822266, "step": 758, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998144268989563 }, { "episode": 12160, "epoch": 0.2185713772153719, "loss/policy_avg": 0.6265522241592407, "lr": 2.8544861963190187e-06, "objective/entropy": -87.25729370117188, "objective/kl": 10.555778503417969, "objective/non_score_reward": -1.0555777549743652, "objective/rlhf_reward": 0.17768906950950658, "objective/scores": 1.1, "policy/approxkl_avg": 90.87506866455078, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6326598525047302, "step": 759, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0008277893066406 }, { "episode": 12176, "epoch": 0.21885897113276054, "loss/policy_avg": 0.12339673936367035, "lr": 2.8542944785276075e-06, "objective/entropy": -33.543636322021484, "objective/kl": 7.962390899658203, "objective/non_score_reward": -0.7962390184402466, "objective/rlhf_reward": -2.784956073760986, "objective/scores": 0.1, "policy/approxkl_avg": 20.08357810974121, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8025330901145935, "step": 760, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9965462684631348 }, { "episode": 12192, "epoch": 0.21914656505014918, "loss/policy_avg": 0.1615457832813263, "lr": 2.8541027607361963e-06, "objective/entropy": 145.47601318359375, "objective/kl": 15.407791137695312, "objective/non_score_reward": -1.5407792329788208, "objective/rlhf_reward": -8.163117408752441, "objective/scores": -0.5, "policy/approxkl_avg": 79.53536224365234, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5605360269546509, "step": 761, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9999860525131226 }, { "episode": 12208, "epoch": 0.21943415896753785, "loss/policy_avg": 0.30102694034576416, "lr": 2.8539110429447856e-06, "objective/entropy": 27.323680877685547, "objective/kl": 3.660177707672119, "objective/non_score_reward": -0.36601775884628296, "objective/rlhf_reward": 2.935928934812546, "objective/scores": 1.1, "policy/approxkl_avg": 2.197434902191162, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.36402425169944763, "step": 762, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9990768432617188 }, { "episode": 12224, "epoch": 0.21972175288492649, "loss/policy_avg": 0.18315057456493378, "lr": 2.8537193251533744e-06, "objective/entropy": 72.46862030029297, "objective/kl": 10.31401252746582, "objective/non_score_reward": -1.0314011573791504, "objective/rlhf_reward": -1.7256047189235686, "objective/scores": 0.6, "policy/approxkl_avg": 9.106523513793945, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7303919792175293, "step": 763, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0000040531158447 }, { "episode": 12240, "epoch": 0.22000934680231513, "loss/policy_avg": 0.5514622330665588, "lr": 2.853527607361963e-06, "objective/entropy": -74.59797668457031, "objective/kl": 14.129312515258789, "objective/non_score_reward": -1.4129313230514526, "objective/rlhf_reward": -1.2517252624034878, "objective/scores": 1.1, "policy/approxkl_avg": 46.52693176269531, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4529426395893097, "step": 764, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9981811046600342 }, { "episode": 12256, "epoch": 0.22029694071970377, "loss/policy_avg": -0.04296427220106125, "lr": 2.8533358895705524e-06, "objective/entropy": -14.170623779296875, "objective/kl": 16.591388702392578, "objective/non_score_reward": -1.6591390371322632, "objective/rlhf_reward": -8.636556625366211, "objective/scores": -0.5, "policy/approxkl_avg": 108.84127807617188, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7804018259048462, "step": 765, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0000200271606445 }, { "episode": 12272, "epoch": 0.22058453463709243, "loss/policy_avg": 0.14860758185386658, "lr": 2.8531441717791412e-06, "objective/entropy": 234.34619140625, "objective/kl": 14.954992294311523, "objective/non_score_reward": -1.4954993724822998, "objective/rlhf_reward": -3.8592914364495616, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 31.390544891357422, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7560025453567505, "step": 766, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9960116147994995 }, { "episode": 12288, "epoch": 0.22087212855448107, "loss/policy_avg": 0.027001656591892242, "lr": 2.85295245398773e-06, "objective/entropy": -231.42864990234375, "objective/kl": 14.624351501464844, "objective/non_score_reward": -1.462435245513916, "objective/rlhf_reward": -1.449740996956825, "objective/scores": 1.1, "policy/approxkl_avg": 125.69221496582031, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6178066730499268, "step": 767, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9983733892440796 }, { "episode": 12304, "epoch": 0.2211597224718697, "loss/policy_avg": 0.10003480315208435, "lr": 2.852760736196319e-06, "objective/entropy": 105.91529846191406, "objective/kl": 11.028611183166504, "objective/non_score_reward": -1.1028611660003662, "objective/rlhf_reward": -0.011445081233977916, "objective/scores": 1.1, "policy/approxkl_avg": 99.16251373291016, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4474494457244873, "step": 768, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0001845359802246 }, { "episode": 12320, "epoch": 0.22144731638925838, "loss/policy_avg": 0.3084249794483185, "lr": 2.852569018404908e-06, "objective/entropy": 311.5811767578125, "objective/kl": 12.777912139892578, "objective/non_score_reward": -1.2777912616729736, "objective/rlhf_reward": -4.711165154725313, "objective/scores": 0.1, "policy/approxkl_avg": 87.19984436035156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.747369110584259, "step": 769, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.000682830810547 }, { "episode": 12336, "epoch": 0.22173491030664702, "loss/policy_avg": 0.1643168181180954, "lr": 2.852377300613497e-06, "objective/entropy": -229.23110961914062, "objective/kl": 13.623214721679688, "objective/non_score_reward": -1.362321376800537, "objective/rlhf_reward": -7.449285507202148, "objective/scores": -0.5, "policy/approxkl_avg": 68.4518051147461, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7264796495437622, "step": 770, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0006937980651855 }, { "episode": 12352, "epoch": 0.22202250422403566, "loss/policy_avg": -0.010787129402160645, "lr": 2.8521855828220857e-06, "objective/entropy": 124.02545166015625, "objective/kl": 7.704123497009277, "objective/non_score_reward": -0.7704123258590698, "objective/rlhf_reward": -5.081649303436279, "objective/scores": -0.5, "policy/approxkl_avg": 28.008346557617188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6329702138900757, "step": 771, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0013375282287598 }, { "episode": 12368, "epoch": 0.2223100981414243, "loss/policy_avg": 0.2653355300426483, "lr": 2.851993865030675e-06, "objective/entropy": 23.56524658203125, "objective/kl": 15.354362487792969, "objective/non_score_reward": -1.5354361534118652, "objective/rlhf_reward": -5.741744464635849, "objective/scores": 0.1, "policy/approxkl_avg": 111.05146789550781, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6844107508659363, "step": 772, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9988980293273926 }, { "episode": 12384, "epoch": 0.22259769205881297, "loss/policy_avg": 0.08450818061828613, "lr": 2.8518021472392637e-06, "objective/entropy": 342.4418029785156, "objective/kl": 15.424835205078125, "objective/non_score_reward": -1.5424836874008179, "objective/rlhf_reward": -5.769934868812561, "objective/scores": 0.1, "policy/approxkl_avg": 132.2557373046875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8565220236778259, "step": 773, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9969991445541382 }, { "episode": 12400, "epoch": 0.2228852859762016, "loss/policy_avg": -0.5032927393913269, "lr": 2.851610429447853e-06, "objective/entropy": -107.81707763671875, "objective/kl": 13.79594612121582, "objective/non_score_reward": -1.3795948028564453, "objective/rlhf_reward": -3.118379211425781, "objective/scores": 0.6, "policy/approxkl_avg": 41.49142837524414, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.35487908124923706, "step": 774, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.006086826324463 }, { "episode": 12416, "epoch": 0.22317287989359025, "loss/policy_avg": 0.14595381915569305, "lr": 2.8514187116564418e-06, "objective/entropy": -56.38682556152344, "objective/kl": 13.150504112243652, "objective/non_score_reward": -1.3150502443313599, "objective/rlhf_reward": -3.3127899570035293, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 46.86200714111328, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7194310426712036, "step": 775, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0001683235168457 }, { "episode": 12432, "epoch": 0.2234604738109789, "loss/policy_avg": -0.2638096213340759, "lr": 2.8512269938650306e-06, "objective/entropy": 76.923095703125, "objective/kl": 12.93875503540039, "objective/non_score_reward": -1.2938756942749023, "objective/rlhf_reward": -7.175502777099609, "objective/scores": -0.5, "policy/approxkl_avg": 27.019851684570312, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.5464330911636353, "step": 776, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.014585018157959 }, { "episode": 12448, "epoch": 0.22374806772836756, "loss/policy_avg": 0.19038286805152893, "lr": 2.85103527607362e-06, "objective/entropy": 99.66128540039062, "objective/kl": 11.428020477294922, "objective/non_score_reward": -1.142802119255066, "objective/rlhf_reward": -4.1712084174156185, "objective/scores": 0.1, "policy/approxkl_avg": 27.72539520263672, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.533401608467102, "step": 777, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9978110790252686 }, { "episode": 12464, "epoch": 0.2240356616457562, "loss/policy_avg": 0.31564778089523315, "lr": 2.8508435582822086e-06, "objective/entropy": 43.836891174316406, "objective/kl": 15.314764022827148, "objective/non_score_reward": -1.5314764976501465, "objective/rlhf_reward": -1.7259061098098751, "objective/scores": 1.1, "policy/approxkl_avg": 83.3494644165039, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7181559205055237, "step": 778, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9969323873519897 }, { "episode": 12480, "epoch": 0.22432325556314484, "loss/policy_avg": 0.15969273447990417, "lr": 2.8506518404907974e-06, "objective/entropy": 62.45077896118164, "objective/kl": 15.108039855957031, "objective/non_score_reward": -1.5108040571212769, "objective/rlhf_reward": -5.643216168880462, "objective/scores": 0.1, "policy/approxkl_avg": 79.81277465820312, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6362130641937256, "step": 779, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9979162216186523 }, { "episode": 12496, "epoch": 0.22461084948053348, "loss/policy_avg": -0.1570049226284027, "lr": 2.8504601226993867e-06, "objective/entropy": 130.28684997558594, "objective/kl": 15.398730278015137, "objective/non_score_reward": -1.5398731231689453, "objective/rlhf_reward": -3.759492194652557, "objective/scores": 0.6, "policy/approxkl_avg": 36.37596893310547, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5687062740325928, "step": 780, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0024261474609375 }, { "episode": 12512, "epoch": 0.22489844339792214, "loss/policy_avg": 0.1517828404903412, "lr": 2.8502684049079755e-06, "objective/entropy": 66.80693817138672, "objective/kl": 7.023012161254883, "objective/non_score_reward": -0.7023012042045593, "objective/rlhf_reward": -1.1473453245764835, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 70.58222961425781, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5671324729919434, "step": 781, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999403953552246 }, { "episode": 12528, "epoch": 0.22518603731531078, "loss/policy_avg": 0.3743288516998291, "lr": 2.8500766871165647e-06, "objective/entropy": 37.623748779296875, "objective/kl": 9.943754196166992, "objective/non_score_reward": -0.994375467300415, "objective/rlhf_reward": -1.5775016754865643, "objective/scores": 0.6, "policy/approxkl_avg": 14.220436096191406, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5091394186019897, "step": 782, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9982104301452637 }, { "episode": 12544, "epoch": 0.22547363123269942, "loss/policy_avg": 0.4069502055644989, "lr": 2.8498849693251535e-06, "objective/entropy": 184.85443115234375, "objective/kl": 14.495233535766602, "objective/non_score_reward": -1.4495233297348022, "objective/rlhf_reward": -7.798093795776367, "objective/scores": -0.5, "policy/approxkl_avg": 5.534117221832275, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6540646553039551, "step": 783, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999969720840454 }, { "episode": 12560, "epoch": 0.22576122515008806, "loss/policy_avg": -0.15900185704231262, "lr": 2.8496932515337423e-06, "objective/entropy": -7.934391021728516, "objective/kl": 14.886871337890625, "objective/non_score_reward": -1.4886871576309204, "objective/rlhf_reward": -7.954748630523682, "objective/scores": -0.5, "policy/approxkl_avg": 22.033672332763672, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6111799478530884, "step": 784, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0002901554107666 }, { "episode": 12576, "epoch": 0.22604881906747673, "loss/policy_avg": 0.05465098097920418, "lr": 2.8495015337423316e-06, "objective/entropy": 184.02117919921875, "objective/kl": 11.928532600402832, "objective/non_score_reward": -1.192853331565857, "objective/rlhf_reward": -4.371413117647171, "objective/scores": 0.1, "policy/approxkl_avg": 45.86432647705078, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8463940620422363, "step": 785, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000133752822876 }, { "episode": 12592, "epoch": 0.22633641298486537, "loss/policy_avg": 0.3418072462081909, "lr": 2.8493098159509204e-06, "objective/entropy": -23.809371948242188, "objective/kl": 15.163887023925781, "objective/non_score_reward": -1.5163884162902832, "objective/rlhf_reward": -8.065553665161133, "objective/scores": -0.5, "policy/approxkl_avg": 78.87910461425781, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8011717796325684, "step": 786, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.99837064743042 }, { "episode": 12608, "epoch": 0.226624006902254, "loss/policy_avg": -0.3748244047164917, "lr": 2.8491180981595096e-06, "objective/entropy": 126.95550537109375, "objective/kl": 14.842533111572266, "objective/non_score_reward": -1.4842532873153687, "objective/rlhf_reward": -5.537013149261474, "objective/scores": 0.1, "policy/approxkl_avg": 80.93215942382812, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6581273674964905, "step": 787, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0006587505340576 }, { "episode": 12624, "epoch": 0.22691160081964268, "loss/policy_avg": 0.230320543050766, "lr": 2.8489263803680984e-06, "objective/entropy": 95.23011779785156, "objective/kl": 14.606027603149414, "objective/non_score_reward": -1.4606029987335205, "objective/rlhf_reward": -4.238291654650288, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 115.39529418945312, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.728987455368042, "step": 788, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9998013973236084 }, { "episode": 12640, "epoch": 0.22719919473703132, "loss/policy_avg": -0.3885463774204254, "lr": 2.8487346625766872e-06, "objective/entropy": 97.80613708496094, "objective/kl": 11.48002815246582, "objective/non_score_reward": -1.1480028629302979, "objective/rlhf_reward": -6.592011451721191, "objective/scores": -0.5, "policy/approxkl_avg": 59.53124237060547, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.594038724899292, "step": 789, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.010626792907715 }, { "episode": 12656, "epoch": 0.22748678865441996, "loss/policy_avg": 0.3055153489112854, "lr": 2.848542944785276e-06, "objective/entropy": 199.88526916503906, "objective/kl": 13.171801567077637, "objective/non_score_reward": -1.3171800374984741, "objective/rlhf_reward": -3.6068607918625935, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 85.62678527832031, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5295522212982178, "step": 790, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9962729215621948 }, { "episode": 12672, "epoch": 0.2277743825718086, "loss/policy_avg": 0.18343961238861084, "lr": 2.848351226993865e-06, "objective/entropy": 212.48171997070312, "objective/kl": 14.552225112915039, "objective/non_score_reward": -1.4552226066589355, "objective/rlhf_reward": -7.820890426635742, "objective/scores": -0.5, "policy/approxkl_avg": 26.826194763183594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7555446028709412, "step": 791, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9981155395507812 }, { "episode": 12688, "epoch": 0.22806197648919727, "loss/policy_avg": 1.651806116104126, "lr": 2.848159509202454e-06, "objective/entropy": 22.058094024658203, "objective/kl": 12.990779876708984, "objective/non_score_reward": -1.2990779876708984, "objective/rlhf_reward": -4.796312069892883, "objective/scores": 0.1, "policy/approxkl_avg": 81.44281005859375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7748013734817505, "step": 792, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999983549118042 }, { "episode": 12704, "epoch": 0.2283495704065859, "loss/policy_avg": 0.020992066711187363, "lr": 2.847967791411043e-06, "objective/entropy": 100.20834350585938, "objective/kl": 20.087814331054688, "objective/non_score_reward": -2.0087814331054688, "objective/rlhf_reward": -6.373266016662704, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 137.36184692382812, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6344403028488159, "step": 793, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9986518621444702 }, { "episode": 12720, "epoch": 0.22863716432397455, "loss/policy_avg": 0.06677938997745514, "lr": 2.8477760736196317e-06, "objective/entropy": 7.002399444580078, "objective/kl": 10.064220428466797, "objective/non_score_reward": -1.0064222812652588, "objective/rlhf_reward": -3.62568869292736, "objective/scores": 0.1, "policy/approxkl_avg": 43.89893341064453, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5471144318580627, "step": 794, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9984047412872314 }, { "episode": 12736, "epoch": 0.22892475824136319, "loss/policy_avg": -0.07828734815120697, "lr": 2.847584355828221e-06, "objective/entropy": -104.39112854003906, "objective/kl": 18.24449920654297, "objective/non_score_reward": -1.824450135231018, "objective/rlhf_reward": -9.297800064086914, "objective/scores": -0.5, "policy/approxkl_avg": 33.5714225769043, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6325054168701172, "step": 795, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9970802068710327 }, { "episode": 12752, "epoch": 0.22921235215875185, "loss/policy_avg": 0.28975850343704224, "lr": 2.8473926380368097e-06, "objective/entropy": 133.83016967773438, "objective/kl": 8.593679428100586, "objective/non_score_reward": -0.8593680262565613, "objective/rlhf_reward": -5.437472343444824, "objective/scores": -0.5, "policy/approxkl_avg": 8.80305004119873, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.729377269744873, "step": 796, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9978275299072266 }, { "episode": 12768, "epoch": 0.2294999460761405, "loss/policy_avg": 1.422573208808899, "lr": 2.847200920245399e-06, "objective/entropy": 149.96119689941406, "objective/kl": 16.894641876220703, "objective/non_score_reward": -1.6894640922546387, "objective/rlhf_reward": -6.357856726646423, "objective/scores": 0.1, "policy/approxkl_avg": 230.54080200195312, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8895937204360962, "step": 797, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0016024112701416 }, { "episode": 12784, "epoch": 0.22978753999352913, "loss/policy_avg": 0.19597771763801575, "lr": 2.8470092024539878e-06, "objective/entropy": 95.1851577758789, "objective/kl": 12.648846626281738, "objective/non_score_reward": -1.2648844718933105, "objective/rlhf_reward": -7.059537887573242, "objective/scores": -0.5, "policy/approxkl_avg": 10.24577522277832, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7796587944030762, "step": 798, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000627279281616 }, { "episode": 12800, "epoch": 0.23007513391091777, "loss/policy_avg": 0.13448825478553772, "lr": 2.8468174846625766e-06, "objective/entropy": 102.28286743164062, "objective/kl": 13.572122573852539, "objective/non_score_reward": -1.3572125434875488, "objective/rlhf_reward": -1.0288498461246487, "objective/scores": 1.1, "policy/approxkl_avg": 100.61666870117188, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6195476055145264, "step": 799, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9980723857879639 }, { "episode": 12816, "epoch": 0.23036272782830644, "loss/policy_avg": 0.6187319159507751, "lr": 2.846625766871166e-06, "objective/entropy": -39.129539489746094, "objective/kl": 18.39947509765625, "objective/non_score_reward": -1.8399477005004883, "objective/rlhf_reward": -2.9597911596298214, "objective/scores": 1.1, "policy/approxkl_avg": 77.12504577636719, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5955438613891602, "step": 800, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.001791000366211 }, { "episode": 12832, "epoch": 0.23065032174569508, "loss/policy_avg": 0.3273153305053711, "lr": 2.8464340490797546e-06, "objective/entropy": 13.417747497558594, "objective/kl": 7.873808860778809, "objective/non_score_reward": -0.7873809337615967, "objective/rlhf_reward": -1.324694927009653, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 17.407360076904297, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5497540235519409, "step": 801, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9995551109313965 }, { "episode": 12848, "epoch": 0.23093791566308372, "loss/policy_avg": 0.3061869144439697, "lr": 2.8462423312883434e-06, "objective/entropy": -106.52912902832031, "objective/kl": 10.210708618164062, "objective/non_score_reward": -1.0210708379745483, "objective/rlhf_reward": -6.084282875061035, "objective/scores": -0.5, "policy/approxkl_avg": 96.2752456665039, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4860496520996094, "step": 802, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9976447820663452 }, { "episode": 12864, "epoch": 0.23122550958047236, "loss/policy_avg": 0.1686146855354309, "lr": 2.8460506134969327e-06, "objective/entropy": 29.669536590576172, "objective/kl": 18.03110122680664, "objective/non_score_reward": -1.803110122680664, "objective/rlhf_reward": -4.288721655250761, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 38.228782653808594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8174216151237488, "step": 803, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9982223510742188 }, { "episode": 12880, "epoch": 0.23151310349786103, "loss/policy_avg": 0.25656819343566895, "lr": 2.8458588957055215e-06, "objective/entropy": -39.153350830078125, "objective/kl": 9.698554039001465, "objective/non_score_reward": -0.9698554277420044, "objective/rlhf_reward": 0.5205781698226932, "objective/scores": 1.1, "policy/approxkl_avg": 9.279545783996582, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4837508201599121, "step": 804, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9997928142547607 }, { "episode": 12896, "epoch": 0.23180069741524967, "loss/policy_avg": 0.08962078392505646, "lr": 2.8456671779141107e-06, "objective/entropy": 91.72572326660156, "objective/kl": 9.199845314025879, "objective/non_score_reward": -0.9199845790863037, "objective/rlhf_reward": -1.9466050426165262, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 36.79070281982422, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7538923025131226, "step": 805, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9967260360717773 }, { "episode": 12912, "epoch": 0.2320882913326383, "loss/policy_avg": 0.2165328860282898, "lr": 2.8454754601226995e-06, "objective/entropy": 141.5177001953125, "objective/kl": 16.128833770751953, "objective/non_score_reward": -1.61288321018219, "objective/rlhf_reward": -4.328826548830543, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 8.008130073547363, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.76326584815979, "step": 806, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9970195293426514 }, { "episode": 12928, "epoch": 0.23237588525002698, "loss/policy_avg": 0.33014577627182007, "lr": 2.8452837423312883e-06, "objective/entropy": -7.563770294189453, "objective/kl": 15.92538070678711, "objective/non_score_reward": -1.5925382375717163, "objective/rlhf_reward": -8.370153427124023, "objective/scores": -0.5, "policy/approxkl_avg": 122.01593017578125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6476025581359863, "step": 807, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9975717067718506 }, { "episode": 12944, "epoch": 0.23266347916741562, "loss/policy_avg": 0.2228621542453766, "lr": 2.8450920245398776e-06, "objective/entropy": -146.32801818847656, "objective/kl": 14.984716415405273, "objective/non_score_reward": -1.498471736907959, "objective/rlhf_reward": -7.993886947631836, "objective/scores": -0.5, "policy/approxkl_avg": 8.571311950683594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6879932880401611, "step": 808, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9982060194015503 }, { "episode": 12960, "epoch": 0.23295107308480426, "loss/policy_avg": -0.1637599915266037, "lr": 2.8449003067484664e-06, "objective/entropy": -171.0908203125, "objective/kl": 4.425614833831787, "objective/non_score_reward": -0.44256141781806946, "objective/rlhf_reward": -1.370245734602213, "objective/scores": 0.1, "policy/approxkl_avg": 7.86245059967041, "policy/clipfrac_avg": 2.0, "policy/entropy_avg": 0.36712339520454407, "step": 809, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001098871231079 }, { "episode": 12976, "epoch": 0.2332386670021929, "loss/policy_avg": 0.16715390980243683, "lr": 2.8447085889570556e-06, "objective/entropy": -126.9478530883789, "objective/kl": 13.796289443969727, "objective/non_score_reward": -1.379629135131836, "objective/rlhf_reward": -5.118516108393669, "objective/scores": 0.1, "policy/approxkl_avg": 139.91973876953125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6227424144744873, "step": 810, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9997196197509766 }, { "episode": 12992, "epoch": 0.23352626091958156, "loss/policy_avg": 0.8460204601287842, "lr": 2.8445168711656444e-06, "objective/entropy": -108.99869537353516, "objective/kl": 14.032926559448242, "objective/non_score_reward": -1.4032926559448242, "objective/rlhf_reward": -3.2131706982851025, "objective/scores": 0.6, "policy/approxkl_avg": 148.4869384765625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5060499906539917, "step": 811, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9975128173828125 }, { "episode": 13008, "epoch": 0.2338138548369702, "loss/policy_avg": 0.5107466578483582, "lr": 2.8443251533742332e-06, "objective/entropy": 26.51531219482422, "objective/kl": 12.1517333984375, "objective/non_score_reward": -1.215173363685608, "objective/rlhf_reward": -2.4606932833790776, "objective/scores": 0.6, "policy/approxkl_avg": 44.8768310546875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6984570622444153, "step": 812, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9982036352157593 }, { "episode": 13024, "epoch": 0.23410144875435884, "loss/policy_avg": 0.5176931619644165, "lr": 2.844133435582822e-06, "objective/entropy": 87.28473663330078, "objective/kl": 11.600730895996094, "objective/non_score_reward": -1.160073161125183, "objective/rlhf_reward": -4.240292406082153, "objective/scores": 0.1, "policy/approxkl_avg": 86.09700012207031, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5015227794647217, "step": 813, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9994313716888428 }, { "episode": 13040, "epoch": 0.23438904267174748, "loss/policy_avg": 0.2525648772716522, "lr": 2.843941717791411e-06, "objective/entropy": -90.09027099609375, "objective/kl": 17.53409767150879, "objective/non_score_reward": -1.7534098625183105, "objective/rlhf_reward": -5.188810522827218, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 133.2302703857422, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6975289583206177, "step": 814, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.996279239654541 }, { "episode": 13056, "epoch": 0.23467663658913615, "loss/policy_avg": 0.0005577714182436466, "lr": 2.84375e-06, "objective/entropy": 87.36531829833984, "objective/kl": 8.974178314208984, "objective/non_score_reward": -0.8974178433418274, "objective/rlhf_reward": -3.189671283960342, "objective/scores": 0.1, "policy/approxkl_avg": 1.3939369916915894, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5556995868682861, "step": 815, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0007152557373047 }, { "episode": 13072, "epoch": 0.2349642305065248, "loss/policy_avg": -0.10415857285261154, "lr": 2.843558282208589e-06, "objective/entropy": 281.04083251953125, "objective/kl": 10.391765594482422, "objective/non_score_reward": -1.0391765832901, "objective/rlhf_reward": -3.7567061990499493, "objective/scores": 0.1, "policy/approxkl_avg": 10.441307067871094, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.8066496849060059, "step": 816, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.001943588256836 }, { "episode": 13088, "epoch": 0.23525182442391343, "loss/policy_avg": 0.2472541332244873, "lr": 2.8433665644171777e-06, "objective/entropy": 124.40581512451172, "objective/kl": 11.622451782226562, "objective/non_score_reward": -1.1622451543807983, "objective/rlhf_reward": -2.248980677127838, "objective/scores": 0.6, "policy/approxkl_avg": 6.343780517578125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5752452611923218, "step": 817, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999915599822998 }, { "episode": 13104, "epoch": 0.23553941834130207, "loss/policy_avg": 0.2929553687572479, "lr": 2.843174846625767e-06, "objective/entropy": -18.095306396484375, "objective/kl": 10.766685485839844, "objective/non_score_reward": -1.076668620109558, "objective/rlhf_reward": -6.306674480438232, "objective/scores": -0.5, "policy/approxkl_avg": 29.248212814331055, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.44631391763687134, "step": 818, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9986329078674316 }, { "episode": 13120, "epoch": 0.23582701225869074, "loss/policy_avg": 0.2811235189437866, "lr": 2.8429831288343558e-06, "objective/entropy": -93.74215698242188, "objective/kl": 10.5712890625, "objective/non_score_reward": -1.05712890625, "objective/rlhf_reward": -3.8285156697034832, "objective/scores": 0.1, "policy/approxkl_avg": 14.029077529907227, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6386555433273315, "step": 819, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0000128746032715 }, { "episode": 13136, "epoch": 0.23611460617607938, "loss/policy_avg": 0.321929007768631, "lr": 2.842791411042945e-06, "objective/entropy": -222.2208251953125, "objective/kl": 10.463525772094727, "objective/non_score_reward": -1.0463526248931885, "objective/rlhf_reward": -3.7854103505611416, "objective/scores": 0.1, "policy/approxkl_avg": 34.59898376464844, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7040956616401672, "step": 820, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0001721382141113 }, { "episode": 13152, "epoch": 0.23640220009346802, "loss/policy_avg": 0.2252596765756607, "lr": 2.842599693251534e-06, "objective/entropy": -5.484672546386719, "objective/kl": 7.207294940948486, "objective/non_score_reward": -0.7207294702529907, "objective/rlhf_reward": 0.04080113327386714, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 4.542896270751953, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5736124515533447, "step": 821, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999035120010376 }, { "episode": 13168, "epoch": 0.23668979401085666, "loss/policy_avg": 0.17349383234977722, "lr": 2.8424079754601226e-06, "objective/entropy": 260.90020751953125, "objective/kl": 13.826977729797363, "objective/non_score_reward": -1.3826978206634521, "objective/rlhf_reward": -7.530791282653809, "objective/scores": -0.5, "policy/approxkl_avg": 39.94245529174805, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8797262907028198, "step": 822, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9994349479675293 }, { "episode": 13184, "epoch": 0.23697738792824533, "loss/policy_avg": -0.0034087272360920906, "lr": 2.842216257668712e-06, "objective/entropy": -51.63107681274414, "objective/kl": 10.85478401184082, "objective/non_score_reward": -1.0854783058166504, "objective/rlhf_reward": 0.058087015151977894, "objective/scores": 1.1, "policy/approxkl_avg": 9.927780151367188, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6021898984909058, "step": 823, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.001553535461426 }, { "episode": 13200, "epoch": 0.23726498184563397, "loss/policy_avg": 0.42283856868743896, "lr": 2.8420245398773006e-06, "objective/entropy": 200.68020629882812, "objective/kl": 10.381429672241211, "objective/non_score_reward": -1.0381429195404053, "objective/rlhf_reward": -6.152571678161621, "objective/scores": -0.5, "policy/approxkl_avg": 74.65203857421875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7309496998786926, "step": 824, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9966254234313965 }, { "episode": 13216, "epoch": 0.2375525757630226, "loss/policy_avg": -0.09755183756351471, "lr": 2.84183282208589e-06, "objective/entropy": -69.01538848876953, "objective/kl": 5.6848883628845215, "objective/non_score_reward": -0.5684888362884521, "objective/rlhf_reward": 2.1260446399450306, "objective/scores": 1.1, "policy/approxkl_avg": 1.0148842334747314, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.44518405199050903, "step": 825, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0009875297546387 }, { "episode": 13232, "epoch": 0.23784016968041127, "loss/policy_avg": -0.25965848565101624, "lr": 2.8416411042944787e-06, "objective/entropy": 171.923828125, "objective/kl": 8.58846664428711, "objective/non_score_reward": -0.8588467836380005, "objective/rlhf_reward": -3.035387037694454, "objective/scores": 0.1, "policy/approxkl_avg": 39.28413391113281, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5065795183181763, "step": 826, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.007476806640625 }, { "episode": 13248, "epoch": 0.2381277635977999, "loss/policy_avg": 0.05290607735514641, "lr": 2.8414493865030675e-06, "objective/entropy": 31.982357025146484, "objective/kl": 14.847391128540039, "objective/non_score_reward": -1.484739065170288, "objective/rlhf_reward": -5.538956558704376, "objective/scores": 0.1, "policy/approxkl_avg": 134.27171325683594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.3722934126853943, "step": 827, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999661922454834 }, { "episode": 13264, "epoch": 0.23841535751518855, "loss/policy_avg": 0.2654344141483307, "lr": 2.8412576687116567e-06, "objective/entropy": 174.80413818359375, "objective/kl": 10.663211822509766, "objective/non_score_reward": -1.0663211345672607, "objective/rlhf_reward": -3.8652845233678814, "objective/scores": 0.1, "policy/approxkl_avg": 17.65331268310547, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6540871262550354, "step": 828, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0004587173461914 }, { "episode": 13280, "epoch": 0.2387029514325772, "loss/policy_avg": 0.29667001962661743, "lr": 2.8410659509202455e-06, "objective/entropy": -67.4645004272461, "objective/kl": 11.012588500976562, "objective/non_score_reward": -1.1012588739395142, "objective/rlhf_reward": -4.005035495758056, "objective/scores": 0.1, "policy/approxkl_avg": 39.841941833496094, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6608229279518127, "step": 829, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9974405765533447 }, { "episode": 13296, "epoch": 0.23899054534996586, "loss/policy_avg": 0.0870223194360733, "lr": 2.8408742331288343e-06, "objective/entropy": 218.6323699951172, "objective/kl": 13.182598114013672, "objective/non_score_reward": -1.3182597160339355, "objective/rlhf_reward": -3.6111795954114063, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 191.98348999023438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8216352462768555, "step": 830, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9983137845993042 }, { "episode": 13312, "epoch": 0.2392781392673545, "loss/policy_avg": 0.10631455481052399, "lr": 2.8406825153374236e-06, "objective/entropy": -176.3839569091797, "objective/kl": 10.902888298034668, "objective/non_score_reward": -1.0902888774871826, "objective/rlhf_reward": -6.3611555099487305, "objective/scores": -0.5, "policy/approxkl_avg": 30.94891929626465, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6645264029502869, "step": 831, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9989513158798218 }, { "episode": 13328, "epoch": 0.23956573318474314, "loss/policy_avg": 0.5899176001548767, "lr": 2.8404907975460124e-06, "objective/entropy": 200.20742797851562, "objective/kl": 18.23828125, "objective/non_score_reward": -1.8238282203674316, "objective/rlhf_reward": -9.295312881469727, "objective/scores": -0.5, "policy/approxkl_avg": 94.45753479003906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5188368558883667, "step": 832, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0002236366271973 }, { "episode": 13344, "epoch": 0.23985332710213178, "loss/policy_avg": 0.34776580333709717, "lr": 2.8402990797546016e-06, "objective/entropy": 3.6189041137695312, "objective/kl": 14.347872734069824, "objective/non_score_reward": -1.4347872734069824, "objective/rlhf_reward": -5.339149034023285, "objective/scores": 0.1, "policy/approxkl_avg": 63.92158889770508, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.49274176359176636, "step": 833, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999204158782959 }, { "episode": 13360, "epoch": 0.24014092101952045, "loss/policy_avg": -0.2418670505285263, "lr": 2.84010736196319e-06, "objective/entropy": 307.7858581542969, "objective/kl": 11.471115112304688, "objective/non_score_reward": -1.1471115350723267, "objective/rlhf_reward": -4.188446259498596, "objective/scores": 0.1, "policy/approxkl_avg": 50.762351989746094, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.7961260676383972, "step": 834, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0257229804992676 }, { "episode": 13376, "epoch": 0.2404285149369091, "loss/policy_avg": 0.4314028024673462, "lr": 2.8399156441717792e-06, "objective/entropy": 106.43098449707031, "objective/kl": 12.59414291381836, "objective/non_score_reward": -1.2594143152236938, "objective/rlhf_reward": -0.6376570820808407, "objective/scores": 1.1, "policy/approxkl_avg": 49.60033416748047, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6837184429168701, "step": 835, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000203847885132 }, { "episode": 13392, "epoch": 0.24071610885429773, "loss/policy_avg": -0.4717941880226135, "lr": 2.839723926380368e-06, "objective/entropy": 156.29014587402344, "objective/kl": 13.116241455078125, "objective/non_score_reward": -1.3116241693496704, "objective/rlhf_reward": -3.690237252917841, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 150.56942749023438, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6304831504821777, "step": 836, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.002359390258789 }, { "episode": 13408, "epoch": 0.24100370277168637, "loss/policy_avg": 0.48564639687538147, "lr": 2.839532208588957e-06, "objective/entropy": 25.292335510253906, "objective/kl": 10.876035690307617, "objective/non_score_reward": -1.0876035690307617, "objective/rlhf_reward": -2.403002808766301, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 81.79934692382812, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7601375579833984, "step": 837, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0018410682678223 }, { "episode": 13424, "epoch": 0.24129129668907504, "loss/policy_avg": 0.2227717936038971, "lr": 2.839340490797546e-06, "objective/entropy": 290.1324157714844, "objective/kl": 8.968036651611328, "objective/non_score_reward": -0.896803617477417, "objective/rlhf_reward": -5.587214469909668, "objective/scores": -0.5, "policy/approxkl_avg": 6.772152900695801, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6874889135360718, "step": 838, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9978574514389038 }, { "episode": 13440, "epoch": 0.24157889060646368, "loss/policy_avg": 0.378933846950531, "lr": 2.839148773006135e-06, "objective/entropy": 83.87255096435547, "objective/kl": 16.091838836669922, "objective/non_score_reward": -1.6091837882995605, "objective/rlhf_reward": -8.436735153198242, "objective/scores": -0.5, "policy/approxkl_avg": 171.8421630859375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7673331499099731, "step": 839, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9988526105880737 }, { "episode": 13456, "epoch": 0.24186648452385232, "loss/policy_avg": 0.05077691376209259, "lr": 2.838957055214724e-06, "objective/entropy": 201.8306884765625, "objective/kl": 10.442170143127441, "objective/non_score_reward": -1.0442171096801758, "objective/rlhf_reward": -6.176868438720703, "objective/scores": -0.5, "policy/approxkl_avg": 42.752803802490234, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7712172269821167, "step": 840, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.1516284942626953 }, { "episode": 13472, "epoch": 0.24215407844124096, "loss/policy_avg": 0.6697203516960144, "lr": 2.838765337423313e-06, "objective/entropy": -228.6722869873047, "objective/kl": 12.05379867553711, "objective/non_score_reward": -1.2053799629211426, "objective/rlhf_reward": -4.421519672870636, "objective/scores": 0.1, "policy/approxkl_avg": 23.305578231811523, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6277028918266296, "step": 841, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998668909072876 }, { "episode": 13488, "epoch": 0.24244167235862962, "loss/policy_avg": -0.059078969061374664, "lr": 2.8385736196319018e-06, "objective/entropy": 41.68458557128906, "objective/kl": 9.514412879943848, "objective/non_score_reward": -0.9514412879943848, "objective/rlhf_reward": -3.4057652115821835, "objective/scores": 0.1, "policy/approxkl_avg": 1.8853363990783691, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6527504324913025, "step": 842, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000281810760498 }, { "episode": 13504, "epoch": 0.24272926627601826, "loss/policy_avg": 0.03922227397561073, "lr": 2.838381901840491e-06, "objective/entropy": -126.3974838256836, "objective/kl": 15.604471206665039, "objective/non_score_reward": -1.560447096824646, "objective/rlhf_reward": -5.841788208484649, "objective/scores": 0.1, "policy/approxkl_avg": 42.71725082397461, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5101525187492371, "step": 843, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998733401298523 }, { "episode": 13520, "epoch": 0.2430168601934069, "loss/policy_avg": -0.07494255900382996, "lr": 2.83819018404908e-06, "objective/entropy": -109.69844055175781, "objective/kl": 12.058096885681152, "objective/non_score_reward": -1.2058095932006836, "objective/rlhf_reward": -4.423238492012024, "objective/scores": 0.1, "policy/approxkl_avg": 71.72029876708984, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6301126480102539, "step": 844, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 2.0005083084106445 }, { "episode": 13536, "epoch": 0.24330445411079557, "loss/policy_avg": -0.0021638330072164536, "lr": 2.8379984662576686e-06, "objective/entropy": -55.14314651489258, "objective/kl": 16.980009078979492, "objective/non_score_reward": -1.6980011463165283, "objective/rlhf_reward": -6.392004287242889, "objective/scores": 0.1, "policy/approxkl_avg": 58.57079315185547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5378575921058655, "step": 845, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9998762607574463 }, { "episode": 13552, "epoch": 0.2435920480281842, "loss/policy_avg": 0.18776272237300873, "lr": 2.837806748466258e-06, "objective/entropy": 213.31375122070312, "objective/kl": 13.382487297058105, "objective/non_score_reward": -1.3382488489151, "objective/rlhf_reward": -0.9529952764511105, "objective/scores": 1.1, "policy/approxkl_avg": 47.32990646362305, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5546428561210632, "step": 846, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.999443769454956 }, { "episode": 13568, "epoch": 0.24387964194557285, "loss/policy_avg": -0.10050228238105774, "lr": 2.8376150306748467e-06, "objective/entropy": -1.5470504760742188, "objective/kl": 5.421267509460449, "objective/non_score_reward": -0.5421267151832581, "objective/rlhf_reward": -1.768506808578968, "objective/scores": 0.1, "policy/approxkl_avg": 13.883844375610352, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.690306544303894, "step": 847, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0045018196105957 }, { "episode": 13584, "epoch": 0.2441672358629615, "loss/policy_avg": 0.14597558975219727, "lr": 2.837423312883436e-06, "objective/entropy": -17.603618621826172, "objective/kl": 11.631009101867676, "objective/non_score_reward": -1.1631009578704834, "objective/rlhf_reward": -4.252403473854065, "objective/scores": 0.1, "policy/approxkl_avg": 40.45585250854492, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5793694257736206, "step": 848, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.997261643409729 }, { "episode": 13600, "epoch": 0.24445482978035016, "loss/policy_avg": 0.3591553866863251, "lr": 2.8372315950920247e-06, "objective/entropy": 140.2003173828125, "objective/kl": 16.13811492919922, "objective/non_score_reward": -1.613811731338501, "objective/rlhf_reward": -6.055246709287166, "objective/scores": 0.1, "policy/approxkl_avg": 67.97541809082031, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7342993021011353, "step": 849, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9992564916610718 }, { "episode": 13616, "epoch": 0.2447424236977388, "loss/policy_avg": 0.30371835827827454, "lr": 2.8370398773006135e-06, "objective/entropy": 231.95297241210938, "objective/kl": 17.194393157958984, "objective/non_score_reward": -1.7194395065307617, "objective/rlhf_reward": -4.755051466003929, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 10.407341003417969, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5657638311386108, "step": 850, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0007481575012207 }, { "episode": 13632, "epoch": 0.24503001761512744, "loss/policy_avg": 0.5038444399833679, "lr": 2.8368481595092027e-06, "objective/entropy": 60.03744125366211, "objective/kl": 14.280261993408203, "objective/non_score_reward": -1.4280261993408203, "objective/rlhf_reward": -3.764693687634404, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 37.566680908203125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.611575186252594, "step": 851, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9986273050308228 }, { "episode": 13648, "epoch": 0.24531761153251608, "loss/policy_avg": 0.25327011942863464, "lr": 2.8366564417177915e-06, "objective/entropy": 80.94923400878906, "objective/kl": 13.566845893859863, "objective/non_score_reward": -1.356684684753418, "objective/rlhf_reward": -1.0267389029264447, "objective/scores": 1.1, "policy/approxkl_avg": 30.468521118164062, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.743263840675354, "step": 852, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.002877712249756 }, { "episode": 13664, "epoch": 0.24560520544990475, "loss/policy_avg": 0.7088375091552734, "lr": 2.8364647239263804e-06, "objective/entropy": -19.6234130859375, "objective/kl": 18.899595260620117, "objective/non_score_reward": -1.889959454536438, "objective/rlhf_reward": -5.735009069713662, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 87.53837585449219, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6533883213996887, "step": 853, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9973728656768799 }, { "episode": 13680, "epoch": 0.24589279936729339, "loss/policy_avg": 0.21842418611049652, "lr": 2.8362730061349696e-06, "objective/entropy": -2.3630218505859375, "objective/kl": 10.437822341918945, "objective/non_score_reward": -1.043782353401184, "objective/rlhf_reward": -1.775129473209381, "objective/scores": 0.6, "policy/approxkl_avg": 25.123273849487305, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4626666307449341, "step": 854, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.997999906539917 }, { "episode": 13696, "epoch": 0.24618039328468203, "loss/policy_avg": 0.30842268466949463, "lr": 2.8360812883435584e-06, "objective/entropy": 73.10386657714844, "objective/kl": 14.02588176727295, "objective/non_score_reward": -1.402587890625, "objective/rlhf_reward": -7.6103515625, "objective/scores": -0.5, "policy/approxkl_avg": 44.31865692138672, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.47557830810546875, "step": 855, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000145435333252 }, { "episode": 13712, "epoch": 0.24646798720207067, "loss/policy_avg": 0.1986391693353653, "lr": 2.835889570552147e-06, "objective/entropy": 41.59130096435547, "objective/kl": 11.414254188537598, "objective/non_score_reward": -1.141425371170044, "objective/rlhf_reward": -6.565701484680176, "objective/scores": -0.5, "policy/approxkl_avg": 57.20241165161133, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5653365850448608, "step": 856, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.001561164855957 }, { "episode": 13728, "epoch": 0.24675558111945933, "loss/policy_avg": -0.09086053818464279, "lr": 2.835697852760736e-06, "objective/entropy": 129.108154296875, "objective/kl": 11.600625038146973, "objective/non_score_reward": -1.160062551498413, "objective/rlhf_reward": -4.240250265598297, "objective/scores": 0.1, "policy/approxkl_avg": 43.625953674316406, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.8041683435440063, "step": 857, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.001258373260498 }, { "episode": 13744, "epoch": 0.24704317503684797, "loss/policy_avg": 0.4292464852333069, "lr": 2.8355061349693253e-06, "objective/entropy": -7.69146728515625, "objective/kl": 8.978507995605469, "objective/non_score_reward": -0.8978508710861206, "objective/rlhf_reward": -0.6676843806516852, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 29.219491958618164, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.43980222940444946, "step": 858, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9975333213806152 }, { "episode": 13760, "epoch": 0.2473307689542366, "loss/policy_avg": -0.1740269660949707, "lr": 2.835314417177914e-06, "objective/entropy": 199.1434783935547, "objective/kl": 10.360536575317383, "objective/non_score_reward": -1.0360536575317383, "objective/rlhf_reward": -3.74421471953392, "objective/scores": 0.1, "policy/approxkl_avg": 45.653465270996094, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.46243250370025635, "step": 859, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.001521110534668 }, { "episode": 13776, "epoch": 0.24761836287162525, "loss/policy_avg": 0.11918877065181732, "lr": 2.835122699386503e-06, "objective/entropy": 72.46167755126953, "objective/kl": 14.164287567138672, "objective/non_score_reward": -1.416428565979004, "objective/rlhf_reward": -7.665714263916016, "objective/scores": -0.5, "policy/approxkl_avg": 134.235107421875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4980497360229492, "step": 860, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9996706247329712 }, { "episode": 13792, "epoch": 0.24790595678901392, "loss/policy_avg": 0.05426352471113205, "lr": 2.834930981595092e-06, "objective/entropy": 4.267814636230469, "objective/kl": 14.633834838867188, "objective/non_score_reward": -1.463383436203003, "objective/rlhf_reward": -1.453533565998077, "objective/scores": 1.1, "policy/approxkl_avg": 25.47060203552246, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.566750168800354, "step": 861, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998802900314331 }, { "episode": 13808, "epoch": 0.24819355070640256, "loss/policy_avg": 0.5575066804885864, "lr": 2.834739263803681e-06, "objective/entropy": 206.50341796875, "objective/kl": 13.636398315429688, "objective/non_score_reward": -1.3636398315429688, "objective/rlhf_reward": -1.0545593261718746, "objective/scores": 1.1, "policy/approxkl_avg": 26.75678062438965, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4932931065559387, "step": 862, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9983261823654175 }, { "episode": 13824, "epoch": 0.2484811446237912, "loss/policy_avg": 0.23257222771644592, "lr": 2.83454754601227e-06, "objective/entropy": -39.31481170654297, "objective/kl": 14.039112091064453, "objective/non_score_reward": -1.4039111137390137, "objective/rlhf_reward": -7.615644454956055, "objective/scores": -0.5, "policy/approxkl_avg": 62.90751647949219, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.643797755241394, "step": 863, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9972407817840576 }, { "episode": 13840, "epoch": 0.24876873854117984, "loss/policy_avg": 0.02133660763502121, "lr": 2.834355828220859e-06, "objective/entropy": -178.11383056640625, "objective/kl": 11.982830047607422, "objective/non_score_reward": -1.1982829570770264, "objective/rlhf_reward": -4.393131679296493, "objective/scores": 0.1, "policy/approxkl_avg": 53.20249938964844, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7372720241546631, "step": 864, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0001306533813477 }, { "episode": 13856, "epoch": 0.2490563324585685, "loss/policy_avg": 0.11362017691135406, "lr": 2.8341641104294478e-06, "objective/entropy": 79.2239990234375, "objective/kl": 13.957067489624023, "objective/non_score_reward": -1.3957067728042603, "objective/rlhf_reward": -5.182827150821685, "objective/scores": 0.1, "policy/approxkl_avg": 67.16444396972656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6697244048118591, "step": 865, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998111367225647 }, { "episode": 13872, "epoch": 0.24934392637595715, "loss/policy_avg": -0.15431474149227142, "lr": 2.833972392638037e-06, "objective/entropy": 85.38504028320312, "objective/kl": 12.748974800109863, "objective/non_score_reward": -1.274897575378418, "objective/rlhf_reward": -4.699590167403221, "objective/scores": 0.1, "policy/approxkl_avg": 95.32827758789062, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4564756155014038, "step": 866, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0008811950683594 }, { "episode": 13888, "epoch": 0.2496315202933458, "loss/policy_avg": 0.5219398140907288, "lr": 2.833780674846626e-06, "objective/entropy": -68.0516128540039, "objective/kl": 10.130060195922852, "objective/non_score_reward": -1.0130060911178589, "objective/rlhf_reward": -3.6520243942737576, "objective/scores": 0.1, "policy/approxkl_avg": 20.153987884521484, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.576066255569458, "step": 867, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9983216524124146 }, { "episode": 13904, "epoch": 0.24991911421073446, "loss/policy_avg": -0.27920252084732056, "lr": 2.8335889570552146e-06, "objective/entropy": 37.78301239013672, "objective/kl": 8.389419555664062, "objective/non_score_reward": -0.8389419317245483, "objective/rlhf_reward": -5.355767726898193, "objective/scores": -0.5, "policy/approxkl_avg": 48.89885711669922, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.3526889383792877, "step": 868, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0450096130371094 }, { "episode": 13920, "epoch": 0.25020670812812307, "loss/policy_avg": 0.23919862508773804, "lr": 2.833397239263804e-06, "objective/entropy": 96.82152557373047, "objective/kl": 14.15482234954834, "objective/non_score_reward": -1.415482521057129, "objective/rlhf_reward": -5.261929965019226, "objective/scores": 0.1, "policy/approxkl_avg": 81.41618347167969, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7077710628509521, "step": 869, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0002362728118896 }, { "episode": 13936, "epoch": 0.25049430204551176, "loss/policy_avg": -0.008724374696612358, "lr": 2.8332055214723927e-06, "objective/entropy": 5.678382873535156, "objective/kl": 8.6428861618042, "objective/non_score_reward": -0.8642886877059937, "objective/rlhf_reward": -0.5334355577242103, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 18.883581161499023, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6166142225265503, "step": 870, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9976524114608765 }, { "episode": 13952, "epoch": 0.2507818959629004, "loss/policy_avg": 0.5849788188934326, "lr": 2.833013803680982e-06, "objective/entropy": 7.5515899658203125, "objective/kl": 12.30251693725586, "objective/non_score_reward": -1.2302517890930176, "objective/rlhf_reward": -3.259147563547479, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 45.453224182128906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8170486688613892, "step": 871, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998389482498169 }, { "episode": 13968, "epoch": 0.25106948988028904, "loss/policy_avg": 1.0887643098831177, "lr": 2.8328220858895707e-06, "objective/entropy": 19.560287475585938, "objective/kl": 9.816727638244629, "objective/non_score_reward": -0.9816729426383972, "objective/rlhf_reward": -3.526691591739654, "objective/scores": 0.1, "policy/approxkl_avg": 59.10675048828125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4642179012298584, "step": 872, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9975433349609375 }, { "episode": 13984, "epoch": 0.2513570837976777, "loss/policy_avg": 0.27298757433891296, "lr": 2.8326303680981595e-06, "objective/entropy": -61.813533782958984, "objective/kl": 14.266277313232422, "objective/non_score_reward": -1.4266278743743896, "objective/rlhf_reward": -3.3065116763114926, "objective/scores": 0.6, "policy/approxkl_avg": 34.11764907836914, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.49072712659835815, "step": 873, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9989274740219116 }, { "episode": 14000, "epoch": 0.2516446777150663, "loss/policy_avg": 0.22000867128372192, "lr": 2.8324386503067487e-06, "objective/entropy": -82.48310089111328, "objective/kl": 12.360732078552246, "objective/non_score_reward": -1.2360732555389404, "objective/rlhf_reward": -6.944293022155762, "objective/scores": -0.5, "policy/approxkl_avg": 76.64207458496094, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4945850372314453, "step": 874, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999876856803894 }, { "episode": 14016, "epoch": 0.25193227163245496, "loss/policy_avg": 0.2379000037908554, "lr": 2.8322469325153376e-06, "objective/entropy": 11.343524932861328, "objective/kl": 13.053705215454102, "objective/non_score_reward": -1.305370569229126, "objective/rlhf_reward": -7.221482276916504, "objective/scores": -0.5, "policy/approxkl_avg": 109.89395141601562, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5588850975036621, "step": 875, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9999418258666992 }, { "episode": 14032, "epoch": 0.2522198655498436, "loss/policy_avg": 0.48280632495880127, "lr": 2.8320552147239268e-06, "objective/entropy": 169.83905029296875, "objective/kl": 12.475770950317383, "objective/non_score_reward": -1.2475769519805908, "objective/rlhf_reward": -3.042896787600453, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 133.74017333984375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.40729820728302, "step": 876, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9965707063674927 }, { "episode": 14048, "epoch": 0.25250745946723224, "loss/policy_avg": 0.5907744765281677, "lr": 2.8318634969325156e-06, "objective/entropy": 206.38848876953125, "objective/kl": 10.661521911621094, "objective/non_score_reward": -1.0661522150039673, "objective/rlhf_reward": -3.8646090537309643, "objective/scores": 0.1, "policy/approxkl_avg": 19.813899993896484, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.71628737449646, "step": 877, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9990339279174805 }, { "episode": 14064, "epoch": 0.25279505338462094, "loss/policy_avg": 0.2785950005054474, "lr": 2.8316717791411044e-06, "objective/entropy": 162.6572723388672, "objective/kl": 12.341489791870117, "objective/non_score_reward": -1.2341489791870117, "objective/rlhf_reward": -2.012877260090086, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 85.18682861328125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8230720162391663, "step": 878, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0007762908935547 }, { "episode": 14080, "epoch": 0.2530826473020096, "loss/policy_avg": 0.5439757108688354, "lr": 2.8314800613496932e-06, "objective/entropy": -29.994464874267578, "objective/kl": 9.197661399841309, "objective/non_score_reward": -0.9197661876678467, "objective/rlhf_reward": -3.279064661264419, "objective/scores": 0.1, "policy/approxkl_avg": 27.833585739135742, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.591437816619873, "step": 879, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.997772216796875 }, { "episode": 14096, "epoch": 0.2533702412193982, "loss/policy_avg": 0.41080114245414734, "lr": 2.831288343558282e-06, "objective/entropy": 239.91580200195312, "objective/kl": 9.134315490722656, "objective/non_score_reward": -0.9134315252304077, "objective/rlhf_reward": -5.653726100921631, "objective/scores": -0.5, "policy/approxkl_avg": 22.719074249267578, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5987927913665771, "step": 880, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0004446506500244 }, { "episode": 14112, "epoch": 0.25365783513678686, "loss/policy_avg": 0.6375956535339355, "lr": 2.8310966257668713e-06, "objective/entropy": -158.66209411621094, "objective/kl": 10.0148344039917, "objective/non_score_reward": -1.0014833211898804, "objective/rlhf_reward": -1.082214523793432, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 21.38302230834961, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6316667795181274, "step": 881, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0006909370422363 }, { "episode": 14128, "epoch": 0.2539454290541755, "loss/policy_avg": 0.45256638526916504, "lr": 2.83090490797546e-06, "objective/entropy": 207.23558044433594, "objective/kl": 18.251401901245117, "objective/non_score_reward": -1.8251402378082275, "objective/rlhf_reward": -4.90056095123291, "objective/scores": 0.6, "policy/approxkl_avg": 26.723499298095703, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5192215442657471, "step": 882, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.000243663787842 }, { "episode": 14144, "epoch": 0.25423302297156414, "loss/policy_avg": -0.07051656395196915, "lr": 2.830713190184049e-06, "objective/entropy": 99.65924072265625, "objective/kl": 11.526399612426758, "objective/non_score_reward": -1.1526398658752441, "objective/rlhf_reward": -4.2105597615242, "objective/scores": 0.1, "policy/approxkl_avg": 7.548530101776123, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6270474195480347, "step": 883, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0014915466308594 }, { "episode": 14160, "epoch": 0.2545206168889528, "loss/policy_avg": 0.15961593389511108, "lr": 2.830521472392638e-06, "objective/entropy": -13.243667602539062, "objective/kl": 14.339917182922363, "objective/non_score_reward": -1.43399178981781, "objective/rlhf_reward": -1.3359672188758847, "objective/scores": 1.1, "policy/approxkl_avg": 100.59516906738281, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.570610523223877, "step": 884, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9982887506484985 }, { "episode": 14176, "epoch": 0.2548082108063415, "loss/policy_avg": 0.2716251015663147, "lr": 2.830329754601227e-06, "objective/entropy": 43.207088470458984, "objective/kl": 15.224469184875488, "objective/non_score_reward": -1.522447109222412, "objective/rlhf_reward": -1.6897883176803585, "objective/scores": 1.1, "policy/approxkl_avg": 12.94150161743164, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.45153629779815674, "step": 885, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.001307725906372 }, { "episode": 14192, "epoch": 0.2550958047237301, "loss/policy_avg": 0.19296292960643768, "lr": 2.830138036809816e-06, "objective/entropy": -110.1358642578125, "objective/kl": 8.883868217468262, "objective/non_score_reward": -0.8883869051933289, "objective/rlhf_reward": -5.553547382354736, "objective/scores": -0.5, "policy/approxkl_avg": 7.389582633972168, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.713127613067627, "step": 886, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0009400844573975 }, { "episode": 14208, "epoch": 0.25538339864111875, "loss/policy_avg": -0.08597195148468018, "lr": 2.829946319018405e-06, "objective/entropy": 135.94149780273438, "objective/kl": 12.527217864990234, "objective/non_score_reward": -1.2527216672897339, "objective/rlhf_reward": -0.6108868777751919, "objective/scores": 1.1, "policy/approxkl_avg": 2.5981717109680176, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6131302714347839, "step": 887, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0009326934814453 }, { "episode": 14224, "epoch": 0.2556709925585074, "loss/policy_avg": 0.06909796595573425, "lr": 2.8297546012269938e-06, "objective/entropy": 86.01097869873047, "objective/kl": 10.155022621154785, "objective/non_score_reward": -1.0155022144317627, "objective/rlhf_reward": -1.1382899030458655, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 16.276836395263672, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6549547910690308, "step": 888, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9995278120040894 }, { "episode": 14240, "epoch": 0.25595858647589603, "loss/policy_avg": 0.1944853961467743, "lr": 2.829562883435583e-06, "objective/entropy": 23.008251190185547, "objective/kl": 10.702659606933594, "objective/non_score_reward": -1.0702658891677856, "objective/rlhf_reward": -3.88106365352869, "objective/scores": 0.1, "policy/approxkl_avg": 92.89102935791016, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.557334840297699, "step": 889, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0014572143554688 }, { "episode": 14256, "epoch": 0.2562461803932847, "loss/policy_avg": 0.18296848237514496, "lr": 2.829371165644172e-06, "objective/entropy": 236.90658569335938, "objective/kl": 17.1822566986084, "objective/non_score_reward": -1.7182257175445557, "objective/rlhf_reward": -6.472903227806091, "objective/scores": 0.1, "policy/approxkl_avg": 32.678916931152344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6069942116737366, "step": 890, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0001883506774902 }, { "episode": 14272, "epoch": 0.2565337743106733, "loss/policy_avg": 0.04921949282288551, "lr": 2.829179447852761e-06, "objective/entropy": 108.23074340820312, "objective/kl": 9.07606315612793, "objective/non_score_reward": -0.907606303691864, "objective/rlhf_reward": 0.7695747852325443, "objective/scores": 1.1, "policy/approxkl_avg": 1.1709768772125244, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6781247854232788, "step": 891, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0007190704345703 }, { "episode": 14288, "epoch": 0.25682136822806195, "loss/policy_avg": 0.263433575630188, "lr": 2.82898773006135e-06, "objective/entropy": -11.065872192382812, "objective/kl": 15.533244132995605, "objective/non_score_reward": -1.5533244609832764, "objective/rlhf_reward": -4.090591879860435, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 90.50593566894531, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6565124988555908, "step": 892, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.996107816696167 }, { "episode": 14304, "epoch": 0.25710896214545065, "loss/policy_avg": 0.03790378198027611, "lr": 2.8287960122699387e-06, "objective/entropy": -195.91549682617188, "objective/kl": 4.9871110916137695, "objective/non_score_reward": -0.49871110916137695, "objective/rlhf_reward": -0.047433326916630936, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 2.2991867065429688, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5137837529182434, "step": 893, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9998875856399536 }, { "episode": 14320, "epoch": 0.2573965560628393, "loss/policy_avg": 0.16121526062488556, "lr": 2.828604294478528e-06, "objective/entropy": -126.01262664794922, "objective/kl": 13.482461929321289, "objective/non_score_reward": -1.3482462167739868, "objective/rlhf_reward": -3.5681560590592136, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 6.479033470153809, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5413044691085815, "step": 894, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9979188442230225 }, { "episode": 14336, "epoch": 0.25768414998022793, "loss/policy_avg": 0.03230078145861626, "lr": 2.8284125766871167e-06, "objective/entropy": -3.0018844604492188, "objective/kl": 11.91611385345459, "objective/non_score_reward": -1.191611409187317, "objective/rlhf_reward": -4.366445696353912, "objective/scores": 0.1, "policy/approxkl_avg": 78.21747589111328, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6705281734466553, "step": 895, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9984080791473389 }, { "episode": 14352, "epoch": 0.25797174389761657, "loss/policy_avg": 0.3060767948627472, "lr": 2.8282208588957055e-06, "objective/entropy": 61.28870391845703, "objective/kl": 8.302905082702637, "objective/non_score_reward": -0.8302905559539795, "objective/rlhf_reward": -2.921162268519401, "objective/scores": 0.1, "policy/approxkl_avg": 5.191162586212158, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.553315281867981, "step": 896, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.997650384902954 }, { "episode": 14368, "epoch": 0.2582593378150052, "loss/policy_avg": 0.17983271181583405, "lr": 2.8280291411042947e-06, "objective/entropy": -20.331504821777344, "objective/kl": 17.491222381591797, "objective/non_score_reward": -1.7491222620010376, "objective/rlhf_reward": -8.996489524841309, "objective/scores": -0.5, "policy/approxkl_avg": 119.10844421386719, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5886894464492798, "step": 897, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9989356994628906 }, { "episode": 14384, "epoch": 0.25854693173239385, "loss/policy_avg": 0.1394512951374054, "lr": 2.8278374233128836e-06, "objective/entropy": 127.6997299194336, "objective/kl": 15.690263748168945, "objective/non_score_reward": -1.5690264701843262, "objective/rlhf_reward": -8.276105880737305, "objective/scores": -0.5, "policy/approxkl_avg": 46.133121490478516, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6130183935165405, "step": 898, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.997514247894287 }, { "episode": 14400, "epoch": 0.2588345256497825, "loss/policy_avg": 0.2098160684108734, "lr": 2.827645705521473e-06, "objective/entropy": -21.335540771484375, "objective/kl": 10.539255142211914, "objective/non_score_reward": -1.0539255142211914, "objective/rlhf_reward": -3.815702205896377, "objective/scores": 0.1, "policy/approxkl_avg": 31.069839477539062, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7244166731834412, "step": 899, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9994945526123047 }, { "episode": 14416, "epoch": 0.2591221195671711, "loss/policy_avg": 0.642552375793457, "lr": 2.8274539877300616e-06, "objective/entropy": -27.118850708007812, "objective/kl": 15.130813598632812, "objective/non_score_reward": -1.5130811929702759, "objective/rlhf_reward": -3.652324831485748, "objective/scores": 0.6, "policy/approxkl_avg": 107.5416259765625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6859394311904907, "step": 900, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.00144100189209 }, { "episode": 14432, "epoch": 0.2594097134845598, "loss/policy_avg": 0.5634386539459229, "lr": 2.8272622699386504e-06, "objective/entropy": -81.20954895019531, "objective/kl": 9.424870491027832, "objective/non_score_reward": -0.9424870610237122, "objective/rlhf_reward": -1.647241981998954, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 161.22662353515625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.49134987592697144, "step": 901, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9995436668395996 }, { "episode": 14448, "epoch": 0.25969730740194846, "loss/policy_avg": 0.12854906916618347, "lr": 2.8270705521472392e-06, "objective/entropy": 49.876365661621094, "objective/kl": 13.830504417419434, "objective/non_score_reward": -1.383050560951233, "objective/rlhf_reward": -7.532202243804932, "objective/scores": -0.5, "policy/approxkl_avg": 109.16975402832031, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6257975697517395, "step": 902, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9994562864303589 }, { "episode": 14464, "epoch": 0.2599849013193371, "loss/policy_avg": -0.2561631202697754, "lr": 2.826878834355828e-06, "objective/entropy": -208.84307861328125, "objective/kl": 16.52663803100586, "objective/non_score_reward": -1.6526635885238647, "objective/rlhf_reward": -4.210654458403587, "objective/scores": 0.6, "policy/approxkl_avg": 27.665565490722656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7057524919509888, "step": 903, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998441457748413 }, { "episode": 14480, "epoch": 0.26027249523672574, "loss/policy_avg": 0.011652922257781029, "lr": 2.8266871165644173e-06, "objective/entropy": 39.806941986083984, "objective/kl": 8.505756378173828, "objective/non_score_reward": -0.8505756855010986, "objective/rlhf_reward": -3.0023026227951046, "objective/scores": 0.1, "policy/approxkl_avg": 97.56808471679688, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.49984896183013916, "step": 904, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9997618198394775 }, { "episode": 14496, "epoch": 0.2605600891541144, "loss/policy_avg": 0.3299306035041809, "lr": 2.826495398773006e-06, "objective/entropy": 186.02059936523438, "objective/kl": 12.209084510803223, "objective/non_score_reward": -1.2209084033966064, "objective/rlhf_reward": -2.760927232281242, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 42.59535217285156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8104973435401917, "step": 905, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0011796951293945 }, { "episode": 14512, "epoch": 0.260847683071503, "loss/policy_avg": 0.4473887085914612, "lr": 2.826303680981595e-06, "objective/entropy": 236.0746307373047, "objective/kl": 15.661829948425293, "objective/non_score_reward": -1.5661829710006714, "objective/rlhf_reward": -8.264732360839844, "objective/scores": -0.5, "policy/approxkl_avg": 174.1883087158203, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6012299656867981, "step": 906, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.999688982963562 }, { "episode": 14528, "epoch": 0.26113527698889166, "loss/policy_avg": 0.26635417342185974, "lr": 2.826111963190184e-06, "objective/entropy": -117.22915649414062, "objective/kl": 19.672523498535156, "objective/non_score_reward": -1.967252492904663, "objective/rlhf_reward": -9.869009971618652, "objective/scores": -0.5, "policy/approxkl_avg": 142.4681396484375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6905121803283691, "step": 907, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998607873916626 }, { "episode": 14544, "epoch": 0.26142287090628036, "loss/policy_avg": 0.463941752910614, "lr": 2.825920245398773e-06, "objective/entropy": 193.2610626220703, "objective/kl": 11.647161483764648, "objective/non_score_reward": -1.1647162437438965, "objective/rlhf_reward": -2.2588648259639736, "objective/scores": 0.6, "policy/approxkl_avg": 40.81477355957031, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6915194988250732, "step": 908, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9976000785827637 }, { "episode": 14560, "epoch": 0.261710464823669, "loss/policy_avg": 0.4663980007171631, "lr": 2.825728527607362e-06, "objective/entropy": -9.778663635253906, "objective/kl": 11.958295822143555, "objective/non_score_reward": -1.1958296298980713, "objective/rlhf_reward": -1.8595997437250344, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 25.867328643798828, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.38230326771736145, "step": 909, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9996697902679443 }, { "episode": 14576, "epoch": 0.26199805874105764, "loss/policy_avg": 0.21209433674812317, "lr": 2.825536809815951e-06, "objective/entropy": 162.04637145996094, "objective/kl": 15.277240753173828, "objective/non_score_reward": -1.527724266052246, "objective/rlhf_reward": -5.710896825790405, "objective/scores": 0.1, "policy/approxkl_avg": 14.56328010559082, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7583177089691162, "step": 910, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000117063522339 }, { "episode": 14592, "epoch": 0.2622856526584463, "loss/policy_avg": 0.6738499402999878, "lr": 2.8253450920245398e-06, "objective/entropy": 169.78570556640625, "objective/kl": 12.763500213623047, "objective/non_score_reward": -1.2763500213623047, "objective/rlhf_reward": -4.705400294065475, "objective/scores": 0.1, "policy/approxkl_avg": 50.89369201660156, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.712113082408905, "step": 911, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9986307621002197 }, { "episode": 14608, "epoch": 0.2625732465758349, "loss/policy_avg": -0.1670105904340744, "lr": 2.825153374233129e-06, "objective/entropy": 129.38778686523438, "objective/kl": 8.210895538330078, "objective/non_score_reward": -0.8210896253585815, "objective/rlhf_reward": -5.284358501434326, "objective/scores": -0.5, "policy/approxkl_avg": 5.988738536834717, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.5074647665023804, "step": 912, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.003474473953247 }, { "episode": 14624, "epoch": 0.26286084049322356, "loss/policy_avg": 0.8385502099990845, "lr": 2.824961656441718e-06, "objective/entropy": 2.270915985107422, "objective/kl": 16.01020622253418, "objective/non_score_reward": -1.6010206937789917, "objective/rlhf_reward": -6.004082834720611, "objective/scores": 0.1, "policy/approxkl_avg": 122.61514282226562, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6589502692222595, "step": 913, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0001535415649414 }, { "episode": 14640, "epoch": 0.2631484344106122, "loss/policy_avg": 0.5200643539428711, "lr": 2.824769938650307e-06, "objective/entropy": 41.12443542480469, "objective/kl": 14.1666259765625, "objective/non_score_reward": -1.4166628122329712, "objective/rlhf_reward": -1.2666512787342068, "objective/scores": 1.1, "policy/approxkl_avg": 110.20585632324219, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7196406126022339, "step": 914, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9971590042114258 }, { "episode": 14656, "epoch": 0.26343602832800084, "loss/policy_avg": 0.056996263563632965, "lr": 2.824578220858896e-06, "objective/entropy": 147.31983947753906, "objective/kl": 7.540309906005859, "objective/non_score_reward": -0.7540310025215149, "objective/rlhf_reward": -5.0161237716674805, "objective/scores": -0.5, "policy/approxkl_avg": 13.796720504760742, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5611803531646729, "step": 915, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9968600273132324 }, { "episode": 14672, "epoch": 0.26372362224538953, "loss/policy_avg": 0.24829980731010437, "lr": 2.8243865030674847e-06, "objective/entropy": 31.06182861328125, "objective/kl": 10.875221252441406, "objective/non_score_reward": -1.0875221490859985, "objective/rlhf_reward": -2.227382304445777, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 7.782899379730225, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.635535717010498, "step": 916, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9965479373931885 }, { "episode": 14688, "epoch": 0.2640112161627782, "loss/policy_avg": 0.06934709846973419, "lr": 2.824194785276074e-06, "objective/entropy": 142.6950225830078, "objective/kl": 6.470335960388184, "objective/non_score_reward": -0.6470335721969604, "objective/rlhf_reward": 0.3355845019805703, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 37.148372650146484, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6092413663864136, "step": 917, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998875617980957 }, { "episode": 14704, "epoch": 0.2642988100801668, "loss/policy_avg": 0.20597806572914124, "lr": 2.8240030674846627e-06, "objective/entropy": 272.4873046875, "objective/kl": 14.78536319732666, "objective/non_score_reward": -1.4785361289978027, "objective/rlhf_reward": -7.914144992828369, "objective/scores": -0.5, "policy/approxkl_avg": 106.7894287109375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7212300300598145, "step": 918, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0013232231140137 }, { "episode": 14720, "epoch": 0.26458640399755545, "loss/policy_avg": -0.0837036743760109, "lr": 2.8238113496932515e-06, "objective/entropy": 48.787288665771484, "objective/kl": 13.830099105834961, "objective/non_score_reward": -1.383009910583496, "objective/rlhf_reward": -7.532039165496826, "objective/scores": -0.5, "policy/approxkl_avg": 13.889322280883789, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6313380599021912, "step": 919, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0004570484161377 }, { "episode": 14736, "epoch": 0.2648739979149441, "loss/policy_avg": 0.5091476440429688, "lr": 2.8236196319018408e-06, "objective/entropy": 169.26397705078125, "objective/kl": 15.42192554473877, "objective/non_score_reward": -1.5421926975250244, "objective/rlhf_reward": -5.768770357966423, "objective/scores": 0.1, "policy/approxkl_avg": 256.7361145019531, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6787434816360474, "step": 920, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9991068840026855 }, { "episode": 14752, "epoch": 0.26516159183233273, "loss/policy_avg": 0.153235524892807, "lr": 2.8234279141104296e-06, "objective/entropy": -151.74813842773438, "objective/kl": 8.891946792602539, "objective/non_score_reward": -0.8891947269439697, "objective/rlhf_reward": 0.8432209208607677, "objective/scores": 1.1, "policy/approxkl_avg": 21.438941955566406, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6721331477165222, "step": 921, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9993393421173096 }, { "episode": 14768, "epoch": 0.2654491857497214, "loss/policy_avg": 0.1505753993988037, "lr": 2.823236196319019e-06, "objective/entropy": 103.85417175292969, "objective/kl": 9.798246383666992, "objective/non_score_reward": -0.9798246026039124, "objective/rlhf_reward": -2.363039045539453, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 3.5156641006469727, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5437833070755005, "step": 922, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9987695217132568 }, { "episode": 14784, "epoch": 0.26573677966711007, "loss/policy_avg": 0.031007010489702225, "lr": 2.823044478527607e-06, "objective/entropy": -236.375732421875, "objective/kl": 14.377126693725586, "objective/non_score_reward": -1.4377126693725586, "objective/rlhf_reward": -5.350850439071655, "objective/scores": 0.1, "policy/approxkl_avg": 107.49105072021484, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7668004631996155, "step": 923, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9992296695709229 }, { "episode": 14800, "epoch": 0.2660243735844987, "loss/policy_avg": 1.054640769958496, "lr": 2.8228527607361964e-06, "objective/entropy": 308.00750732421875, "objective/kl": 16.157392501831055, "objective/non_score_reward": -1.615739345550537, "objective/rlhf_reward": -3.539238218904707, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 54.42896270751953, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7293402552604675, "step": 924, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9982367753982544 }, { "episode": 14816, "epoch": 0.26631196750188735, "loss/policy_avg": 0.8975625038146973, "lr": 2.8226610429447852e-06, "objective/entropy": 24.863666534423828, "objective/kl": 9.843988418579102, "objective/non_score_reward": -0.9843988418579102, "objective/rlhf_reward": -3.5375953972339627, "objective/scores": 0.1, "policy/approxkl_avg": 22.406509399414062, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.34417903423309326, "step": 925, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9979350566864014 }, { "episode": 14832, "epoch": 0.266599561419276, "loss/policy_avg": 0.7326955795288086, "lr": 2.822469325153374e-06, "objective/entropy": -177.4724578857422, "objective/kl": 18.637279510498047, "objective/non_score_reward": -1.8637280464172363, "objective/rlhf_reward": -5.850791875187474, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 165.49850463867188, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6023860573768616, "step": 926, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9989757537841797 }, { "episode": 14848, "epoch": 0.2668871553366646, "loss/policy_avg": 0.18156574666500092, "lr": 2.8222776073619633e-06, "objective/entropy": 98.12123107910156, "objective/kl": 16.134002685546875, "objective/non_score_reward": -1.6134004592895508, "objective/rlhf_reward": -8.453601837158203, "objective/scores": -0.5, "policy/approxkl_avg": 92.71282958984375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4930706322193146, "step": 927, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999679446220398 }, { "episode": 14864, "epoch": 0.26717474925405327, "loss/policy_avg": 0.6215415596961975, "lr": 2.822085889570552e-06, "objective/entropy": -99.99117279052734, "objective/kl": 6.985932350158691, "objective/non_score_reward": -0.6985931396484375, "objective/rlhf_reward": 1.6056274041533474, "objective/scores": 1.1, "policy/approxkl_avg": 2.1479220390319824, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5162912011146545, "step": 928, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0008466243743896 }, { "episode": 14880, "epoch": 0.2674623431714419, "loss/policy_avg": 0.305203914642334, "lr": 2.8218941717791413e-06, "objective/entropy": 52.752532958984375, "objective/kl": 16.012250900268555, "objective/non_score_reward": -1.6012248992919922, "objective/rlhf_reward": -6.0048998355865475, "objective/scores": 0.1, "policy/approxkl_avg": 33.087364196777344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8244567513465881, "step": 929, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9968031644821167 }, { "episode": 14896, "epoch": 0.26774993708883055, "loss/policy_avg": 0.3882259726524353, "lr": 2.82170245398773e-06, "objective/entropy": 170.99806213378906, "objective/kl": 11.023755073547363, "objective/non_score_reward": -1.1023752689361572, "objective/rlhf_reward": -0.009501358866691234, "objective/scores": 1.1, "policy/approxkl_avg": 47.82780456542969, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.68819260597229, "step": 930, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000180721282959 }, { "episode": 14912, "epoch": 0.26803753100621924, "loss/policy_avg": 0.3398761451244354, "lr": 2.821510736196319e-06, "objective/entropy": -18.394920349121094, "objective/kl": 10.919998168945312, "objective/non_score_reward": -1.0919996500015259, "objective/rlhf_reward": -1.9679986894130705, "objective/scores": 0.6, "policy/approxkl_avg": 50.97767639160156, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.736298680305481, "step": 931, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9992265701293945 }, { "episode": 14928, "epoch": 0.2683251249236079, "loss/policy_avg": -0.4279845058917999, "lr": 2.821319018404908e-06, "objective/entropy": 86.6572494506836, "objective/kl": 7.83455228805542, "objective/non_score_reward": -0.7834553718566895, "objective/rlhf_reward": -1.4719620398884876, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 18.167985916137695, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.49258583784103394, "step": 932, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.011922836303711 }, { "episode": 14944, "epoch": 0.2686127188409965, "loss/policy_avg": 0.23957113921642303, "lr": 2.821127300613497e-06, "objective/entropy": 121.09419250488281, "objective/kl": 11.997109413146973, "objective/non_score_reward": -1.1997110843658447, "objective/rlhf_reward": -6.798844337463379, "objective/scores": -0.5, "policy/approxkl_avg": 29.186599731445312, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5409206748008728, "step": 933, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.995774269104004 }, { "episode": 14960, "epoch": 0.26890031275838516, "loss/policy_avg": 0.44504815340042114, "lr": 2.8209355828220858e-06, "objective/entropy": 85.28959655761719, "objective/kl": 9.80569076538086, "objective/non_score_reward": -0.9805691242218018, "objective/rlhf_reward": 0.47772344350814855, "objective/scores": 1.1, "policy/approxkl_avg": 14.007376670837402, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6362345218658447, "step": 934, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.998863697052002 }, { "episode": 14976, "epoch": 0.2691879066757738, "loss/policy_avg": 0.06488144397735596, "lr": 2.820743865030675e-06, "objective/entropy": -71.18653869628906, "objective/kl": 11.959724426269531, "objective/non_score_reward": -1.1959723234176636, "objective/rlhf_reward": -6.783889293670654, "objective/scores": -0.5, "policy/approxkl_avg": 9.0035400390625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6700109243392944, "step": 935, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0012855529785156 }, { "episode": 14992, "epoch": 0.26947550059316244, "loss/policy_avg": 0.00518820621073246, "lr": 2.820552147239264e-06, "objective/entropy": 10.628082275390625, "objective/kl": 19.038652420043945, "objective/non_score_reward": -1.9038654565811157, "objective/rlhf_reward": -4.691742573620054, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 43.91889190673828, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9487606287002563, "step": 936, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.997607707977295 }, { "episode": 15008, "epoch": 0.2697630945105511, "loss/policy_avg": 0.1454022079706192, "lr": 2.820360429447853e-06, "objective/entropy": 179.77040100097656, "objective/kl": 12.530265808105469, "objective/non_score_reward": -1.2530266046524048, "objective/rlhf_reward": -4.612106418609619, "objective/scores": 0.1, "policy/approxkl_avg": 82.92576599121094, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6685892343521118, "step": 937, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9975636005401611 }, { "episode": 15024, "epoch": 0.2700506884279397, "loss/policy_avg": 0.2995688319206238, "lr": 2.820168711656442e-06, "objective/entropy": 133.7378692626953, "objective/kl": 10.791794776916504, "objective/non_score_reward": -1.0791795253753662, "objective/rlhf_reward": -1.9167180418968202, "objective/scores": 0.6, "policy/approxkl_avg": 52.06595230102539, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4817456007003784, "step": 938, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000180721282959 }, { "episode": 15040, "epoch": 0.2703382823453284, "loss/policy_avg": 0.34660422801971436, "lr": 2.8199769938650307e-06, "objective/entropy": 19.058616638183594, "objective/kl": 14.038864135742188, "objective/non_score_reward": -1.4038866758346558, "objective/rlhf_reward": -1.215546733140945, "objective/scores": 1.1, "policy/approxkl_avg": 48.796592712402344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.41182541847229004, "step": 939, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.997514009475708 }, { "episode": 15056, "epoch": 0.27062587626271706, "loss/policy_avg": 0.2968835234642029, "lr": 2.81978527607362e-06, "objective/entropy": 76.23985290527344, "objective/kl": 13.028081893920898, "objective/non_score_reward": -1.3028082847595215, "objective/rlhf_reward": -4.811233407258987, "objective/scores": 0.1, "policy/approxkl_avg": 115.7067642211914, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6411737203598022, "step": 940, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.997145414352417 }, { "episode": 15072, "epoch": 0.2709134701801057, "loss/policy_avg": 0.38955211639404297, "lr": 2.8195935582822087e-06, "objective/entropy": -10.788116455078125, "objective/kl": 14.27804946899414, "objective/non_score_reward": -1.4278050661087036, "objective/rlhf_reward": -1.3112202048301693, "objective/scores": 1.1, "policy/approxkl_avg": 11.869421005249023, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6856441497802734, "step": 941, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9979138374328613 }, { "episode": 15088, "epoch": 0.27120106409749434, "loss/policy_avg": -0.04374265670776367, "lr": 2.819401840490798e-06, "objective/entropy": 46.83854675292969, "objective/kl": 8.191838264465332, "objective/non_score_reward": -0.8191839456558228, "objective/rlhf_reward": -2.8767357528209683, "objective/scores": 0.1, "policy/approxkl_avg": 13.673277854919434, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4365406632423401, "step": 942, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9992234706878662 }, { "episode": 15104, "epoch": 0.271488658014883, "loss/policy_avg": 0.019855128601193428, "lr": 2.8192101226993868e-06, "objective/entropy": 146.18865966796875, "objective/kl": 11.691083908081055, "objective/non_score_reward": -1.169108271598816, "objective/rlhf_reward": -4.276433093845844, "objective/scores": 0.1, "policy/approxkl_avg": 49.19375228881836, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.438481867313385, "step": 943, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0008602142333984 }, { "episode": 15120, "epoch": 0.2717762519322716, "loss/policy_avg": -0.07478684931993484, "lr": 2.8190184049079756e-06, "objective/entropy": 203.56137084960938, "objective/kl": 14.393468856811523, "objective/non_score_reward": -1.4393467903137207, "objective/rlhf_reward": -1.357387429475784, "objective/scores": 1.1, "policy/approxkl_avg": 32.7160758972168, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7971817255020142, "step": 944, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.00042462348938 }, { "episode": 15136, "epoch": 0.27206384584966026, "loss/policy_avg": 0.2446022629737854, "lr": 2.8188266871165644e-06, "objective/entropy": 31.180438995361328, "objective/kl": 16.74203109741211, "objective/non_score_reward": -1.6742032766342163, "objective/rlhf_reward": -8.696813583374023, "objective/scores": -0.5, "policy/approxkl_avg": 105.12753295898438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7867506742477417, "step": 945, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9979796409606934 }, { "episode": 15152, "epoch": 0.27235143976704895, "loss/policy_avg": 0.1629078984260559, "lr": 2.818634969325153e-06, "objective/entropy": -197.06622314453125, "objective/kl": 13.794787406921387, "objective/non_score_reward": -1.3794788122177124, "objective/rlhf_reward": -7.51791524887085, "objective/scores": -0.5, "policy/approxkl_avg": 115.64588928222656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6810861825942993, "step": 946, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9969407320022583 }, { "episode": 15168, "epoch": 0.2726390336844376, "loss/policy_avg": -0.03024168312549591, "lr": 2.8184432515337424e-06, "objective/entropy": 149.4989776611328, "objective/kl": 16.816232681274414, "objective/non_score_reward": -1.6816232204437256, "objective/rlhf_reward": -8.726492881774902, "objective/scores": -0.5, "policy/approxkl_avg": 36.17455291748047, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.48948463797569275, "step": 947, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.001532554626465 }, { "episode": 15184, "epoch": 0.27292662760182623, "loss/policy_avg": 0.34567373991012573, "lr": 2.8182515337423312e-06, "objective/entropy": 90.28936767578125, "objective/kl": 19.398473739624023, "objective/non_score_reward": -1.939847469329834, "objective/rlhf_reward": -7.359389877319336, "objective/scores": 0.1, "policy/approxkl_avg": 41.141029357910156, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7335373163223267, "step": 948, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9995067119598389 }, { "episode": 15200, "epoch": 0.2732142215192149, "loss/policy_avg": 0.4489978551864624, "lr": 2.81805981595092e-06, "objective/entropy": 38.101829528808594, "objective/kl": 14.68608283996582, "objective/non_score_reward": -1.4686082601547241, "objective/rlhf_reward": -7.8744330406188965, "objective/scores": -0.5, "policy/approxkl_avg": 115.49784088134766, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5973610877990723, "step": 949, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9989502429962158 }, { "episode": 15216, "epoch": 0.2735018154366035, "loss/policy_avg": 0.4765210449695587, "lr": 2.8178680981595093e-06, "objective/entropy": 234.66183471679688, "objective/kl": 17.86334228515625, "objective/non_score_reward": -1.7863342761993408, "objective/rlhf_reward": -6.745336925983429, "objective/scores": 0.1, "policy/approxkl_avg": 46.356414794921875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7054615616798401, "step": 950, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9985949993133545 }, { "episode": 15232, "epoch": 0.27378940935399215, "loss/policy_avg": 0.0288299061357975, "lr": 2.817676380368098e-06, "objective/entropy": 156.8824462890625, "objective/kl": 9.917464256286621, "objective/non_score_reward": -0.9917463660240173, "objective/rlhf_reward": -5.966985702514648, "objective/scores": -0.5, "policy/approxkl_avg": 38.51423645019531, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5643774271011353, "step": 951, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000211238861084 }, { "episode": 15248, "epoch": 0.2740770032713808, "loss/policy_avg": 0.11630159616470337, "lr": 2.8174846625766873e-06, "objective/entropy": 73.06771087646484, "objective/kl": 13.975484848022461, "objective/non_score_reward": -1.3975484371185303, "objective/rlhf_reward": -5.190193688869476, "objective/scores": 0.1, "policy/approxkl_avg": 40.41292190551758, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5209547281265259, "step": 952, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0003228187561035 }, { "episode": 15264, "epoch": 0.27436459718876943, "loss/policy_avg": 0.4185836911201477, "lr": 2.817292944785276e-06, "objective/entropy": 172.16983032226562, "objective/kl": 18.557403564453125, "objective/non_score_reward": -1.8557404279708862, "objective/rlhf_reward": -7.022961831092834, "objective/scores": 0.1, "policy/approxkl_avg": 90.51202392578125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.751570463180542, "step": 953, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9965605735778809 }, { "episode": 15280, "epoch": 0.27465219110615813, "loss/policy_avg": -0.0021925121545791626, "lr": 2.817101226993865e-06, "objective/entropy": -46.05889892578125, "objective/kl": 13.408881187438965, "objective/non_score_reward": -1.3408881425857544, "objective/rlhf_reward": -2.439833794475767, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 17.623706817626953, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7430280447006226, "step": 954, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0016512870788574 }, { "episode": 15296, "epoch": 0.27493978502354677, "loss/policy_avg": -0.014723315834999084, "lr": 2.816909509202454e-06, "objective/entropy": 288.378173828125, "objective/kl": 11.871005058288574, "objective/non_score_reward": -1.1871004104614258, "objective/rlhf_reward": -6.748402118682861, "objective/scores": -0.5, "policy/approxkl_avg": 3.3176779747009277, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6996443271636963, "step": 955, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.000519037246704 }, { "episode": 15312, "epoch": 0.2752273789409354, "loss/policy_avg": 0.6079083681106567, "lr": 2.816717791411043e-06, "objective/entropy": 31.44274139404297, "objective/kl": 20.894535064697266, "objective/non_score_reward": -2.08945369720459, "objective/rlhf_reward": -7.957814311981202, "objective/scores": 0.1, "policy/approxkl_avg": 17.899011611938477, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.3588119149208069, "step": 956, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9980523586273193 }, { "episode": 15328, "epoch": 0.27551497285832405, "loss/policy_avg": 0.41200828552246094, "lr": 2.816526073619632e-06, "objective/entropy": -4.771537780761719, "objective/kl": 15.709589004516602, "objective/non_score_reward": -1.5709590911865234, "objective/rlhf_reward": -4.161129685417686, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 90.92428588867188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5645814538002014, "step": 957, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.995368242263794 }, { "episode": 15344, "epoch": 0.2758025667757127, "loss/policy_avg": 0.45684581995010376, "lr": 2.816334355828221e-06, "objective/entropy": -4.6809844970703125, "objective/kl": 9.97734260559082, "objective/non_score_reward": -0.9977341890335083, "objective/rlhf_reward": -3.5909366667270657, "objective/scores": 0.1, "policy/approxkl_avg": 56.411354064941406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6887657642364502, "step": 958, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.002199172973633 }, { "episode": 15360, "epoch": 0.2760901606931013, "loss/policy_avg": -0.45410293340682983, "lr": 2.81614263803681e-06, "objective/entropy": -121.51223754882812, "objective/kl": 7.737212657928467, "objective/non_score_reward": -0.7737212777137756, "objective/rlhf_reward": -1.2700562730160465, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 17.588157653808594, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.4520346224308014, "step": 959, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.006819725036621 }, { "episode": 15376, "epoch": 0.27637775461048997, "loss/policy_avg": 0.30216163396835327, "lr": 2.815950920245399e-06, "objective/entropy": 151.243896484375, "objective/kl": 17.64258575439453, "objective/non_score_reward": -1.764258623123169, "objective/rlhf_reward": -4.133315478206846, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 200.840087890625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5193691253662109, "step": 960, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9965643882751465 }, { "episode": 15392, "epoch": 0.27666534852787866, "loss/policy_avg": 0.04060244560241699, "lr": 2.815759202453988e-06, "objective/entropy": 231.11839294433594, "objective/kl": 14.362661361694336, "objective/non_score_reward": -1.4362661838531494, "objective/rlhf_reward": -5.345064407587051, "objective/scores": 0.1, "policy/approxkl_avg": 47.88323974609375, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7988565564155579, "step": 961, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9993538856506348 }, { "episode": 15408, "epoch": 0.2769529424452673, "loss/policy_avg": 0.3130953907966614, "lr": 2.8155674846625767e-06, "objective/entropy": 120.5262680053711, "objective/kl": 15.489297866821289, "objective/non_score_reward": -1.5489299297332764, "objective/rlhf_reward": -5.795719510316848, "objective/scores": 0.1, "policy/approxkl_avg": 145.9391326904297, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6065146923065186, "step": 962, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9990172386169434 }, { "episode": 15424, "epoch": 0.27724053636265594, "loss/policy_avg": 0.4497438967227936, "lr": 2.815375766871166e-06, "objective/entropy": 103.47833251953125, "objective/kl": 18.042163848876953, "objective/non_score_reward": -1.8042165040969849, "objective/rlhf_reward": -9.216865539550781, "objective/scores": -0.5, "policy/approxkl_avg": 38.347259521484375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5049797296524048, "step": 963, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0005576610565186 }, { "episode": 15440, "epoch": 0.2775281302800446, "loss/policy_avg": 0.35638368129730225, "lr": 2.8151840490797547e-06, "objective/entropy": 91.05720520019531, "objective/kl": 11.770478248596191, "objective/non_score_reward": -1.177047848701477, "objective/rlhf_reward": -1.7844724997293677, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 71.71107482910156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8685045838356018, "step": 964, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999760627746582 }, { "episode": 15456, "epoch": 0.2778157241974332, "loss/policy_avg": 0.463620662689209, "lr": 2.814992331288344e-06, "objective/entropy": 75.50821685791016, "objective/kl": 18.123964309692383, "objective/non_score_reward": -1.8123962879180908, "objective/rlhf_reward": -5.693326204028681, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 125.98820495605469, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6453250050544739, "step": 965, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9984214305877686 }, { "episode": 15472, "epoch": 0.27810331811482186, "loss/policy_avg": 0.29372507333755493, "lr": 2.8148006134969328e-06, "objective/entropy": 249.2400360107422, "objective/kl": 21.19771385192871, "objective/non_score_reward": -2.1197714805603027, "objective/rlhf_reward": -6.079085892438888, "objective/scores": 0.6, "policy/approxkl_avg": 248.2655487060547, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8577245473861694, "step": 966, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999601125717163 }, { "episode": 15488, "epoch": 0.2783909120322105, "loss/policy_avg": 0.06916552782058716, "lr": 2.8146088957055216e-06, "objective/entropy": -68.20138549804688, "objective/kl": 7.964428901672363, "objective/non_score_reward": -0.7964429259300232, "objective/rlhf_reward": -0.7857717037200929, "objective/scores": 0.6, "policy/approxkl_avg": 1.7321834564208984, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.44984665513038635, "step": 967, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999240756034851 }, { "episode": 15504, "epoch": 0.27867850594959914, "loss/policy_avg": 0.2512040436267853, "lr": 2.8144171779141104e-06, "objective/entropy": 11.576576232910156, "objective/kl": 7.048787593841553, "objective/non_score_reward": -0.7048788070678711, "objective/rlhf_reward": -0.9946863904324283, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 0.4532914459705353, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5414634943008423, "step": 968, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.003035545349121 }, { "episode": 15520, "epoch": 0.27896609986698784, "loss/policy_avg": 0.6474967002868652, "lr": 2.814225460122699e-06, "objective/entropy": -189.89291381835938, "objective/kl": 6.612698078155518, "objective/non_score_reward": -0.6612698435783386, "objective/rlhf_reward": -0.24507925510406492, "objective/scores": 0.6, "policy/approxkl_avg": 31.995384216308594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7192500233650208, "step": 969, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.001279830932617 }, { "episode": 15536, "epoch": 0.2792536937843765, "loss/policy_avg": 0.03323252499103546, "lr": 2.8140337423312884e-06, "objective/entropy": -8.374664306640625, "objective/kl": 7.630832672119141, "objective/non_score_reward": -0.7630833387374878, "objective/rlhf_reward": -0.12861440026876592, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 5.505527019500732, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7015961408615112, "step": 970, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000675678253174 }, { "episode": 15552, "epoch": 0.2795412877017651, "loss/policy_avg": 0.21652743220329285, "lr": 2.8138420245398772e-06, "objective/entropy": 166.86929321289062, "objective/kl": 10.748126983642578, "objective/non_score_reward": -1.0748127698898315, "objective/rlhf_reward": -3.899251019954681, "objective/scores": 0.1, "policy/approxkl_avg": 27.194313049316406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4750038683414459, "step": 971, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9973280429840088 }, { "episode": 15568, "epoch": 0.27982888161915376, "loss/policy_avg": 0.49266356229782104, "lr": 2.813650306748466e-06, "objective/entropy": 118.69342041015625, "objective/kl": 12.942707061767578, "objective/non_score_reward": -1.2942707538604736, "objective/rlhf_reward": -3.5152232996827233, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 94.40614318847656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7927144169807434, "step": 972, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0002317428588867 }, { "episode": 15584, "epoch": 0.2801164755365424, "loss/policy_avg": -0.02213054895401001, "lr": 2.8134585889570553e-06, "objective/entropy": 187.91323852539062, "objective/kl": 18.41681671142578, "objective/non_score_reward": -1.841681718826294, "objective/rlhf_reward": -9.366726875305176, "objective/scores": -0.5, "policy/approxkl_avg": 153.68333435058594, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7450737953186035, "step": 973, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.002140760421753 }, { "episode": 15600, "epoch": 0.28040406945393104, "loss/policy_avg": -0.016699761152267456, "lr": 2.813266871165644e-06, "objective/entropy": 20.254638671875, "objective/kl": 5.66358757019043, "objective/non_score_reward": -0.566358745098114, "objective/rlhf_reward": -4.265435218811035, "objective/scores": -0.5, "policy/approxkl_avg": 22.735843658447266, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.626510739326477, "step": 974, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.003321886062622 }, { "episode": 15616, "epoch": 0.2806916633713197, "loss/policy_avg": 0.5562024712562561, "lr": 2.8130751533742333e-06, "objective/entropy": 83.20606994628906, "objective/kl": 12.872986793518066, "objective/non_score_reward": -1.2872986793518066, "objective/rlhf_reward": -7.149194717407227, "objective/scores": -0.5, "policy/approxkl_avg": 62.93476486206055, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6668212413787842, "step": 975, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9978232383728027 }, { "episode": 15632, "epoch": 0.2809792572887083, "loss/policy_avg": 0.11060173809528351, "lr": 2.812883435582822e-06, "objective/entropy": -130.44863891601562, "objective/kl": 12.669355392456055, "objective/non_score_reward": -1.2669358253479004, "objective/rlhf_reward": -2.1440238698732585, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 30.233688354492188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5940582752227783, "step": 976, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.00014328956604 }, { "episode": 15648, "epoch": 0.281266851206097, "loss/policy_avg": 0.1823975145816803, "lr": 2.812691717791411e-06, "objective/entropy": 94.65828704833984, "objective/kl": 12.634793281555176, "objective/non_score_reward": -1.2634793519973755, "objective/rlhf_reward": -2.130198453308317, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 86.40443420410156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5936030149459839, "step": 977, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0006115436553955 }, { "episode": 15664, "epoch": 0.28155444512348565, "loss/policy_avg": 0.2329886257648468, "lr": 2.8125e-06, "objective/entropy": 214.52359008789062, "objective/kl": 14.259061813354492, "objective/non_score_reward": -1.4259061813354492, "objective/rlhf_reward": -1.3036247253417965, "objective/scores": 1.1, "policy/approxkl_avg": 51.973575592041016, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7015668153762817, "step": 978, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9960739612579346 }, { "episode": 15680, "epoch": 0.2818420390408743, "loss/policy_avg": 0.1662026047706604, "lr": 2.812308282208589e-06, "objective/entropy": 140.4540252685547, "objective/kl": 18.81513786315918, "objective/non_score_reward": -1.881514072418213, "objective/rlhf_reward": -5.578645418362553, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 54.464908599853516, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5715263485908508, "step": 979, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9991223812103271 }, { "episode": 15696, "epoch": 0.28212963295826293, "loss/policy_avg": 0.5212792754173279, "lr": 2.8121165644171782e-06, "objective/entropy": 271.5147705078125, "objective/kl": 11.757984161376953, "objective/non_score_reward": -1.1757985353469849, "objective/rlhf_reward": -2.580488087908302, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 99.44505310058594, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6800845265388489, "step": 980, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9972641468048096 }, { "episode": 15712, "epoch": 0.2824172268756516, "loss/policy_avg": 0.10810688138008118, "lr": 2.811924846625767e-06, "objective/entropy": 220.07534790039062, "objective/kl": 9.518316268920898, "objective/non_score_reward": -0.9518316984176636, "objective/rlhf_reward": -0.8836077495825019, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 9.45026969909668, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6736278533935547, "step": 981, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0032143592834473 }, { "episode": 15728, "epoch": 0.2827048207930402, "loss/policy_avg": 0.1366291642189026, "lr": 2.811733128834356e-06, "objective/entropy": -55.597900390625, "objective/kl": 9.75802230834961, "objective/non_score_reward": -0.9758022427558899, "objective/rlhf_reward": -5.9032087326049805, "objective/scores": -0.5, "policy/approxkl_avg": 4.722705841064453, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4849565923213959, "step": 982, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9987678527832031 }, { "episode": 15744, "epoch": 0.28299241471042885, "loss/policy_avg": 0.29598093032836914, "lr": 2.811541411042945e-06, "objective/entropy": 65.98103332519531, "objective/kl": 8.494199752807617, "objective/non_score_reward": -0.8494198322296143, "objective/rlhf_reward": 1.002320730686188, "objective/scores": 1.1, "policy/approxkl_avg": 16.425743103027344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.38723868131637573, "step": 983, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9991505146026611 }, { "episode": 15760, "epoch": 0.28328000862781755, "loss/policy_avg": 0.44190719723701477, "lr": 2.811349693251534e-06, "objective/entropy": -63.0709114074707, "objective/kl": 13.403665542602539, "objective/non_score_reward": -1.3403666019439697, "objective/rlhf_reward": -2.437747453094694, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 51.38234329223633, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.630331814289093, "step": 984, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9976906776428223 }, { "episode": 15776, "epoch": 0.2835676025452062, "loss/policy_avg": 0.08249235153198242, "lr": 2.8111579754601227e-06, "objective/entropy": -40.74168395996094, "objective/kl": 17.953168869018555, "objective/non_score_reward": -1.7953169345855713, "objective/rlhf_reward": -4.7812679469585415, "objective/scores": 0.6, "policy/approxkl_avg": 120.02989196777344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5399852991104126, "step": 985, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0002918243408203 }, { "episode": 15792, "epoch": 0.2838551964625948, "loss/policy_avg": 0.2005055695772171, "lr": 2.810966257668712e-06, "objective/entropy": -191.1624755859375, "objective/kl": 9.120893478393555, "objective/non_score_reward": -0.9120894074440002, "objective/rlhf_reward": -5.648357391357422, "objective/scores": -0.5, "policy/approxkl_avg": 9.947835922241211, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.609139084815979, "step": 986, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9988343715667725 }, { "episode": 15808, "epoch": 0.28414279037998347, "loss/policy_avg": 0.7648462057113647, "lr": 2.8107745398773007e-06, "objective/entropy": 359.6585693359375, "objective/kl": 18.952003479003906, "objective/non_score_reward": -1.895200490951538, "objective/rlhf_reward": -5.458095910326515, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 54.210479736328125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8480129837989807, "step": 987, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.999436616897583 }, { "episode": 15824, "epoch": 0.2844303842973721, "loss/policy_avg": 0.6691151857376099, "lr": 2.81058282208589e-06, "objective/entropy": 73.57808685302734, "objective/kl": 18.581069946289062, "objective/non_score_reward": -1.858107089996338, "objective/rlhf_reward": -5.309722246901069, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 32.08354949951172, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4837074875831604, "step": 988, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9994486570358276 }, { "episode": 15840, "epoch": 0.28471797821476075, "loss/policy_avg": 0.2605787217617035, "lr": 2.8103911042944788e-06, "objective/entropy": -190.95657348632812, "objective/kl": 11.05725383758545, "objective/non_score_reward": -1.1057254076004028, "objective/rlhf_reward": -2.022901570796966, "objective/scores": 0.6, "policy/approxkl_avg": 29.785179138183594, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.74052894115448, "step": 989, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0007266998291016 }, { "episode": 15856, "epoch": 0.2850055721321494, "loss/policy_avg": 0.17699474096298218, "lr": 2.8101993865030676e-06, "objective/entropy": 202.82345581054688, "objective/kl": 10.571136474609375, "objective/non_score_reward": -1.0571134090423584, "objective/rlhf_reward": -3.8284538000822064, "objective/scores": 0.1, "policy/approxkl_avg": 30.342830657958984, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5505605340003967, "step": 990, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9986047744750977 }, { "episode": 15872, "epoch": 0.285293166049538, "loss/policy_avg": 0.2762322425842285, "lr": 2.8100076687116564e-06, "objective/entropy": -215.3885498046875, "objective/kl": 11.097982406616211, "objective/non_score_reward": -1.1097981929779053, "objective/rlhf_reward": -4.039192607998848, "objective/scores": 0.1, "policy/approxkl_avg": 24.58441162109375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.554857611656189, "step": 991, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000056743621826 }, { "episode": 15888, "epoch": 0.2855807599669267, "loss/policy_avg": 0.8083657026290894, "lr": 2.809815950920245e-06, "objective/entropy": 136.85971069335938, "objective/kl": 14.077508926391602, "objective/non_score_reward": -1.4077508449554443, "objective/rlhf_reward": -1.2310034692287442, "objective/scores": 1.1, "policy/approxkl_avg": 50.88481903076172, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6447016596794128, "step": 992, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9999983310699463 }, { "episode": 15904, "epoch": 0.28586835388431536, "loss/policy_avg": 0.09076584875583649, "lr": 2.8096242331288344e-06, "objective/entropy": -15.63296127319336, "objective/kl": 17.013904571533203, "objective/non_score_reward": -1.701390266418457, "objective/rlhf_reward": -6.405561363697052, "objective/scores": 0.1, "policy/approxkl_avg": 94.94831848144531, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5415375828742981, "step": 993, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9992749691009521 }, { "episode": 15920, "epoch": 0.286155947801704, "loss/policy_avg": 0.30634552240371704, "lr": 2.8094325153374232e-06, "objective/entropy": 146.68658447265625, "objective/kl": 11.60175895690918, "objective/non_score_reward": -1.1601760387420654, "objective/rlhf_reward": -6.640704154968262, "objective/scores": -0.5, "policy/approxkl_avg": 34.54106521606445, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.577959418296814, "step": 994, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.996502161026001 }, { "episode": 15936, "epoch": 0.28644354171909264, "loss/policy_avg": -0.13943040370941162, "lr": 2.809240797546012e-06, "objective/entropy": 84.48971557617188, "objective/kl": 15.685787200927734, "objective/non_score_reward": -1.5685787200927734, "objective/rlhf_reward": -4.449486131939004, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 35.03499221801758, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4448893070220947, "step": 995, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0012125968933105 }, { "episode": 15952, "epoch": 0.2867311356364813, "loss/policy_avg": 0.5280898213386536, "lr": 2.8090490797546013e-06, "objective/entropy": 270.3984375, "objective/kl": 17.52050018310547, "objective/non_score_reward": -1.7520501613616943, "objective/rlhf_reward": -6.608200347423553, "objective/scores": 0.1, "policy/approxkl_avg": 20.95287322998047, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6844844818115234, "step": 996, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.999497413635254 }, { "episode": 15968, "epoch": 0.2870187295538699, "loss/policy_avg": 0.7680552005767822, "lr": 2.80885736196319e-06, "objective/entropy": 83.93873596191406, "objective/kl": 15.629266738891602, "objective/non_score_reward": -1.5629265308380127, "objective/rlhf_reward": -5.8517063617706295, "objective/scores": 0.1, "policy/approxkl_avg": 84.26701354980469, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6365024447441101, "step": 997, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9980711936950684 }, { "episode": 15984, "epoch": 0.28730632347125856, "loss/policy_avg": 0.031346116214990616, "lr": 2.8086656441717793e-06, "objective/entropy": 55.36757278442383, "objective/kl": 5.684802055358887, "objective/non_score_reward": -0.5684801936149597, "objective/rlhf_reward": -1.873920848965645, "objective/scores": 0.1, "policy/approxkl_avg": 0.7184413075447083, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.48540106415748596, "step": 998, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0035548210144043 }, { "episode": 16000, "epoch": 0.28759391738864726, "loss/policy_avg": 0.12601952254772186, "lr": 2.808473926380368e-06, "objective/entropy": -26.834529876708984, "objective/kl": 9.361145973205566, "objective/non_score_reward": -0.9361146688461304, "objective/rlhf_reward": -0.8207396909010141, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 71.52543640136719, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6758915185928345, "step": 999, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9973740577697754 }, { "episode": 16016, "epoch": 0.2878815113060359, "loss/policy_avg": 0.9146034121513367, "lr": 2.808282208588957e-06, "objective/entropy": 227.4399871826172, "objective/kl": 11.205362319946289, "objective/non_score_reward": -1.120536208152771, "objective/rlhf_reward": -4.082144802808761, "objective/scores": 0.1, "policy/approxkl_avg": 6.882693290710449, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5677628517150879, "step": 1000, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9989690780639648 }, { "episode": 16032, "epoch": 0.28816910522342454, "loss/policy_avg": 0.17238189280033112, "lr": 2.808090490797546e-06, "objective/entropy": 33.76301193237305, "objective/kl": 22.955963134765625, "objective/non_score_reward": -2.295596122741699, "objective/rlhf_reward": -8.782385206222534, "objective/scores": 0.1, "policy/approxkl_avg": 231.3272705078125, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6772900223731995, "step": 1001, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9999369382858276 }, { "episode": 16048, "epoch": 0.2884566991408132, "loss/policy_avg": 0.2179027795791626, "lr": 2.807898773006135e-06, "objective/entropy": 165.42457580566406, "objective/kl": 12.760942459106445, "objective/non_score_reward": -1.2760944366455078, "objective/rlhf_reward": -0.7043775081634518, "objective/scores": 1.1, "policy/approxkl_avg": 23.380008697509766, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6448920965194702, "step": 1002, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0009689331054688 }, { "episode": 16064, "epoch": 0.2887442930582018, "loss/policy_avg": 0.21163496375083923, "lr": 2.8077070552147242e-06, "objective/entropy": 219.0919647216797, "objective/kl": 17.551136016845703, "objective/non_score_reward": -1.7551138401031494, "objective/rlhf_reward": -6.620455002784729, "objective/scores": 0.1, "policy/approxkl_avg": 38.19704055786133, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5705841779708862, "step": 1003, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9992287158966064 }, { "episode": 16080, "epoch": 0.28903188697559046, "loss/policy_avg": 0.2837230861186981, "lr": 2.807515337423313e-06, "objective/entropy": -219.6112060546875, "objective/kl": 18.70673942565918, "objective/non_score_reward": -1.8706741333007812, "objective/rlhf_reward": -7.082696056365966, "objective/scores": 0.1, "policy/approxkl_avg": 109.35348510742188, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.60931396484375, "step": 1004, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9979100227355957 }, { "episode": 16096, "epoch": 0.2893194808929791, "loss/policy_avg": -0.19084030389785767, "lr": 2.807323619631902e-06, "objective/entropy": -65.57774353027344, "objective/kl": 7.905247211456299, "objective/non_score_reward": -0.7905246019363403, "objective/rlhf_reward": -5.162098407745361, "objective/scores": -0.5, "policy/approxkl_avg": 21.45587921142578, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5015524625778198, "step": 1005, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0000932216644287 }, { "episode": 16112, "epoch": 0.28960707481036774, "loss/policy_avg": 0.08700132369995117, "lr": 2.807131901840491e-06, "objective/entropy": 273.9281311035156, "objective/kl": 13.745208740234375, "objective/non_score_reward": -1.3745208978652954, "objective/rlhf_reward": -3.375377314464126, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 66.57199096679688, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6697818040847778, "step": 1006, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9990549087524414 }, { "episode": 16128, "epoch": 0.28989466872775643, "loss/policy_avg": 0.39911019802093506, "lr": 2.80694018404908e-06, "objective/entropy": 123.5094223022461, "objective/kl": 19.111557006835938, "objective/non_score_reward": -1.9111559391021729, "objective/rlhf_reward": -5.8197947695580226, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 223.53636169433594, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5573745965957642, "step": 1007, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9973094463348389 }, { "episode": 16144, "epoch": 0.2901822626451451, "loss/policy_avg": 0.4091383218765259, "lr": 2.8067484662576687e-06, "objective/entropy": -59.11369323730469, "objective/kl": 10.690498352050781, "objective/non_score_reward": -1.0690498352050781, "objective/rlhf_reward": -3.8761992067098614, "objective/scores": 0.1, "policy/approxkl_avg": 16.191486358642578, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5697863101959229, "step": 1008, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.996948480606079 }, { "episode": 16160, "epoch": 0.2904698565625337, "loss/policy_avg": 0.144743874669075, "lr": 2.806556748466258e-06, "objective/entropy": 78.24107360839844, "objective/kl": 16.85831642150879, "objective/non_score_reward": -1.6858315467834473, "objective/rlhf_reward": -6.343326365947723, "objective/scores": 0.1, "policy/approxkl_avg": 2.000248908996582, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8502017855644226, "step": 1009, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.001063346862793 }, { "episode": 16176, "epoch": 0.29075745047992235, "loss/policy_avg": 0.14420843124389648, "lr": 2.8063650306748467e-06, "objective/entropy": 98.99222564697266, "objective/kl": 15.894691467285156, "objective/non_score_reward": -1.589469075202942, "objective/rlhf_reward": -5.957876390218734, "objective/scores": 0.1, "policy/approxkl_avg": 23.941476821899414, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7167706489562988, "step": 1010, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9974000453948975 }, { "episode": 16192, "epoch": 0.291045044397311, "loss/policy_avg": 0.11632807552814484, "lr": 2.806173312883436e-06, "objective/entropy": 55.578338623046875, "objective/kl": 10.901174545288086, "objective/non_score_reward": -1.0901174545288086, "objective/rlhf_reward": -6.360469818115234, "objective/scores": -0.5, "policy/approxkl_avg": 22.527301788330078, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7306192517280579, "step": 1011, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9974496364593506 }, { "episode": 16208, "epoch": 0.29133263831469963, "loss/policy_avg": 0.48613467812538147, "lr": 2.8059815950920244e-06, "objective/entropy": 96.17320251464844, "objective/kl": 11.01476001739502, "objective/non_score_reward": -1.1014759540557861, "objective/rlhf_reward": -4.0059039950370785, "objective/scores": 0.1, "policy/approxkl_avg": 23.446365356445312, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6368293762207031, "step": 1012, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9981093406677246 }, { "episode": 16224, "epoch": 0.2916202322320883, "loss/policy_avg": 0.8086050748825073, "lr": 2.8057898773006136e-06, "objective/entropy": 50.43046188354492, "objective/kl": 15.851388931274414, "objective/non_score_reward": -1.5851387977600098, "objective/rlhf_reward": -1.9405554294586178, "objective/scores": 1.1, "policy/approxkl_avg": 146.61557006835938, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6533379554748535, "step": 1013, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.996522068977356 }, { "episode": 16240, "epoch": 0.2919078261494769, "loss/policy_avg": 0.5354580879211426, "lr": 2.8055981595092024e-06, "objective/entropy": -83.87467193603516, "objective/kl": 12.357803344726562, "objective/non_score_reward": -1.2357802391052246, "objective/rlhf_reward": -2.5431207478046414, "objective/scores": 0.6, "policy/approxkl_avg": 74.94430541992188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6065086722373962, "step": 1014, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9968059062957764 }, { "episode": 16256, "epoch": 0.2921954200668656, "loss/policy_avg": 0.10914095491170883, "lr": 2.805406441717791e-06, "objective/entropy": 68.0996322631836, "objective/kl": 20.283634185791016, "objective/non_score_reward": -2.0283632278442383, "objective/rlhf_reward": -7.713453269004821, "objective/scores": 0.1, "policy/approxkl_avg": 97.01466369628906, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.47718697786331177, "step": 1015, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000540256500244 }, { "episode": 16272, "epoch": 0.29248301398425425, "loss/policy_avg": 0.41233736276626587, "lr": 2.8052147239263804e-06, "objective/entropy": 203.62808227539062, "objective/kl": 19.823139190673828, "objective/non_score_reward": -1.982313632965088, "objective/rlhf_reward": -9.929254531860352, "objective/scores": -0.5, "policy/approxkl_avg": 127.52713775634766, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8571972250938416, "step": 1016, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9971632957458496 }, { "episode": 16288, "epoch": 0.2927706079016429, "loss/policy_avg": 0.33414748311042786, "lr": 2.8050230061349693e-06, "objective/entropy": -66.33137512207031, "objective/kl": 16.306615829467773, "objective/non_score_reward": -1.6306617259979248, "objective/rlhf_reward": -2.122646844387054, "objective/scores": 1.1, "policy/approxkl_avg": 23.996356964111328, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6071459054946899, "step": 1017, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0003702640533447 }, { "episode": 16304, "epoch": 0.2930582018190315, "loss/policy_avg": 0.2442750632762909, "lr": 2.8048312883435585e-06, "objective/entropy": 91.87252807617188, "objective/kl": 14.24521541595459, "objective/non_score_reward": -1.4245214462280273, "objective/rlhf_reward": -3.575379910246406, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 26.111114501953125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.9083739519119263, "step": 1018, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9987156391143799 }, { "episode": 16320, "epoch": 0.29334579573642017, "loss/policy_avg": 0.755403995513916, "lr": 2.8046395705521473e-06, "objective/entropy": -104.68330383300781, "objective/kl": 8.859827041625977, "objective/non_score_reward": -0.8859825134277344, "objective/rlhf_reward": -5.5439300537109375, "objective/scores": -0.5, "policy/approxkl_avg": 12.549600601196289, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7196685671806335, "step": 1019, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999251127243042 }, { "episode": 16336, "epoch": 0.2936333896538088, "loss/policy_avg": 0.1244896948337555, "lr": 2.804447852760736e-06, "objective/entropy": 156.35601806640625, "objective/kl": 9.711006164550781, "objective/non_score_reward": -0.9711006879806519, "objective/rlhf_reward": -1.4844029307365418, "objective/scores": 0.6, "policy/approxkl_avg": 23.540321350097656, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5550477504730225, "step": 1020, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9976718425750732 }, { "episode": 16352, "epoch": 0.29392098357119745, "loss/policy_avg": 0.37241601943969727, "lr": 2.8042561349693253e-06, "objective/entropy": 68.94697570800781, "objective/kl": 13.95817756652832, "objective/non_score_reward": -1.395817756652832, "objective/rlhf_reward": -7.583271026611328, "objective/scores": -0.5, "policy/approxkl_avg": 20.297168731689453, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6176844835281372, "step": 1021, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9974250793457031 }, { "episode": 16368, "epoch": 0.29420857748858614, "loss/policy_avg": 0.48266342282295227, "lr": 2.804064417177914e-06, "objective/entropy": 129.90074157714844, "objective/kl": 19.586383819580078, "objective/non_score_reward": -1.9586385488510132, "objective/rlhf_reward": -4.9108348234903545, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 28.53821563720703, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5636917352676392, "step": 1022, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.997184157371521 }, { "episode": 16384, "epoch": 0.2944961714059748, "loss/policy_avg": 0.39663034677505493, "lr": 2.803872699386503e-06, "objective/entropy": -57.39234924316406, "objective/kl": 14.80410385131836, "objective/non_score_reward": -1.4804103374481201, "objective/rlhf_reward": -3.521641409397125, "objective/scores": 0.6, "policy/approxkl_avg": 97.82295227050781, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7238129377365112, "step": 1023, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998586893081665 }, { "episode": 16400, "epoch": 0.2947837653233634, "loss/policy_avg": 0.39789485931396484, "lr": 2.803680981595092e-06, "objective/entropy": 19.74584197998047, "objective/kl": 13.682332992553711, "objective/non_score_reward": -1.368233323097229, "objective/rlhf_reward": -5.072933173179626, "objective/scores": 0.1, "policy/approxkl_avg": 40.714088439941406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7020970582962036, "step": 1024, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9982075691223145 }, { "episode": 16416, "epoch": 0.29507135924075206, "loss/policy_avg": -0.1076928898692131, "lr": 2.803489263803681e-06, "objective/entropy": -72.29051208496094, "objective/kl": 19.678255081176758, "objective/non_score_reward": -1.9678255319595337, "objective/rlhf_reward": -3.4713020682334896, "objective/scores": 1.1, "policy/approxkl_avg": 158.48939514160156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6535936594009399, "step": 1025, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9994299411773682 }, { "episode": 16432, "epoch": 0.2953589531581407, "loss/policy_avg": 0.2257157564163208, "lr": 2.8032975460122702e-06, "objective/entropy": 12.529336929321289, "objective/kl": 9.745655059814453, "objective/non_score_reward": -0.9745655059814453, "objective/rlhf_reward": 0.5017380058765415, "objective/scores": 1.1, "policy/approxkl_avg": 35.45285415649414, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6599619388580322, "step": 1026, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0049285888671875 }, { "episode": 16448, "epoch": 0.29564654707552934, "loss/policy_avg": 0.3417929410934448, "lr": 2.803105828220859e-06, "objective/entropy": 29.500675201416016, "objective/kl": 17.312862396240234, "objective/non_score_reward": -1.7312862873077393, "objective/rlhf_reward": -2.525145328044891, "objective/scores": 1.1, "policy/approxkl_avg": 118.51178741455078, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4447431266307831, "step": 1027, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.997718095779419 }, { "episode": 16464, "epoch": 0.295934140992918, "loss/policy_avg": 0.7444425821304321, "lr": 2.802914110429448e-06, "objective/entropy": -81.61296844482422, "objective/kl": 15.141607284545898, "objective/non_score_reward": -1.5141608715057373, "objective/rlhf_reward": -3.6566432178020474, "objective/scores": 0.6, "policy/approxkl_avg": 52.98291015625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7052822113037109, "step": 1028, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9991610050201416 }, { "episode": 16480, "epoch": 0.2962217349103066, "loss/policy_avg": 0.7192173004150391, "lr": 2.802722392638037e-06, "objective/entropy": 68.40217590332031, "objective/kl": 19.928909301757812, "objective/non_score_reward": -1.9928908348083496, "objective/rlhf_reward": -3.571563637256622, "objective/scores": 1.1, "policy/approxkl_avg": 217.04953002929688, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5880913734436035, "step": 1029, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001051664352417 }, { "episode": 16496, "epoch": 0.2965093288276953, "loss/policy_avg": 0.36680111289024353, "lr": 2.802530674846626e-06, "objective/entropy": 77.72987365722656, "objective/kl": 8.90180492401123, "objective/non_score_reward": -0.8901805877685547, "objective/rlhf_reward": 0.8392777085304264, "objective/scores": 1.1, "policy/approxkl_avg": 16.994407653808594, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5440889596939087, "step": 1030, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000692129135132 }, { "episode": 16512, "epoch": 0.29679692274508396, "loss/policy_avg": -0.17457365989685059, "lr": 2.802338957055215e-06, "objective/entropy": 81.9105224609375, "objective/kl": 9.55565071105957, "objective/non_score_reward": -0.9555650949478149, "objective/rlhf_reward": -3.422260200977325, "objective/scores": 0.1, "policy/approxkl_avg": 17.63282585144043, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6269766092300415, "step": 1031, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.007657527923584 }, { "episode": 16528, "epoch": 0.2970845166624726, "loss/policy_avg": 0.24074256420135498, "lr": 2.802147239263804e-06, "objective/entropy": 210.78358459472656, "objective/kl": 16.9814510345459, "objective/non_score_reward": -1.6981453895568848, "objective/rlhf_reward": -6.392581349611282, "objective/scores": 0.1, "policy/approxkl_avg": 152.18362426757812, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5589526295661926, "step": 1032, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999018669128418 }, { "episode": 16544, "epoch": 0.29737211057986124, "loss/policy_avg": 0.18762660026550293, "lr": 2.8019555214723927e-06, "objective/entropy": 71.11669921875, "objective/kl": 13.297914505004883, "objective/non_score_reward": -1.32979154586792, "objective/rlhf_reward": -7.31916618347168, "objective/scores": -0.5, "policy/approxkl_avg": 57.81365966796875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.690626859664917, "step": 1033, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998163104057312 }, { "episode": 16560, "epoch": 0.2976597044972499, "loss/policy_avg": 0.18001145124435425, "lr": 2.8017638036809816e-06, "objective/entropy": 168.59170532226562, "objective/kl": 16.74738121032715, "objective/non_score_reward": -1.6747379302978516, "objective/rlhf_reward": -2.29895190000534, "objective/scores": 1.1, "policy/approxkl_avg": 25.452075958251953, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5927215218544006, "step": 1034, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000150203704834 }, { "episode": 16576, "epoch": 0.2979472984146385, "loss/policy_avg": 0.14382623136043549, "lr": 2.8015720858895704e-06, "objective/entropy": 6.768333435058594, "objective/kl": 10.177921295166016, "objective/non_score_reward": -1.0177922248840332, "objective/rlhf_reward": -1.1474501236688819, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 7.347600936889648, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7635913491249084, "step": 1035, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9986321926116943 }, { "episode": 16592, "epoch": 0.29823489233202716, "loss/policy_avg": -0.041412293910980225, "lr": 2.8013803680981596e-06, "objective/entropy": 208.7352752685547, "objective/kl": 12.503351211547852, "objective/non_score_reward": -1.2503352165222168, "objective/rlhf_reward": -7.001340866088867, "objective/scores": -0.5, "policy/approxkl_avg": 7.922041893005371, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5301596522331238, "step": 1036, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0028204917907715 }, { "episode": 16608, "epoch": 0.2985224862494158, "loss/policy_avg": 0.2591552138328552, "lr": 2.8011886503067484e-06, "objective/entropy": 192.43026733398438, "objective/kl": 16.139490127563477, "objective/non_score_reward": -1.6139490604400635, "objective/rlhf_reward": -4.793936526001083, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 114.85481262207031, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7047553062438965, "step": 1037, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9972035884857178 }, { "episode": 16624, "epoch": 0.2988100801668045, "loss/policy_avg": 0.7857614755630493, "lr": 2.8009969325153372e-06, "objective/entropy": -56.200775146484375, "objective/kl": 15.523076057434082, "objective/non_score_reward": -1.5523076057434082, "objective/rlhf_reward": -1.80923039317131, "objective/scores": 1.1, "policy/approxkl_avg": 18.877765655517578, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6590103507041931, "step": 1038, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9990099668502808 }, { "episode": 16640, "epoch": 0.29909767408419313, "loss/policy_avg": 0.22429285943508148, "lr": 2.8008052147239264e-06, "objective/entropy": -137.72122192382812, "objective/kl": 10.073450088500977, "objective/non_score_reward": -1.0073450803756714, "objective/rlhf_reward": -3.629380321502685, "objective/scores": 0.1, "policy/approxkl_avg": 17.942007064819336, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5248731374740601, "step": 1039, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9995999336242676 }, { "episode": 16656, "epoch": 0.2993852680015818, "loss/policy_avg": 0.329068124294281, "lr": 2.8006134969325153e-06, "objective/entropy": -55.219940185546875, "objective/kl": 17.680944442749023, "objective/non_score_reward": -1.7680946588516235, "objective/rlhf_reward": -2.672378635406494, "objective/scores": 1.1, "policy/approxkl_avg": 83.21047973632812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5964963436126709, "step": 1040, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9971622228622437 }, { "episode": 16672, "epoch": 0.2996728619189704, "loss/policy_avg": 0.15215516090393066, "lr": 2.8004217791411045e-06, "objective/entropy": 75.79779815673828, "objective/kl": 19.695858001708984, "objective/non_score_reward": -1.9695857763290405, "objective/rlhf_reward": -3.4783432245254513, "objective/scores": 1.1, "policy/approxkl_avg": 33.504329681396484, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6170982122421265, "step": 1041, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998176097869873 }, { "episode": 16688, "epoch": 0.29996045583635905, "loss/policy_avg": 0.41010716557502747, "lr": 2.8002300613496933e-06, "objective/entropy": 16.160049438476562, "objective/kl": 10.406661033630371, "objective/non_score_reward": -1.040666103363037, "objective/rlhf_reward": -2.5008046976929768, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 4.847574234008789, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6609134078025818, "step": 1042, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0026755332946777 }, { "episode": 16704, "epoch": 0.3002480497537477, "loss/policy_avg": 0.26580554246902466, "lr": 2.800038343558282e-06, "objective/entropy": 27.727663040161133, "objective/kl": 11.889737129211426, "objective/non_score_reward": -1.1889736652374268, "objective/rlhf_reward": -4.35589433312416, "objective/scores": 0.1, "policy/approxkl_avg": 135.2098388671875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7928782105445862, "step": 1043, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9992873668670654 }, { "episode": 16720, "epoch": 0.30053564367113633, "loss/policy_avg": 0.2747136950492859, "lr": 2.7998466257668713e-06, "objective/entropy": 174.1541290283203, "objective/kl": 14.657130241394043, "objective/non_score_reward": -1.4657130241394043, "objective/rlhf_reward": -5.462852334976196, "objective/scores": 0.1, "policy/approxkl_avg": 56.979190826416016, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.9884006977081299, "step": 1044, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0000054836273193 }, { "episode": 16736, "epoch": 0.300823237588525, "loss/policy_avg": 0.5620021820068359, "lr": 2.79965490797546e-06, "objective/entropy": -178.74172973632812, "objective/kl": 18.4420108795166, "objective/non_score_reward": -1.8442012071609497, "objective/rlhf_reward": -5.254098551646743, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 104.42015075683594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6059384346008301, "step": 1045, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9990811347961426 }, { "episode": 16752, "epoch": 0.30111083150591367, "loss/policy_avg": 0.24908019602298737, "lr": 2.799463190184049e-06, "objective/entropy": 147.04888916015625, "objective/kl": 13.451667785644531, "objective/non_score_reward": -1.3451666831970215, "objective/rlhf_reward": -3.433255563454564, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 20.122337341308594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6942051649093628, "step": 1046, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9999792575836182 }, { "episode": 16768, "epoch": 0.3013984254233023, "loss/policy_avg": 0.45103228092193604, "lr": 2.799271472392638e-06, "objective/entropy": -139.34446716308594, "objective/kl": 10.701043128967285, "objective/non_score_reward": -1.0701043605804443, "objective/rlhf_reward": 0.11958267688751256, "objective/scores": 1.1, "policy/approxkl_avg": 8.23202133178711, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6052207946777344, "step": 1047, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9994077682495117 }, { "episode": 16784, "epoch": 0.30168601934069095, "loss/policy_avg": 0.12558560073375702, "lr": 2.799079754601227e-06, "objective/entropy": 68.8547134399414, "objective/kl": 11.65601634979248, "objective/non_score_reward": -1.1656014919281006, "objective/rlhf_reward": -2.2624060869216915, "objective/scores": 0.6, "policy/approxkl_avg": 29.906028747558594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5991178750991821, "step": 1048, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000406265258789 }, { "episode": 16800, "epoch": 0.3019736132580796, "loss/policy_avg": -0.44990408420562744, "lr": 2.7988880368098162e-06, "objective/entropy": 197.53802490234375, "objective/kl": 11.517388343811035, "objective/non_score_reward": -1.1517388820648193, "objective/rlhf_reward": -1.6832366927873819, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 103.94009399414062, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.7234064340591431, "step": 1049, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0036420822143555 }, { "episode": 16816, "epoch": 0.3022612071754682, "loss/policy_avg": -0.47146177291870117, "lr": 2.798696319018405e-06, "objective/entropy": 12.529426574707031, "objective/kl": 8.463263511657715, "objective/non_score_reward": -0.846326470375061, "objective/rlhf_reward": -1.651972428957621, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 23.17457389831543, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6993383169174194, "step": 1050, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000576972961426 }, { "episode": 16832, "epoch": 0.30254880109285687, "loss/policy_avg": 0.27615439891815186, "lr": 2.798504601226994e-06, "objective/entropy": 75.7374496459961, "objective/kl": 13.605602264404297, "objective/non_score_reward": -1.3605601787567139, "objective/rlhf_reward": -7.442241191864014, "objective/scores": -0.5, "policy/approxkl_avg": 184.4613800048828, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6710618138313293, "step": 1051, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9975073337554932 }, { "episode": 16848, "epoch": 0.3028363950102455, "loss/policy_avg": 0.2937107980251312, "lr": 2.798312883435583e-06, "objective/entropy": 61.35673141479492, "objective/kl": 18.245712280273438, "objective/non_score_reward": -1.8245713710784912, "objective/rlhf_reward": -4.374566708446714, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 35.17201614379883, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6811769008636475, "step": 1052, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998936653137207 }, { "episode": 16864, "epoch": 0.3031239889276342, "loss/policy_avg": 0.3230166733264923, "lr": 2.798121165644172e-06, "objective/entropy": -3.2104339599609375, "objective/kl": 12.775344848632812, "objective/non_score_reward": -1.2775344848632812, "objective/rlhf_reward": -2.987431766764198, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 53.64230728149414, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6872749328613281, "step": 1053, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.995277762413025 }, { "episode": 16880, "epoch": 0.30341158284502284, "loss/policy_avg": 0.0927608460187912, "lr": 2.797929447852761e-06, "objective/entropy": 85.36604309082031, "objective/kl": 15.402597427368164, "objective/non_score_reward": -1.5402597188949585, "objective/rlhf_reward": -5.761038935184478, "objective/scores": 0.1, "policy/approxkl_avg": 74.3502426147461, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.605860710144043, "step": 1054, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9997683763504028 }, { "episode": 16896, "epoch": 0.3036991767624115, "loss/policy_avg": 0.44006651639938354, "lr": 2.79773773006135e-06, "objective/entropy": -40.1391487121582, "objective/kl": 17.43438720703125, "objective/non_score_reward": -1.743438720703125, "objective/rlhf_reward": -6.573754703998565, "objective/scores": 0.1, "policy/approxkl_avg": 32.484825134277344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5174025297164917, "step": 1055, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0002403259277344 }, { "episode": 16912, "epoch": 0.3039867706798001, "loss/policy_avg": 0.014260075986385345, "lr": 2.7975460122699388e-06, "objective/entropy": -85.97489929199219, "objective/kl": 8.647266387939453, "objective/non_score_reward": -0.8647266626358032, "objective/rlhf_reward": -5.458906173706055, "objective/scores": -0.5, "policy/approxkl_avg": 8.572721481323242, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4460605978965759, "step": 1056, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998361349105835 }, { "episode": 16928, "epoch": 0.30427436459718876, "loss/policy_avg": 0.047507576644420624, "lr": 2.7973542944785276e-06, "objective/entropy": 52.07126235961914, "objective/kl": 15.33387565612793, "objective/non_score_reward": -1.5333876609802246, "objective/rlhf_reward": -5.73355042040348, "objective/scores": 0.1, "policy/approxkl_avg": 77.14869689941406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.703601598739624, "step": 1057, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9998363256454468 }, { "episode": 16944, "epoch": 0.3045619585145774, "loss/policy_avg": 0.8822598457336426, "lr": 2.7971625766871164e-06, "objective/entropy": 198.6245880126953, "objective/kl": 13.298235893249512, "objective/non_score_reward": -1.329823613166809, "objective/rlhf_reward": -7.319294452667236, "objective/scores": -0.5, "policy/approxkl_avg": 109.0306396484375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6059063673019409, "step": 1058, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9981026649475098 }, { "episode": 16960, "epoch": 0.30484955243196604, "loss/policy_avg": 0.1904626488685608, "lr": 2.7969708588957056e-06, "objective/entropy": 218.47293090820312, "objective/kl": 12.414791107177734, "objective/non_score_reward": -1.2414791584014893, "objective/rlhf_reward": -4.565916454792022, "objective/scores": 0.1, "policy/approxkl_avg": 7.559885501861572, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5881438255310059, "step": 1059, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9983572959899902 }, { "episode": 16976, "epoch": 0.30513714634935474, "loss/policy_avg": 0.29011303186416626, "lr": 2.7967791411042944e-06, "objective/entropy": 30.57851791381836, "objective/kl": 10.664090156555176, "objective/non_score_reward": -1.0664091110229492, "objective/rlhf_reward": -6.265636444091797, "objective/scores": -0.5, "policy/approxkl_avg": 52.796512603759766, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7438642978668213, "step": 1060, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9986371994018555 }, { "episode": 16992, "epoch": 0.3054247402667434, "loss/policy_avg": 0.49531346559524536, "lr": 2.7965874233128832e-06, "objective/entropy": 66.80797576904297, "objective/kl": 18.846420288085938, "objective/non_score_reward": -1.8846420049667358, "objective/rlhf_reward": -9.538568496704102, "objective/scores": -0.5, "policy/approxkl_avg": 144.69943237304688, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.48985937237739563, "step": 1061, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9985793828964233 }, { "episode": 17008, "epoch": 0.305712334184132, "loss/policy_avg": 0.423782616853714, "lr": 2.7963957055214725e-06, "objective/entropy": 36.09656524658203, "objective/kl": 16.3355655670166, "objective/non_score_reward": -1.633556604385376, "objective/rlhf_reward": -8.534226417541504, "objective/scores": -0.5, "policy/approxkl_avg": 97.47111511230469, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8207142353057861, "step": 1062, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9972548484802246 }, { "episode": 17024, "epoch": 0.30599992810152066, "loss/policy_avg": 0.569042980670929, "lr": 2.7962039877300613e-06, "objective/entropy": -27.293312072753906, "objective/kl": 17.702495574951172, "objective/non_score_reward": -1.770249605178833, "objective/rlhf_reward": -5.256169851097177, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 163.7218475341797, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.38791435956954956, "step": 1063, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9989525079727173 }, { "episode": 17040, "epoch": 0.3062875220189093, "loss/policy_avg": 0.45904070138931274, "lr": 2.7960122699386505e-06, "objective/entropy": 110.01829528808594, "objective/kl": 17.024635314941406, "objective/non_score_reward": -1.7024635076522827, "objective/rlhf_reward": -6.40985426902771, "objective/scores": 0.1, "policy/approxkl_avg": 31.4630126953125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5167896747589111, "step": 1064, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9971375465393066 }, { "episode": 17056, "epoch": 0.30657511593629794, "loss/policy_avg": 0.446601927280426, "lr": 2.7958205521472393e-06, "objective/entropy": 173.10923767089844, "objective/kl": 11.851961135864258, "objective/non_score_reward": -1.1851961612701416, "objective/rlhf_reward": -4.340784585475921, "objective/scores": 0.1, "policy/approxkl_avg": 11.621267318725586, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4809800982475281, "step": 1065, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0021729469299316 }, { "episode": 17072, "epoch": 0.3068627098536866, "loss/policy_avg": -0.12707829475402832, "lr": 2.795628834355828e-06, "objective/entropy": 107.12103271484375, "objective/kl": 15.33704662322998, "objective/non_score_reward": -1.5337047576904297, "objective/rlhf_reward": -8.134819030761719, "objective/scores": -0.5, "policy/approxkl_avg": 81.38722229003906, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6840176582336426, "step": 1066, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0004630088806152 }, { "episode": 17088, "epoch": 0.3071503037710752, "loss/policy_avg": 0.6813977360725403, "lr": 2.7954371165644174e-06, "objective/entropy": 69.50862121582031, "objective/kl": 11.214311599731445, "objective/non_score_reward": -1.1214311122894287, "objective/rlhf_reward": -4.085724315047264, "objective/scores": 0.1, "policy/approxkl_avg": 110.50018310546875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.43184494972229004, "step": 1067, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000887155532837 }, { "episode": 17104, "epoch": 0.3074378976884639, "loss/policy_avg": -0.03066210076212883, "lr": 2.795245398773006e-06, "objective/entropy": 105.03985595703125, "objective/kl": 13.49374008178711, "objective/non_score_reward": -1.3493739366531372, "objective/rlhf_reward": -7.397495746612549, "objective/scores": -0.5, "policy/approxkl_avg": 32.43345642089844, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4681611657142639, "step": 1068, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0003468990325928 }, { "episode": 17120, "epoch": 0.30772549160585255, "loss/policy_avg": 0.0021042972803115845, "lr": 2.7950536809815954e-06, "objective/entropy": 9.283809661865234, "objective/kl": 17.915706634521484, "objective/non_score_reward": -1.791570782661438, "objective/rlhf_reward": -6.766283369064331, "objective/scores": 0.1, "policy/approxkl_avg": 19.99706268310547, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7062077522277832, "step": 1069, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.001181125640869 }, { "episode": 17136, "epoch": 0.3080130855232412, "loss/policy_avg": 0.6168322563171387, "lr": 2.794861963190184e-06, "objective/entropy": 141.08819580078125, "objective/kl": 11.856430053710938, "objective/non_score_reward": -1.185642957687378, "objective/rlhf_reward": -4.34257218837738, "objective/scores": 0.1, "policy/approxkl_avg": 39.94817352294922, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4839014708995819, "step": 1070, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9996006488800049 }, { "episode": 17152, "epoch": 0.30830067944062983, "loss/policy_avg": 0.4380769431591034, "lr": 2.794670245398773e-06, "objective/entropy": 152.61181640625, "objective/kl": 7.851347923278809, "objective/non_score_reward": -0.7851347923278809, "objective/rlhf_reward": -5.140539169311523, "objective/scores": -0.5, "policy/approxkl_avg": 21.251985549926758, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6197154521942139, "step": 1071, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.996599793434143 }, { "episode": 17168, "epoch": 0.3085882733580185, "loss/policy_avg": 0.3455006182193756, "lr": 2.7944785276073622e-06, "objective/entropy": -161.29962158203125, "objective/kl": 14.089064598083496, "objective/non_score_reward": -1.4089064598083496, "objective/rlhf_reward": -7.635625839233398, "objective/scores": -0.5, "policy/approxkl_avg": 83.3675537109375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6021291613578796, "step": 1072, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0006675720214844 }, { "episode": 17184, "epoch": 0.3088758672754071, "loss/policy_avg": -0.06648188829421997, "lr": 2.794286809815951e-06, "objective/entropy": 6.721179962158203, "objective/kl": 12.68543815612793, "objective/non_score_reward": -1.2685437202453613, "objective/rlhf_reward": -7.074174880981445, "objective/scores": -0.5, "policy/approxkl_avg": 27.025358200073242, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.9224163293838501, "step": 1073, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.99924635887146 }, { "episode": 17200, "epoch": 0.30916346119279575, "loss/policy_avg": 0.10548116266727448, "lr": 2.79409509202454e-06, "objective/entropy": -63.38032531738281, "objective/kl": 14.34760570526123, "objective/non_score_reward": -1.434760570526123, "objective/rlhf_reward": -3.7916312915849044, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 5.972915172576904, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5976482033729553, "step": 1074, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9976732730865479 }, { "episode": 17216, "epoch": 0.3094510551101844, "loss/policy_avg": 0.31662654876708984, "lr": 2.793903374233129e-06, "objective/entropy": 161.00772094726562, "objective/kl": 8.747349739074707, "objective/non_score_reward": -0.8747349381446838, "objective/rlhf_reward": -3.0989399313926693, "objective/scores": 0.1, "policy/approxkl_avg": 59.20728302001953, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4308844804763794, "step": 1075, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.99920654296875 }, { "episode": 17232, "epoch": 0.3097386490275731, "loss/policy_avg": 0.5542811751365662, "lr": 2.793711656441718e-06, "objective/entropy": 210.3388671875, "objective/kl": 10.123682975769043, "objective/non_score_reward": -1.0123682022094727, "objective/rlhf_reward": -3.649472868442535, "objective/scores": 0.1, "policy/approxkl_avg": 20.41395378112793, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8292099237442017, "step": 1076, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9961988925933838 }, { "episode": 17248, "epoch": 0.3100262429449617, "loss/policy_avg": 0.8028037548065186, "lr": 2.793519938650307e-06, "objective/entropy": 0.7303447723388672, "objective/kl": 17.22289276123047, "objective/non_score_reward": -1.7222893238067627, "objective/rlhf_reward": -8.88915729522705, "objective/scores": -0.5, "policy/approxkl_avg": 74.21015167236328, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5208621621131897, "step": 1077, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9992649555206299 }, { "episode": 17264, "epoch": 0.31031383686235037, "loss/policy_avg": 0.3188959062099457, "lr": 2.793328220858896e-06, "objective/entropy": -157.26812744140625, "objective/kl": 9.934246063232422, "objective/non_score_reward": -0.993424654006958, "objective/rlhf_reward": -5.973698616027832, "objective/scores": -0.5, "policy/approxkl_avg": 56.13273620605469, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.831667423248291, "step": 1078, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.998191475868225 }, { "episode": 17280, "epoch": 0.310601430779739, "loss/policy_avg": 0.7122023105621338, "lr": 2.7931365030674848e-06, "objective/entropy": -39.97633361816406, "objective/kl": 20.920761108398438, "objective/non_score_reward": -2.092076301574707, "objective/rlhf_reward": -10.368305206298828, "objective/scores": -0.5, "policy/approxkl_avg": 220.3436279296875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6899431943893433, "step": 1079, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9980072975158691 }, { "episode": 17296, "epoch": 0.31088902469712765, "loss/policy_avg": 0.22102391719818115, "lr": 2.7929447852760736e-06, "objective/entropy": 64.91043090820312, "objective/kl": 12.331470489501953, "objective/non_score_reward": -1.233147144317627, "objective/rlhf_reward": -4.532588696479797, "objective/scores": 0.1, "policy/approxkl_avg": 14.989774703979492, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6464887857437134, "step": 1080, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0013198852539062 }, { "episode": 17312, "epoch": 0.3111766186145163, "loss/policy_avg": 0.10684894025325775, "lr": 2.7927530674846624e-06, "objective/entropy": 99.40476989746094, "objective/kl": 13.806692123413086, "objective/non_score_reward": -1.380669116973877, "objective/rlhf_reward": -3.7893431643644964, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 28.995933532714844, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5542625188827515, "step": 1081, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0000174045562744 }, { "episode": 17328, "epoch": 0.3114642125319049, "loss/policy_avg": 0.30782192945480347, "lr": 2.7925613496932516e-06, "objective/entropy": 6.308849334716797, "objective/kl": 8.870867729187012, "objective/non_score_reward": -0.8870867490768433, "objective/rlhf_reward": -1.4256407044091561, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 30.72797203063965, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6196576952934265, "step": 1082, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000112533569336 }, { "episode": 17344, "epoch": 0.3117518064492936, "loss/policy_avg": 0.3716828227043152, "lr": 2.7923696319018404e-06, "objective/entropy": 238.0005340576172, "objective/kl": 11.961174011230469, "objective/non_score_reward": -1.1961175203323364, "objective/rlhf_reward": -0.3844699472188946, "objective/scores": 1.1, "policy/approxkl_avg": 157.48333740234375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7168400287628174, "step": 1083, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9966548681259155 }, { "episode": 17360, "epoch": 0.31203940036668226, "loss/policy_avg": 0.11649422347545624, "lr": 2.7921779141104297e-06, "objective/entropy": 103.03892517089844, "objective/kl": 14.831886291503906, "objective/non_score_reward": -1.4831887483596802, "objective/rlhf_reward": -3.009036113263342, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 112.13179779052734, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9329808950424194, "step": 1084, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0000717639923096 }, { "episode": 17376, "epoch": 0.3123269942840709, "loss/policy_avg": 0.050925977528095245, "lr": 2.7919861963190185e-06, "objective/entropy": 285.0555419921875, "objective/kl": 12.683483123779297, "objective/non_score_reward": -1.2683483362197876, "objective/rlhf_reward": -7.07339334487915, "objective/scores": -0.5, "policy/approxkl_avg": 1.9751580953598022, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.731339156627655, "step": 1085, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.00105881690979 }, { "episode": 17392, "epoch": 0.31261458820145954, "loss/policy_avg": 0.2870730757713318, "lr": 2.7917944785276073e-06, "objective/entropy": 179.18856811523438, "objective/kl": 21.071481704711914, "objective/non_score_reward": -2.1071481704711914, "objective/rlhf_reward": -8.028593218326568, "objective/scores": 0.1, "policy/approxkl_avg": 2.975924491882324, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.758672833442688, "step": 1086, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9997131824493408 }, { "episode": 17408, "epoch": 0.3129021821188482, "loss/policy_avg": 0.15941983461380005, "lr": 2.7916027607361965e-06, "objective/entropy": 121.9920425415039, "objective/kl": 12.823822975158691, "objective/non_score_reward": -1.2823822498321533, "objective/rlhf_reward": -4.729529297351837, "objective/scores": 0.1, "policy/approxkl_avg": 19.64611053466797, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6388263702392578, "step": 1087, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.00076961517334 }, { "episode": 17424, "epoch": 0.3131897760362368, "loss/policy_avg": -0.06212315335869789, "lr": 2.7914110429447853e-06, "objective/entropy": 38.291786193847656, "objective/kl": 4.688940525054932, "objective/non_score_reward": -0.4688940942287445, "objective/rlhf_reward": -1.4755764067173003, "objective/scores": 0.1, "policy/approxkl_avg": 3.398519515991211, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6102602481842041, "step": 1088, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.00457501411438 }, { "episode": 17440, "epoch": 0.31347736995362546, "loss/policy_avg": 0.9298348426818848, "lr": 2.791219325153374e-06, "objective/entropy": 100.21686553955078, "objective/kl": 13.85842227935791, "objective/non_score_reward": -1.3858420848846436, "objective/rlhf_reward": -3.1433682352304455, "objective/scores": 0.6, "policy/approxkl_avg": 33.18985366821289, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6664258241653442, "step": 1089, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9988486766815186 }, { "episode": 17456, "epoch": 0.3137649638710141, "loss/policy_avg": 0.44839587807655334, "lr": 2.7910276073619634e-06, "objective/entropy": -5.302301406860352, "objective/kl": 11.534767150878906, "objective/non_score_reward": -1.1534767150878906, "objective/rlhf_reward": -2.789078231128763, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 26.897872924804688, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6113318204879761, "step": 1090, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999198079109192 }, { "episode": 17472, "epoch": 0.3140525577884028, "loss/policy_avg": -0.0263579823076725, "lr": 2.790835889570552e-06, "objective/entropy": 53.606441497802734, "objective/kl": 11.902129173278809, "objective/non_score_reward": -1.1902129650115967, "objective/rlhf_reward": -6.760851860046387, "objective/scores": -0.5, "policy/approxkl_avg": 17.736373901367188, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.505375862121582, "step": 1091, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9991669654846191 }, { "episode": 17488, "epoch": 0.31434015170579144, "loss/policy_avg": 0.02518617734313011, "lr": 2.7906441717791414e-06, "objective/entropy": 146.1632843017578, "objective/kl": 13.355687141418457, "objective/non_score_reward": -1.335568904876709, "objective/rlhf_reward": -0.9422759175300595, "objective/scores": 1.1, "policy/approxkl_avg": 39.9644889831543, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6409592032432556, "step": 1092, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9996291399002075 }, { "episode": 17504, "epoch": 0.3146277456231801, "loss/policy_avg": 0.06718438118696213, "lr": 2.79045245398773e-06, "objective/entropy": 31.78778076171875, "objective/kl": 16.474475860595703, "objective/non_score_reward": -1.6474474668502808, "objective/rlhf_reward": -6.189789867401123, "objective/scores": 0.1, "policy/approxkl_avg": 38.23661804199219, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5104482769966125, "step": 1093, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9979634284973145 }, { "episode": 17520, "epoch": 0.3149153395405687, "loss/policy_avg": 0.013752680271863937, "lr": 2.790260736196319e-06, "objective/entropy": -115.1087646484375, "objective/kl": 15.564956665039062, "objective/non_score_reward": -1.5564956665039062, "objective/rlhf_reward": -5.825982904434204, "objective/scores": 0.1, "policy/approxkl_avg": 84.01404571533203, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8566933274269104, "step": 1094, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998520851135254 }, { "episode": 17536, "epoch": 0.31520293345795736, "loss/policy_avg": 0.4633193016052246, "lr": 2.7900690184049083e-06, "objective/entropy": 209.13702392578125, "objective/kl": 13.725728034973145, "objective/non_score_reward": -1.372572898864746, "objective/rlhf_reward": -5.090291237831115, "objective/scores": 0.1, "policy/approxkl_avg": 41.772090911865234, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7493543028831482, "step": 1095, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9992456436157227 }, { "episode": 17552, "epoch": 0.315490527375346, "loss/policy_avg": -0.17638610303401947, "lr": 2.789877300613497e-06, "objective/entropy": 143.67054748535156, "objective/kl": 10.347640037536621, "objective/non_score_reward": -1.0347639322280884, "objective/rlhf_reward": 0.2609441518783573, "objective/scores": 1.1, "policy/approxkl_avg": 35.974178314208984, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6519808173179626, "step": 1096, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.00136399269104 }, { "episode": 17568, "epoch": 0.31577812129273464, "loss/policy_avg": 0.25082990527153015, "lr": 2.789685582822086e-06, "objective/entropy": 135.237060546875, "objective/kl": 11.404836654663086, "objective/non_score_reward": -1.1404837369918823, "objective/rlhf_reward": -6.5619354248046875, "objective/scores": -0.5, "policy/approxkl_avg": 8.249817848205566, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.39255958795547485, "step": 1097, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9993515014648438 }, { "episode": 17584, "epoch": 0.31606571521012333, "loss/policy_avg": 0.24139416217803955, "lr": 2.789493865030675e-06, "objective/entropy": 52.1706428527832, "objective/kl": 11.25928783416748, "objective/non_score_reward": -1.1259288787841797, "objective/rlhf_reward": -4.103715336322784, "objective/scores": 0.1, "policy/approxkl_avg": 21.120098114013672, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.49053245782852173, "step": 1098, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.00132417678833 }, { "episode": 17600, "epoch": 0.316353309127512, "loss/policy_avg": 0.09163187444210052, "lr": 2.789302147239264e-06, "objective/entropy": -32.953826904296875, "objective/kl": 13.274989128112793, "objective/non_score_reward": -1.3274990320205688, "objective/rlhf_reward": -7.309996128082275, "objective/scores": -0.5, "policy/approxkl_avg": 21.395530700683594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6640526652336121, "step": 1099, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997539758682251 }, { "episode": 17616, "epoch": 0.3166409030449006, "loss/policy_avg": 0.04822006821632385, "lr": 2.789110429447853e-06, "objective/entropy": 137.935791015625, "objective/kl": 16.86005973815918, "objective/non_score_reward": -1.6860061883926392, "objective/rlhf_reward": -8.744024276733398, "objective/scores": -0.5, "policy/approxkl_avg": 82.54655456542969, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9151659607887268, "step": 1100, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.996995449066162 }, { "episode": 17632, "epoch": 0.31692849696228925, "loss/policy_avg": 0.3273594379425049, "lr": 2.7889187116564415e-06, "objective/entropy": -49.96745300292969, "objective/kl": 14.063299179077148, "objective/non_score_reward": -1.4063299894332886, "objective/rlhf_reward": -5.225319838523864, "objective/scores": 0.1, "policy/approxkl_avg": 52.512229919433594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7904208898544312, "step": 1101, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9996654987335205 }, { "episode": 17648, "epoch": 0.3172160908796779, "loss/policy_avg": 4.8567938804626465, "lr": 2.7887269938650308e-06, "objective/entropy": 26.933815002441406, "objective/kl": 13.435043334960938, "objective/non_score_reward": -1.3435043096542358, "objective/rlhf_reward": -0.974017313122749, "objective/scores": 1.1, "policy/approxkl_avg": 59.44342041015625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7176166772842407, "step": 1102, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0019140243530273 }, { "episode": 17664, "epoch": 0.31750368479706653, "loss/policy_avg": 0.5443607568740845, "lr": 2.7885352760736196e-06, "objective/entropy": 148.4956817626953, "objective/kl": 14.069759368896484, "objective/non_score_reward": -1.4069759845733643, "objective/rlhf_reward": -3.9660444311505425, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 58.521888732910156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6067029237747192, "step": 1103, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9998931884765625 }, { "episode": 17680, "epoch": 0.31779127871445517, "loss/policy_avg": 0.1637672483921051, "lr": 2.7883435582822084e-06, "objective/entropy": 151.56597900390625, "objective/kl": 12.996841430664062, "objective/non_score_reward": -1.2996841669082642, "objective/rlhf_reward": -7.198736667633057, "objective/scores": -0.5, "policy/approxkl_avg": 35.401649475097656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6609679460525513, "step": 1104, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0000152587890625 }, { "episode": 17696, "epoch": 0.3180788726318438, "loss/policy_avg": 0.23036092519760132, "lr": 2.7881518404907976e-06, "objective/entropy": -163.6939697265625, "objective/kl": 11.472090721130371, "objective/non_score_reward": -1.1472091674804688, "objective/rlhf_reward": -4.188836640119552, "objective/scores": 0.1, "policy/approxkl_avg": 43.7734489440918, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7076303362846375, "step": 1105, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.997105598449707 }, { "episode": 17712, "epoch": 0.3183664665492325, "loss/policy_avg": 0.19628867506980896, "lr": 2.7879601226993864e-06, "objective/entropy": 42.15092468261719, "objective/kl": 12.80676555633545, "objective/non_score_reward": -1.2806766033172607, "objective/rlhf_reward": -7.122706413269043, "objective/scores": -0.5, "policy/approxkl_avg": 41.839134216308594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6349719762802124, "step": 1106, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9971110820770264 }, { "episode": 17728, "epoch": 0.31865406046662115, "loss/policy_avg": 0.08596844226121902, "lr": 2.7877684049079757e-06, "objective/entropy": 103.33952331542969, "objective/kl": 14.068815231323242, "objective/non_score_reward": -1.4068814516067505, "objective/rlhf_reward": -5.227526044845581, "objective/scores": 0.1, "policy/approxkl_avg": 44.43631362915039, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5943961143493652, "step": 1107, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9989540576934814 }, { "episode": 17744, "epoch": 0.3189416543840098, "loss/policy_avg": -0.21316379308700562, "lr": 2.7875766871165645e-06, "objective/entropy": 90.92996978759766, "objective/kl": 9.894368171691895, "objective/non_score_reward": -0.9894368052482605, "objective/rlhf_reward": -1.557747161388397, "objective/scores": 0.6, "policy/approxkl_avg": 8.975800514221191, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5788801908493042, "step": 1108, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.013200521469116 }, { "episode": 17760, "epoch": 0.3192292483013984, "loss/policy_avg": 0.11199735105037689, "lr": 2.7873849693251533e-06, "objective/entropy": 234.6155548095703, "objective/kl": 15.355046272277832, "objective/non_score_reward": -1.5355048179626465, "objective/rlhf_reward": -4.317190463813851, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 10.193365097045898, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7720480561256409, "step": 1109, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9996470212936401 }, { "episode": 17776, "epoch": 0.31951684221878707, "loss/policy_avg": 0.39977481961250305, "lr": 2.7871932515337425e-06, "objective/entropy": -96.6143569946289, "objective/kl": 21.307180404663086, "objective/non_score_reward": -2.1307179927825928, "objective/rlhf_reward": -6.96661278506811, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 83.29231262207031, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5110878944396973, "step": 1110, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0002777576446533 }, { "episode": 17792, "epoch": 0.3198044361361757, "loss/policy_avg": 0.21299278736114502, "lr": 2.7870015337423313e-06, "objective/entropy": -37.447601318359375, "objective/kl": 13.953254699707031, "objective/non_score_reward": -1.3953255414962769, "objective/rlhf_reward": -7.581302165985107, "objective/scores": -0.5, "policy/approxkl_avg": 49.7848014831543, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.45726099610328674, "step": 1111, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9998581409454346 }, { "episode": 17808, "epoch": 0.32009203005356435, "loss/policy_avg": 0.8188914656639099, "lr": 2.78680981595092e-06, "objective/entropy": 115.01608276367188, "objective/kl": 13.266737937927246, "objective/non_score_reward": -1.3266738653182983, "objective/rlhf_reward": -4.906695282459259, "objective/scores": 0.1, "policy/approxkl_avg": 30.617708206176758, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6811967492103577, "step": 1112, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.996246099472046 }, { "episode": 17824, "epoch": 0.320379623970953, "loss/policy_avg": 0.4930746555328369, "lr": 2.7866180981595094e-06, "objective/entropy": -53.15104675292969, "objective/kl": 19.036401748657227, "objective/non_score_reward": -1.9036401510238647, "objective/rlhf_reward": -5.491854207889114, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 156.63845825195312, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6161982417106628, "step": 1113, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9972939491271973 }, { "episode": 17840, "epoch": 0.3206672178883417, "loss/policy_avg": -0.006604592781513929, "lr": 2.786426380368098e-06, "objective/entropy": 11.049800872802734, "objective/kl": 15.964569091796875, "objective/non_score_reward": -1.59645676612854, "objective/rlhf_reward": -5.985827451944351, "objective/scores": 0.1, "policy/approxkl_avg": 111.54328155517578, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6989386081695557, "step": 1114, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9984581470489502 }, { "episode": 17856, "epoch": 0.3209548118057303, "loss/policy_avg": 0.34559106826782227, "lr": 2.7862346625766874e-06, "objective/entropy": 62.411766052246094, "objective/kl": 14.781805992126465, "objective/non_score_reward": -1.4781804084777832, "objective/rlhf_reward": -7.912721633911133, "objective/scores": -0.5, "policy/approxkl_avg": 61.101905822753906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7949924468994141, "step": 1115, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9982895851135254 }, { "episode": 17872, "epoch": 0.32124240572311896, "loss/policy_avg": 0.17752394080162048, "lr": 2.7860429447852762e-06, "objective/entropy": -58.06376647949219, "objective/kl": 11.518682479858398, "objective/non_score_reward": -1.151868224143982, "objective/rlhf_reward": -4.20747292637825, "objective/scores": 0.1, "policy/approxkl_avg": 13.080097198486328, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6485680937767029, "step": 1116, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0006484985351562 }, { "episode": 17888, "epoch": 0.3215299996405076, "loss/policy_avg": -0.11512506008148193, "lr": 2.785851226993865e-06, "objective/entropy": 34.327606201171875, "objective/kl": 6.476920127868652, "objective/non_score_reward": -0.6476920247077942, "objective/rlhf_reward": -4.590767860412598, "objective/scores": -0.5, "policy/approxkl_avg": 10.352293014526367, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6617291569709778, "step": 1117, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9993085861206055 }, { "episode": 17904, "epoch": 0.32181759355789624, "loss/policy_avg": 0.160440132021904, "lr": 2.7856595092024543e-06, "objective/entropy": 60.79460906982422, "objective/kl": 10.265054702758789, "objective/non_score_reward": -1.0265053510665894, "objective/rlhf_reward": -3.706021463871002, "objective/scores": 0.1, "policy/approxkl_avg": 32.06240463256836, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6272501945495605, "step": 1118, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9996984004974365 }, { "episode": 17920, "epoch": 0.3221051874752849, "loss/policy_avg": 0.043839361518621445, "lr": 2.785467791411043e-06, "objective/entropy": 38.43065643310547, "objective/kl": 5.596960067749023, "objective/non_score_reward": -0.5596960783004761, "objective/rlhf_reward": -4.238784313201904, "objective/scores": -0.5, "policy/approxkl_avg": 11.959665298461914, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7662742137908936, "step": 1119, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9981067180633545 }, { "episode": 17936, "epoch": 0.3223927813926735, "loss/policy_avg": 0.08080266416072845, "lr": 2.7852760736196323e-06, "objective/entropy": -86.41799926757812, "objective/kl": 9.186336517333984, "objective/non_score_reward": -0.9186336398124695, "objective/rlhf_reward": -0.7508156045686928, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 47.581703186035156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.609947681427002, "step": 1120, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9973444938659668 }, { "episode": 17952, "epoch": 0.3226803753100622, "loss/policy_avg": 0.4055587947368622, "lr": 2.785084355828221e-06, "objective/entropy": 73.900390625, "objective/kl": 15.857680320739746, "objective/non_score_reward": -1.5857681035995483, "objective/rlhf_reward": -1.9430725932121273, "objective/scores": 1.1, "policy/approxkl_avg": 56.220314025878906, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.5025008916854858, "step": 1121, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0059289932250977 }, { "episode": 17968, "epoch": 0.32296796922745086, "loss/policy_avg": -0.31560468673706055, "lr": 2.78489263803681e-06, "objective/entropy": 112.73280334472656, "objective/kl": 10.532523155212402, "objective/non_score_reward": -1.0532522201538086, "objective/rlhf_reward": 0.18699111938476598, "objective/scores": 1.1, "policy/approxkl_avg": 13.733489990234375, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.8091140985488892, "step": 1122, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0048389434814453 }, { "episode": 17984, "epoch": 0.3232555631448395, "loss/policy_avg": 0.011030721478164196, "lr": 2.7847009202453987e-06, "objective/entropy": 20.075477600097656, "objective/kl": 13.247042655944824, "objective/non_score_reward": -1.3247044086456299, "objective/rlhf_reward": -4.898817485570907, "objective/scores": 0.1, "policy/approxkl_avg": 54.85459899902344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5287940502166748, "step": 1123, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.001558303833008 }, { "episode": 18000, "epoch": 0.32354315706222814, "loss/policy_avg": 0.34385043382644653, "lr": 2.7845092024539875e-06, "objective/entropy": 26.090667724609375, "objective/kl": 16.442066192626953, "objective/non_score_reward": -1.6442067623138428, "objective/rlhf_reward": -8.576827049255371, "objective/scores": -0.5, "policy/approxkl_avg": 29.108585357666016, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6301190853118896, "step": 1124, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9967900514602661 }, { "episode": 18016, "epoch": 0.3238307509796168, "loss/policy_avg": 0.019408032298088074, "lr": 2.7843174846625768e-06, "objective/entropy": 56.218666076660156, "objective/kl": 21.929851531982422, "objective/non_score_reward": -2.1929850578308105, "objective/rlhf_reward": -10.771940231323242, "objective/scores": -0.5, "policy/approxkl_avg": 70.08192443847656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7020080089569092, "step": 1125, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9998396635055542 }, { "episode": 18032, "epoch": 0.3241183448970054, "loss/policy_avg": -0.09649811685085297, "lr": 2.7841257668711656e-06, "objective/entropy": 111.22513580322266, "objective/kl": 7.8993682861328125, "objective/non_score_reward": -0.7899367809295654, "objective/rlhf_reward": 1.2402529209852222, "objective/scores": 1.1, "policy/approxkl_avg": 15.892783164978027, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5796664357185364, "step": 1126, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0133345127105713 }, { "episode": 18048, "epoch": 0.32440593881439406, "loss/policy_avg": 0.012353288009762764, "lr": 2.7839340490797544e-06, "objective/entropy": 35.718894958496094, "objective/kl": 12.086095809936523, "objective/non_score_reward": -1.2086095809936523, "objective/rlhf_reward": -3.2303187585511974, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 25.718326568603516, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6076189279556274, "step": 1127, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0017600059509277 }, { "episode": 18064, "epoch": 0.3246935327317827, "loss/policy_avg": 0.3458269238471985, "lr": 2.7837423312883436e-06, "objective/entropy": 187.83889770507812, "objective/kl": 11.952428817749023, "objective/non_score_reward": -1.1952428817749023, "objective/rlhf_reward": -3.0476382980744043, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 6.217028617858887, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5002806186676025, "step": 1128, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9992327690124512 }, { "episode": 18080, "epoch": 0.3249811266491714, "loss/policy_avg": 0.142012819647789, "lr": 2.7835506134969324e-06, "objective/entropy": -29.551471710205078, "objective/kl": 11.834535598754883, "objective/non_score_reward": -1.1834536790847778, "objective/rlhf_reward": -2.7864033979939773, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 20.23397445678711, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5848058462142944, "step": 1129, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9986717700958252 }, { "episode": 18096, "epoch": 0.32526872056656003, "loss/policy_avg": 0.004172764718532562, "lr": 2.7833588957055217e-06, "objective/entropy": -160.0207977294922, "objective/kl": 14.571106910705566, "objective/non_score_reward": -1.4571107625961304, "objective/rlhf_reward": -4.095109657446543, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 59.20903015136719, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7981671094894409, "step": 1130, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9990029335021973 }, { "episode": 18112, "epoch": 0.32555631448394867, "loss/policy_avg": 0.15084603428840637, "lr": 2.7831671779141105e-06, "objective/entropy": 96.68032836914062, "objective/kl": 13.1805419921875, "objective/non_score_reward": -1.318053960800171, "objective/rlhf_reward": -3.1495099685349803, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 25.39088249206543, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7604236602783203, "step": 1131, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000643730163574 }, { "episode": 18128, "epoch": 0.3258439084013373, "loss/policy_avg": 0.19803990423679352, "lr": 2.7829754601226993e-06, "objective/entropy": 13.389362335205078, "objective/kl": 16.05734634399414, "objective/non_score_reward": -1.6057347059249878, "objective/rlhf_reward": -8.42293930053711, "objective/scores": -0.5, "policy/approxkl_avg": 133.68670654296875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7763709425926208, "step": 1132, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9998838901519775 }, { "episode": 18144, "epoch": 0.32613150231872595, "loss/policy_avg": -0.009128901176154613, "lr": 2.7827837423312885e-06, "objective/entropy": -21.005226135253906, "objective/kl": 17.77182388305664, "objective/non_score_reward": -1.7771823406219482, "objective/rlhf_reward": -4.1850105866205425, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 100.30049896240234, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5907350778579712, "step": 1133, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9987881183624268 }, { "episode": 18160, "epoch": 0.3264190962361146, "loss/policy_avg": 0.21724221110343933, "lr": 2.7825920245398773e-06, "objective/entropy": -26.32037353515625, "objective/kl": 18.74835205078125, "objective/non_score_reward": -1.8748352527618408, "objective/rlhf_reward": -7.099340951442718, "objective/scores": 0.1, "policy/approxkl_avg": 106.53759765625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.843207836151123, "step": 1134, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9977933168411255 }, { "episode": 18176, "epoch": 0.32670669015350323, "loss/policy_avg": 0.2329787313938141, "lr": 2.7824003067484666e-06, "objective/entropy": 129.7427978515625, "objective/kl": 12.383695602416992, "objective/non_score_reward": -1.2383698225021362, "objective/rlhf_reward": -2.830773057714973, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 47.80897521972656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6198921203613281, "step": 1135, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9971883296966553 }, { "episode": 18192, "epoch": 0.3269942840708919, "loss/policy_avg": 0.40199702978134155, "lr": 2.7822085889570554e-06, "objective/entropy": 108.27337646484375, "objective/kl": 10.543985366821289, "objective/non_score_reward": -1.0543984174728394, "objective/rlhf_reward": -6.217593669891357, "objective/scores": -0.5, "policy/approxkl_avg": 57.99873352050781, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5256673097610474, "step": 1136, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000418186187744 }, { "episode": 18208, "epoch": 0.32728187798828057, "loss/policy_avg": 0.06313613057136536, "lr": 2.782016871165644e-06, "objective/entropy": 143.59881591796875, "objective/kl": 13.319164276123047, "objective/non_score_reward": -1.3319165706634521, "objective/rlhf_reward": -7.327666282653809, "objective/scores": -0.5, "policy/approxkl_avg": 18.44135284423828, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5660614967346191, "step": 1137, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9981001615524292 }, { "episode": 18224, "epoch": 0.3275694719056692, "loss/policy_avg": 0.302053838968277, "lr": 2.7818251533742334e-06, "objective/entropy": 220.9147186279297, "objective/kl": 15.305615425109863, "objective/non_score_reward": -1.5305616855621338, "objective/rlhf_reward": -4.518126699987965, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 81.97419738769531, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8646837472915649, "step": 1138, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9971197843551636 }, { "episode": 18240, "epoch": 0.32785706582305785, "loss/policy_avg": 1.9834920167922974, "lr": 2.7816334355828222e-06, "objective/entropy": 120.74797821044922, "objective/kl": 13.918863296508789, "objective/non_score_reward": -1.3918863534927368, "objective/rlhf_reward": -5.167545443773269, "objective/scores": 0.1, "policy/approxkl_avg": 63.093605041503906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6175976991653442, "step": 1139, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0009703636169434 }, { "episode": 18256, "epoch": 0.3281446597404465, "loss/policy_avg": 0.7380589842796326, "lr": 2.781441717791411e-06, "objective/entropy": -91.15982055664062, "objective/kl": 11.536355018615723, "objective/non_score_reward": -1.1536355018615723, "objective/rlhf_reward": -3.010421965185719, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 42.84178161621094, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5434945821762085, "step": 1140, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000872850418091 }, { "episode": 18272, "epoch": 0.3284322536578351, "loss/policy_avg": 0.3735503554344177, "lr": 2.7812500000000003e-06, "objective/entropy": 37.28156280517578, "objective/kl": 13.715288162231445, "objective/non_score_reward": -1.3715288639068604, "objective/rlhf_reward": -7.486115455627441, "objective/scores": -0.5, "policy/approxkl_avg": 58.20576095581055, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7024419903755188, "step": 1141, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9980452060699463 }, { "episode": 18288, "epoch": 0.32871984757522377, "loss/policy_avg": 0.10260803997516632, "lr": 2.781058282208589e-06, "objective/entropy": -110.40155029296875, "objective/kl": 10.552820205688477, "objective/non_score_reward": -1.0552821159362793, "objective/rlhf_reward": -3.8211283445358273, "objective/scores": 0.1, "policy/approxkl_avg": 24.53842544555664, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.618804931640625, "step": 1142, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0004189014434814 }, { "episode": 18304, "epoch": 0.3290074414926124, "loss/policy_avg": -0.2922723889350891, "lr": 2.7808665644171783e-06, "objective/entropy": 43.667179107666016, "objective/kl": 15.767911911010742, "objective/non_score_reward": -1.576791524887085, "objective/rlhf_reward": -5.907165741920471, "objective/scores": 0.1, "policy/approxkl_avg": 17.228885650634766, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7186756134033203, "step": 1143, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0082926750183105 }, { "episode": 18320, "epoch": 0.3292950354100011, "loss/policy_avg": 0.15897995233535767, "lr": 2.780674846625767e-06, "objective/entropy": 28.125289916992188, "objective/kl": 16.386962890625, "objective/non_score_reward": -1.638696312904358, "objective/rlhf_reward": -8.554784774780273, "objective/scores": -0.5, "policy/approxkl_avg": 72.38021850585938, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.663409948348999, "step": 1144, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9980506896972656 }, { "episode": 18336, "epoch": 0.32958262932738974, "loss/policy_avg": 0.10999254137277603, "lr": 2.780483128834356e-06, "objective/entropy": -168.7968292236328, "objective/kl": 4.2082414627075195, "objective/non_score_reward": -0.4208241403102875, "objective/rlhf_reward": -1.28329656124115, "objective/scores": 0.1, "policy/approxkl_avg": 6.389283180236816, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.515932559967041, "step": 1145, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000765085220337 }, { "episode": 18352, "epoch": 0.3298702232447784, "loss/policy_avg": 0.08765261620283127, "lr": 2.7802914110429447e-06, "objective/entropy": -20.226409912109375, "objective/kl": 21.756072998046875, "objective/non_score_reward": -2.175607442855835, "objective/rlhf_reward": -8.302429950237274, "objective/scores": 0.1, "policy/approxkl_avg": 67.168212890625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5186696648597717, "step": 1146, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9991587400436401 }, { "episode": 18368, "epoch": 0.330157817162167, "loss/policy_avg": 0.15273040533065796, "lr": 2.7800996932515335e-06, "objective/entropy": 31.48741912841797, "objective/kl": 19.75265884399414, "objective/non_score_reward": -1.975265622138977, "objective/rlhf_reward": -3.5010626077651974, "objective/scores": 1.1, "policy/approxkl_avg": 20.612037658691406, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4049757719039917, "step": 1147, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9988133907318115 }, { "episode": 18384, "epoch": 0.33044541107955566, "loss/policy_avg": -0.058190956711769104, "lr": 2.7799079754601228e-06, "objective/entropy": -115.31792449951172, "objective/kl": 17.150239944458008, "objective/non_score_reward": -1.7150239944458008, "objective/rlhf_reward": -2.4600957393646237, "objective/scores": 1.1, "policy/approxkl_avg": 95.23487854003906, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.754326343536377, "step": 1148, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9976162910461426 }, { "episode": 18400, "epoch": 0.3307330049969443, "loss/policy_avg": 0.23397204279899597, "lr": 2.7797162576687116e-06, "objective/entropy": 135.87664794921875, "objective/kl": 21.47640609741211, "objective/non_score_reward": -2.1476407051086426, "objective/rlhf_reward": -8.190562522411346, "objective/scores": 0.1, "policy/approxkl_avg": 22.630273818969727, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7489320039749146, "step": 1149, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9979724884033203 }, { "episode": 18416, "epoch": 0.33102059891433294, "loss/policy_avg": 0.2109401971101761, "lr": 2.7795245398773004e-06, "objective/entropy": -96.35713195800781, "objective/kl": 7.339359283447266, "objective/non_score_reward": -0.7339359521865845, "objective/rlhf_reward": 1.4642561316490177, "objective/scores": 1.1, "policy/approxkl_avg": 16.709714889526367, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5943701863288879, "step": 1150, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000924587249756 }, { "episode": 18432, "epoch": 0.3313081928317216, "loss/policy_avg": 0.27022865414619446, "lr": 2.7793328220858896e-06, "objective/entropy": -70.71237182617188, "objective/kl": 12.857504844665527, "objective/non_score_reward": -1.2857506275177002, "objective/rlhf_reward": -4.7430025100708, "objective/scores": 0.1, "policy/approxkl_avg": 84.81177520751953, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8109536170959473, "step": 1151, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.997032880783081 }, { "episode": 18448, "epoch": 0.3315957867491103, "loss/policy_avg": -0.1951054185628891, "lr": 2.7791411042944784e-06, "objective/entropy": -112.49846649169922, "objective/kl": 10.639480590820312, "objective/non_score_reward": -1.0639480352401733, "objective/rlhf_reward": -2.6995326717763692, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 1.8408970832824707, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4948251247406006, "step": 1152, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0016403198242188 }, { "episode": 18464, "epoch": 0.3318833806664989, "loss/policy_avg": 0.6007785797119141, "lr": 2.7789493865030677e-06, "objective/entropy": 2.571552276611328, "objective/kl": 17.329076766967773, "objective/non_score_reward": -1.732907772064209, "objective/rlhf_reward": -2.5316311478614804, "objective/scores": 1.1, "policy/approxkl_avg": 14.923664093017578, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6116815805435181, "step": 1153, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9988734722137451 }, { "episode": 18480, "epoch": 0.33217097458388756, "loss/policy_avg": 1.0791575908660889, "lr": 2.7787576687116565e-06, "objective/entropy": 186.5494384765625, "objective/kl": 21.728107452392578, "objective/non_score_reward": -2.1728107929229736, "objective/rlhf_reward": -6.291242933273315, "objective/scores": 0.6, "policy/approxkl_avg": 129.72695922851562, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6914939284324646, "step": 1154, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9973831176757812 }, { "episode": 18496, "epoch": 0.3324585685012762, "loss/policy_avg": 0.3817293345928192, "lr": 2.7785659509202453e-06, "objective/entropy": -144.4022216796875, "objective/kl": 14.357304573059082, "objective/non_score_reward": -1.4357304573059082, "objective/rlhf_reward": -7.742921829223633, "objective/scores": -0.5, "policy/approxkl_avg": 1.8433094024658203, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6484241485595703, "step": 1155, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.001634120941162 }, { "episode": 18512, "epoch": 0.33274616241866484, "loss/policy_avg": 0.34997937083244324, "lr": 2.7783742331288345e-06, "objective/entropy": -131.33566284179688, "objective/kl": 13.435831069946289, "objective/non_score_reward": -1.343583345413208, "objective/rlhf_reward": -4.974333143234253, "objective/scores": 0.1, "policy/approxkl_avg": 34.9140739440918, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6474661231040955, "step": 1156, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9971636533737183 }, { "episode": 18528, "epoch": 0.3330337563360535, "loss/policy_avg": 0.39980053901672363, "lr": 2.7781825153374233e-06, "objective/entropy": -103.27327728271484, "objective/kl": 17.875957489013672, "objective/non_score_reward": -1.787595510482788, "objective/rlhf_reward": -6.750381997227668, "objective/scores": 0.1, "policy/approxkl_avg": 52.51301574707031, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6814613938331604, "step": 1157, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9991803169250488 }, { "episode": 18544, "epoch": 0.3333213502534421, "loss/policy_avg": 0.3572099804878235, "lr": 2.7779907975460126e-06, "objective/entropy": -19.225521087646484, "objective/kl": 13.611352920532227, "objective/non_score_reward": -1.3611352443695068, "objective/rlhf_reward": -5.044541215896606, "objective/scores": 0.1, "policy/approxkl_avg": 31.89737892150879, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7011643648147583, "step": 1158, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9996830224990845 }, { "episode": 18560, "epoch": 0.3336089441708308, "loss/policy_avg": 0.35091787576675415, "lr": 2.7777990797546014e-06, "objective/entropy": -124.19099426269531, "objective/kl": 8.701774597167969, "objective/non_score_reward": -0.8701775074005127, "objective/rlhf_reward": -3.080709910392761, "objective/scores": 0.1, "policy/approxkl_avg": 71.30574035644531, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5850265026092529, "step": 1159, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9987276792526245 }, { "episode": 18576, "epoch": 0.33389653808821945, "loss/policy_avg": 0.10446971654891968, "lr": 2.77760736196319e-06, "objective/entropy": 303.48541259765625, "objective/kl": 14.655348777770996, "objective/non_score_reward": -1.4655349254608154, "objective/rlhf_reward": -7.862139701843262, "objective/scores": -0.5, "policy/approxkl_avg": 22.810016632080078, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.9821838736534119, "step": 1160, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9996795654296875 }, { "episode": 18592, "epoch": 0.3341841320056081, "loss/policy_avg": 0.10929669439792633, "lr": 2.7774156441717794e-06, "objective/entropy": -19.155555725097656, "objective/kl": 12.892605781555176, "objective/non_score_reward": -1.2892606258392334, "objective/rlhf_reward": -3.209631155209477, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 19.19078826904297, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5724284648895264, "step": 1161, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9966273307800293 }, { "episode": 18608, "epoch": 0.33447172592299673, "loss/policy_avg": 0.31029993295669556, "lr": 2.7772239263803682e-06, "objective/entropy": 104.80690002441406, "objective/kl": 18.291303634643555, "objective/non_score_reward": -1.8291301727294922, "objective/rlhf_reward": -4.392801736236784, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 215.850830078125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6571725606918335, "step": 1162, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998605728149414 }, { "episode": 18624, "epoch": 0.33475931984038537, "loss/policy_avg": -0.5111748576164246, "lr": 2.777032208588957e-06, "objective/entropy": -37.404815673828125, "objective/kl": 12.505797386169434, "objective/non_score_reward": -1.250579833984375, "objective/rlhf_reward": -4.60231921672821, "objective/scores": 0.1, "policy/approxkl_avg": 10.684650421142578, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.628068208694458, "step": 1163, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0031986236572266 }, { "episode": 18640, "epoch": 0.335046913757774, "loss/policy_avg": 0.5985330939292908, "lr": 2.7768404907975463e-06, "objective/entropy": 85.2934799194336, "objective/kl": 21.97635269165039, "objective/non_score_reward": -2.1976354122161865, "objective/rlhf_reward": -8.39054157435894, "objective/scores": 0.1, "policy/approxkl_avg": 46.09657669067383, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8425604104995728, "step": 1164, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9965444803237915 }, { "episode": 18656, "epoch": 0.33533450767516265, "loss/policy_avg": -0.33150577545166016, "lr": 2.776648773006135e-06, "objective/entropy": 17.754005432128906, "objective/kl": 11.162395477294922, "objective/non_score_reward": -1.1162395477294922, "objective/rlhf_reward": -6.464958667755127, "objective/scores": -0.5, "policy/approxkl_avg": 37.122459411621094, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8440073132514954, "step": 1165, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0027763843536377 }, { "episode": 18672, "epoch": 0.3356221015925513, "loss/policy_avg": 0.5681189298629761, "lr": 2.7764570552147243e-06, "objective/entropy": 174.1182861328125, "objective/kl": 13.990375518798828, "objective/non_score_reward": -1.399037480354309, "objective/rlhf_reward": -7.596149921417236, "objective/scores": -0.5, "policy/approxkl_avg": 37.488868713378906, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4357028007507324, "step": 1166, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.000075340270996 }, { "episode": 18688, "epoch": 0.33590969550994, "loss/policy_avg": 0.6947311162948608, "lr": 2.7762653374233127e-06, "objective/entropy": -161.28807067871094, "objective/kl": 13.998380661010742, "objective/non_score_reward": -1.3998382091522217, "objective/rlhf_reward": -1.199352583289146, "objective/scores": 1.1, "policy/approxkl_avg": 57.86552429199219, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6250740885734558, "step": 1167, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9996944665908813 }, { "episode": 18704, "epoch": 0.3361972894273286, "loss/policy_avg": 0.1502753645181656, "lr": 2.776073619631902e-06, "objective/entropy": 189.06851196289062, "objective/kl": 6.054288864135742, "objective/non_score_reward": -0.6054288148880005, "objective/rlhf_reward": -2.021715438365936, "objective/scores": 0.1, "policy/approxkl_avg": 3.8485565185546875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.48803550004959106, "step": 1168, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.998919129371643 }, { "episode": 18720, "epoch": 0.33648488334471727, "loss/policy_avg": 0.42749345302581787, "lr": 2.7758819018404907e-06, "objective/entropy": -281.3139953613281, "objective/kl": 12.804277420043945, "objective/non_score_reward": -1.2804276943206787, "objective/rlhf_reward": -4.721710836887359, "objective/scores": 0.1, "policy/approxkl_avg": 67.71830749511719, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7152395844459534, "step": 1169, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9967622756958008 }, { "episode": 18736, "epoch": 0.3367724772621059, "loss/policy_avg": 0.06842047721147537, "lr": 2.7756901840490796e-06, "objective/entropy": -40.159446716308594, "objective/kl": 17.861766815185547, "objective/non_score_reward": -1.7861766815185547, "objective/rlhf_reward": -6.744707024097442, "objective/scores": 0.1, "policy/approxkl_avg": 21.40418243408203, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5496320724487305, "step": 1170, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001437187194824 }, { "episode": 18752, "epoch": 0.33706007117949455, "loss/policy_avg": 0.7477625608444214, "lr": 2.7754984662576688e-06, "objective/entropy": -32.50873565673828, "objective/kl": 17.935222625732422, "objective/non_score_reward": -1.7935223579406738, "objective/rlhf_reward": -4.774089819192886, "objective/scores": 0.6, "policy/approxkl_avg": 59.91670227050781, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.615089476108551, "step": 1171, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9974488019943237 }, { "episode": 18768, "epoch": 0.3373476650968832, "loss/policy_avg": -0.10740236937999725, "lr": 2.7753067484662576e-06, "objective/entropy": -91.79651641845703, "objective/kl": 12.429764747619629, "objective/non_score_reward": -1.242976427078247, "objective/rlhf_reward": -4.571905589103698, "objective/scores": 0.1, "policy/approxkl_avg": 12.65214729309082, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6722214221954346, "step": 1172, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0004234313964844 }, { "episode": 18784, "epoch": 0.3376352590142718, "loss/policy_avg": -0.14357039332389832, "lr": 2.775115030674847e-06, "objective/entropy": -119.25468444824219, "objective/kl": 11.105037689208984, "objective/non_score_reward": -1.110503911972046, "objective/rlhf_reward": -2.885756283011988, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 1.2598841190338135, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.7479334473609924, "step": 1173, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0019779205322266 }, { "episode": 18800, "epoch": 0.3379228529316605, "loss/policy_avg": 0.3568786382675171, "lr": 2.7749233128834356e-06, "objective/entropy": 276.316650390625, "objective/kl": 14.202279090881348, "objective/non_score_reward": -1.4202280044555664, "objective/rlhf_reward": -3.5582059047379833, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 66.28675842285156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8677693605422974, "step": 1174, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9986944198608398 }, { "episode": 18816, "epoch": 0.33821044684904916, "loss/policy_avg": 0.37396079301834106, "lr": 2.7747315950920244e-06, "objective/entropy": 104.71287536621094, "objective/kl": 18.649734497070312, "objective/non_score_reward": -1.8649733066558838, "objective/rlhf_reward": -7.059893345832824, "objective/scores": 0.1, "policy/approxkl_avg": 59.60235595703125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.42154908180236816, "step": 1175, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9960224628448486 }, { "episode": 18832, "epoch": 0.3384980407664378, "loss/policy_avg": -0.2894097566604614, "lr": 2.7745398773006137e-06, "objective/entropy": -84.91337585449219, "objective/kl": 12.799016952514648, "objective/non_score_reward": -1.2799016237258911, "objective/rlhf_reward": -0.7196066290140148, "objective/scores": 1.1, "policy/approxkl_avg": 67.74818420410156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6464543342590332, "step": 1176, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.00795316696167 }, { "episode": 18848, "epoch": 0.33878563468382644, "loss/policy_avg": 0.3645285367965698, "lr": 2.7743481595092025e-06, "objective/entropy": 168.74581909179688, "objective/kl": 17.867380142211914, "objective/non_score_reward": -1.786738395690918, "objective/rlhf_reward": -2.746953225135803, "objective/scores": 1.1, "policy/approxkl_avg": 77.44456481933594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7084821462631226, "step": 1177, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9979579448699951 }, { "episode": 18864, "epoch": 0.3390732286012151, "loss/policy_avg": 0.4001834988594055, "lr": 2.7741564417177913e-06, "objective/entropy": 99.16302490234375, "objective/kl": 13.607667922973633, "objective/non_score_reward": -1.360766887664795, "objective/rlhf_reward": -1.0430678486824032, "objective/scores": 1.1, "policy/approxkl_avg": 28.074718475341797, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5342350006103516, "step": 1178, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9996448755264282 }, { "episode": 18880, "epoch": 0.3393608225186037, "loss/policy_avg": 0.10984847694635391, "lr": 2.7739647239263805e-06, "objective/entropy": 180.45806884765625, "objective/kl": 11.570047378540039, "objective/non_score_reward": -1.157004952430725, "objective/rlhf_reward": -4.228019899129867, "objective/scores": 0.1, "policy/approxkl_avg": 43.99121856689453, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5318220257759094, "step": 1179, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9975459575653076 }, { "episode": 18896, "epoch": 0.33964841643599236, "loss/policy_avg": 0.3910417854785919, "lr": 2.7737730061349693e-06, "objective/entropy": -8.10009765625, "objective/kl": 8.290020942687988, "objective/non_score_reward": -0.8290020823478699, "objective/rlhf_reward": 1.0839917749166492, "objective/scores": 1.1, "policy/approxkl_avg": 37.283531188964844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5130699872970581, "step": 1180, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9991108179092407 }, { "episode": 18912, "epoch": 0.339936010353381, "loss/policy_avg": 0.014388211071491241, "lr": 2.7735812883435586e-06, "objective/entropy": 175.02603149414062, "objective/kl": 15.754423141479492, "objective/non_score_reward": -1.5754423141479492, "objective/rlhf_reward": -8.301769256591797, "objective/scores": -0.5, "policy/approxkl_avg": 7.509084701538086, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7703262567520142, "step": 1181, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9978232383728027 }, { "episode": 18928, "epoch": 0.3402236042707697, "loss/policy_avg": 0.055365175008773804, "lr": 2.7733895705521474e-06, "objective/entropy": -334.13519287109375, "objective/kl": 8.783041000366211, "objective/non_score_reward": -0.878304123878479, "objective/rlhf_reward": -3.1132164955139157, "objective/scores": 0.1, "policy/approxkl_avg": 40.43748474121094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6499872207641602, "step": 1182, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.999503493309021 }, { "episode": 18944, "epoch": 0.34051119818815834, "loss/policy_avg": 0.07409346848726273, "lr": 2.773197852760736e-06, "objective/entropy": -83.13888549804688, "objective/kl": 14.876424789428711, "objective/non_score_reward": -1.487642526626587, "objective/rlhf_reward": -5.550569987297058, "objective/scores": 0.1, "policy/approxkl_avg": 71.0294189453125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8472854495048523, "step": 1183, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9958550930023193 }, { "episode": 18960, "epoch": 0.340798792105547, "loss/policy_avg": 0.9990078806877136, "lr": 2.7730061349693254e-06, "objective/entropy": 20.914791107177734, "objective/kl": 14.88058853149414, "objective/non_score_reward": -1.4880588054656982, "objective/rlhf_reward": -5.552235579490661, "objective/scores": 0.1, "policy/approxkl_avg": 18.125185012817383, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4836152493953705, "step": 1184, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9983336925506592 }, { "episode": 18976, "epoch": 0.3410863860229356, "loss/policy_avg": 0.048895176500082016, "lr": 2.7728144171779142e-06, "objective/entropy": 89.80201721191406, "objective/kl": 12.485555648803711, "objective/non_score_reward": -1.2485556602478027, "objective/rlhf_reward": -4.594222491979599, "objective/scores": 0.1, "policy/approxkl_avg": 1.1111341714859009, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4978290796279907, "step": 1185, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0011367797851562 }, { "episode": 18992, "epoch": 0.34137397994032426, "loss/policy_avg": 0.6291418075561523, "lr": 2.7726226993865035e-06, "objective/entropy": 154.68283081054688, "objective/kl": 14.19253921508789, "objective/non_score_reward": -1.419253945350647, "objective/rlhf_reward": -3.5543094298997264, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 38.20229721069336, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7595288753509521, "step": 1186, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.998978614807129 }, { "episode": 19008, "epoch": 0.3416615738577129, "loss/policy_avg": 0.3727290630340576, "lr": 2.7724309815950923e-06, "objective/entropy": 136.96429443359375, "objective/kl": 8.332364082336426, "objective/non_score_reward": -0.8332364559173584, "objective/rlhf_reward": -5.332945823669434, "objective/scores": -0.5, "policy/approxkl_avg": 17.576475143432617, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7260855436325073, "step": 1187, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9986281394958496 }, { "episode": 19024, "epoch": 0.34194916777510154, "loss/policy_avg": 0.2877521514892578, "lr": 2.772239263803681e-06, "objective/entropy": -1.9418220520019531, "objective/kl": 14.981258392333984, "objective/non_score_reward": -1.4981257915496826, "objective/rlhf_reward": -3.8697967848935466, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 115.05579376220703, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6772172451019287, "step": 1188, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9973249435424805 }, { "episode": 19040, "epoch": 0.3422367616924902, "loss/policy_avg": 0.3203403949737549, "lr": 2.7720475460122703e-06, "objective/entropy": 126.06346893310547, "objective/kl": 14.789588928222656, "objective/non_score_reward": -1.4789589643478394, "objective/rlhf_reward": -5.515835797786712, "objective/scores": 0.1, "policy/approxkl_avg": 37.32659149169922, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7956616878509521, "step": 1189, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.996840238571167 }, { "episode": 19056, "epoch": 0.34252435560987887, "loss/policy_avg": 0.1269657164812088, "lr": 2.7718558282208587e-06, "objective/entropy": 7.045299530029297, "objective/kl": 11.909355163574219, "objective/non_score_reward": -1.1909356117248535, "objective/rlhf_reward": -4.363742417097091, "objective/scores": 0.1, "policy/approxkl_avg": 51.526981353759766, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6353417634963989, "step": 1190, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998396635055542 }, { "episode": 19072, "epoch": 0.3428119495272675, "loss/policy_avg": 0.10352663695812225, "lr": 2.771664110429448e-06, "objective/entropy": 18.426589965820312, "objective/kl": 17.08945655822754, "objective/non_score_reward": -1.7089455127716064, "objective/rlhf_reward": -6.43578211069107, "objective/scores": 0.1, "policy/approxkl_avg": 130.82119750976562, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8962185382843018, "step": 1191, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9980072975158691 }, { "episode": 19088, "epoch": 0.34309954344465615, "loss/policy_avg": 0.2877599000930786, "lr": 2.7714723926380367e-06, "objective/entropy": 67.25830841064453, "objective/kl": 13.41183090209961, "objective/non_score_reward": -1.341183066368103, "objective/rlhf_reward": -7.364732265472412, "objective/scores": -0.5, "policy/approxkl_avg": 136.07571411132812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.66959547996521, "step": 1192, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.997651219367981 }, { "episode": 19104, "epoch": 0.3433871373620448, "loss/policy_avg": 0.3532646894454956, "lr": 2.7712806748466256e-06, "objective/entropy": 232.93739318847656, "objective/kl": 11.582132339477539, "objective/non_score_reward": -1.1582132577896118, "objective/rlhf_reward": -4.232852792739868, "objective/scores": 0.1, "policy/approxkl_avg": 66.96092224121094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.9691135883331299, "step": 1193, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9989712238311768 }, { "episode": 19120, "epoch": 0.34367473127943343, "loss/policy_avg": 0.08443383872509003, "lr": 2.771088957055215e-06, "objective/entropy": 12.634071350097656, "objective/kl": 14.32248592376709, "objective/non_score_reward": -1.432248592376709, "objective/rlhf_reward": -7.728994369506836, "objective/scores": -0.5, "policy/approxkl_avg": 155.06851196289062, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6068894863128662, "step": 1194, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.997370958328247 }, { "episode": 19136, "epoch": 0.34396232519682207, "loss/policy_avg": 0.05074826255440712, "lr": 2.7708972392638036e-06, "objective/entropy": -49.18777847290039, "objective/kl": 16.202213287353516, "objective/non_score_reward": -1.6202213764190674, "objective/rlhf_reward": -4.3581790349641185, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 74.01683044433594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6107933521270752, "step": 1195, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9966356754302979 }, { "episode": 19152, "epoch": 0.3442499191142107, "loss/policy_avg": 1.3108307123184204, "lr": 2.770705521472393e-06, "objective/entropy": 54.5289421081543, "objective/kl": 15.158395767211914, "objective/non_score_reward": -1.5158395767211914, "objective/rlhf_reward": -5.6633586347103115, "objective/scores": 0.1, "policy/approxkl_avg": 2.8598058223724365, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.6213253736495972, "step": 1196, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0031495094299316 }, { "episode": 19168, "epoch": 0.3445375130315994, "loss/policy_avg": 0.09720651805400848, "lr": 2.7705138036809816e-06, "objective/entropy": 94.39827728271484, "objective/kl": 7.398892402648926, "objective/non_score_reward": -0.7398892641067505, "objective/rlhf_reward": -0.5595570117235185, "objective/scores": 0.6, "policy/approxkl_avg": 35.56739044189453, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7716015577316284, "step": 1197, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9994008541107178 }, { "episode": 19184, "epoch": 0.34482510694898805, "loss/policy_avg": 0.4629075527191162, "lr": 2.7703220858895705e-06, "objective/entropy": 152.28797912597656, "objective/kl": 12.815742492675781, "objective/non_score_reward": -1.2815742492675781, "objective/rlhf_reward": -4.726297056674957, "objective/scores": 0.1, "policy/approxkl_avg": 32.886993408203125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.603091299533844, "step": 1198, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9996881484985352 }, { "episode": 19200, "epoch": 0.3451127008663767, "loss/policy_avg": 0.381091833114624, "lr": 2.7701303680981597e-06, "objective/entropy": -184.68792724609375, "objective/kl": 19.192184448242188, "objective/non_score_reward": -1.9192183017730713, "objective/rlhf_reward": -5.852044816288064, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 53.215721130371094, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.672134518623352, "step": 1199, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9985284805297852 }, { "episode": 19216, "epoch": 0.3454002947837653, "loss/policy_avg": 0.5557818412780762, "lr": 2.7699386503067485e-06, "objective/entropy": 74.49488830566406, "objective/kl": 16.349365234375, "objective/non_score_reward": -1.6349365711212158, "objective/rlhf_reward": -3.6160276278269023, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 71.15203857421875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5768604278564453, "step": 1200, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.997605800628662 }, { "episode": 19232, "epoch": 0.34568788870115397, "loss/policy_avg": 0.3999622166156769, "lr": 2.7697469325153373e-06, "objective/entropy": 29.93677520751953, "objective/kl": 15.555572509765625, "objective/non_score_reward": -1.5555572509765625, "objective/rlhf_reward": -4.618109259668904, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 138.052978515625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6741936206817627, "step": 1201, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9976664781570435 }, { "episode": 19248, "epoch": 0.3459754826185426, "loss/policy_avg": -0.13153797388076782, "lr": 2.7695552147239265e-06, "objective/entropy": -79.21650695800781, "objective/kl": 17.083938598632812, "objective/non_score_reward": -1.7083938121795654, "objective/rlhf_reward": -2.433575308322906, "objective/scores": 1.1, "policy/approxkl_avg": 41.158538818359375, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5853614807128906, "step": 1202, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9991123676300049 }, { "episode": 19264, "epoch": 0.34626307653593125, "loss/policy_avg": 0.5117297172546387, "lr": 2.7693634969325153e-06, "objective/entropy": 2.7270851135253906, "objective/kl": 17.102258682250977, "objective/non_score_reward": -1.7102259397506714, "objective/rlhf_reward": -8.840904235839844, "objective/scores": -0.5, "policy/approxkl_avg": 23.606922149658203, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8073578476905823, "step": 1203, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9986017942428589 }, { "episode": 19280, "epoch": 0.3465506704533199, "loss/policy_avg": 0.020893381908535957, "lr": 2.7691717791411046e-06, "objective/entropy": -169.71463012695312, "objective/kl": 7.1462202072143555, "objective/non_score_reward": -0.7146221399307251, "objective/rlhf_reward": -2.4584885299205776, "objective/scores": 0.1, "policy/approxkl_avg": 39.70250701904297, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5930685997009277, "step": 1204, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0004806518554688 }, { "episode": 19296, "epoch": 0.3468382643707086, "loss/policy_avg": 0.5275117754936218, "lr": 2.7689800613496934e-06, "objective/entropy": 21.12427520751953, "objective/kl": 20.648849487304688, "objective/non_score_reward": -2.064885139465332, "objective/rlhf_reward": -5.859540140628814, "objective/scores": 0.6, "policy/approxkl_avg": 26.942766189575195, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6685649156570435, "step": 1205, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997446894645691 }, { "episode": 19312, "epoch": 0.3471258582880972, "loss/policy_avg": -0.19722406566143036, "lr": 2.768788343558282e-06, "objective/entropy": 255.74984741210938, "objective/kl": 14.213126182556152, "objective/non_score_reward": -1.4213125705718994, "objective/rlhf_reward": -2.761531566024992, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 18.114723205566406, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.7719753980636597, "step": 1206, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0006258487701416 }, { "episode": 19328, "epoch": 0.34741345220548586, "loss/policy_avg": 0.0459427610039711, "lr": 2.7685966257668714e-06, "objective/entropy": -24.662124633789062, "objective/kl": 11.61572265625, "objective/non_score_reward": -1.1615723371505737, "objective/rlhf_reward": -2.9129559258619944, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 10.064481735229492, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6453742384910583, "step": 1207, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9978961944580078 }, { "episode": 19344, "epoch": 0.3477010461228745, "loss/policy_avg": 0.03587307780981064, "lr": 2.7684049079754602e-06, "objective/entropy": 79.69007110595703, "objective/kl": 15.860215187072754, "objective/non_score_reward": -1.5860217809677124, "objective/rlhf_reward": -3.9440871238708493, "objective/scores": 0.6, "policy/approxkl_avg": 59.39844512939453, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7233803272247314, "step": 1208, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9987972974777222 }, { "episode": 19360, "epoch": 0.34798864004026314, "loss/policy_avg": 0.338050901889801, "lr": 2.7682131901840495e-06, "objective/entropy": 172.87771606445312, "objective/kl": 15.573002815246582, "objective/non_score_reward": -1.557300329208374, "objective/rlhf_reward": -5.829201197624206, "objective/scores": 0.1, "policy/approxkl_avg": 84.08805084228516, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.425936222076416, "step": 1209, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.99677574634552 }, { "episode": 19376, "epoch": 0.3482762339576518, "loss/policy_avg": 0.1206553503870964, "lr": 2.7680214723926383e-06, "objective/entropy": 82.61885070800781, "objective/kl": 18.71231460571289, "objective/non_score_reward": -1.8712315559387207, "objective/rlhf_reward": -9.484926223754883, "objective/scores": -0.5, "policy/approxkl_avg": 79.68077087402344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6749487519264221, "step": 1210, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9997036457061768 }, { "episode": 19392, "epoch": 0.3485638278750404, "loss/policy_avg": -0.030552543699741364, "lr": 2.767829754601227e-06, "objective/entropy": 258.60968017578125, "objective/kl": 12.804272651672363, "objective/non_score_reward": -1.28042733669281, "objective/rlhf_reward": -4.72170922756195, "objective/scores": 0.1, "policy/approxkl_avg": 3.8165931701660156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6673685908317566, "step": 1211, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0015029907226562 }, { "episode": 19408, "epoch": 0.3488514217924291, "loss/policy_avg": 0.26328012347221375, "lr": 2.767638036809816e-06, "objective/entropy": 81.95811462402344, "objective/kl": 10.576614379882812, "objective/non_score_reward": -1.0576614141464233, "objective/rlhf_reward": -6.230645656585693, "objective/scores": -0.5, "policy/approxkl_avg": 53.74596405029297, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5103533267974854, "step": 1212, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0021772384643555 }, { "episode": 19424, "epoch": 0.34913901570981776, "loss/policy_avg": 0.7071826457977295, "lr": 2.7674463190184047e-06, "objective/entropy": 284.74810791015625, "objective/kl": 14.454414367675781, "objective/non_score_reward": -1.4454416036605835, "objective/rlhf_reward": -1.3817664146423336, "objective/scores": 1.1, "policy/approxkl_avg": 162.52574157714844, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.698977530002594, "step": 1213, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9988505840301514 }, { "episode": 19440, "epoch": 0.3494266096272064, "loss/policy_avg": 0.5319777131080627, "lr": 2.767254601226994e-06, "objective/entropy": 7.809246063232422, "objective/kl": 18.88573455810547, "objective/non_score_reward": -1.8885735273361206, "objective/rlhf_reward": -7.154294228553772, "objective/scores": 0.1, "policy/approxkl_avg": 23.536239624023438, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5789269804954529, "step": 1214, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9986400604248047 }, { "episode": 19456, "epoch": 0.34971420354459504, "loss/policy_avg": 0.086372971534729, "lr": 2.7670628834355828e-06, "objective/entropy": -125.92728424072266, "objective/kl": 6.277002334594727, "objective/non_score_reward": -0.6277002692222595, "objective/rlhf_reward": 0.41291793739679195, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 32.177825927734375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5431735515594482, "step": 1215, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9989644289016724 }, { "episode": 19472, "epoch": 0.3500017974619837, "loss/policy_avg": 0.684249997138977, "lr": 2.7668711656441716e-06, "objective/entropy": 105.52658081054688, "objective/kl": 14.267902374267578, "objective/non_score_reward": -1.4267902374267578, "objective/rlhf_reward": -5.307160621881485, "objective/scores": 0.1, "policy/approxkl_avg": 172.38299560546875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.49907970428466797, "step": 1216, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9985556602478027 }, { "episode": 19488, "epoch": 0.3502893913793723, "loss/policy_avg": 0.1681925654411316, "lr": 2.766679447852761e-06, "objective/entropy": -46.325313568115234, "objective/kl": 10.954778671264648, "objective/non_score_reward": -1.095477819442749, "objective/rlhf_reward": -1.458192323089811, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 19.582189559936523, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5834884643554688, "step": 1217, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000382423400879 }, { "episode": 19504, "epoch": 0.35057698529676096, "loss/policy_avg": 0.17251908779144287, "lr": 2.7664877300613496e-06, "objective/entropy": -134.26455688476562, "objective/kl": 12.687049865722656, "objective/non_score_reward": -1.268705129623413, "objective/rlhf_reward": -7.074820518493652, "objective/scores": -0.5, "policy/approxkl_avg": 28.018173217773438, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4345284700393677, "step": 1218, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9991281032562256 }, { "episode": 19520, "epoch": 0.3508645792141496, "loss/policy_avg": 0.17196732759475708, "lr": 2.766296012269939e-06, "objective/entropy": -33.67222595214844, "objective/kl": 13.162768363952637, "objective/non_score_reward": -1.3162769079208374, "objective/rlhf_reward": -0.8651078552007672, "objective/scores": 1.1, "policy/approxkl_avg": 27.336265563964844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5979089736938477, "step": 1219, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9985971450805664 }, { "episode": 19536, "epoch": 0.3511521731315383, "loss/policy_avg": 0.09419011324644089, "lr": 2.7661042944785276e-06, "objective/entropy": 13.70315170288086, "objective/kl": 11.582527160644531, "objective/non_score_reward": -1.1582528352737427, "objective/rlhf_reward": -6.633011341094971, "objective/scores": -0.5, "policy/approxkl_avg": 2.180685043334961, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6442111730575562, "step": 1220, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0029656887054443 }, { "episode": 19552, "epoch": 0.35143976704892693, "loss/policy_avg": 0.2863805294036865, "lr": 2.7659125766871165e-06, "objective/entropy": 245.30551147460938, "objective/kl": 16.196739196777344, "objective/non_score_reward": -1.6196739673614502, "objective/rlhf_reward": -8.4786958694458, "objective/scores": -0.5, "policy/approxkl_avg": 65.83010864257812, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7952322959899902, "step": 1221, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9992876052856445 }, { "episode": 19568, "epoch": 0.35172736096631557, "loss/policy_avg": 0.6682702302932739, "lr": 2.7657208588957057e-06, "objective/entropy": -27.104393005371094, "objective/kl": 8.179853439331055, "objective/non_score_reward": -0.8179854154586792, "objective/rlhf_reward": -2.8719416022300717, "objective/scores": 0.1, "policy/approxkl_avg": 17.878311157226562, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4288652837276459, "step": 1222, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9993765354156494 }, { "episode": 19584, "epoch": 0.3520149548837042, "loss/policy_avg": 0.33719485998153687, "lr": 2.7655291411042945e-06, "objective/entropy": -138.77658081054688, "objective/kl": 15.182317733764648, "objective/non_score_reward": -1.5182318687438965, "objective/rlhf_reward": -8.072927474975586, "objective/scores": -0.5, "policy/approxkl_avg": 30.492324829101562, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5747877359390259, "step": 1223, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997852087020874 }, { "episode": 19600, "epoch": 0.35230254880109285, "loss/policy_avg": 0.4515780210494995, "lr": 2.7653374233128837e-06, "objective/entropy": 64.33354949951172, "objective/kl": 20.334157943725586, "objective/non_score_reward": -2.0334157943725586, "objective/rlhf_reward": -10.133663177490234, "objective/scores": -0.5, "policy/approxkl_avg": 121.74520874023438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7705545425415039, "step": 1224, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9969508647918701 }, { "episode": 19616, "epoch": 0.3525901427184815, "loss/policy_avg": 0.470808744430542, "lr": 2.7651457055214725e-06, "objective/entropy": 24.672637939453125, "objective/kl": 17.567148208618164, "objective/non_score_reward": -1.7567150592803955, "objective/rlhf_reward": -6.626860237121582, "objective/scores": 0.1, "policy/approxkl_avg": 145.18096923828125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.571911096572876, "step": 1225, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998140573501587 }, { "episode": 19632, "epoch": 0.35287773663587013, "loss/policy_avg": 0.35610437393188477, "lr": 2.7649539877300614e-06, "objective/entropy": 52.07497024536133, "objective/kl": 25.290016174316406, "objective/non_score_reward": -2.5290017127990723, "objective/rlhf_reward": -9.716007149219514, "objective/scores": 0.1, "policy/approxkl_avg": 150.7601318359375, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6791574954986572, "step": 1226, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0010275840759277 }, { "episode": 19648, "epoch": 0.35316533055325877, "loss/policy_avg": 0.5662277936935425, "lr": 2.7647622699386506e-06, "objective/entropy": 201.03201293945312, "objective/kl": 13.793638229370117, "objective/non_score_reward": -1.379363775253296, "objective/rlhf_reward": -1.117455160617828, "objective/scores": 1.1, "policy/approxkl_avg": 86.15937805175781, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6101793050765991, "step": 1227, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9986412525177002 }, { "episode": 19664, "epoch": 0.35345292447064747, "loss/policy_avg": 0.14119097590446472, "lr": 2.7645705521472394e-06, "objective/entropy": 205.80929565429688, "objective/kl": 14.83749008178711, "objective/non_score_reward": -1.4837491512298584, "objective/rlhf_reward": -1.5349966049194332, "objective/scores": 1.1, "policy/approxkl_avg": 63.156280517578125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7360595464706421, "step": 1228, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998273253440857 }, { "episode": 19680, "epoch": 0.3537405183880361, "loss/policy_avg": 0.270801305770874, "lr": 2.764378834355828e-06, "objective/entropy": 201.06890869140625, "objective/kl": 21.808286666870117, "objective/non_score_reward": -2.180828809738159, "objective/rlhf_reward": -8.323315000534057, "objective/scores": 0.1, "policy/approxkl_avg": 96.15478515625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7318389415740967, "step": 1229, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9991538524627686 }, { "episode": 19696, "epoch": 0.35402811230542475, "loss/policy_avg": -0.37473565340042114, "lr": 2.7641871165644174e-06, "objective/entropy": 160.452880859375, "objective/kl": 10.46506118774414, "objective/non_score_reward": -1.0465059280395508, "objective/rlhf_reward": -1.262304966093275, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 75.31881713867188, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7993210554122925, "step": 1230, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0051486492156982 }, { "episode": 19712, "epoch": 0.3543157062228134, "loss/policy_avg": 0.08775840699672699, "lr": 2.7639953987730062e-06, "objective/entropy": -58.0557861328125, "objective/kl": 16.303316116333008, "objective/non_score_reward": -1.6303316354751587, "objective/rlhf_reward": -6.121326541900634, "objective/scores": 0.1, "policy/approxkl_avg": 36.74671173095703, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6570830345153809, "step": 1231, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998000979423523 }, { "episode": 19728, "epoch": 0.354603300140202, "loss/policy_avg": 0.24697911739349365, "lr": 2.7638036809815955e-06, "objective/entropy": -103.66670227050781, "objective/kl": 13.675052642822266, "objective/non_score_reward": -1.3675053119659424, "objective/rlhf_reward": -3.645192380222391, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 8.908821105957031, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5298190712928772, "step": 1232, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9975241422653198 }, { "episode": 19744, "epoch": 0.35489089405759067, "loss/policy_avg": 0.0736452266573906, "lr": 2.7636119631901843e-06, "objective/entropy": -149.5491943359375, "objective/kl": 13.649198532104492, "objective/non_score_reward": -1.3649197816848755, "objective/rlhf_reward": -3.3369730732598644, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 97.76475524902344, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6775115132331848, "step": 1233, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9988733530044556 }, { "episode": 19760, "epoch": 0.3551784879749793, "loss/policy_avg": 0.30038145184516907, "lr": 2.763420245398773e-06, "objective/entropy": 9.354202270507812, "objective/kl": 13.509121894836426, "objective/non_score_reward": -1.350912094116211, "objective/rlhf_reward": -5.003648361563682, "objective/scores": 0.1, "policy/approxkl_avg": 48.25217819213867, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.3075612783432007, "step": 1234, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0003974437713623 }, { "episode": 19776, "epoch": 0.355466081892368, "loss/policy_avg": 0.41265279054641724, "lr": 2.763228527607362e-06, "objective/entropy": -128.67822265625, "objective/kl": 13.989595413208008, "objective/non_score_reward": -1.3989596366882324, "objective/rlhf_reward": -4.039579230305508, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 38.396392822265625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5274858474731445, "step": 1235, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000598907470703 }, { "episode": 19792, "epoch": 0.35575367580975664, "loss/policy_avg": 0.16135510802268982, "lr": 2.7630368098159507e-06, "objective/entropy": -96.63204956054688, "objective/kl": 12.73376178741455, "objective/non_score_reward": -1.273376226425171, "objective/rlhf_reward": -0.6935049951076504, "objective/scores": 1.1, "policy/approxkl_avg": 39.10956573486328, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.670026421546936, "step": 1236, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0001089572906494 }, { "episode": 19808, "epoch": 0.3560412697271453, "loss/policy_avg": 0.7625874280929565, "lr": 2.76284509202454e-06, "objective/entropy": -326.073974609375, "objective/kl": 7.750607967376709, "objective/non_score_reward": -0.775060772895813, "objective/rlhf_reward": -2.700243210792541, "objective/scores": 0.1, "policy/approxkl_avg": 22.49315643310547, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7181180715560913, "step": 1237, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.999666690826416 }, { "episode": 19824, "epoch": 0.3563288636445339, "loss/policy_avg": 0.008348610252141953, "lr": 2.7626533742331288e-06, "objective/entropy": -314.941650390625, "objective/kl": 10.41222095489502, "objective/non_score_reward": -1.0412222146987915, "objective/rlhf_reward": -3.7648888289928433, "objective/scores": 0.1, "policy/approxkl_avg": 41.16736602783203, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6663084626197815, "step": 1238, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.00093936920166 }, { "episode": 19840, "epoch": 0.35661645756192256, "loss/policy_avg": 0.36450040340423584, "lr": 2.7624616564417176e-06, "objective/entropy": -93.8783950805664, "objective/kl": 10.081415176391602, "objective/non_score_reward": -1.0081413984298706, "objective/rlhf_reward": -3.632565504312515, "objective/scores": 0.1, "policy/approxkl_avg": 43.1667366027832, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6830465197563171, "step": 1239, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9981257915496826 }, { "episode": 19856, "epoch": 0.3569040514793112, "loss/policy_avg": 0.29206255078315735, "lr": 2.762269938650307e-06, "objective/entropy": 194.64642333984375, "objective/kl": 20.66879653930664, "objective/non_score_reward": -2.0668797492980957, "objective/rlhf_reward": -7.867519116401672, "objective/scores": 0.1, "policy/approxkl_avg": 11.687646865844727, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.741154670715332, "step": 1240, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9989938735961914 }, { "episode": 19872, "epoch": 0.35719164539669984, "loss/policy_avg": 0.3017123341560364, "lr": 2.7620782208588956e-06, "objective/entropy": 5.207828521728516, "objective/kl": 17.089153289794922, "objective/non_score_reward": -1.70891535282135, "objective/rlhf_reward": -8.835660934448242, "objective/scores": -0.5, "policy/approxkl_avg": 90.6629638671875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5502313375473022, "step": 1241, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000894069671631 }, { "episode": 19888, "epoch": 0.3574792393140885, "loss/policy_avg": 0.44059765338897705, "lr": 2.761886503067485e-06, "objective/entropy": 93.31766510009766, "objective/kl": 17.86589241027832, "objective/non_score_reward": -1.7865893840789795, "objective/rlhf_reward": -2.7463575363159176, "objective/scores": 1.1, "policy/approxkl_avg": 94.84536743164062, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7133783102035522, "step": 1242, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.99619722366333 }, { "episode": 19904, "epoch": 0.3577668332314772, "loss/policy_avg": 0.29421958327293396, "lr": 2.7616947852760737e-06, "objective/entropy": 1.1859626770019531, "objective/kl": 19.047836303710938, "objective/non_score_reward": -1.9047834873199463, "objective/rlhf_reward": -7.2191343069076535, "objective/scores": 0.1, "policy/approxkl_avg": 83.07100677490234, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4693220257759094, "step": 1243, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9991942644119263 }, { "episode": 19920, "epoch": 0.3580544271488658, "loss/policy_avg": 0.2835289239883423, "lr": 2.7615030674846625e-06, "objective/entropy": 215.6124267578125, "objective/kl": 13.682714462280273, "objective/non_score_reward": -1.3682713508605957, "objective/rlhf_reward": -2.549366746784422, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 10.411121368408203, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8226298093795776, "step": 1244, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999145269393921 }, { "episode": 19936, "epoch": 0.35834202106625446, "loss/policy_avg": 0.4547170400619507, "lr": 2.7613113496932517e-06, "objective/entropy": 76.82644653320312, "objective/kl": 22.442930221557617, "objective/non_score_reward": -2.244292736053467, "objective/rlhf_reward": -8.577171301841737, "objective/scores": 0.1, "policy/approxkl_avg": 93.25566101074219, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7442919015884399, "step": 1245, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.995746374130249 }, { "episode": 19952, "epoch": 0.3586296149836431, "loss/policy_avg": 0.33403638005256653, "lr": 2.7611196319018405e-06, "objective/entropy": 315.484130859375, "objective/kl": 18.028207778930664, "objective/non_score_reward": -1.8028206825256348, "objective/rlhf_reward": -4.811282730102539, "objective/scores": 0.6, "policy/approxkl_avg": 71.87936401367188, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7787157297134399, "step": 1246, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.999906301498413 }, { "episode": 19968, "epoch": 0.35891720890103174, "loss/policy_avg": 0.4515129625797272, "lr": 2.7609279141104297e-06, "objective/entropy": 49.19495391845703, "objective/kl": 16.58286476135254, "objective/non_score_reward": -1.658286452293396, "objective/rlhf_reward": -4.808317120346139, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 22.461732864379883, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5894609689712524, "step": 1247, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9996263980865479 }, { "episode": 19984, "epoch": 0.3592048028184204, "loss/policy_avg": 0.3973531723022461, "lr": 2.7607361963190186e-06, "objective/entropy": -21.238182067871094, "objective/kl": 18.508586883544922, "objective/non_score_reward": -1.8508589267730713, "objective/rlhf_reward": -5.456024329142506, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 67.94461822509766, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7065160274505615, "step": 1248, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000941514968872 }, { "episode": 20000, "epoch": 0.359492396735809, "loss/policy_avg": 0.23100371658802032, "lr": 2.7605444785276074e-06, "objective/entropy": -163.1217041015625, "objective/kl": 14.045398712158203, "objective/non_score_reward": -1.4045398235321045, "objective/rlhf_reward": -5.218158936500549, "objective/scores": 0.1, "policy/approxkl_avg": 31.335323333740234, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8941419720649719, "step": 1249, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0009543895721436 }, { "episode": 20016, "epoch": 0.3597799906531977, "loss/policy_avg": 0.26674309372901917, "lr": 2.7603527607361966e-06, "objective/entropy": 37.34232711791992, "objective/kl": 18.331321716308594, "objective/non_score_reward": -1.833132028579712, "objective/rlhf_reward": -2.932528471946716, "objective/scores": 1.1, "policy/approxkl_avg": 19.882667541503906, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7180185317993164, "step": 1250, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0010123252868652 }, { "episode": 20032, "epoch": 0.36006758457058635, "loss/policy_avg": -0.03628786280751228, "lr": 2.7601610429447854e-06, "objective/entropy": 8.678840637207031, "objective/kl": 13.700248718261719, "objective/non_score_reward": -1.3700249195098877, "objective/rlhf_reward": -7.480099678039551, "objective/scores": -0.5, "policy/approxkl_avg": 9.891830444335938, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6439998149871826, "step": 1251, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.004136085510254 }, { "episode": 20048, "epoch": 0.360355178487975, "loss/policy_avg": 0.038509830832481384, "lr": 2.759969325153374e-06, "objective/entropy": -88.04098510742188, "objective/kl": 11.22274112701416, "objective/non_score_reward": -1.1222741603851318, "objective/rlhf_reward": -2.089096477627754, "objective/scores": 0.6, "policy/approxkl_avg": 32.534996032714844, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.532057523727417, "step": 1252, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9985415935516357 }, { "episode": 20064, "epoch": 0.36064277240536363, "loss/policy_avg": 0.3246498703956604, "lr": 2.7597776073619634e-06, "objective/entropy": -265.6895751953125, "objective/kl": 17.194293975830078, "objective/non_score_reward": -1.71942937374115, "objective/rlhf_reward": -6.47771737575531, "objective/scores": 0.1, "policy/approxkl_avg": 23.17576026916504, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6558340787887573, "step": 1253, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9985039234161377 }, { "episode": 20080, "epoch": 0.36093036632275227, "loss/policy_avg": 0.34922367334365845, "lr": 2.7595858895705523e-06, "objective/entropy": 323.5899658203125, "objective/kl": 16.424396514892578, "objective/non_score_reward": -1.642439842224121, "objective/rlhf_reward": -8.569759368896484, "objective/scores": -0.5, "policy/approxkl_avg": 119.47270965576172, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8957875967025757, "step": 1254, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0001938343048096 }, { "episode": 20096, "epoch": 0.3612179602401409, "loss/policy_avg": 0.1072012335062027, "lr": 2.7593941717791415e-06, "objective/entropy": -119.50285339355469, "objective/kl": 15.749418258666992, "objective/non_score_reward": -1.574941873550415, "objective/rlhf_reward": -5.899767941236496, "objective/scores": 0.1, "policy/approxkl_avg": 87.79750061035156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.41685575246810913, "step": 1255, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9994564056396484 }, { "episode": 20112, "epoch": 0.36150555415752955, "loss/policy_avg": -0.09395378828048706, "lr": 2.75920245398773e-06, "objective/entropy": 104.20881652832031, "objective/kl": 14.162569046020508, "objective/non_score_reward": -1.4162569046020508, "objective/rlhf_reward": -7.665027618408203, "objective/scores": -0.5, "policy/approxkl_avg": 5.374798774719238, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7825880646705627, "step": 1256, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0023365020751953 }, { "episode": 20128, "epoch": 0.3617931480749182, "loss/policy_avg": 0.028538435697555542, "lr": 2.759010736196319e-06, "objective/entropy": -46.99930953979492, "objective/kl": 12.82414436340332, "objective/non_score_reward": -1.282414436340332, "objective/rlhf_reward": -7.129657745361328, "objective/scores": -0.5, "policy/approxkl_avg": 23.753807067871094, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7124722599983215, "step": 1257, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0020642280578613 }, { "episode": 20144, "epoch": 0.3620807419923069, "loss/policy_avg": -0.021636590361595154, "lr": 2.758819018404908e-06, "objective/entropy": -221.2742919921875, "objective/kl": 9.566751480102539, "objective/non_score_reward": -0.956675112247467, "objective/rlhf_reward": 0.573299536108971, "objective/scores": 1.1, "policy/approxkl_avg": 18.7915096282959, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5591281652450562, "step": 1258, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.001438617706299 }, { "episode": 20160, "epoch": 0.3623683359096955, "loss/policy_avg": -0.04480976611375809, "lr": 2.7586273006134967e-06, "objective/entropy": 240.30905151367188, "objective/kl": 14.567556381225586, "objective/non_score_reward": -1.4567558765411377, "objective/rlhf_reward": -7.827023506164551, "objective/scores": -0.5, "policy/approxkl_avg": 54.074058532714844, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6273489594459534, "step": 1259, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9984817504882812 }, { "episode": 20176, "epoch": 0.36265592982708417, "loss/policy_avg": 0.4374186396598816, "lr": 2.758435582822086e-06, "objective/entropy": 188.3015594482422, "objective/kl": 17.31353759765625, "objective/non_score_reward": -1.731353998184204, "objective/rlhf_reward": -4.802709641233955, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 31.519737243652344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8211510181427002, "step": 1260, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0011744499206543 }, { "episode": 20192, "epoch": 0.3629435237444728, "loss/policy_avg": 0.40651851892471313, "lr": 2.7582438650306748e-06, "objective/entropy": -213.75115966796875, "objective/kl": 13.607375144958496, "objective/non_score_reward": -1.3607374429702759, "objective/rlhf_reward": -5.042950010299682, "objective/scores": 0.1, "policy/approxkl_avg": 35.90991973876953, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5784170031547546, "step": 1261, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9970366954803467 }, { "episode": 20208, "epoch": 0.36323111766186145, "loss/policy_avg": 0.08956024795770645, "lr": 2.758052147239264e-06, "objective/entropy": 154.9608612060547, "objective/kl": 15.312253952026367, "objective/non_score_reward": -1.5312254428863525, "objective/rlhf_reward": -1.724901711940765, "objective/scores": 1.1, "policy/approxkl_avg": 106.27772521972656, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4395660161972046, "step": 1262, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9995371103286743 }, { "episode": 20224, "epoch": 0.3635187115792501, "loss/policy_avg": 0.33477357029914856, "lr": 2.757860429447853e-06, "objective/entropy": -70.99191284179688, "objective/kl": 12.952332496643066, "objective/non_score_reward": -1.2952332496643066, "objective/rlhf_reward": -3.447599486509959, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 37.827144622802734, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7374582290649414, "step": 1263, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.995866298675537 }, { "episode": 20240, "epoch": 0.3638063054966387, "loss/policy_avg": 0.6385763883590698, "lr": 2.7576687116564416e-06, "objective/entropy": -83.57589721679688, "objective/kl": 14.134538650512695, "objective/non_score_reward": -1.4134538173675537, "objective/rlhf_reward": -5.253815388679504, "objective/scores": 0.1, "policy/approxkl_avg": 41.28148651123047, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5673624873161316, "step": 1264, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9980382919311523 }, { "episode": 20256, "epoch": 0.36409389941402737, "loss/policy_avg": 0.1510678082704544, "lr": 2.757476993865031e-06, "objective/entropy": -1.0954818725585938, "objective/kl": 13.511443138122559, "objective/non_score_reward": -1.3511443138122559, "objective/rlhf_reward": -1.0045771360397335, "objective/scores": 1.1, "policy/approxkl_avg": 36.97186279296875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.557723879814148, "step": 1265, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000786304473877 }, { "episode": 20272, "epoch": 0.36438149333141606, "loss/policy_avg": -0.15208236873149872, "lr": 2.7572852760736197e-06, "objective/entropy": -118.6660385131836, "objective/kl": 16.32196807861328, "objective/non_score_reward": -1.6321969032287598, "objective/rlhf_reward": -8.528787612915039, "objective/scores": -0.5, "policy/approxkl_avg": 12.030231475830078, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7827368974685669, "step": 1266, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0016098022460938 }, { "episode": 20288, "epoch": 0.3646690872488047, "loss/policy_avg": 0.3386310338973999, "lr": 2.7570935582822085e-06, "objective/entropy": 249.75567626953125, "objective/kl": 14.743573188781738, "objective/non_score_reward": -1.4743572473526, "objective/rlhf_reward": -7.8974289894104, "objective/scores": -0.5, "policy/approxkl_avg": 85.64590454101562, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6248792409896851, "step": 1267, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9980359077453613 }, { "episode": 20304, "epoch": 0.36495668116619334, "loss/policy_avg": 0.6923346519470215, "lr": 2.7569018404907977e-06, "objective/entropy": 117.10751342773438, "objective/kl": 14.273673057556152, "objective/non_score_reward": -1.4273674488067627, "objective/rlhf_reward": -3.884640704068254, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 126.95195770263672, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.622350811958313, "step": 1268, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9966614246368408 }, { "episode": 20320, "epoch": 0.365244275083582, "loss/policy_avg": 0.08828192949295044, "lr": 2.7567101226993865e-06, "objective/entropy": 95.71147155761719, "objective/kl": 22.682233810424805, "objective/non_score_reward": -2.268223285675049, "objective/rlhf_reward": -7.468773577276783, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 78.45034790039062, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7277878522872925, "step": 1269, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9989908933639526 }, { "episode": 20336, "epoch": 0.3655318690009706, "loss/policy_avg": 0.41136306524276733, "lr": 2.7565184049079757e-06, "objective/entropy": -162.8468780517578, "objective/kl": 17.89332389831543, "objective/non_score_reward": -1.7893322706222534, "objective/rlhf_reward": -2.7573293209075924, "objective/scores": 1.1, "policy/approxkl_avg": 37.13008117675781, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6133675575256348, "step": 1270, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000051975250244 }, { "episode": 20352, "epoch": 0.36581946291835926, "loss/policy_avg": 0.019866986200213432, "lr": 2.7563266871165646e-06, "objective/entropy": 217.34902954101562, "objective/kl": 12.45542049407959, "objective/non_score_reward": -1.245542049407959, "objective/rlhf_reward": -4.58216837644577, "objective/scores": 0.1, "policy/approxkl_avg": 13.521160125732422, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.826188862323761, "step": 1271, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0006699562072754 }, { "episode": 20368, "epoch": 0.3661070568357479, "loss/policy_avg": 0.07603908330202103, "lr": 2.7561349693251534e-06, "objective/entropy": 21.53125762939453, "objective/kl": 12.73741340637207, "objective/non_score_reward": -1.2737412452697754, "objective/rlhf_reward": -0.6949649810791012, "objective/scores": 1.1, "policy/approxkl_avg": 49.010589599609375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5798752307891846, "step": 1272, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9964756965637207 }, { "episode": 20384, "epoch": 0.3663946507531366, "loss/policy_avg": -0.3879421353340149, "lr": 2.7559432515337426e-06, "objective/entropy": 135.53102111816406, "objective/kl": 15.746997833251953, "objective/non_score_reward": -1.5746999979019165, "objective/rlhf_reward": -8.298799514770508, "objective/scores": -0.5, "policy/approxkl_avg": 8.827058792114258, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6375302076339722, "step": 1273, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0074105262756348 }, { "episode": 20400, "epoch": 0.36668224467052524, "loss/policy_avg": 0.15085294842720032, "lr": 2.7557515337423314e-06, "objective/entropy": 300.15081787109375, "objective/kl": 21.906736373901367, "objective/non_score_reward": -2.190673828125, "objective/rlhf_reward": -10.7626953125, "objective/scores": -0.5, "policy/approxkl_avg": 292.7366943359375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7200959920883179, "step": 1274, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9972238540649414 }, { "episode": 20416, "epoch": 0.3669698385879139, "loss/policy_avg": 0.47925299406051636, "lr": 2.7555598159509206e-06, "objective/entropy": 147.05970764160156, "objective/kl": 12.889491081237793, "objective/non_score_reward": -1.2889491319656372, "objective/rlhf_reward": -2.2320775135767192, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 8.79387378692627, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7686591148376465, "step": 1275, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9991161823272705 }, { "episode": 20432, "epoch": 0.3672574325053025, "loss/policy_avg": 0.5244907140731812, "lr": 2.7553680981595095e-06, "objective/entropy": -89.96923828125, "objective/kl": 9.46074390411377, "objective/non_score_reward": -0.946074366569519, "objective/rlhf_reward": -1.3842974662780758, "objective/scores": 0.6, "policy/approxkl_avg": 65.30824279785156, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.698806881904602, "step": 1276, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000295400619507 }, { "episode": 20448, "epoch": 0.36754502642269116, "loss/policy_avg": 0.6136975288391113, "lr": 2.7551763803680983e-06, "objective/entropy": 7.9844207763671875, "objective/kl": 11.841961860656738, "objective/non_score_reward": -1.1841962337493896, "objective/rlhf_reward": -0.3367847561836239, "objective/scores": 1.1, "policy/approxkl_avg": 87.70883178710938, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.643621027469635, "step": 1277, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9997437000274658 }, { "episode": 20464, "epoch": 0.3678326203400798, "loss/policy_avg": -0.00877484492957592, "lr": 2.7549846625766875e-06, "objective/entropy": -142.05413818359375, "objective/kl": 13.480876922607422, "objective/non_score_reward": -1.3480876684188843, "objective/rlhf_reward": -2.992350524663925, "objective/scores": 0.6, "policy/approxkl_avg": 101.64769744873047, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.591066300868988, "step": 1278, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9975664615631104 }, { "episode": 20480, "epoch": 0.36812021425746844, "loss/policy_avg": 0.04989251866936684, "lr": 2.754792944785276e-06, "objective/entropy": 3.4066810607910156, "objective/kl": 14.480069160461426, "objective/non_score_reward": -1.4480068683624268, "objective/rlhf_reward": -7.792027473449707, "objective/scores": -0.5, "policy/approxkl_avg": 29.06435775756836, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7045605182647705, "step": 1279, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9972350597381592 }, { "episode": 20496, "epoch": 0.3684078081748571, "loss/policy_avg": 1.8392304182052612, "lr": 2.754601226993865e-06, "objective/entropy": 46.341854095458984, "objective/kl": 17.002796173095703, "objective/non_score_reward": -1.700279712677002, "objective/rlhf_reward": -2.401118969917297, "objective/scores": 1.1, "policy/approxkl_avg": 13.771295547485352, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7492488026618958, "step": 1280, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.002211570739746 }, { "episode": 20512, "epoch": 0.36869540209224577, "loss/policy_avg": 0.062071219086647034, "lr": 2.754409509202454e-06, "objective/entropy": -166.6836700439453, "objective/kl": 15.558152198791504, "objective/non_score_reward": -1.5558152198791504, "objective/rlhf_reward": -4.398432280096124, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 12.791590690612793, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5931823253631592, "step": 1281, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000124454498291 }, { "episode": 20528, "epoch": 0.3689829960096344, "loss/policy_avg": 0.6587561368942261, "lr": 2.7542177914110427e-06, "objective/entropy": 28.02899932861328, "objective/kl": 11.468122482299805, "objective/non_score_reward": -1.1468123197555542, "objective/rlhf_reward": -2.7624205901947727, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 50.253570556640625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7729257345199585, "step": 1282, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9982600212097168 }, { "episode": 20544, "epoch": 0.36927058992702305, "loss/policy_avg": 0.16267871856689453, "lr": 2.754026073619632e-06, "objective/entropy": 167.5796661376953, "objective/kl": 12.056804656982422, "objective/non_score_reward": -1.2056803703308105, "objective/rlhf_reward": -4.4227216899394985, "objective/scores": 0.1, "policy/approxkl_avg": 14.177530288696289, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.665571391582489, "step": 1283, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9999282360076904 }, { "episode": 20560, "epoch": 0.3695581838444117, "loss/policy_avg": 0.10843977332115173, "lr": 2.7538343558282208e-06, "objective/entropy": -65.32721710205078, "objective/kl": 9.67951774597168, "objective/non_score_reward": -0.9679518938064575, "objective/rlhf_reward": 0.5281925439834598, "objective/scores": 1.1, "policy/approxkl_avg": 28.627899169921875, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5553978085517883, "step": 1284, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0003318786621094 }, { "episode": 20576, "epoch": 0.36984577776180033, "loss/policy_avg": 0.09218505769968033, "lr": 2.75364263803681e-06, "objective/entropy": -287.59014892578125, "objective/kl": 10.535503387451172, "objective/non_score_reward": -1.0535502433776855, "objective/rlhf_reward": 0.18579889237880742, "objective/scores": 1.1, "policy/approxkl_avg": 33.40215301513672, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6041494607925415, "step": 1285, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.9981060028076172 }, { "episode": 20592, "epoch": 0.37013337167918897, "loss/policy_avg": 0.4806681275367737, "lr": 2.753450920245399e-06, "objective/entropy": 114.24024200439453, "objective/kl": 12.809671401977539, "objective/non_score_reward": -1.2809672355651855, "objective/rlhf_reward": -3.001162292734657, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 103.79825592041016, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7868355512619019, "step": 1286, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9987444877624512 }, { "episode": 20608, "epoch": 0.3704209655965776, "loss/policy_avg": 0.4517222046852112, "lr": 2.7532592024539876e-06, "objective/entropy": -82.73650360107422, "objective/kl": 16.96005630493164, "objective/non_score_reward": -1.6960057020187378, "objective/rlhf_reward": -8.78402328491211, "objective/scores": -0.5, "policy/approxkl_avg": 49.851829528808594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7056453824043274, "step": 1287, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9979089498519897 }, { "episode": 20624, "epoch": 0.3707085595139663, "loss/policy_avg": 0.3327828645706177, "lr": 2.753067484662577e-06, "objective/entropy": -133.07638549804688, "objective/kl": 13.776689529418945, "objective/non_score_reward": -1.377668857574463, "objective/rlhf_reward": -5.1106756091117855, "objective/scores": 0.1, "policy/approxkl_avg": 28.329500198364258, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6216986179351807, "step": 1288, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000491142272949 }, { "episode": 20640, "epoch": 0.37099615343135495, "loss/policy_avg": 0.5435598492622375, "lr": 2.7528757668711657e-06, "objective/entropy": -170.30508422851562, "objective/kl": 17.714038848876953, "objective/non_score_reward": -1.7714040279388428, "objective/rlhf_reward": -6.685615873336792, "objective/scores": 0.1, "policy/approxkl_avg": 24.18598747253418, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7448534965515137, "step": 1289, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9992847442626953 }, { "episode": 20656, "epoch": 0.3712837473487436, "loss/policy_avg": 0.24354924261569977, "lr": 2.7526840490797545e-06, "objective/entropy": 23.606029510498047, "objective/kl": 26.219905853271484, "objective/non_score_reward": -2.62199068069458, "objective/rlhf_reward": -6.087962484359741, "objective/scores": 1.1, "policy/approxkl_avg": 57.348243713378906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5495198965072632, "step": 1290, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9979891777038574 }, { "episode": 20672, "epoch": 0.3715713412661322, "loss/policy_avg": 1.0543159246444702, "lr": 2.7524923312883437e-06, "objective/entropy": 263.5877685546875, "objective/kl": 7.638702392578125, "objective/non_score_reward": -0.7638702392578125, "objective/rlhf_reward": 1.3445191025733951, "objective/scores": 1.1, "policy/approxkl_avg": 3.8037867546081543, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6338490843772888, "step": 1291, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0179636478424072 }, { "episode": 20688, "epoch": 0.37185893518352087, "loss/policy_avg": 0.7852039933204651, "lr": 2.7523006134969325e-06, "objective/entropy": 304.3499450683594, "objective/kl": 19.973011016845703, "objective/non_score_reward": -1.9973011016845703, "objective/rlhf_reward": -7.589204466342926, "objective/scores": 0.1, "policy/approxkl_avg": 71.57891845703125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.9033545255661011, "step": 1292, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9971482753753662 }, { "episode": 20704, "epoch": 0.3721465291009095, "loss/policy_avg": 0.7660449743270874, "lr": 2.7521088957055218e-06, "objective/entropy": 363.7418212890625, "objective/kl": 17.045547485351562, "objective/non_score_reward": -1.7045549154281616, "objective/rlhf_reward": -8.818220138549805, "objective/scores": -0.5, "policy/approxkl_avg": 165.32534790039062, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.941403329372406, "step": 1293, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.998640537261963 }, { "episode": 20720, "epoch": 0.37243412301829815, "loss/policy_avg": 0.18262270092964172, "lr": 2.7519171779141106e-06, "objective/entropy": 340.91082763671875, "objective/kl": 15.803611755371094, "objective/non_score_reward": -1.580361247062683, "objective/rlhf_reward": -4.765185623374537, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 74.37960815429688, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8930048942565918, "step": 1294, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.99721360206604 }, { "episode": 20736, "epoch": 0.3727217169356868, "loss/policy_avg": 0.5075985193252563, "lr": 2.7517254601226994e-06, "objective/entropy": 66.2529296875, "objective/kl": 10.169906616210938, "objective/non_score_reward": -1.0169909000396729, "objective/rlhf_reward": -3.6679632425308224, "objective/scores": 0.1, "policy/approxkl_avg": 34.16536331176758, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5018965005874634, "step": 1295, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9997565746307373 }, { "episode": 20752, "epoch": 0.3730093108530755, "loss/policy_avg": 0.4298272132873535, "lr": 2.7515337423312886e-06, "objective/entropy": 20.0439453125, "objective/kl": 13.036293029785156, "objective/non_score_reward": -1.3036293983459473, "objective/rlhf_reward": -7.214517116546631, "objective/scores": -0.5, "policy/approxkl_avg": 15.878606796264648, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6570239067077637, "step": 1296, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9991276264190674 }, { "episode": 20768, "epoch": 0.3732969047704641, "loss/policy_avg": 0.5323415994644165, "lr": 2.7513420245398774e-06, "objective/entropy": 100.38456726074219, "objective/kl": 8.541244506835938, "objective/non_score_reward": -0.8541244268417358, "objective/rlhf_reward": -3.016497766971588, "objective/scores": 0.1, "policy/approxkl_avg": 1.5799808502197266, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.544098973274231, "step": 1297, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0030770301818848 }, { "episode": 20784, "epoch": 0.37358449868785276, "loss/policy_avg": 0.0821521133184433, "lr": 2.7511503067484666e-06, "objective/entropy": -208.57977294921875, "objective/kl": 12.208963394165039, "objective/non_score_reward": -1.2208964824676514, "objective/rlhf_reward": -6.8835859298706055, "objective/scores": -0.5, "policy/approxkl_avg": 8.187795639038086, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6201414465904236, "step": 1298, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.003359317779541 }, { "episode": 20800, "epoch": 0.3738720926052414, "loss/policy_avg": 0.21889813244342804, "lr": 2.7509585889570555e-06, "objective/entropy": -46.03602600097656, "objective/kl": 12.725289344787598, "objective/non_score_reward": -1.272528886795044, "objective/rlhf_reward": -4.690115666389465, "objective/scores": 0.1, "policy/approxkl_avg": 17.959117889404297, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.619624674320221, "step": 1299, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9985125064849854 }, { "episode": 20816, "epoch": 0.37415968652263004, "loss/policy_avg": -0.2133423089981079, "lr": 2.7507668711656443e-06, "objective/entropy": 17.793853759765625, "objective/kl": 12.961200714111328, "objective/non_score_reward": -1.2961199283599854, "objective/rlhf_reward": -0.784479862451553, "objective/scores": 1.1, "policy/approxkl_avg": 15.770217895507812, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5697649717330933, "step": 1300, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999387502670288 }, { "episode": 20832, "epoch": 0.3744472804400187, "loss/policy_avg": 0.21750691533088684, "lr": 2.750575153374233e-06, "objective/entropy": -119.40364074707031, "objective/kl": 9.480939865112305, "objective/non_score_reward": -0.9480940103530884, "objective/rlhf_reward": -3.392376041412353, "objective/scores": 0.1, "policy/approxkl_avg": 5.330104827880859, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.612727165222168, "step": 1301, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.009127616882324 }, { "episode": 20848, "epoch": 0.3747348743574073, "loss/policy_avg": 0.4480114281177521, "lr": 2.750383435582822e-06, "objective/entropy": 16.765975952148438, "objective/kl": 16.303150177001953, "objective/non_score_reward": -1.6303151845932007, "objective/rlhf_reward": -4.398554409221683, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 132.9036865234375, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6376628279685974, "step": 1302, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9977257251739502 }, { "episode": 20864, "epoch": 0.37502246827479596, "loss/policy_avg": -0.39135733246803284, "lr": 2.750191717791411e-06, "objective/entropy": -138.0423126220703, "objective/kl": 9.426933288574219, "objective/non_score_reward": -0.9426934719085693, "objective/rlhf_reward": -2.037440554300944, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 17.444637298583984, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.5734092593193054, "step": 1303, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.2076222896575928 }, { "episode": 20880, "epoch": 0.37531006219218466, "loss/policy_avg": 0.4854779541492462, "lr": 2.75e-06, "objective/entropy": 92.62399291992188, "objective/kl": 16.287330627441406, "objective/non_score_reward": -1.6287332773208618, "objective/rlhf_reward": -3.591213975788328, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 11.808944702148438, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6390659213066101, "step": 1304, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0000646114349365 }, { "episode": 20896, "epoch": 0.3755976561095733, "loss/policy_avg": -0.24735336005687714, "lr": 2.7498082822085887e-06, "objective/entropy": -53.366703033447266, "objective/kl": 10.406715393066406, "objective/non_score_reward": -1.040671467781067, "objective/rlhf_reward": -6.162686347961426, "objective/scores": -0.5, "policy/approxkl_avg": 34.323829650878906, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5506607294082642, "step": 1305, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0012283325195312 }, { "episode": 20912, "epoch": 0.37588525002696194, "loss/policy_avg": 0.5731884241104126, "lr": 2.749616564417178e-06, "objective/entropy": 111.93737030029297, "objective/kl": 17.37570571899414, "objective/non_score_reward": -1.7375702857971191, "objective/rlhf_reward": -8.950281143188477, "objective/scores": -0.5, "policy/approxkl_avg": 77.21607208251953, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7403782606124878, "step": 1306, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9988470077514648 }, { "episode": 20928, "epoch": 0.3761728439443506, "loss/policy_avg": 0.2895098328590393, "lr": 2.7494248466257668e-06, "objective/entropy": 31.602371215820312, "objective/kl": 13.965235710144043, "objective/non_score_reward": -1.3965235948562622, "objective/rlhf_reward": -3.6386830312775924, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 33.449668884277344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6721315383911133, "step": 1307, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998502492904663 }, { "episode": 20944, "epoch": 0.3764604378617392, "loss/policy_avg": -0.17103518545627594, "lr": 2.749233128834356e-06, "objective/entropy": 132.74185180664062, "objective/kl": 20.359312057495117, "objective/non_score_reward": -2.03593111038208, "objective/rlhf_reward": -3.7437247991561886, "objective/scores": 1.1, "policy/approxkl_avg": 118.31222534179688, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.718961775302887, "step": 1308, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.002476692199707 }, { "episode": 20960, "epoch": 0.37674803177912786, "loss/policy_avg": 0.14908567070960999, "lr": 2.749041411042945e-06, "objective/entropy": 244.53402709960938, "objective/kl": 15.694480895996094, "objective/non_score_reward": -1.5694482326507568, "objective/rlhf_reward": -8.277792930603027, "objective/scores": -0.5, "policy/approxkl_avg": 63.194114685058594, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6035882234573364, "step": 1309, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.997267723083496 }, { "episode": 20976, "epoch": 0.3770356256965165, "loss/policy_avg": 0.05589284002780914, "lr": 2.7488496932515336e-06, "objective/entropy": 42.838871002197266, "objective/kl": 14.986385345458984, "objective/non_score_reward": -1.49863862991333, "objective/rlhf_reward": -5.5945545271039006, "objective/scores": 0.1, "policy/approxkl_avg": 78.51327514648438, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.670206606388092, "step": 1310, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9988725185394287 }, { "episode": 20992, "epoch": 0.3773232196139052, "loss/policy_avg": 0.5007076263427734, "lr": 2.748657975460123e-06, "objective/entropy": 176.76974487304688, "objective/kl": 15.967851638793945, "objective/non_score_reward": -1.5967851877212524, "objective/rlhf_reward": -8.387140274047852, "objective/scores": -0.5, "policy/approxkl_avg": 53.00636672973633, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6999363899230957, "step": 1311, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.997910737991333 }, { "episode": 21008, "epoch": 0.37761081353129383, "loss/policy_avg": 1.6480071544647217, "lr": 2.7484662576687117e-06, "objective/entropy": 79.01959228515625, "objective/kl": 17.435348510742188, "objective/non_score_reward": -1.743534803390503, "objective/rlhf_reward": -2.5741390645503994, "objective/scores": 1.1, "policy/approxkl_avg": 99.76893615722656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7191579341888428, "step": 1312, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9991645812988281 }, { "episode": 21024, "epoch": 0.37789840744868247, "loss/policy_avg": 0.16946262121200562, "lr": 2.748274539877301e-06, "objective/entropy": 133.11578369140625, "objective/kl": 9.329315185546875, "objective/non_score_reward": -0.9329314827919006, "objective/rlhf_reward": 0.6682741880416874, "objective/scores": 1.1, "policy/approxkl_avg": 12.421842575073242, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5224270224571228, "step": 1313, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0036096572875977 }, { "episode": 21040, "epoch": 0.3781860013660711, "loss/policy_avg": 0.32205283641815186, "lr": 2.7480828220858897e-06, "objective/entropy": -96.55888366699219, "objective/kl": 11.596528053283691, "objective/non_score_reward": -1.1596527099609375, "objective/rlhf_reward": -0.23861107826232875, "objective/scores": 1.1, "policy/approxkl_avg": 6.196664810180664, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5269286632537842, "step": 1314, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9993153810501099 }, { "episode": 21056, "epoch": 0.37847359528345975, "loss/policy_avg": 0.060142651200294495, "lr": 2.7478911042944785e-06, "objective/entropy": 106.45033264160156, "objective/kl": 9.987686157226562, "objective/non_score_reward": -0.99876868724823, "objective/rlhf_reward": -5.995074272155762, "objective/scores": -0.5, "policy/approxkl_avg": 0.6358002424240112, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6303056478500366, "step": 1315, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.00164794921875 }, { "episode": 21072, "epoch": 0.3787611892008484, "loss/policy_avg": 0.5722357034683228, "lr": 2.7476993865030678e-06, "objective/entropy": 36.441009521484375, "objective/kl": 14.859726905822754, "objective/non_score_reward": -1.4859726428985596, "objective/rlhf_reward": -1.5438909776508805, "objective/scores": 1.1, "policy/approxkl_avg": 233.2808837890625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.736956000328064, "step": 1316, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9973156452178955 }, { "episode": 21088, "epoch": 0.37904878311823703, "loss/policy_avg": 0.14421963691711426, "lr": 2.7475076687116566e-06, "objective/entropy": -57.41680908203125, "objective/kl": 15.156774520874023, "objective/non_score_reward": -1.5156774520874023, "objective/rlhf_reward": -8.06270980834961, "objective/scores": -0.5, "policy/approxkl_avg": 22.829402923583984, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.795988917350769, "step": 1317, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 2.0047948360443115 }, { "episode": 21104, "epoch": 0.37933637703562567, "loss/policy_avg": 0.6621519327163696, "lr": 2.7473159509202454e-06, "objective/entropy": 89.67889404296875, "objective/kl": 16.89303207397461, "objective/non_score_reward": -1.689302921295166, "objective/rlhf_reward": -4.634505877570186, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 178.07943725585938, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8290786147117615, "step": 1318, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0020265579223633 }, { "episode": 21120, "epoch": 0.37962397095301437, "loss/policy_avg": 0.5616532564163208, "lr": 2.7471242331288346e-06, "objective/entropy": 228.52713012695312, "objective/kl": 11.308792114257812, "objective/non_score_reward": -1.1308794021606445, "objective/rlhf_reward": -6.523517608642578, "objective/scores": -0.5, "policy/approxkl_avg": 72.7155532836914, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7285025119781494, "step": 1319, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0004825592041016 }, { "episode": 21136, "epoch": 0.379911564870403, "loss/policy_avg": 0.4463692903518677, "lr": 2.7469325153374234e-06, "objective/entropy": 33.118324279785156, "objective/kl": 16.940200805664062, "objective/non_score_reward": -1.6940197944641113, "objective/rlhf_reward": -2.3760792374610897, "objective/scores": 1.1, "policy/approxkl_avg": 26.495033264160156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.708628237247467, "step": 1320, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0003607273101807 }, { "episode": 21152, "epoch": 0.38019915878779165, "loss/policy_avg": 0.033870283514261246, "lr": 2.7467407975460127e-06, "objective/entropy": 138.80300903320312, "objective/kl": 18.500614166259766, "objective/non_score_reward": -1.8500614166259766, "objective/rlhf_reward": -9.400245666503906, "objective/scores": -0.5, "policy/approxkl_avg": 27.69375991821289, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6441357731819153, "step": 1321, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9995644092559814 }, { "episode": 21168, "epoch": 0.3804867527051803, "loss/policy_avg": 0.3302859663963318, "lr": 2.7465490797546015e-06, "objective/entropy": 294.7465515136719, "objective/kl": 21.378875732421875, "objective/non_score_reward": -2.137887716293335, "objective/rlhf_reward": -8.15155074596405, "objective/scores": 0.1, "policy/approxkl_avg": 118.91703796386719, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.9274790287017822, "step": 1322, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9957250356674194 }, { "episode": 21184, "epoch": 0.3807743466225689, "loss/policy_avg": 0.0874224305152893, "lr": 2.7463573619631903e-06, "objective/entropy": -143.14297485351562, "objective/kl": 10.797538757324219, "objective/non_score_reward": -1.0797538757324219, "objective/rlhf_reward": -6.3190155029296875, "objective/scores": -0.5, "policy/approxkl_avg": 4.874238014221191, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7316051125526428, "step": 1323, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9993560314178467 }, { "episode": 21200, "epoch": 0.38106194053995757, "loss/policy_avg": 0.5921223163604736, "lr": 2.746165644171779e-06, "objective/entropy": 154.961181640625, "objective/kl": 20.293582916259766, "objective/non_score_reward": -2.029358386993408, "objective/rlhf_reward": -10.117433547973633, "objective/scores": -0.5, "policy/approxkl_avg": 93.22239685058594, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6468446254730225, "step": 1324, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9983704090118408 }, { "episode": 21216, "epoch": 0.3813495344573462, "loss/policy_avg": 0.6764448881149292, "lr": 2.745973926380368e-06, "objective/entropy": 45.973533630371094, "objective/kl": 15.151253700256348, "objective/non_score_reward": -1.5151253938674927, "objective/rlhf_reward": -8.060501098632812, "objective/scores": -0.5, "policy/approxkl_avg": 46.96009063720703, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7670845985412598, "step": 1325, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9982659816741943 }, { "episode": 21232, "epoch": 0.3816371283747349, "loss/policy_avg": 0.3867417871952057, "lr": 2.745782208588957e-06, "objective/entropy": -59.67474365234375, "objective/kl": 18.129138946533203, "objective/non_score_reward": -1.8129138946533203, "objective/rlhf_reward": -2.8516556978225704, "objective/scores": 1.1, "policy/approxkl_avg": 18.081439971923828, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5321407318115234, "step": 1326, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0017099380493164 }, { "episode": 21248, "epoch": 0.38192472229212354, "loss/policy_avg": -0.04611814394593239, "lr": 2.745590490797546e-06, "objective/entropy": 61.81705856323242, "objective/kl": 19.817153930664062, "objective/non_score_reward": -1.981715440750122, "objective/rlhf_reward": -7.526861643791198, "objective/scores": 0.1, "policy/approxkl_avg": 112.93687438964844, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7623246908187866, "step": 1327, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9971072673797607 }, { "episode": 21264, "epoch": 0.3822123162095122, "loss/policy_avg": 0.07851407676935196, "lr": 2.7453987730061347e-06, "objective/entropy": -244.60308837890625, "objective/kl": 8.617375373840332, "objective/non_score_reward": -0.8617374897003174, "objective/rlhf_reward": -3.046949928998947, "objective/scores": 0.1, "policy/approxkl_avg": 29.10788345336914, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6370751857757568, "step": 1328, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0000083446502686 }, { "episode": 21280, "epoch": 0.3824999101269008, "loss/policy_avg": 0.13727867603302002, "lr": 2.745207055214724e-06, "objective/entropy": 233.06227111816406, "objective/kl": 12.714912414550781, "objective/non_score_reward": -1.271491289138794, "objective/rlhf_reward": -4.685965275764465, "objective/scores": 0.1, "policy/approxkl_avg": 42.4720458984375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5807084441184998, "step": 1329, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9980757236480713 }, { "episode": 21296, "epoch": 0.38278750404428946, "loss/policy_avg": 0.28195735812187195, "lr": 2.7450153374233128e-06, "objective/entropy": 45.676937103271484, "objective/kl": 8.602317810058594, "objective/non_score_reward": -0.8602317571640015, "objective/rlhf_reward": -3.0409270286560055, "objective/scores": 0.1, "policy/approxkl_avg": 9.06165885925293, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5428619384765625, "step": 1330, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9992284774780273 }, { "episode": 21312, "epoch": 0.3830750979616781, "loss/policy_avg": 0.47195160388946533, "lr": 2.744823619631902e-06, "objective/entropy": -45.77192687988281, "objective/kl": 14.907909393310547, "objective/non_score_reward": -1.490790843963623, "objective/rlhf_reward": -7.963163375854492, "objective/scores": -0.5, "policy/approxkl_avg": 100.64746856689453, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6922006011009216, "step": 1331, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9971425533294678 }, { "episode": 21328, "epoch": 0.38336269187906674, "loss/policy_avg": 0.019790709018707275, "lr": 2.744631901840491e-06, "objective/entropy": 102.41281127929688, "objective/kl": 8.887635231018066, "objective/non_score_reward": -0.8887635469436646, "objective/rlhf_reward": -3.1550543963909146, "objective/scores": 0.1, "policy/approxkl_avg": 1.5302138328552246, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.8031924962997437, "step": 1332, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0033862590789795 }, { "episode": 21344, "epoch": 0.3836502857964554, "loss/policy_avg": 0.20752736926078796, "lr": 2.7444401840490796e-06, "objective/entropy": 373.9801940917969, "objective/kl": 22.242820739746094, "objective/non_score_reward": -2.2242817878723145, "objective/rlhf_reward": -10.897127151489258, "objective/scores": -0.5, "policy/approxkl_avg": 39.84983825683594, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.9056296348571777, "step": 1333, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.998758316040039 }, { "episode": 21360, "epoch": 0.3839378797138441, "loss/policy_avg": 0.4480108618736267, "lr": 2.744248466257669e-06, "objective/entropy": -94.64065551757812, "objective/kl": 17.10127067565918, "objective/non_score_reward": -1.7101271152496338, "objective/rlhf_reward": -5.1786488346463315, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 49.86717224121094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7233811616897583, "step": 1334, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.996772289276123 }, { "episode": 21376, "epoch": 0.3842254736312327, "loss/policy_avg": 0.5475999712944031, "lr": 2.7440567484662577e-06, "objective/entropy": 106.49105834960938, "objective/kl": 11.232261657714844, "objective/non_score_reward": -1.1232261657714844, "objective/rlhf_reward": -6.4929046630859375, "objective/scores": -0.5, "policy/approxkl_avg": 26.136554718017578, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7535625696182251, "step": 1335, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9989811182022095 }, { "episode": 21392, "epoch": 0.38451306754862136, "loss/policy_avg": 0.2780382037162781, "lr": 2.743865030674847e-06, "objective/entropy": 141.95111083984375, "objective/kl": 15.009653091430664, "objective/non_score_reward": -1.5009653568267822, "objective/rlhf_reward": -1.6038616657257077, "objective/scores": 1.1, "policy/approxkl_avg": 40.69712829589844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.46323397755622864, "step": 1336, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.997363805770874 }, { "episode": 21408, "epoch": 0.38480066146601, "loss/policy_avg": 0.5840069055557251, "lr": 2.7436733128834357e-06, "objective/entropy": 66.1385726928711, "objective/kl": 12.383220672607422, "objective/non_score_reward": -1.2383220195770264, "objective/rlhf_reward": -4.55328813791275, "objective/scores": 0.1, "policy/approxkl_avg": 9.937700271606445, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.67143714427948, "step": 1337, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9998342990875244 }, { "episode": 21424, "epoch": 0.38508825538339864, "loss/policy_avg": 0.47393912076950073, "lr": 2.7434815950920245e-06, "objective/entropy": 149.76553344726562, "objective/kl": 22.620647430419922, "objective/non_score_reward": -2.2620649337768555, "objective/rlhf_reward": -8.648259377479553, "objective/scores": 0.1, "policy/approxkl_avg": 125.28185272216797, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.47826266288757324, "step": 1338, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9998018741607666 }, { "episode": 21440, "epoch": 0.3853758493007873, "loss/policy_avg": 0.561253011226654, "lr": 2.7432898773006138e-06, "objective/entropy": 165.69529724121094, "objective/kl": 16.98709487915039, "objective/non_score_reward": -1.6987093687057495, "objective/rlhf_reward": -8.794837951660156, "objective/scores": -0.5, "policy/approxkl_avg": 104.10726928710938, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7110261917114258, "step": 1339, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9979422092437744 }, { "episode": 21456, "epoch": 0.3856634432181759, "loss/policy_avg": 0.08664319664239883, "lr": 2.7430981595092026e-06, "objective/entropy": 135.85597229003906, "objective/kl": 13.391534805297852, "objective/non_score_reward": -1.3391534090042114, "objective/rlhf_reward": -4.956613807380199, "objective/scores": 0.1, "policy/approxkl_avg": 53.051788330078125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5849297046661377, "step": 1340, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9986178874969482 }, { "episode": 21472, "epoch": 0.38595103713556456, "loss/policy_avg": 0.07530111074447632, "lr": 2.7429064417177914e-06, "objective/entropy": 94.53955078125, "objective/kl": 9.170591354370117, "objective/non_score_reward": -0.9170591831207275, "objective/rlhf_reward": 0.7317632079124454, "objective/scores": 1.1, "policy/approxkl_avg": 5.455111980438232, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5276771783828735, "step": 1341, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9987722635269165 }, { "episode": 21488, "epoch": 0.38623863105295325, "loss/policy_avg": -0.18977919220924377, "lr": 2.7427147239263806e-06, "objective/entropy": 84.14942932128906, "objective/kl": 13.785886764526367, "objective/non_score_reward": -1.3785889148712158, "objective/rlhf_reward": -1.1143553912639614, "objective/scores": 1.1, "policy/approxkl_avg": 18.839641571044922, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.7620354294776917, "step": 1342, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.005643129348755 }, { "episode": 21504, "epoch": 0.3865262249703419, "loss/policy_avg": 0.051979582756757736, "lr": 2.7425230061349694e-06, "objective/entropy": 144.39056396484375, "objective/kl": 13.885366439819336, "objective/non_score_reward": -1.3885365724563599, "objective/rlhf_reward": -2.6304271861326427, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 29.781160354614258, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.3581441342830658, "step": 1343, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0010387897491455 }, { "episode": 21520, "epoch": 0.38681381888773053, "loss/policy_avg": 0.032890960574150085, "lr": 2.7423312883435587e-06, "objective/entropy": 148.615478515625, "objective/kl": 7.770719051361084, "objective/non_score_reward": -0.7770719528198242, "objective/rlhf_reward": -5.108287811279297, "objective/scores": -0.5, "policy/approxkl_avg": 29.95543098449707, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6966961026191711, "step": 1344, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0212697982788086 }, { "episode": 21536, "epoch": 0.38710141280511917, "loss/policy_avg": 0.13797160983085632, "lr": 2.742139570552147e-06, "objective/entropy": 287.2220764160156, "objective/kl": 19.492271423339844, "objective/non_score_reward": -1.9492273330688477, "objective/rlhf_reward": -3.396909093856811, "objective/scores": 1.1, "policy/approxkl_avg": 48.7330207824707, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.713855504989624, "step": 1345, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9992468357086182 }, { "episode": 21552, "epoch": 0.3873890067225078, "loss/policy_avg": 0.26439833641052246, "lr": 2.7419478527607363e-06, "objective/entropy": 21.02935791015625, "objective/kl": 14.434762954711914, "objective/non_score_reward": -1.4434764385223389, "objective/rlhf_reward": -7.7739057540893555, "objective/scores": -0.5, "policy/approxkl_avg": 56.537498474121094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7992854118347168, "step": 1346, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9975281953811646 }, { "episode": 21568, "epoch": 0.38767660063989645, "loss/policy_avg": 0.5610612034797668, "lr": 2.741756134969325e-06, "objective/entropy": 51.866172790527344, "objective/kl": 14.561103820800781, "objective/non_score_reward": -1.4561104774475098, "objective/rlhf_reward": -5.42444167137146, "objective/scores": 0.1, "policy/approxkl_avg": 13.282157897949219, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.777137279510498, "step": 1347, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9995806217193604 }, { "episode": 21584, "epoch": 0.3879641945572851, "loss/policy_avg": 0.2028767466545105, "lr": 2.741564417177914e-06, "objective/entropy": 42.17268371582031, "objective/kl": 16.39310073852539, "objective/non_score_reward": -1.6393101215362549, "objective/rlhf_reward": -4.609829376416142, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 51.239471435546875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7365516424179077, "step": 1348, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000267505645752 }, { "episode": 21600, "epoch": 0.3882517884746738, "loss/policy_avg": 0.13323874771595, "lr": 2.741372699386503e-06, "objective/entropy": 181.5269012451172, "objective/kl": 8.048508644104004, "objective/non_score_reward": -0.8048508167266846, "objective/rlhf_reward": -2.819403341412544, "objective/scores": 0.1, "policy/approxkl_avg": 3.953498363494873, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7375788688659668, "step": 1349, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000728130340576 }, { "episode": 21616, "epoch": 0.3885393823920624, "loss/policy_avg": 0.32980287075042725, "lr": 2.741180981595092e-06, "objective/entropy": -87.61666107177734, "objective/kl": 13.81655216217041, "objective/non_score_reward": -1.3816550970077515, "objective/rlhf_reward": -3.701791579994272, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 58.40357971191406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8026307821273804, "step": 1350, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.001197576522827 }, { "episode": 21632, "epoch": 0.38882697630945107, "loss/policy_avg": 1.0421653985977173, "lr": 2.740989263803681e-06, "objective/entropy": 295.99005126953125, "objective/kl": 13.983240127563477, "objective/non_score_reward": -1.3983240127563477, "objective/rlhf_reward": -3.193296259641647, "objective/scores": 0.6, "policy/approxkl_avg": 161.0973358154297, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6929864883422852, "step": 1351, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9990979433059692 }, { "episode": 21648, "epoch": 0.3891145702268397, "loss/policy_avg": -0.03795725852251053, "lr": 2.74079754601227e-06, "objective/entropy": 205.16946411132812, "objective/kl": 16.07244873046875, "objective/non_score_reward": -1.607244610786438, "objective/rlhf_reward": -6.028978353738784, "objective/scores": 0.1, "policy/approxkl_avg": 34.50408172607422, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8304120302200317, "step": 1352, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9998217821121216 }, { "episode": 21664, "epoch": 0.38940216414422835, "loss/policy_avg": 0.558219850063324, "lr": 2.740605828220859e-06, "objective/entropy": -41.869117736816406, "objective/kl": 12.831666946411133, "objective/non_score_reward": -1.2831666469573975, "objective/rlhf_reward": -4.732666528224945, "objective/scores": 0.1, "policy/approxkl_avg": 51.029197692871094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8420255780220032, "step": 1353, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9978086948394775 }, { "episode": 21680, "epoch": 0.389689758061617, "loss/policy_avg": 0.07326959073543549, "lr": 2.740414110429448e-06, "objective/entropy": 153.06631469726562, "objective/kl": 15.971595764160156, "objective/non_score_reward": -1.597159743309021, "objective/rlhf_reward": -3.4649201377641887, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 31.896522521972656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7210527658462524, "step": 1354, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9981770515441895 }, { "episode": 21696, "epoch": 0.3899773519790056, "loss/policy_avg": 0.4767560064792633, "lr": 2.740222392638037e-06, "objective/entropy": 231.67868041992188, "objective/kl": 16.846141815185547, "objective/non_score_reward": -1.6846144199371338, "objective/rlhf_reward": -5.005124107996622, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 137.12591552734375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.764703094959259, "step": 1355, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9972622394561768 }, { "episode": 21712, "epoch": 0.39026494589639427, "loss/policy_avg": 0.11291810870170593, "lr": 2.7400306748466256e-06, "objective/entropy": 142.6898956298828, "objective/kl": 17.51624298095703, "objective/non_score_reward": -1.751624345779419, "objective/rlhf_reward": -6.606497681140899, "objective/scores": 0.1, "policy/approxkl_avg": 38.98486328125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6645491123199463, "step": 1356, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9972184896469116 }, { "episode": 21728, "epoch": 0.39055253981378296, "loss/policy_avg": 0.4059602618217468, "lr": 2.739838957055215e-06, "objective/entropy": 133.1064453125, "objective/kl": 14.388803482055664, "objective/non_score_reward": -1.438880443572998, "objective/rlhf_reward": -2.8318024619829387, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 34.9724006652832, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7247166633605957, "step": 1357, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998490810394287 }, { "episode": 21744, "epoch": 0.3908401337311716, "loss/policy_avg": 0.1599498689174652, "lr": 2.7396472392638037e-06, "objective/entropy": 46.026268005371094, "objective/kl": 7.916892051696777, "objective/non_score_reward": -0.7916892766952515, "objective/rlhf_reward": -5.166757106781006, "objective/scores": -0.5, "policy/approxkl_avg": 10.668688774108887, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6974193453788757, "step": 1358, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0124077796936035 }, { "episode": 21760, "epoch": 0.39112772764856024, "loss/policy_avg": 0.11326786130666733, "lr": 2.739455521472393e-06, "objective/entropy": 139.10223388671875, "objective/kl": 18.496702194213867, "objective/non_score_reward": -1.84967041015625, "objective/rlhf_reward": -2.9986812531948086, "objective/scores": 1.1, "policy/approxkl_avg": 94.51484680175781, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8132376670837402, "step": 1359, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9987239837646484 }, { "episode": 21776, "epoch": 0.3914153215659489, "loss/policy_avg": 0.12141874432563782, "lr": 2.7392638036809817e-06, "objective/entropy": 65.53262329101562, "objective/kl": 17.54939842224121, "objective/non_score_reward": -1.7549399137496948, "objective/rlhf_reward": -6.619759654998779, "objective/scores": 0.1, "policy/approxkl_avg": 71.7486572265625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6637197732925415, "step": 1360, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0016891956329346 }, { "episode": 21792, "epoch": 0.3917029154833375, "loss/policy_avg": 0.07374750077724457, "lr": 2.7390720858895705e-06, "objective/entropy": -212.7354736328125, "objective/kl": 8.046335220336914, "objective/non_score_reward": -0.8046334981918335, "objective/rlhf_reward": -2.818534111976623, "objective/scores": 0.1, "policy/approxkl_avg": 10.865436553955078, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7911792993545532, "step": 1361, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9985687732696533 }, { "episode": 21808, "epoch": 0.39199050940072616, "loss/policy_avg": 0.7190690040588379, "lr": 2.7388803680981598e-06, "objective/entropy": 244.15914916992188, "objective/kl": 7.830224990844727, "objective/non_score_reward": -0.7830224633216858, "objective/rlhf_reward": -1.1846786839532215, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 8.115243911743164, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6153068542480469, "step": 1362, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.000927209854126 }, { "episode": 21824, "epoch": 0.3922781033181148, "loss/policy_avg": 0.2109440267086029, "lr": 2.7386886503067486e-06, "objective/entropy": -58.729270935058594, "objective/kl": 21.156822204589844, "objective/non_score_reward": -2.1156821250915527, "objective/rlhf_reward": -4.062729096412658, "objective/scores": 1.1, "policy/approxkl_avg": 108.97077941894531, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6496853828430176, "step": 1363, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0006513595581055 }, { "episode": 21840, "epoch": 0.3925656972355035, "loss/policy_avg": -0.1141444593667984, "lr": 2.738496932515338e-06, "objective/entropy": 30.81436538696289, "objective/kl": 17.293773651123047, "objective/non_score_reward": -1.7293777465820312, "objective/rlhf_reward": -5.09268179831178, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 40.24468994140625, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5824557542800903, "step": 1364, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9977633953094482 }, { "episode": 21856, "epoch": 0.39285329115289214, "loss/policy_avg": 0.20132741332054138, "lr": 2.7383052147239266e-06, "objective/entropy": 103.67611694335938, "objective/kl": 13.289691925048828, "objective/non_score_reward": -1.3289692401885986, "objective/rlhf_reward": -4.915877020359039, "objective/scores": 0.1, "policy/approxkl_avg": 5.196710586547852, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.62409508228302, "step": 1365, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9994707107543945 }, { "episode": 21872, "epoch": 0.3931408850702808, "loss/policy_avg": 0.20989450812339783, "lr": 2.7381134969325154e-06, "objective/entropy": 99.08970642089844, "objective/kl": 14.397649765014648, "objective/non_score_reward": -1.4397649765014648, "objective/rlhf_reward": -5.359060144424438, "objective/scores": 0.1, "policy/approxkl_avg": 47.96294403076172, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7722165584564209, "step": 1366, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.997646689414978 }, { "episode": 21888, "epoch": 0.3934284789876694, "loss/policy_avg": 0.17023280262947083, "lr": 2.7379217791411042e-06, "objective/entropy": 146.130615234375, "objective/kl": 14.226318359375, "objective/non_score_reward": -1.422631859779358, "objective/rlhf_reward": -4.134268193450525, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 30.81444549560547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5660938024520874, "step": 1367, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0002405643463135 }, { "episode": 21904, "epoch": 0.39371607290505806, "loss/policy_avg": 0.38309454917907715, "lr": 2.737730061349693e-06, "objective/entropy": 90.76017761230469, "objective/kl": 13.75072956085205, "objective/non_score_reward": -1.375072956085205, "objective/rlhf_reward": -1.1002921894192692, "objective/scores": 1.1, "policy/approxkl_avg": 34.77094268798828, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8311518430709839, "step": 1368, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0005173683166504 }, { "episode": 21920, "epoch": 0.3940036668224467, "loss/policy_avg": 0.24230515956878662, "lr": 2.7375383435582823e-06, "objective/entropy": 248.60150146484375, "objective/kl": 15.170378684997559, "objective/non_score_reward": -1.5170379877090454, "objective/rlhf_reward": -3.1444328769457073, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 2.941917657852173, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6488876938819885, "step": 1369, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.001039743423462 }, { "episode": 21936, "epoch": 0.39429126073983534, "loss/policy_avg": 1.0242424011230469, "lr": 2.737346625766871e-06, "objective/entropy": 74.42791748046875, "objective/kl": 14.777740478515625, "objective/non_score_reward": -1.4777741432189941, "objective/rlhf_reward": -3.5110965132713314, "objective/scores": 0.6, "policy/approxkl_avg": 65.65885162353516, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.595373272895813, "step": 1370, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9968748092651367 }, { "episode": 21952, "epoch": 0.394578854657224, "loss/policy_avg": 0.04835711419582367, "lr": 2.73715490797546e-06, "objective/entropy": 134.174560546875, "objective/kl": 13.336694717407227, "objective/non_score_reward": -1.3336694240570068, "objective/rlhf_reward": -4.9346778154373165, "objective/scores": 0.1, "policy/approxkl_avg": 29.234766006469727, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7185621857643127, "step": 1371, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0065627098083496 }, { "episode": 21968, "epoch": 0.39486644857461267, "loss/policy_avg": 0.2938240170478821, "lr": 2.736963190184049e-06, "objective/entropy": -30.205530166625977, "objective/kl": 17.3317928314209, "objective/non_score_reward": -1.7331793308258057, "objective/rlhf_reward": -5.1993836919466645, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 119.18181610107422, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6755663156509399, "step": 1372, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999345302581787 }, { "episode": 21984, "epoch": 0.3951540424920013, "loss/policy_avg": 0.3882046937942505, "lr": 2.736771472392638e-06, "objective/entropy": 97.84891510009766, "objective/kl": 13.398181915283203, "objective/non_score_reward": -1.3398182392120361, "objective/rlhf_reward": -3.4118618769215896, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 45.98151779174805, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5983450412750244, "step": 1373, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9974939823150635 }, { "episode": 22000, "epoch": 0.39544163640938995, "loss/policy_avg": 0.3626946210861206, "lr": 2.736579754601227e-06, "objective/entropy": 151.31686401367188, "objective/kl": 14.419286727905273, "objective/non_score_reward": -1.4419286251068115, "objective/rlhf_reward": -1.3677145004272457, "objective/scores": 1.1, "policy/approxkl_avg": 9.240350723266602, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4013226628303528, "step": 1374, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.00012469291687 }, { "episode": 22016, "epoch": 0.3957292303267786, "loss/policy_avg": -0.1479598432779312, "lr": 2.736388036809816e-06, "objective/entropy": -0.02170562744140625, "objective/kl": 12.707273483276367, "objective/non_score_reward": -1.2707273960113525, "objective/rlhf_reward": -2.1591904580008716, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 17.358325958251953, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6573648452758789, "step": 1375, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9988689422607422 }, { "episode": 22032, "epoch": 0.39601682424416723, "loss/policy_avg": -0.3523472249507904, "lr": 2.736196319018405e-06, "objective/entropy": 113.51665496826172, "objective/kl": 12.600326538085938, "objective/non_score_reward": -1.2600325345993042, "objective/rlhf_reward": -2.1164110049020977, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 37.17249298095703, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5286760926246643, "step": 1376, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.002790927886963 }, { "episode": 22048, "epoch": 0.39630441816155587, "loss/policy_avg": 0.06261734664440155, "lr": 2.736004601226994e-06, "objective/entropy": 236.24249267578125, "objective/kl": 8.30074691772461, "objective/non_score_reward": -0.830074667930603, "objective/rlhf_reward": -2.920298492908478, "objective/scores": 0.1, "policy/approxkl_avg": 3.030731439590454, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.586647629737854, "step": 1377, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0025887489318848 }, { "episode": 22064, "epoch": 0.3965920120789445, "loss/policy_avg": -0.276256799697876, "lr": 2.735812883435583e-06, "objective/entropy": 147.0058135986328, "objective/kl": 14.008570671081543, "objective/non_score_reward": -1.4008569717407227, "objective/rlhf_reward": -3.203427946567535, "objective/scores": 0.6, "policy/approxkl_avg": 46.215850830078125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6285009980201721, "step": 1378, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.008394479751587 }, { "episode": 22080, "epoch": 0.39687960599633315, "loss/policy_avg": 0.13082417845726013, "lr": 2.7356211656441717e-06, "objective/entropy": -202.0, "objective/kl": 11.788710594177246, "objective/non_score_reward": -1.1788710355758667, "objective/rlhf_reward": -0.31548414230346644, "objective/scores": 1.1, "policy/approxkl_avg": 3.524956703186035, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8542511463165283, "step": 1379, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9987741708755493 }, { "episode": 22096, "epoch": 0.39716719991372185, "loss/policy_avg": 0.095906101167202, "lr": 2.735429447852761e-06, "objective/entropy": 80.37876892089844, "objective/kl": 14.221534729003906, "objective/non_score_reward": -1.4221534729003906, "objective/rlhf_reward": -5.288614249229431, "objective/scores": 0.1, "policy/approxkl_avg": 3.4239659309387207, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5438545942306519, "step": 1380, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.001100778579712 }, { "episode": 22112, "epoch": 0.3974547938311105, "loss/policy_avg": 0.7796217203140259, "lr": 2.7352377300613497e-06, "objective/entropy": -26.782432556152344, "objective/kl": 10.952417373657227, "objective/non_score_reward": -1.0952417850494385, "objective/rlhf_reward": -1.4572480961096015, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 52.988441467285156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8282052278518677, "step": 1381, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9960274696350098 }, { "episode": 22128, "epoch": 0.3977423877484991, "loss/policy_avg": 0.33992427587509155, "lr": 2.735046012269939e-06, "objective/entropy": 25.139671325683594, "objective/kl": 12.910887718200684, "objective/non_score_reward": -1.2910888195037842, "objective/rlhf_reward": -7.164355278015137, "objective/scores": -0.5, "policy/approxkl_avg": 67.68551635742188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.633919894695282, "step": 1382, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0026700496673584 }, { "episode": 22144, "epoch": 0.39802998166588777, "loss/policy_avg": 0.20044741034507751, "lr": 2.7348542944785277e-06, "objective/entropy": -16.74477767944336, "objective/kl": 15.490436553955078, "objective/non_score_reward": -1.5490437746047974, "objective/rlhf_reward": -5.796175158023834, "objective/scores": 0.1, "policy/approxkl_avg": 13.735712051391602, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.637913703918457, "step": 1383, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9990465641021729 }, { "episode": 22160, "epoch": 0.3983175755832764, "loss/policy_avg": -0.0019893571734428406, "lr": 2.7346625766871165e-06, "objective/entropy": -15.93057632446289, "objective/kl": 14.116677284240723, "objective/non_score_reward": -1.4116679430007935, "objective/rlhf_reward": -7.646671772003174, "objective/scores": -0.5, "policy/approxkl_avg": 22.67679214477539, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7569680213928223, "step": 1384, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9990018606185913 }, { "episode": 22176, "epoch": 0.39860516950066505, "loss/policy_avg": 0.8097996115684509, "lr": 2.7344708588957058e-06, "objective/entropy": 65.41546630859375, "objective/kl": 14.531415939331055, "objective/non_score_reward": -1.453141689300537, "objective/rlhf_reward": -7.812566757202148, "objective/scores": -0.5, "policy/approxkl_avg": 40.36127853393555, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6726914048194885, "step": 1385, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9987540245056152 }, { "episode": 22192, "epoch": 0.3988927634180537, "loss/policy_avg": 0.2525538206100464, "lr": 2.7342791411042946e-06, "objective/entropy": -36.927734375, "objective/kl": 15.043611526489258, "objective/non_score_reward": -1.5043611526489258, "objective/rlhf_reward": -8.017444610595703, "objective/scores": -0.5, "policy/approxkl_avg": 10.599006652832031, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.804946780204773, "step": 1386, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9973516464233398 }, { "episode": 22208, "epoch": 0.3991803573354424, "loss/policy_avg": 0.04792652279138565, "lr": 2.734087423312884e-06, "objective/entropy": 257.07891845703125, "objective/kl": 17.709779739379883, "objective/non_score_reward": -1.7709778547286987, "objective/rlhf_reward": -6.683911597728729, "objective/scores": 0.1, "policy/approxkl_avg": 89.76461029052734, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6373671889305115, "step": 1387, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9984499216079712 }, { "episode": 22224, "epoch": 0.399467951252831, "loss/policy_avg": 0.22276929020881653, "lr": 2.7338957055214726e-06, "objective/entropy": 262.33367919921875, "objective/kl": 20.37179946899414, "objective/non_score_reward": -2.037179946899414, "objective/rlhf_reward": -3.748719549179077, "objective/scores": 1.1, "policy/approxkl_avg": 57.362403869628906, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7482947707176208, "step": 1388, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9991297721862793 }, { "episode": 22240, "epoch": 0.39975554517021966, "loss/policy_avg": 0.013198129832744598, "lr": 2.7337039877300614e-06, "objective/entropy": 56.792144775390625, "objective/kl": 17.05535888671875, "objective/non_score_reward": -1.7055360078811646, "objective/rlhf_reward": -3.898425136448118, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 2.213428258895874, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5146496295928955, "step": 1389, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000297784805298 }, { "episode": 22256, "epoch": 0.4000431390876083, "loss/policy_avg": 0.7818821668624878, "lr": 2.7335122699386503e-06, "objective/entropy": 14.627386093139648, "objective/kl": 11.377643585205078, "objective/non_score_reward": -1.1377642154693604, "objective/rlhf_reward": -1.6273382350218024, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 53.005706787109375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6652359962463379, "step": 1390, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.997227668762207 }, { "episode": 22272, "epoch": 0.40033073300499694, "loss/policy_avg": 0.6206638813018799, "lr": 2.733320552147239e-06, "objective/entropy": 60.79871368408203, "objective/kl": 15.99708366394043, "objective/non_score_reward": -1.5997084379196167, "objective/rlhf_reward": -5.998833811283111, "objective/scores": 0.1, "policy/approxkl_avg": 117.21906280517578, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6628750562667847, "step": 1391, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9984533786773682 }, { "episode": 22288, "epoch": 0.4006183269223856, "loss/policy_avg": -0.016587669029831886, "lr": 2.7331288343558283e-06, "objective/entropy": -194.74586486816406, "objective/kl": 10.784317016601562, "objective/non_score_reward": -1.0784317255020142, "objective/rlhf_reward": -2.757467686143473, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 32.546348571777344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7876214385032654, "step": 1392, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9989862442016602 }, { "episode": 22304, "epoch": 0.4009059208397742, "loss/policy_avg": 0.2942638099193573, "lr": 2.732937116564417e-06, "objective/entropy": -21.264427185058594, "objective/kl": 14.427265167236328, "objective/non_score_reward": -1.4427263736724854, "objective/rlhf_reward": -3.3709056735038754, "objective/scores": 0.6, "policy/approxkl_avg": 108.87052154541016, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8786087036132812, "step": 1393, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.996511459350586 }, { "episode": 22320, "epoch": 0.40119351475716286, "loss/policy_avg": 0.4861958622932434, "lr": 2.732745398773006e-06, "objective/entropy": -2.8598480224609375, "objective/kl": 17.84203338623047, "objective/non_score_reward": -1.784203290939331, "objective/rlhf_reward": -6.736813253164291, "objective/scores": 0.1, "policy/approxkl_avg": 70.02932739257812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.599122166633606, "step": 1394, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9993877410888672 }, { "episode": 22336, "epoch": 0.40148110867455156, "loss/policy_avg": 0.0992339476943016, "lr": 2.732553680981595e-06, "objective/entropy": -72.45336151123047, "objective/kl": 12.332319259643555, "objective/non_score_reward": -1.2332319021224976, "objective/rlhf_reward": -2.532927787303924, "objective/scores": 0.6, "policy/approxkl_avg": 45.31598663330078, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7159440517425537, "step": 1395, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9985222816467285 }, { "episode": 22352, "epoch": 0.4017687025919402, "loss/policy_avg": -0.0790301039814949, "lr": 2.732361963190184e-06, "objective/entropy": 32.95172119140625, "objective/kl": 16.841075897216797, "objective/non_score_reward": -1.6841075420379639, "objective/rlhf_reward": -6.336429929733276, "objective/scores": 0.1, "policy/approxkl_avg": 49.57403564453125, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.7559831142425537, "step": 1396, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.001506805419922 }, { "episode": 22368, "epoch": 0.40205629650932884, "loss/policy_avg": 0.03451812267303467, "lr": 2.732170245398773e-06, "objective/entropy": -18.881187438964844, "objective/kl": 15.064776420593262, "objective/non_score_reward": -1.5064775943756104, "objective/rlhf_reward": -8.025910377502441, "objective/scores": -0.5, "policy/approxkl_avg": 56.9893798828125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4877445101737976, "step": 1397, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9996527433395386 }, { "episode": 22384, "epoch": 0.4023438904267175, "loss/policy_avg": 0.05204878747463226, "lr": 2.731978527607362e-06, "objective/entropy": 200.4969482421875, "objective/kl": 14.600479125976562, "objective/non_score_reward": -1.460047960281372, "objective/rlhf_reward": -7.840191841125488, "objective/scores": -0.5, "policy/approxkl_avg": 7.002072334289551, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7866439819335938, "step": 1398, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.002009868621826 }, { "episode": 22400, "epoch": 0.4026314843441061, "loss/policy_avg": -0.12847991287708282, "lr": 2.731786809815951e-06, "objective/entropy": 79.1689224243164, "objective/kl": 18.852502822875977, "objective/non_score_reward": -1.8852503299713135, "objective/rlhf_reward": -7.141001439094543, "objective/scores": 0.1, "policy/approxkl_avg": 13.583232879638672, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.833438515663147, "step": 1399, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000863552093506 }, { "episode": 22416, "epoch": 0.40291907826149476, "loss/policy_avg": 0.2065950632095337, "lr": 2.73159509202454e-06, "objective/entropy": 3.9622802734375, "objective/kl": 11.335432052612305, "objective/non_score_reward": -1.1335433721542358, "objective/rlhf_reward": -6.534173488616943, "objective/scores": -0.5, "policy/approxkl_avg": 42.01820755004883, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.81622713804245, "step": 1400, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9971067905426025 }, { "episode": 22432, "epoch": 0.4032066721788834, "loss/policy_avg": 0.13497845828533173, "lr": 2.731403374233129e-06, "objective/entropy": -37.79541015625, "objective/kl": 17.951873779296875, "objective/non_score_reward": -1.7951874732971191, "objective/rlhf_reward": -9.180749893188477, "objective/scores": -0.5, "policy/approxkl_avg": 58.593788146972656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5796679854393005, "step": 1401, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.996742844581604 }, { "episode": 22448, "epoch": 0.40349426609627204, "loss/policy_avg": 0.27193281054496765, "lr": 2.731211656441718e-06, "objective/entropy": 174.13641357421875, "objective/kl": 13.39457893371582, "objective/non_score_reward": -1.3394579887390137, "objective/rlhf_reward": -3.2351258418717723, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 26.27016830444336, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7780320644378662, "step": 1402, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9996938705444336 }, { "episode": 22464, "epoch": 0.40378186001366073, "loss/policy_avg": 2.09384822845459, "lr": 2.731019938650307e-06, "objective/entropy": -58.64219665527344, "objective/kl": 10.139740943908691, "objective/non_score_reward": -1.0139741897583008, "objective/rlhf_reward": -2.451776567761021, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 29.046207427978516, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6864455938339233, "step": 1403, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0002877712249756 }, { "episode": 22480, "epoch": 0.40406945393104937, "loss/policy_avg": -0.19483403861522675, "lr": 2.7308282208588957e-06, "objective/entropy": -197.337158203125, "objective/kl": 15.854147911071777, "objective/non_score_reward": -1.5854146480560303, "objective/rlhf_reward": -5.94165871143341, "objective/scores": 0.1, "policy/approxkl_avg": 154.90176391601562, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.47332537174224854, "step": 1404, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9990125894546509 }, { "episode": 22496, "epoch": 0.404357047848438, "loss/policy_avg": 0.23371820151805878, "lr": 2.730636503067485e-06, "objective/entropy": 301.93804931640625, "objective/kl": 21.32901382446289, "objective/non_score_reward": -2.132901668548584, "objective/rlhf_reward": -10.531606674194336, "objective/scores": -0.5, "policy/approxkl_avg": 10.739236831665039, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.8294570446014404, "step": 1405, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999563217163086 }, { "episode": 22512, "epoch": 0.40464464176582665, "loss/policy_avg": 0.16459359228610992, "lr": 2.7304447852760737e-06, "objective/entropy": 308.935791015625, "objective/kl": 19.416046142578125, "objective/non_score_reward": -1.941604733467102, "objective/rlhf_reward": -9.76641845703125, "objective/scores": -0.5, "policy/approxkl_avg": 69.8115234375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.786815881729126, "step": 1406, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.998669147491455 }, { "episode": 22528, "epoch": 0.4049322356832153, "loss/policy_avg": 0.26910632848739624, "lr": 2.7302530674846626e-06, "objective/entropy": 21.283084869384766, "objective/kl": 11.796088218688965, "objective/non_score_reward": -1.1796088218688965, "objective/rlhf_reward": -3.0565758399373157, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 6.656482219696045, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6182963848114014, "step": 1407, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9966087341308594 }, { "episode": 22544, "epoch": 0.40521982960060393, "loss/policy_avg": 0.7190654873847961, "lr": 2.7300613496932518e-06, "objective/entropy": -6.668407440185547, "objective/kl": 18.76657485961914, "objective/non_score_reward": -1.8766577243804932, "objective/rlhf_reward": -7.106630867719651, "objective/scores": 0.1, "policy/approxkl_avg": 211.33731079101562, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6118903756141663, "step": 1408, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998753547668457 }, { "episode": 22560, "epoch": 0.40550742351799257, "loss/policy_avg": 1.4786778688430786, "lr": 2.7298696319018406e-06, "objective/entropy": -25.83483123779297, "objective/kl": 18.464750289916992, "objective/non_score_reward": -1.8464751243591309, "objective/rlhf_reward": -6.985900437831878, "objective/scores": 0.1, "policy/approxkl_avg": 110.1044692993164, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5393742322921753, "step": 1409, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9991658926010132 }, { "episode": 22576, "epoch": 0.40579501743538127, "loss/policy_avg": 0.15572980046272278, "lr": 2.72967791411043e-06, "objective/entropy": -167.088134765625, "objective/kl": 13.024639129638672, "objective/non_score_reward": -1.3024640083312988, "objective/rlhf_reward": -2.286137257457945, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 51.253448486328125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6638354659080505, "step": 1410, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000250816345215 }, { "episode": 22592, "epoch": 0.4060826113527699, "loss/policy_avg": -0.009990356862545013, "lr": 2.7294861963190186e-06, "objective/entropy": -264.7129821777344, "objective/kl": 14.578704833984375, "objective/non_score_reward": -1.4578704833984375, "objective/rlhf_reward": -5.431481993198394, "objective/scores": 0.1, "policy/approxkl_avg": 1.5859309434890747, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7239521741867065, "step": 1411, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0019497871398926 }, { "episode": 22608, "epoch": 0.40637020527015855, "loss/policy_avg": 0.5303527116775513, "lr": 2.7292944785276074e-06, "objective/entropy": -19.109874725341797, "objective/kl": 13.917287826538086, "objective/non_score_reward": -1.3917287588119507, "objective/rlhf_reward": -5.166914796829223, "objective/scores": 0.1, "policy/approxkl_avg": 28.013973236083984, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.47340190410614014, "step": 1412, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0010433197021484 }, { "episode": 22624, "epoch": 0.4066577991875472, "loss/policy_avg": 0.1528145968914032, "lr": 2.7291027607361963e-06, "objective/entropy": 87.03872680664062, "objective/kl": 13.098251342773438, "objective/non_score_reward": -1.30982506275177, "objective/rlhf_reward": -2.8393002212047573, "objective/scores": 0.6, "policy/approxkl_avg": 130.48788452148438, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.508557915687561, "step": 1413, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9990100860595703 }, { "episode": 22640, "epoch": 0.4069453931049358, "loss/policy_avg": -0.11118362098932266, "lr": 2.728911042944785e-06, "objective/entropy": 36.70410919189453, "objective/kl": 12.045846939086914, "objective/non_score_reward": -1.2045847177505493, "objective/rlhf_reward": -6.818338871002197, "objective/scores": -0.5, "policy/approxkl_avg": 22.107789993286133, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.723143458366394, "step": 1414, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0014891624450684 }, { "episode": 22656, "epoch": 0.40723298702232447, "loss/policy_avg": 0.1664956957101822, "lr": 2.7287193251533743e-06, "objective/entropy": 195.51280212402344, "objective/kl": 16.488208770751953, "objective/non_score_reward": -1.6488208770751953, "objective/rlhf_reward": -8.595283508300781, "objective/scores": -0.5, "policy/approxkl_avg": 62.559993743896484, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9423944354057312, "step": 1415, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9988129138946533 }, { "episode": 22672, "epoch": 0.4075205809397131, "loss/policy_avg": 0.18345528841018677, "lr": 2.728527607361963e-06, "objective/entropy": -121.7177734375, "objective/kl": 11.121731758117676, "objective/non_score_reward": -1.1121731996536255, "objective/rlhf_reward": -0.04869285821914637, "objective/scores": 1.1, "policy/approxkl_avg": 20.33164405822754, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6241345405578613, "step": 1416, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9980192184448242 }, { "episode": 22688, "epoch": 0.40780817485710175, "loss/policy_avg": -0.056718356907367706, "lr": 2.7283358895705523e-06, "objective/entropy": 34.303070068359375, "objective/kl": 12.5802583694458, "objective/non_score_reward": -1.258025884628296, "objective/rlhf_reward": -2.6321037769317623, "objective/scores": 0.6, "policy/approxkl_avg": 6.837141036987305, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.74076908826828, "step": 1417, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.003502368927002 }, { "episode": 22704, "epoch": 0.40809576877449044, "loss/policy_avg": 0.34876778721809387, "lr": 2.728144171779141e-06, "objective/entropy": 141.2511749267578, "objective/kl": 16.881202697753906, "objective/non_score_reward": -1.6881201267242432, "objective/rlhf_reward": -4.927651996883462, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 30.830556869506836, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5224217176437378, "step": 1418, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000737190246582 }, { "episode": 22720, "epoch": 0.4083833626918791, "loss/policy_avg": 0.1370847225189209, "lr": 2.72795245398773e-06, "objective/entropy": 131.04425048828125, "objective/kl": 12.592981338500977, "objective/non_score_reward": -1.259298324584961, "objective/rlhf_reward": -7.037193298339844, "objective/scores": -0.5, "policy/approxkl_avg": 79.68668365478516, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7526917457580566, "step": 1419, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9970409870147705 }, { "episode": 22736, "epoch": 0.4086709566092677, "loss/policy_avg": 0.7799844741821289, "lr": 2.727760736196319e-06, "objective/entropy": 110.47119140625, "objective/kl": 18.634571075439453, "objective/non_score_reward": -1.8634570837020874, "objective/rlhf_reward": -9.453828811645508, "objective/scores": -0.5, "policy/approxkl_avg": 91.78942108154297, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5299524068832397, "step": 1420, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.996536135673523 }, { "episode": 22752, "epoch": 0.40895855052665636, "loss/policy_avg": 0.27093034982681274, "lr": 2.727569018404908e-06, "objective/entropy": 98.92936706542969, "objective/kl": 6.821386814117432, "objective/non_score_reward": -0.6821386814117432, "objective/rlhf_reward": -0.3285546362400056, "objective/scores": 0.6, "policy/approxkl_avg": 32.1151123046875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5731009244918823, "step": 1421, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9969843626022339 }, { "episode": 22768, "epoch": 0.409246144444045, "loss/policy_avg": 0.10417535901069641, "lr": 2.727377300613497e-06, "objective/entropy": -63.33757781982422, "objective/kl": 14.846542358398438, "objective/non_score_reward": -1.484654188156128, "objective/rlhf_reward": -7.9386162757873535, "objective/scores": -0.5, "policy/approxkl_avg": 85.31866455078125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5758928060531616, "step": 1422, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9978985786437988 }, { "episode": 22784, "epoch": 0.40953373836143364, "loss/policy_avg": 0.028347529470920563, "lr": 2.727185582822086e-06, "objective/entropy": 2.5661659240722656, "objective/kl": 21.124889373779297, "objective/non_score_reward": -2.1124889850616455, "objective/rlhf_reward": -4.049956119060516, "objective/scores": 1.1, "policy/approxkl_avg": 97.22557067871094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8526643514633179, "step": 1423, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998213768005371 }, { "episode": 22800, "epoch": 0.4098213322788223, "loss/policy_avg": -0.12718230485916138, "lr": 2.726993865030675e-06, "objective/entropy": 271.2698059082031, "objective/kl": 15.589737892150879, "objective/non_score_reward": -1.558973789215088, "objective/rlhf_reward": -8.235895156860352, "objective/scores": -0.5, "policy/approxkl_avg": 34.46259307861328, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7287672758102417, "step": 1424, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.00382661819458 }, { "episode": 22816, "epoch": 0.410108926196211, "loss/policy_avg": 0.017169015482068062, "lr": 2.726802147239264e-06, "objective/entropy": 83.00675201416016, "objective/kl": 19.48290252685547, "objective/non_score_reward": -1.9482901096343994, "objective/rlhf_reward": -3.393160572648048, "objective/scores": 1.1, "policy/approxkl_avg": 7.972476959228516, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5477663278579712, "step": 1425, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.99813711643219 }, { "episode": 22832, "epoch": 0.4103965201135996, "loss/policy_avg": 0.08301222324371338, "lr": 2.726610429447853e-06, "objective/entropy": 29.15322494506836, "objective/kl": 16.26679801940918, "objective/non_score_reward": -1.62667977809906, "objective/rlhf_reward": -8.506719589233398, "objective/scores": -0.5, "policy/approxkl_avg": 80.79837799072266, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6539818048477173, "step": 1426, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0011935234069824 }, { "episode": 22848, "epoch": 0.41068411403098826, "loss/policy_avg": 0.18329089879989624, "lr": 2.7264187116564417e-06, "objective/entropy": -10.258651733398438, "objective/kl": 12.080020904541016, "objective/non_score_reward": -1.2080020904541016, "objective/rlhf_reward": -3.0071794643727054, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 25.368099212646484, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.47818294167518616, "step": 1427, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9991447925567627 }, { "episode": 22864, "epoch": 0.4109717079483769, "loss/policy_avg": 1.2192856073379517, "lr": 2.726226993865031e-06, "objective/entropy": 100.44667053222656, "objective/kl": 11.624716758728027, "objective/non_score_reward": -1.1624715328216553, "objective/rlhf_reward": -4.249886131286621, "objective/scores": 0.1, "policy/approxkl_avg": 85.25991821289062, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7524755001068115, "step": 1428, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0001072883605957 }, { "episode": 22880, "epoch": 0.41125930186576554, "loss/policy_avg": 0.07868831604719162, "lr": 2.7260352760736197e-06, "objective/entropy": -73.34170532226562, "objective/kl": 16.806835174560547, "objective/non_score_reward": -1.6806833744049072, "objective/rlhf_reward": -2.3227337360382077, "objective/scores": 1.1, "policy/approxkl_avg": 62.879486083984375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5852820873260498, "step": 1429, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9978783130645752 }, { "episode": 22896, "epoch": 0.4115468957831542, "loss/policy_avg": 0.5681151151657104, "lr": 2.7258435582822086e-06, "objective/entropy": 109.1818618774414, "objective/kl": 12.72823429107666, "objective/non_score_reward": -1.2728233337402344, "objective/rlhf_reward": -0.6912934094667431, "objective/scores": 1.1, "policy/approxkl_avg": 60.83876037597656, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.51080322265625, "step": 1430, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.997774600982666 }, { "episode": 22912, "epoch": 0.4118344897005428, "loss/policy_avg": 0.3500422239303589, "lr": 2.725651840490798e-06, "objective/entropy": 66.81685638427734, "objective/kl": 14.291481971740723, "objective/non_score_reward": -1.4291484355926514, "objective/rlhf_reward": -7.7165937423706055, "objective/scores": -0.5, "policy/approxkl_avg": 5.064986228942871, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7346335649490356, "step": 1431, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0009124279022217 }, { "episode": 22928, "epoch": 0.41212208361793146, "loss/policy_avg": 0.2891772985458374, "lr": 2.7254601226993866e-06, "objective/entropy": 113.6800537109375, "objective/kl": 19.13040542602539, "objective/non_score_reward": -1.9130408763885498, "objective/rlhf_reward": -9.6521635055542, "objective/scores": -0.5, "policy/approxkl_avg": 36.885581970214844, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5598267316818237, "step": 1432, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9999432563781738 }, { "episode": 22944, "epoch": 0.41240967753532015, "loss/policy_avg": 0.08044654130935669, "lr": 2.725268404907976e-06, "objective/entropy": 64.72499084472656, "objective/kl": 20.169479370117188, "objective/non_score_reward": -2.0169477462768555, "objective/rlhf_reward": -7.667791402339935, "objective/scores": 0.1, "policy/approxkl_avg": 11.26875114440918, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5961747169494629, "step": 1433, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.996532678604126 }, { "episode": 22960, "epoch": 0.4126972714527088, "loss/policy_avg": 0.3121386170387268, "lr": 2.7250766871165642e-06, "objective/entropy": -75.4444580078125, "objective/kl": 23.821794509887695, "objective/non_score_reward": -2.3821797370910645, "objective/rlhf_reward": -5.128718531131744, "objective/scores": 1.1, "policy/approxkl_avg": 27.88954734802246, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.77299964427948, "step": 1434, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9982830286026 }, { "episode": 22976, "epoch": 0.41298486537009743, "loss/policy_avg": 0.24237871170043945, "lr": 2.7248849693251535e-06, "objective/entropy": -32.58727264404297, "objective/kl": 14.388275146484375, "objective/non_score_reward": -1.4388275146484375, "objective/rlhf_reward": -5.355309879779815, "objective/scores": 0.1, "policy/approxkl_avg": 20.626853942871094, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.8990691900253296, "step": 1435, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9984760284423828 }, { "episode": 22992, "epoch": 0.41327245928748607, "loss/policy_avg": 0.24794508516788483, "lr": 2.7246932515337423e-06, "objective/entropy": 143.39877319335938, "objective/kl": 16.412979125976562, "objective/non_score_reward": -1.6412980556488037, "objective/rlhf_reward": -6.16519216299057, "objective/scores": 0.1, "policy/approxkl_avg": 54.83576202392578, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6131792068481445, "step": 1436, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999814510345459 }, { "episode": 23008, "epoch": 0.4135600532048747, "loss/policy_avg": 0.19519871473312378, "lr": 2.724501533742331e-06, "objective/entropy": -59.90544891357422, "objective/kl": 17.853824615478516, "objective/non_score_reward": -1.7853825092315674, "objective/rlhf_reward": -5.4796707682019345, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 17.840316772460938, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4706451892852783, "step": 1437, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0002455711364746 }, { "episode": 23024, "epoch": 0.41384764712226335, "loss/policy_avg": 1.0160012245178223, "lr": 2.7243098159509203e-06, "objective/entropy": 211.19354248046875, "objective/kl": 17.34637451171875, "objective/non_score_reward": -1.7346374988555908, "objective/rlhf_reward": -8.93855094909668, "objective/scores": -0.5, "policy/approxkl_avg": 252.24220275878906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7842428684234619, "step": 1438, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9965323209762573 }, { "episode": 23040, "epoch": 0.414135241039652, "loss/policy_avg": 0.31407612562179565, "lr": 2.724118098159509e-06, "objective/entropy": 184.55502319335938, "objective/kl": 18.949657440185547, "objective/non_score_reward": -1.8949657678604126, "objective/rlhf_reward": -7.1798629522323605, "objective/scores": 0.1, "policy/approxkl_avg": 145.09152221679688, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6857846975326538, "step": 1439, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9969500303268433 }, { "episode": 23056, "epoch": 0.41442283495704063, "loss/policy_avg": 0.11179807037115097, "lr": 2.7239263803680983e-06, "objective/entropy": -39.264190673828125, "objective/kl": 12.129672050476074, "objective/non_score_reward": -1.2129671573638916, "objective/rlhf_reward": -2.904457728342946, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 18.177127838134766, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.660858154296875, "step": 1440, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9972420930862427 }, { "episode": 23072, "epoch": 0.4147104288744293, "loss/policy_avg": -0.29350343346595764, "lr": 2.723734662576687e-06, "objective/entropy": 143.82705688476562, "objective/kl": 8.183101654052734, "objective/non_score_reward": -0.8183101415634155, "objective/rlhf_reward": -0.34952164137479935, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 9.663167953491211, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6625618934631348, "step": 1441, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.023545742034912 }, { "episode": 23088, "epoch": 0.41499802279181797, "loss/policy_avg": 0.12284829467535019, "lr": 2.723542944785276e-06, "objective/entropy": 57.83909225463867, "objective/kl": 11.188475608825684, "objective/non_score_reward": -1.1188476085662842, "objective/rlhf_reward": -4.075390315055847, "objective/scores": 0.1, "policy/approxkl_avg": 19.067108154296875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6835089921951294, "step": 1442, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000497817993164 }, { "episode": 23104, "epoch": 0.4152856167092066, "loss/policy_avg": -0.24080030620098114, "lr": 2.723351226993865e-06, "objective/entropy": -229.4406280517578, "objective/kl": 13.096569061279297, "objective/non_score_reward": -1.309657096862793, "objective/rlhf_reward": -4.8386283874511715, "objective/scores": 0.1, "policy/approxkl_avg": 33.96006774902344, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.798072338104248, "step": 1443, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 2.0064001083374023 }, { "episode": 23120, "epoch": 0.41557321062659525, "loss/policy_avg": 0.3806346356868744, "lr": 2.723159509202454e-06, "objective/entropy": -28.990707397460938, "objective/kl": 21.558467864990234, "objective/non_score_reward": -2.15584659576416, "objective/rlhf_reward": -5.699667845607969, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 111.77049255371094, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5212620496749878, "step": 1444, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998953104019165 }, { "episode": 23136, "epoch": 0.4158608045439839, "loss/policy_avg": 0.2565423250198364, "lr": 2.722967791411043e-06, "objective/entropy": 42.08885955810547, "objective/kl": 15.505270004272461, "objective/non_score_reward": -1.5505270957946777, "objective/rlhf_reward": -5.802108591794967, "objective/scores": 0.1, "policy/approxkl_avg": 72.5589828491211, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5759496688842773, "step": 1445, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9979947805404663 }, { "episode": 23152, "epoch": 0.4161483984613725, "loss/policy_avg": 0.37869054079055786, "lr": 2.722776073619632e-06, "objective/entropy": -158.08641052246094, "objective/kl": 16.30980682373047, "objective/non_score_reward": -1.6309807300567627, "objective/rlhf_reward": -6.12392292022705, "objective/scores": 0.1, "policy/approxkl_avg": 221.66566467285156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6714792251586914, "step": 1446, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9963035583496094 }, { "episode": 23168, "epoch": 0.41643599237876117, "loss/policy_avg": 0.44854480028152466, "lr": 2.722584355828221e-06, "objective/entropy": -37.4086799621582, "objective/kl": 15.159543991088867, "objective/non_score_reward": -1.5159544944763184, "objective/rlhf_reward": -1.6638178586959835, "objective/scores": 1.1, "policy/approxkl_avg": 70.89303588867188, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6588014364242554, "step": 1447, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998877763748169 }, { "episode": 23184, "epoch": 0.41672358629614986, "loss/policy_avg": 0.4342793822288513, "lr": 2.72239263803681e-06, "objective/entropy": 81.91819763183594, "objective/kl": 11.294848442077637, "objective/non_score_reward": -1.1294848918914795, "objective/rlhf_reward": -6.517939567565918, "objective/scores": -0.5, "policy/approxkl_avg": 39.474754333496094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7826581597328186, "step": 1448, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9975943565368652 }, { "episode": 23200, "epoch": 0.4170111802135385, "loss/policy_avg": 0.44055402278900146, "lr": 2.722200920245399e-06, "objective/entropy": 179.12945556640625, "objective/kl": 16.172334671020508, "objective/non_score_reward": -1.6172332763671875, "objective/rlhf_reward": -3.5452140911829204, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 45.85475540161133, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6948776841163635, "step": 1449, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9994070529937744 }, { "episode": 23216, "epoch": 0.41729877413092714, "loss/policy_avg": 0.2199755609035492, "lr": 2.7220092024539877e-06, "objective/entropy": -139.42800903320312, "objective/kl": 12.28195858001709, "objective/non_score_reward": -1.2281959056854248, "objective/rlhf_reward": -4.512783309817314, "objective/scores": 0.1, "policy/approxkl_avg": 86.36170196533203, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5656028389930725, "step": 1450, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.996934413909912 }, { "episode": 23232, "epoch": 0.4175863680483158, "loss/policy_avg": 0.104374460875988, "lr": 2.721817484662577e-06, "objective/entropy": 19.990764617919922, "objective/kl": 18.923419952392578, "objective/non_score_reward": -1.8923419713974, "objective/rlhf_reward": -9.569368362426758, "objective/scores": -0.5, "policy/approxkl_avg": 6.039336204528809, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5476473569869995, "step": 1451, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9971189498901367 }, { "episode": 23248, "epoch": 0.4178739619657044, "loss/policy_avg": 0.32264718413352966, "lr": 2.7216257668711658e-06, "objective/entropy": 41.33949279785156, "objective/kl": 14.19123649597168, "objective/non_score_reward": -1.4191235303878784, "objective/rlhf_reward": -7.676494121551514, "objective/scores": -0.5, "policy/approxkl_avg": 64.27468872070312, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4869697690010071, "step": 1452, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.996262788772583 }, { "episode": 23264, "epoch": 0.41816155588309306, "loss/policy_avg": 0.1430419385433197, "lr": 2.721434049079755e-06, "objective/entropy": 135.11158752441406, "objective/kl": 16.908254623413086, "objective/non_score_reward": -1.6908254623413086, "objective/rlhf_reward": -6.363301908969879, "objective/scores": 0.1, "policy/approxkl_avg": 11.750733375549316, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6986678242683411, "step": 1453, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0003662109375 }, { "episode": 23280, "epoch": 0.4184491498004817, "loss/policy_avg": 0.05973121523857117, "lr": 2.721242331288344e-06, "objective/entropy": 148.2733917236328, "objective/kl": 18.87240219116211, "objective/non_score_reward": -1.8872401714324951, "objective/rlhf_reward": -4.625241969467375, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 25.233966827392578, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.545336902141571, "step": 1454, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000401258468628 }, { "episode": 23296, "epoch": 0.41873674371787034, "loss/policy_avg": -0.26827386021614075, "lr": 2.7210506134969326e-06, "objective/entropy": -35.343963623046875, "objective/kl": 12.835941314697266, "objective/non_score_reward": -1.2835942506790161, "objective/rlhf_reward": -0.7343767940998074, "objective/scores": 1.1, "policy/approxkl_avg": 27.88837432861328, "policy/clipfrac_avg": 2.0, "policy/entropy_avg": 0.719089150428772, "step": 1455, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0176620483398438 }, { "episode": 23312, "epoch": 0.41902433763525904, "loss/policy_avg": 0.4007697105407715, "lr": 2.7208588957055214e-06, "objective/entropy": 38.718055725097656, "objective/kl": 16.693111419677734, "objective/non_score_reward": -1.669311285018921, "objective/rlhf_reward": -3.75352620029566, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 90.98106384277344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7188234925270081, "step": 1456, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.997847080230713 }, { "episode": 23328, "epoch": 0.4193119315526477, "loss/policy_avg": 0.2741047441959381, "lr": 2.7206671779141102e-06, "objective/entropy": 131.7677459716797, "objective/kl": 13.466203689575195, "objective/non_score_reward": -1.3466205596923828, "objective/rlhf_reward": -3.2637757084527355, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 6.06925630569458, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5739939212799072, "step": 1457, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9990949630737305 }, { "episode": 23344, "epoch": 0.4195995254700363, "loss/policy_avg": -0.20607559382915497, "lr": 2.7204754601226995e-06, "objective/entropy": 140.73605346679688, "objective/kl": 12.64559555053711, "objective/non_score_reward": -1.2645595073699951, "objective/rlhf_reward": -3.4541179276147655, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 25.566354751586914, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7669584155082703, "step": 1458, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.010098934173584 }, { "episode": 23360, "epoch": 0.41988711938742496, "loss/policy_avg": 0.08116651326417923, "lr": 2.7202837423312883e-06, "objective/entropy": -56.271209716796875, "objective/kl": 11.172001838684082, "objective/non_score_reward": -1.1172001361846924, "objective/rlhf_reward": -4.068800783157348, "objective/scores": 0.1, "policy/approxkl_avg": 5.867913246154785, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7661799192428589, "step": 1459, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.99923837184906 }, { "episode": 23376, "epoch": 0.4201747133048136, "loss/policy_avg": 0.1444312185049057, "lr": 2.720092024539877e-06, "objective/entropy": 27.851055145263672, "objective/kl": 14.869014739990234, "objective/non_score_reward": -1.4869015216827393, "objective/rlhf_reward": -1.5476059079170223, "objective/scores": 1.1, "policy/approxkl_avg": 118.09273529052734, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6213133335113525, "step": 1460, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0028345584869385 }, { "episode": 23392, "epoch": 0.42046230722220224, "loss/policy_avg": 0.24942255020141602, "lr": 2.7199003067484663e-06, "objective/entropy": -101.20877838134766, "objective/kl": 15.96616268157959, "objective/non_score_reward": -1.5966161489486694, "objective/rlhf_reward": -8.386465072631836, "objective/scores": -0.5, "policy/approxkl_avg": 29.288822174072266, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7658300399780273, "step": 1461, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9989122152328491 }, { "episode": 23408, "epoch": 0.4207499011395909, "loss/policy_avg": -0.02906278893351555, "lr": 2.719708588957055e-06, "objective/entropy": 89.44154357910156, "objective/kl": 11.966876983642578, "objective/non_score_reward": -1.1966878175735474, "objective/rlhf_reward": -4.38675103187561, "objective/scores": 0.1, "policy/approxkl_avg": 33.13001251220703, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5389198660850525, "step": 1462, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0009779930114746 }, { "episode": 23424, "epoch": 0.42103749505697957, "loss/policy_avg": 0.4894261062145233, "lr": 2.7195168711656444e-06, "objective/entropy": -4.174308776855469, "objective/kl": 11.774065971374512, "objective/non_score_reward": -1.1774065494537354, "objective/rlhf_reward": -0.3096263170242306, "objective/scores": 1.1, "policy/approxkl_avg": 32.79606628417969, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5737270712852478, "step": 1463, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9989784955978394 }, { "episode": 23440, "epoch": 0.4213250889743682, "loss/policy_avg": 0.28955715894699097, "lr": 2.719325153374233e-06, "objective/entropy": -63.83994674682617, "objective/kl": 5.191441059112549, "objective/non_score_reward": -0.5191440582275391, "objective/rlhf_reward": -1.676576367020607, "objective/scores": 0.1, "policy/approxkl_avg": 3.2088193893432617, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4842286705970764, "step": 1464, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9986271858215332 }, { "episode": 23456, "epoch": 0.42161268289175685, "loss/policy_avg": 0.3400211036205292, "lr": 2.719133435582822e-06, "objective/entropy": -41.42976379394531, "objective/kl": 14.391012191772461, "objective/non_score_reward": -1.439101219177246, "objective/rlhf_reward": -7.756404876708984, "objective/scores": -0.5, "policy/approxkl_avg": 39.38445281982422, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5957543849945068, "step": 1465, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9980661869049072 }, { "episode": 23472, "epoch": 0.4219002768091455, "loss/policy_avg": 0.30436578392982483, "lr": 2.718941717791411e-06, "objective/entropy": 191.9390869140625, "objective/kl": 12.309860229492188, "objective/non_score_reward": -1.2309861183166504, "objective/rlhf_reward": -6.923944473266602, "objective/scores": -0.5, "policy/approxkl_avg": 6.813614845275879, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6054030656814575, "step": 1466, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.998063325881958 }, { "episode": 23488, "epoch": 0.42218787072653413, "loss/policy_avg": -0.024539276957511902, "lr": 2.71875e-06, "objective/entropy": 69.8360366821289, "objective/kl": 10.700786590576172, "objective/non_score_reward": -1.0700786113739014, "objective/rlhf_reward": 0.11968519687652623, "objective/scores": 1.1, "policy/approxkl_avg": 58.47478103637695, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.46354907751083374, "step": 1467, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0008950233459473 }, { "episode": 23504, "epoch": 0.42247546464392277, "loss/policy_avg": -0.04982258379459381, "lr": 2.7185582822085892e-06, "objective/entropy": 105.5312271118164, "objective/kl": 11.204243659973145, "objective/non_score_reward": -1.1204243898391724, "objective/rlhf_reward": -4.081697633862495, "objective/scores": 0.1, "policy/approxkl_avg": 56.498252868652344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7991576790809631, "step": 1468, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999009370803833 }, { "episode": 23520, "epoch": 0.4227630585613114, "loss/policy_avg": 0.1773151159286499, "lr": 2.718366564417178e-06, "objective/entropy": -217.1636199951172, "objective/kl": 11.214568138122559, "objective/non_score_reward": -1.1214568614959717, "objective/rlhf_reward": -6.485827445983887, "objective/scores": -0.5, "policy/approxkl_avg": 1.7668520212173462, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6018310189247131, "step": 1469, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0021896362304688 }, { "episode": 23536, "epoch": 0.42305065247870005, "loss/policy_avg": 0.2942320704460144, "lr": 2.718174846625767e-06, "objective/entropy": 195.77435302734375, "objective/kl": 12.295269966125488, "objective/non_score_reward": -1.2295269966125488, "objective/rlhf_reward": -4.518107733130455, "objective/scores": 0.1, "policy/approxkl_avg": 6.731227874755859, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6320974826812744, "step": 1470, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9986553192138672 }, { "episode": 23552, "epoch": 0.42333824639608875, "loss/policy_avg": 0.5762624740600586, "lr": 2.717983128834356e-06, "objective/entropy": 11.472053527832031, "objective/kl": 15.345174789428711, "objective/non_score_reward": -1.5345174074172974, "objective/rlhf_reward": -5.738069570064544, "objective/scores": 0.1, "policy/approxkl_avg": 76.61418151855469, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.8648232817649841, "step": 1471, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0015358924865723 }, { "episode": 23568, "epoch": 0.4236258403134774, "loss/policy_avg": 0.24686360359191895, "lr": 2.717791411042945e-06, "objective/entropy": -29.269935607910156, "objective/kl": 18.786380767822266, "objective/non_score_reward": -1.8786380290985107, "objective/rlhf_reward": -9.514552116394043, "objective/scores": -0.5, "policy/approxkl_avg": 123.64360046386719, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8638693690299988, "step": 1472, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.996600866317749 }, { "episode": 23584, "epoch": 0.423913434230866, "loss/policy_avg": 0.6834498643875122, "lr": 2.7175996932515337e-06, "objective/entropy": 117.84262084960938, "objective/kl": 9.692749977111816, "objective/non_score_reward": -0.9692749977111816, "objective/rlhf_reward": -5.877099990844727, "objective/scores": -0.5, "policy/approxkl_avg": 13.469287872314453, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4936543405056, "step": 1473, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9992587566375732 }, { "episode": 23600, "epoch": 0.42420102814825467, "loss/policy_avg": 0.2896210253238678, "lr": 2.717407975460123e-06, "objective/entropy": -165.11666870117188, "objective/kl": 15.991458892822266, "objective/non_score_reward": -1.5991458892822266, "objective/rlhf_reward": -1.9965835869312283, "objective/scores": 1.1, "policy/approxkl_avg": 32.42799377441406, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5854696035385132, "step": 1474, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000058650970459 }, { "episode": 23616, "epoch": 0.4244886220656433, "loss/policy_avg": 0.5737141370773315, "lr": 2.7172162576687118e-06, "objective/entropy": 166.11766052246094, "objective/kl": 23.459354400634766, "objective/non_score_reward": -2.345935106277466, "objective/rlhf_reward": -11.383740425109863, "objective/scores": -0.5, "policy/approxkl_avg": 12.438879013061523, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6819442510604858, "step": 1475, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9964954853057861 }, { "episode": 23632, "epoch": 0.42477621598303195, "loss/policy_avg": 0.355984628200531, "lr": 2.717024539877301e-06, "objective/entropy": 10.719039916992188, "objective/kl": 13.877435684204102, "objective/non_score_reward": -1.3877434730529785, "objective/rlhf_reward": -1.1509741157293316, "objective/scores": 1.1, "policy/approxkl_avg": 46.079010009765625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.697611391544342, "step": 1476, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9987831115722656 }, { "episode": 23648, "epoch": 0.4250638099004206, "loss/policy_avg": 0.3038805425167084, "lr": 2.71683282208589e-06, "objective/entropy": -127.39871978759766, "objective/kl": 10.339272499084473, "objective/non_score_reward": -1.0339272022247314, "objective/rlhf_reward": -6.135708808898926, "objective/scores": -0.5, "policy/approxkl_avg": 48.258731842041016, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7669675350189209, "step": 1477, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9979398250579834 }, { "episode": 23664, "epoch": 0.4253514038178092, "loss/policy_avg": 3.8968005180358887, "lr": 2.7166411042944786e-06, "objective/entropy": -48.2403564453125, "objective/kl": 18.57742691040039, "objective/non_score_reward": -1.8577427864074707, "objective/rlhf_reward": -3.0309712052345272, "objective/scores": 1.1, "policy/approxkl_avg": 22.280670166015625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7676411271095276, "step": 1478, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9987635612487793 }, { "episode": 23680, "epoch": 0.4256389977351979, "loss/policy_avg": -0.27429449558258057, "lr": 2.7164493865030674e-06, "objective/entropy": -148.14480590820312, "objective/kl": 12.99055290222168, "objective/non_score_reward": -1.2990553379058838, "objective/rlhf_reward": -4.796221590042114, "objective/scores": 0.1, "policy/approxkl_avg": 7.844132423400879, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.795985758304596, "step": 1479, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.00437593460083 }, { "episode": 23696, "epoch": 0.42592659165258656, "loss/policy_avg": 0.10123275220394135, "lr": 2.7162576687116562e-06, "objective/entropy": -3.9328994750976562, "objective/kl": 10.527267456054688, "objective/non_score_reward": -1.0527267456054688, "objective/rlhf_reward": -6.210907459259033, "objective/scores": -0.5, "policy/approxkl_avg": 2.531219482421875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6534985303878784, "step": 1480, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000793933868408 }, { "episode": 23712, "epoch": 0.4262141855699752, "loss/policy_avg": 0.8577756881713867, "lr": 2.7160659509202455e-06, "objective/entropy": -65.08586883544922, "objective/kl": 21.022594451904297, "objective/non_score_reward": -2.102259635925293, "objective/rlhf_reward": -10.409038543701172, "objective/scores": -0.5, "policy/approxkl_avg": 177.66314697265625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7775266170501709, "step": 1481, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9983553886413574 }, { "episode": 23728, "epoch": 0.42650177948736384, "loss/policy_avg": 0.17644314467906952, "lr": 2.7158742331288343e-06, "objective/entropy": -16.106605529785156, "objective/kl": 14.382020950317383, "objective/non_score_reward": -1.438201904296875, "objective/rlhf_reward": -4.019474522272746, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 14.692214012145996, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6867858171463013, "step": 1482, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9983866214752197 }, { "episode": 23744, "epoch": 0.4267893734047525, "loss/policy_avg": -0.17333167791366577, "lr": 2.715682515337423e-06, "objective/entropy": -296.7401123046875, "objective/kl": 10.11327075958252, "objective/non_score_reward": -1.0113270282745361, "objective/rlhf_reward": -1.1215893223297326, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 16.041187286376953, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7763004899024963, "step": 1483, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0011274814605713 }, { "episode": 23760, "epoch": 0.4270769673221411, "loss/policy_avg": 0.5083790421485901, "lr": 2.7154907975460123e-06, "objective/entropy": 127.42313385009766, "objective/kl": 15.439505577087402, "objective/non_score_reward": -1.5439507961273193, "objective/rlhf_reward": -1.7758030056953427, "objective/scores": 1.1, "policy/approxkl_avg": 22.564075469970703, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6805832386016846, "step": 1484, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9985697269439697 }, { "episode": 23776, "epoch": 0.42736456123952976, "loss/policy_avg": 0.6594638228416443, "lr": 2.715299079754601e-06, "objective/entropy": 0.7167205810546875, "objective/kl": 13.968058586120605, "objective/non_score_reward": -1.396805763244629, "objective/rlhf_reward": -3.8538898686567937, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 29.08931541442871, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6674610376358032, "step": 1485, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9992890357971191 }, { "episode": 23792, "epoch": 0.42765215515691846, "loss/policy_avg": 0.20646658539772034, "lr": 2.7151073619631904e-06, "objective/entropy": -69.60407257080078, "objective/kl": 10.60425090789795, "objective/non_score_reward": -1.0604252815246582, "objective/rlhf_reward": -3.8417008280754086, "objective/scores": 0.1, "policy/approxkl_avg": 5.102411270141602, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6721906661987305, "step": 1486, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9982298612594604 }, { "episode": 23808, "epoch": 0.4279397490743071, "loss/policy_avg": 0.15155388414859772, "lr": 2.714915644171779e-06, "objective/entropy": 194.2589874267578, "objective/kl": 16.754682540893555, "objective/non_score_reward": -1.6754682064056396, "objective/rlhf_reward": -6.301872944831848, "objective/scores": 0.1, "policy/approxkl_avg": 21.726747512817383, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6672933101654053, "step": 1487, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9999563694000244 }, { "episode": 23824, "epoch": 0.42822734299169574, "loss/policy_avg": 0.7662161588668823, "lr": 2.714723926380368e-06, "objective/entropy": -74.04181671142578, "objective/kl": 18.99214744567871, "objective/non_score_reward": -1.899214744567871, "objective/rlhf_reward": -9.596858978271484, "objective/scores": -0.5, "policy/approxkl_avg": 98.49945831298828, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6449931263923645, "step": 1488, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000901937484741 }, { "episode": 23840, "epoch": 0.4285149369090844, "loss/policy_avg": 0.04193422198295593, "lr": 2.7145322085889572e-06, "objective/entropy": 109.29447937011719, "objective/kl": 14.153707504272461, "objective/non_score_reward": -1.4153707027435303, "objective/rlhf_reward": -5.2614830493927, "objective/scores": 0.1, "policy/approxkl_avg": 0.5321764945983887, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7183086276054382, "step": 1489, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0033793449401855 }, { "episode": 23856, "epoch": 0.428802530826473, "loss/policy_avg": 0.1599797010421753, "lr": 2.714340490797546e-06, "objective/entropy": 304.7250671386719, "objective/kl": 16.353168487548828, "objective/non_score_reward": -1.6353168487548828, "objective/rlhf_reward": -4.593856344895299, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 11.707456588745117, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 1.0101954936981201, "step": 1490, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9995477199554443 }, { "episode": 23872, "epoch": 0.42909012474386166, "loss/policy_avg": 0.4436057209968567, "lr": 2.7141487730061353e-06, "objective/entropy": 181.70814514160156, "objective/kl": 10.047649383544922, "objective/non_score_reward": -1.0047650337219238, "objective/rlhf_reward": 0.3809399843215946, "objective/scores": 1.1, "policy/approxkl_avg": 38.075225830078125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6739720106124878, "step": 1491, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0002894401550293 }, { "episode": 23888, "epoch": 0.4293777186612503, "loss/policy_avg": 0.15693873167037964, "lr": 2.713957055214724e-06, "objective/entropy": 226.33489990234375, "objective/kl": 11.626256942749023, "objective/non_score_reward": -1.162625789642334, "objective/rlhf_reward": -6.650503158569336, "objective/scores": -0.5, "policy/approxkl_avg": 36.06602096557617, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5729613900184631, "step": 1492, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.001974105834961 }, { "episode": 23904, "epoch": 0.42966531257863894, "loss/policy_avg": 3.0012454986572266, "lr": 2.713765337423313e-06, "objective/entropy": 87.70625305175781, "objective/kl": 11.87493896484375, "objective/non_score_reward": -1.1874938011169434, "objective/rlhf_reward": -6.749975204467773, "objective/scores": -0.5, "policy/approxkl_avg": 16.785877227783203, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7042635679244995, "step": 1493, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0035343170166016 }, { "episode": 23920, "epoch": 0.42995290649602763, "loss/policy_avg": 0.29552215337753296, "lr": 2.713573619631902e-06, "objective/entropy": 100.45034790039062, "objective/kl": 15.548515319824219, "objective/non_score_reward": -1.5548515319824219, "objective/rlhf_reward": -5.819406008720398, "objective/scores": 0.1, "policy/approxkl_avg": 34.74082565307617, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7816206812858582, "step": 1494, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000070810317993 }, { "episode": 23936, "epoch": 0.43024050041341627, "loss/policy_avg": 0.138979971408844, "lr": 2.713381901840491e-06, "objective/entropy": 2.042755126953125, "objective/kl": 16.496444702148438, "objective/non_score_reward": -1.6496446132659912, "objective/rlhf_reward": -8.598578453063965, "objective/scores": -0.5, "policy/approxkl_avg": 104.73353576660156, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.853604793548584, "step": 1495, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9992599487304688 }, { "episode": 23952, "epoch": 0.4305280943308049, "loss/policy_avg": -0.35569775104522705, "lr": 2.7131901840490797e-06, "objective/entropy": 47.663047790527344, "objective/kl": 15.499086380004883, "objective/non_score_reward": -1.5499086380004883, "objective/rlhf_reward": -8.199634552001953, "objective/scores": -0.5, "policy/approxkl_avg": 43.7884521484375, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5710984468460083, "step": 1496, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.003573417663574 }, { "episode": 23968, "epoch": 0.43081568824819355, "loss/policy_avg": 0.4634855389595032, "lr": 2.712998466257669e-06, "objective/entropy": 186.1064453125, "objective/kl": 13.683874130249023, "objective/non_score_reward": -1.3683874607086182, "objective/rlhf_reward": -5.073549798130989, "objective/scores": 0.1, "policy/approxkl_avg": 31.9293155670166, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6978639364242554, "step": 1497, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9984076023101807 }, { "episode": 23984, "epoch": 0.4311032821655822, "loss/policy_avg": 0.36279696226119995, "lr": 2.7128067484662578e-06, "objective/entropy": -4.6014556884765625, "objective/kl": 12.281639099121094, "objective/non_score_reward": -1.2281639575958252, "objective/rlhf_reward": -0.5126561433076855, "objective/scores": 1.1, "policy/approxkl_avg": 24.407772064208984, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5443528294563293, "step": 1498, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9997795820236206 }, { "episode": 24000, "epoch": 0.43139087608297083, "loss/policy_avg": 0.6602043509483337, "lr": 2.712615030674847e-06, "objective/entropy": -24.860595703125, "objective/kl": 12.323899269104004, "objective/non_score_reward": -1.2323899269104004, "objective/rlhf_reward": -6.929559707641602, "objective/scores": -0.5, "policy/approxkl_avg": 35.667274475097656, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7064457535743713, "step": 1499, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.004753589630127 }, { "episode": 24016, "epoch": 0.43167847000035947, "loss/policy_avg": 0.25143861770629883, "lr": 2.712423312883436e-06, "objective/entropy": 293.12713623046875, "objective/kl": 12.063253402709961, "objective/non_score_reward": -1.2063254117965698, "objective/rlhf_reward": -4.425301736593246, "objective/scores": 0.1, "policy/approxkl_avg": 60.569305419921875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7328671813011169, "step": 1500, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9970157146453857 }, { "episode": 24032, "epoch": 0.43196606391774817, "loss/policy_avg": 0.2839478850364685, "lr": 2.7122315950920246e-06, "objective/entropy": 110.01084899902344, "objective/kl": 13.495948791503906, "objective/non_score_reward": -1.3495948314666748, "objective/rlhf_reward": -0.9983793929219242, "objective/scores": 1.1, "policy/approxkl_avg": 12.78272819519043, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7883875370025635, "step": 1501, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0003154277801514 }, { "episode": 24048, "epoch": 0.4322536578351368, "loss/policy_avg": 0.20969723165035248, "lr": 2.7120398773006134e-06, "objective/entropy": 4.47552490234375, "objective/kl": 11.441368103027344, "objective/non_score_reward": -1.144136905670166, "objective/rlhf_reward": -4.176547622680664, "objective/scores": 0.1, "policy/approxkl_avg": 35.5608024597168, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5309766530990601, "step": 1502, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0012364387512207 }, { "episode": 24064, "epoch": 0.43254125175252545, "loss/policy_avg": 0.19796213507652283, "lr": 2.7118481595092022e-06, "objective/entropy": 78.621337890625, "objective/kl": 17.16468048095703, "objective/non_score_reward": -1.716468095779419, "objective/rlhf_reward": -8.865872383117676, "objective/scores": -0.5, "policy/approxkl_avg": 159.43646240234375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6347424387931824, "step": 1503, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9969336986541748 }, { "episode": 24080, "epoch": 0.4328288456699141, "loss/policy_avg": 0.8738093376159668, "lr": 2.7116564417177915e-06, "objective/entropy": 82.19612884521484, "objective/kl": 28.559629440307617, "objective/non_score_reward": -2.8559629917144775, "objective/rlhf_reward": -11.02385220527649, "objective/scores": 0.1, "policy/approxkl_avg": 123.68681335449219, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8077266216278076, "step": 1504, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9979246854782104 }, { "episode": 24096, "epoch": 0.4331164395873027, "loss/policy_avg": -0.05104995146393776, "lr": 2.7114647239263803e-06, "objective/entropy": -74.74847412109375, "objective/kl": 7.449967384338379, "objective/non_score_reward": -0.7449966669082642, "objective/rlhf_reward": -2.5799867570400234, "objective/scores": 0.1, "policy/approxkl_avg": 1.6437416076660156, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5547415614128113, "step": 1505, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9997807741165161 }, { "episode": 24112, "epoch": 0.43340403350469137, "loss/policy_avg": 0.2574435770511627, "lr": 2.7112730061349695e-06, "objective/entropy": -191.8342742919922, "objective/kl": 11.918403625488281, "objective/non_score_reward": -1.191840410232544, "objective/rlhf_reward": -4.367361432313919, "objective/scores": 0.1, "policy/approxkl_avg": 42.6358528137207, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5687357187271118, "step": 1506, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9998598098754883 }, { "episode": 24128, "epoch": 0.43369162742208, "loss/policy_avg": -0.061624474823474884, "lr": 2.7110812883435583e-06, "objective/entropy": 209.62362670898438, "objective/kl": 12.062373161315918, "objective/non_score_reward": -1.2062373161315918, "objective/rlhf_reward": -6.824949264526367, "objective/scores": -0.5, "policy/approxkl_avg": 7.045953750610352, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.5013295412063599, "step": 1507, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0000643730163574 }, { "episode": 24144, "epoch": 0.43397922133946865, "loss/policy_avg": -0.24623996019363403, "lr": 2.710889570552147e-06, "objective/entropy": -16.262832641601562, "objective/kl": 18.696651458740234, "objective/non_score_reward": -1.8696651458740234, "objective/rlhf_reward": -7.078660643100738, "objective/scores": 0.1, "policy/approxkl_avg": 24.216144561767578, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7861824631690979, "step": 1508, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.002674102783203 }, { "episode": 24160, "epoch": 0.43426681525685734, "loss/policy_avg": 0.1839936077594757, "lr": 2.7106978527607364e-06, "objective/entropy": -86.09889221191406, "objective/kl": 16.891660690307617, "objective/non_score_reward": -1.6891660690307617, "objective/rlhf_reward": -8.756664276123047, "objective/scores": -0.5, "policy/approxkl_avg": 95.92230224609375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.591369092464447, "step": 1509, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9982521533966064 }, { "episode": 24176, "epoch": 0.434554409174246, "loss/policy_avg": 0.09178348630666733, "lr": 2.710506134969325e-06, "objective/entropy": 319.3994445800781, "objective/kl": 13.149795532226562, "objective/non_score_reward": -1.3149795532226562, "objective/rlhf_reward": -4.859918212890625, "objective/scores": 0.1, "policy/approxkl_avg": 7.748076438903809, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8005496263504028, "step": 1510, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9979579448699951 }, { "episode": 24192, "epoch": 0.4348420030916346, "loss/policy_avg": -0.19410985708236694, "lr": 2.710314417177914e-06, "objective/entropy": -26.6497802734375, "objective/kl": 13.471776962280273, "objective/non_score_reward": -1.3471777439117432, "objective/rlhf_reward": -2.4649919017564983, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 31.211530685424805, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6536291241645813, "step": 1511, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0035223960876465 }, { "episode": 24208, "epoch": 0.43512959700902326, "loss/policy_avg": 0.14305329322814941, "lr": 2.7101226993865032e-06, "objective/entropy": 41.977569580078125, "objective/kl": 10.400999069213867, "objective/non_score_reward": -1.040099859237671, "objective/rlhf_reward": -3.760399496555328, "objective/scores": 0.1, "policy/approxkl_avg": 10.730424880981445, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6075326204299927, "step": 1512, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0010409355163574 }, { "episode": 24224, "epoch": 0.4354171909264119, "loss/policy_avg": 0.19263537228107452, "lr": 2.709930981595092e-06, "objective/entropy": 149.12783813476562, "objective/kl": 14.2966947555542, "objective/non_score_reward": -1.4296694993972778, "objective/rlhf_reward": -1.3186778783798214, "objective/scores": 1.1, "policy/approxkl_avg": 33.09269714355469, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.35392051935195923, "step": 1513, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0018491744995117 }, { "episode": 24240, "epoch": 0.43570478484380054, "loss/policy_avg": -0.43073225021362305, "lr": 2.7097392638036813e-06, "objective/entropy": -185.1280517578125, "objective/kl": 14.71350383758545, "objective/non_score_reward": -1.4713503122329712, "objective/rlhf_reward": -3.937990019993718, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 21.07646369934082, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.5609403848648071, "step": 1514, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.015136241912842 }, { "episode": 24256, "epoch": 0.4359923787611892, "loss/policy_avg": 0.4406461715698242, "lr": 2.70954754601227e-06, "objective/entropy": 164.08778381347656, "objective/kl": 20.165145874023438, "objective/non_score_reward": -2.016514539718628, "objective/rlhf_reward": -10.066058158874512, "objective/scores": -0.5, "policy/approxkl_avg": 56.16340255737305, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6640625, "step": 1515, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9969031810760498 }, { "episode": 24272, "epoch": 0.4362799726785778, "loss/policy_avg": 0.09469935297966003, "lr": 2.709355828220859e-06, "objective/entropy": -48.509239196777344, "objective/kl": 20.4273738861084, "objective/non_score_reward": -2.0427374839782715, "objective/rlhf_reward": -7.7709495484828945, "objective/scores": 0.1, "policy/approxkl_avg": 220.02273559570312, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6581880450248718, "step": 1516, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.996927261352539 }, { "episode": 24288, "epoch": 0.4365675665959665, "loss/policy_avg": 1.0518330335617065, "lr": 2.709164110429448e-06, "objective/entropy": -73.3015365600586, "objective/kl": 13.697656631469727, "objective/non_score_reward": -1.3697656393051147, "objective/rlhf_reward": -5.079062661528587, "objective/scores": 0.1, "policy/approxkl_avg": 94.2430191040039, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7663182020187378, "step": 1517, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0014703273773193 }, { "episode": 24304, "epoch": 0.43685516051335516, "loss/policy_avg": 0.2166292667388916, "lr": 2.708972392638037e-06, "objective/entropy": 80.52146911621094, "objective/kl": 9.666765213012695, "objective/non_score_reward": -0.9666764736175537, "objective/rlhf_reward": -3.466705864667892, "objective/scores": 0.1, "policy/approxkl_avg": 31.77708625793457, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8598181009292603, "step": 1518, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0006589889526367 }, { "episode": 24320, "epoch": 0.4371427544307438, "loss/policy_avg": 1.0463180541992188, "lr": 2.708780674846626e-06, "objective/entropy": 104.51371765136719, "objective/kl": 10.208975791931152, "objective/non_score_reward": -1.020897626876831, "objective/rlhf_reward": 0.31640946269035375, "objective/scores": 1.1, "policy/approxkl_avg": 71.09988403320312, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6160992383956909, "step": 1519, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9966857433319092 }, { "episode": 24336, "epoch": 0.43743034834813244, "loss/policy_avg": 0.49003443121910095, "lr": 2.708588957055215e-06, "objective/entropy": 98.09468078613281, "objective/kl": 16.203685760498047, "objective/non_score_reward": -1.6203685998916626, "objective/rlhf_reward": -4.925215094295099, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 63.16912841796875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7153348922729492, "step": 1520, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998132586479187 }, { "episode": 24352, "epoch": 0.4377179422655211, "loss/policy_avg": 0.07383514940738678, "lr": 2.7083972392638038e-06, "objective/entropy": 91.20425415039062, "objective/kl": 10.100933074951172, "objective/non_score_reward": -1.0100932121276855, "objective/rlhf_reward": -2.43625302976759, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 32.343509674072266, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5615959167480469, "step": 1521, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9986927509307861 }, { "episode": 24368, "epoch": 0.4380055361829097, "loss/policy_avg": 0.615394115447998, "lr": 2.708205521472393e-06, "objective/entropy": 34.11743927001953, "objective/kl": 13.834301948547363, "objective/non_score_reward": -1.3834302425384521, "objective/rlhf_reward": -7.533720970153809, "objective/scores": -0.5, "policy/approxkl_avg": 36.0637321472168, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7028712630271912, "step": 1522, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9989628791809082 }, { "episode": 24384, "epoch": 0.43829313010029836, "loss/policy_avg": 0.42487022280693054, "lr": 2.7080138036809814e-06, "objective/entropy": 291.3263854980469, "objective/kl": 16.128992080688477, "objective/non_score_reward": -1.6128990650177002, "objective/rlhf_reward": -2.0515966624021527, "objective/scores": 1.1, "policy/approxkl_avg": 12.953594207763672, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7052067518234253, "step": 1523, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9995543956756592 }, { "episode": 24400, "epoch": 0.43858072401768705, "loss/policy_avg": 0.9943918585777283, "lr": 2.7078220858895706e-06, "objective/entropy": 183.32113647460938, "objective/kl": 15.212042808532715, "objective/non_score_reward": -1.5212042331695557, "objective/rlhf_reward": -1.6848166942596432, "objective/scores": 1.1, "policy/approxkl_avg": 233.8773193359375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7408942580223083, "step": 1524, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999128818511963 }, { "episode": 24416, "epoch": 0.4388683179350757, "loss/policy_avg": 0.9949908256530762, "lr": 2.7076303680981594e-06, "objective/entropy": -187.033203125, "objective/kl": 17.688919067382812, "objective/non_score_reward": -1.7688918113708496, "objective/rlhf_reward": -6.675567245483398, "objective/scores": 0.1, "policy/approxkl_avg": 29.05734634399414, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.8625385761260986, "step": 1525, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9983234405517578 }, { "episode": 24432, "epoch": 0.43915591185246433, "loss/policy_avg": 0.8484746813774109, "lr": 2.7074386503067482e-06, "objective/entropy": 174.75379943847656, "objective/kl": 20.135005950927734, "objective/non_score_reward": -2.013500690460205, "objective/rlhf_reward": -3.654003000259399, "objective/scores": 1.1, "policy/approxkl_avg": 134.73599243164062, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.664713442325592, "step": 1526, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9998801946640015 }, { "episode": 24448, "epoch": 0.43944350576985297, "loss/policy_avg": 1.8293222188949585, "lr": 2.7072469325153375e-06, "objective/entropy": 57.493255615234375, "objective/kl": 8.005315780639648, "objective/non_score_reward": -0.8005315661430359, "objective/rlhf_reward": 1.1978736609220508, "objective/scores": 1.1, "policy/approxkl_avg": 17.183082580566406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7343942523002625, "step": 1527, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.00285005569458 }, { "episode": 24464, "epoch": 0.4397310996872416, "loss/policy_avg": 0.6055887937545776, "lr": 2.7070552147239263e-06, "objective/entropy": 205.1093292236328, "objective/kl": 15.435603141784668, "objective/non_score_reward": -1.5435603857040405, "objective/rlhf_reward": -1.7742413640022274, "objective/scores": 1.1, "policy/approxkl_avg": 60.15907287597656, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5648759603500366, "step": 1528, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9996998310089111 }, { "episode": 24480, "epoch": 0.44001869360463025, "loss/policy_avg": 1.3451311588287354, "lr": 2.7068634969325155e-06, "objective/entropy": 190.40625, "objective/kl": 13.39560317993164, "objective/non_score_reward": -1.3395602703094482, "objective/rlhf_reward": -0.9582409471273419, "objective/scores": 1.1, "policy/approxkl_avg": 67.11732482910156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8093500137329102, "step": 1529, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9993016719818115 }, { "episode": 24496, "epoch": 0.4403062875220189, "loss/policy_avg": 0.3937966227531433, "lr": 2.7066717791411043e-06, "objective/entropy": 48.01210021972656, "objective/kl": 15.38280200958252, "objective/non_score_reward": -1.5382802486419678, "objective/rlhf_reward": -8.153120994567871, "objective/scores": -0.5, "policy/approxkl_avg": 135.72549438476562, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7022747993469238, "step": 1530, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.995867133140564 }, { "episode": 24512, "epoch": 0.44059388143940753, "loss/policy_avg": 0.4086163341999054, "lr": 2.706480061349693e-06, "objective/entropy": 29.64532470703125, "objective/kl": 11.605493545532227, "objective/non_score_reward": -1.160549283027649, "objective/rlhf_reward": -0.2421972215175625, "objective/scores": 1.1, "policy/approxkl_avg": 110.24431610107422, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.48122888803482056, "step": 1531, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9973434209823608 }, { "episode": 24528, "epoch": 0.4408814753567962, "loss/policy_avg": 0.16401143372058868, "lr": 2.7062883435582824e-06, "objective/entropy": -106.093505859375, "objective/kl": 18.429615020751953, "objective/non_score_reward": -1.8429614305496216, "objective/rlhf_reward": -2.9718456029891964, "objective/scores": 1.1, "policy/approxkl_avg": 90.48030090332031, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8432134389877319, "step": 1532, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.999216914176941 }, { "episode": 24544, "epoch": 0.44116906927418487, "loss/policy_avg": 0.30233433842658997, "lr": 2.706096625766871e-06, "objective/entropy": -94.45288848876953, "objective/kl": 10.195063591003418, "objective/non_score_reward": -1.0195064544677734, "objective/rlhf_reward": -6.078025817871094, "objective/scores": -0.5, "policy/approxkl_avg": 3.0919206142425537, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6552790403366089, "step": 1533, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9992151260375977 }, { "episode": 24560, "epoch": 0.4414566631915735, "loss/policy_avg": 0.30027535557746887, "lr": 2.70590490797546e-06, "objective/entropy": 39.50469207763672, "objective/kl": 21.69940185546875, "objective/non_score_reward": -2.169940233230591, "objective/rlhf_reward": -8.279760932922363, "objective/scores": 0.1, "policy/approxkl_avg": 3.9611849784851074, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5584852695465088, "step": 1534, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0010085105895996 }, { "episode": 24576, "epoch": 0.44174425710896215, "loss/policy_avg": 0.1084059327840805, "lr": 2.7057131901840492e-06, "objective/entropy": 315.66522216796875, "objective/kl": 11.735557556152344, "objective/non_score_reward": -1.1735557317733765, "objective/rlhf_reward": -3.032363360345946, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 4.585368633270264, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 1.023706078529358, "step": 1535, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0004658699035645 }, { "episode": 24592, "epoch": 0.4420318510263508, "loss/policy_avg": 0.8311107158660889, "lr": 2.705521472392638e-06, "objective/entropy": 112.27359008789062, "objective/kl": 16.359058380126953, "objective/non_score_reward": -1.6359058618545532, "objective/rlhf_reward": -2.1436234623193737, "objective/scores": 1.1, "policy/approxkl_avg": 98.65286254882812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8164811134338379, "step": 1536, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9976773262023926 }, { "episode": 24608, "epoch": 0.4423194449437394, "loss/policy_avg": 0.3270980715751648, "lr": 2.7053297546012273e-06, "objective/entropy": 95.81588745117188, "objective/kl": 16.39149284362793, "objective/non_score_reward": -1.6391494274139404, "objective/rlhf_reward": -8.556597709655762, "objective/scores": -0.5, "policy/approxkl_avg": 36.173675537109375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8174101114273071, "step": 1537, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9993128776550293 }, { "episode": 24624, "epoch": 0.44260703886112807, "loss/policy_avg": 0.24724414944648743, "lr": 2.705138036809816e-06, "objective/entropy": 284.1396484375, "objective/kl": 15.634368896484375, "objective/non_score_reward": -1.5634369850158691, "objective/rlhf_reward": -8.253747940063477, "objective/scores": -0.5, "policy/approxkl_avg": 1.1526098251342773, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7458974123001099, "step": 1538, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.001605272293091 }, { "episode": 24640, "epoch": 0.44289463277851676, "loss/policy_avg": 0.14971446990966797, "lr": 2.704946319018405e-06, "objective/entropy": -12.085342407226562, "objective/kl": 18.393085479736328, "objective/non_score_reward": -1.8393086194992065, "objective/rlhf_reward": -5.409823308663304, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 53.39990234375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8056477308273315, "step": 1539, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9987678527832031 }, { "episode": 24656, "epoch": 0.4431822266959054, "loss/policy_avg": 0.3267354965209961, "lr": 2.704754601226994e-06, "objective/entropy": 166.03140258789062, "objective/kl": 17.804903030395508, "objective/non_score_reward": -1.7804901599884033, "objective/rlhf_reward": -6.721960878372192, "objective/scores": 0.1, "policy/approxkl_avg": 61.347267150878906, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.827207624912262, "step": 1540, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9992976188659668 }, { "episode": 24672, "epoch": 0.44346982061329404, "loss/policy_avg": -0.25124239921569824, "lr": 2.704562883435583e-06, "objective/entropy": 106.05780029296875, "objective/kl": 18.72496795654297, "objective/non_score_reward": -1.8724967241287231, "objective/rlhf_reward": -7.089986896514892, "objective/scores": 0.1, "policy/approxkl_avg": 40.37659454345703, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5372323989868164, "step": 1541, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000049352645874 }, { "episode": 24688, "epoch": 0.4437574145306827, "loss/policy_avg": 0.31747931241989136, "lr": 2.704371165644172e-06, "objective/entropy": -78.14122772216797, "objective/kl": 10.030508041381836, "objective/non_score_reward": -1.0030508041381836, "objective/rlhf_reward": -3.612203514575958, "objective/scores": 0.1, "policy/approxkl_avg": 8.71822738647461, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5570470690727234, "step": 1542, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9980380535125732 }, { "episode": 24704, "epoch": 0.4440450084480713, "loss/policy_avg": -0.013954512774944305, "lr": 2.704179447852761e-06, "objective/entropy": 226.41941833496094, "objective/kl": 15.391945838928223, "objective/non_score_reward": -1.5391945838928223, "objective/rlhf_reward": -3.2330593212854595, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 33.7995491027832, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6941033005714417, "step": 1543, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0005383491516113 }, { "episode": 24720, "epoch": 0.44433260236545996, "loss/policy_avg": -0.052130766212940216, "lr": 2.7039877300613498e-06, "objective/entropy": -49.44288635253906, "objective/kl": 14.369674682617188, "objective/non_score_reward": -1.436967372894287, "objective/rlhf_reward": -5.347869789600372, "objective/scores": 0.1, "policy/approxkl_avg": 12.564598083496094, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6965159177780151, "step": 1544, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0005674362182617 }, { "episode": 24736, "epoch": 0.4446201962828486, "loss/policy_avg": 0.21237152814865112, "lr": 2.7037960122699386e-06, "objective/entropy": 195.41824340820312, "objective/kl": 13.13949203491211, "objective/non_score_reward": -1.3139491081237793, "objective/rlhf_reward": -7.255796432495117, "objective/scores": -0.5, "policy/approxkl_avg": 12.779157638549805, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8595893383026123, "step": 1545, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0007612705230713 }, { "episode": 24752, "epoch": 0.44490779020023724, "loss/policy_avg": 0.37602826952934265, "lr": 2.7036042944785274e-06, "objective/entropy": 217.9283905029297, "objective/kl": 16.096073150634766, "objective/non_score_reward": -1.6096073389053345, "objective/rlhf_reward": -8.43842887878418, "objective/scores": -0.5, "policy/approxkl_avg": 3.7367372512817383, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7021983861923218, "step": 1546, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9986357688903809 }, { "episode": 24768, "epoch": 0.44519538411762594, "loss/policy_avg": 0.2176741063594818, "lr": 2.7034125766871166e-06, "objective/entropy": 146.32022094726562, "objective/kl": 19.497966766357422, "objective/non_score_reward": -1.9497966766357422, "objective/rlhf_reward": -7.399186706542968, "objective/scores": 0.1, "policy/approxkl_avg": 13.904340744018555, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6778392791748047, "step": 1547, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9992797374725342 }, { "episode": 24784, "epoch": 0.4454829780350146, "loss/policy_avg": -0.03819188103079796, "lr": 2.7032208588957054e-06, "objective/entropy": 22.281753540039062, "objective/kl": 13.729183197021484, "objective/non_score_reward": -1.3729183673858643, "objective/rlhf_reward": -1.091673335433006, "objective/scores": 1.1, "policy/approxkl_avg": 45.58137512207031, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6482210159301758, "step": 1548, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000016689300537 }, { "episode": 24800, "epoch": 0.4457705719524032, "loss/policy_avg": -0.35607993602752686, "lr": 2.7030291411042943e-06, "objective/entropy": 161.14715576171875, "objective/kl": 18.698495864868164, "objective/non_score_reward": -1.869849443435669, "objective/rlhf_reward": -7.0793980121612545, "objective/scores": 0.1, "policy/approxkl_avg": 21.378082275390625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9166164398193359, "step": 1549, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9974842071533203 }, { "episode": 24816, "epoch": 0.44605816586979186, "loss/policy_avg": 0.16531553864479065, "lr": 2.7028374233128835e-06, "objective/entropy": 16.770519256591797, "objective/kl": 21.444007873535156, "objective/non_score_reward": -2.1444008350372314, "objective/rlhf_reward": -8.177603578567505, "objective/scores": 0.1, "policy/approxkl_avg": 61.63994598388672, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.556962788105011, "step": 1550, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9987709522247314 }, { "episode": 24832, "epoch": 0.4463457597871805, "loss/policy_avg": -0.10354608297348022, "lr": 2.7026457055214723e-06, "objective/entropy": 42.6910514831543, "objective/kl": 17.690929412841797, "objective/non_score_reward": -1.7690927982330322, "objective/rlhf_reward": -6.676370894908905, "objective/scores": 0.1, "policy/approxkl_avg": 47.56449890136719, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6999937295913696, "step": 1551, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000419855117798 }, { "episode": 24848, "epoch": 0.44663335370456914, "loss/policy_avg": 0.22843822836875916, "lr": 2.7024539877300615e-06, "objective/entropy": 223.37420654296875, "objective/kl": 16.745403289794922, "objective/non_score_reward": -1.6745402812957764, "objective/rlhf_reward": -6.298160946369171, "objective/scores": 0.1, "policy/approxkl_avg": 9.113785743713379, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.688583493232727, "step": 1552, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0004663467407227 }, { "episode": 24864, "epoch": 0.4469209476219578, "loss/policy_avg": 0.5825582146644592, "lr": 2.7022622699386503e-06, "objective/entropy": -226.0932159423828, "objective/kl": 10.318975448608398, "objective/non_score_reward": -1.031897783279419, "objective/rlhf_reward": -6.127591133117676, "objective/scores": -0.5, "policy/approxkl_avg": 6.590059280395508, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6814849376678467, "step": 1553, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0015769004821777 }, { "episode": 24880, "epoch": 0.4472085415393464, "loss/policy_avg": 0.7355405688285828, "lr": 2.702070552147239e-06, "objective/entropy": -0.4903068542480469, "objective/kl": 11.162042617797852, "objective/non_score_reward": -1.1162042617797852, "objective/rlhf_reward": -1.5410977348100867, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 43.610984802246094, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.48920321464538574, "step": 1554, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000331163406372 }, { "episode": 24896, "epoch": 0.4474961354567351, "loss/policy_avg": 0.6175955533981323, "lr": 2.7018788343558284e-06, "objective/entropy": 27.28057861328125, "objective/kl": 14.743072509765625, "objective/non_score_reward": -1.4743072986602783, "objective/rlhf_reward": -7.897229194641113, "objective/scores": -0.5, "policy/approxkl_avg": 2.485166072845459, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6037850379943848, "step": 1555, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9994779825210571 }, { "episode": 24912, "epoch": 0.44778372937412375, "loss/policy_avg": 0.4660176634788513, "lr": 2.701687116564417e-06, "objective/entropy": 81.15247344970703, "objective/kl": 15.817729949951172, "objective/non_score_reward": -1.581773042678833, "objective/rlhf_reward": -3.9270920217037197, "objective/scores": 0.6, "policy/approxkl_avg": 70.5673599243164, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6252831220626831, "step": 1556, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998610258102417 }, { "episode": 24928, "epoch": 0.4480713232915124, "loss/policy_avg": 0.07333136349916458, "lr": 2.7014953987730064e-06, "objective/entropy": 144.30224609375, "objective/kl": 22.033649444580078, "objective/non_score_reward": -2.203364849090576, "objective/rlhf_reward": -10.813459396362305, "objective/scores": -0.5, "policy/approxkl_avg": 80.9249267578125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8037264347076416, "step": 1557, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9996355772018433 }, { "episode": 24944, "epoch": 0.44835891720890103, "loss/policy_avg": 0.256188303232193, "lr": 2.7013036809815952e-06, "objective/entropy": -67.07438659667969, "objective/kl": 7.778665065765381, "objective/non_score_reward": -0.7778664827346802, "objective/rlhf_reward": -5.111466407775879, "objective/scores": -0.5, "policy/approxkl_avg": 47.60340118408203, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6475851535797119, "step": 1558, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9994537830352783 }, { "episode": 24960, "epoch": 0.44864651112628967, "loss/policy_avg": 0.0924801379442215, "lr": 2.701111963190184e-06, "objective/entropy": 113.46358489990234, "objective/kl": 14.019509315490723, "objective/non_score_reward": -1.4019509553909302, "objective/rlhf_reward": -7.6078033447265625, "objective/scores": -0.5, "policy/approxkl_avg": 25.744773864746094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6623624563217163, "step": 1559, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9986884593963623 }, { "episode": 24976, "epoch": 0.4489341050436783, "loss/policy_avg": 0.1538938581943512, "lr": 2.7009202453987733e-06, "objective/entropy": 100.3080825805664, "objective/kl": 18.584598541259766, "objective/non_score_reward": -1.8584599494934082, "objective/rlhf_reward": -5.033839440345764, "objective/scores": 0.6, "policy/approxkl_avg": 68.59663391113281, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6090136170387268, "step": 1560, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9989771842956543 }, { "episode": 24992, "epoch": 0.44922169896106695, "loss/policy_avg": 0.485708087682724, "lr": 2.700728527607362e-06, "objective/entropy": -145.61587524414062, "objective/kl": 9.420523643493652, "objective/non_score_reward": -0.9420523643493652, "objective/rlhf_reward": 0.6317904084920887, "objective/scores": 1.1, "policy/approxkl_avg": 15.837213516235352, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.732647180557251, "step": 1561, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9991077184677124 }, { "episode": 25008, "epoch": 0.44950929287845565, "loss/policy_avg": -0.20601963996887207, "lr": 2.700536809815951e-06, "objective/entropy": 38.85050582885742, "objective/kl": 16.94233512878418, "objective/non_score_reward": -1.6942335367202759, "objective/rlhf_reward": -2.376934146881103, "objective/scores": 1.1, "policy/approxkl_avg": 44.557830810546875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6362634301185608, "step": 1562, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0128636360168457 }, { "episode": 25024, "epoch": 0.4497968867958443, "loss/policy_avg": 0.4293028712272644, "lr": 2.70034509202454e-06, "objective/entropy": -42.770118713378906, "objective/kl": 18.898033142089844, "objective/non_score_reward": -1.889803409576416, "objective/rlhf_reward": -7.159213399887085, "objective/scores": 0.1, "policy/approxkl_avg": 46.76361846923828, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8298807144165039, "step": 1563, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000500202178955 }, { "episode": 25040, "epoch": 0.4500844807132329, "loss/policy_avg": 0.29531529545783997, "lr": 2.700153374233129e-06, "objective/entropy": -97.281494140625, "objective/kl": 11.071046829223633, "objective/non_score_reward": -1.1071045398712158, "objective/rlhf_reward": -0.028418457508086803, "objective/scores": 1.1, "policy/approxkl_avg": 3.9526071548461914, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7094585299491882, "step": 1564, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9980638027191162 }, { "episode": 25056, "epoch": 0.45037207463062157, "loss/policy_avg": 0.046556033194065094, "lr": 2.699961656441718e-06, "objective/entropy": 107.26701354980469, "objective/kl": 19.248638153076172, "objective/non_score_reward": -1.9248639345169067, "objective/rlhf_reward": -7.299455797672271, "objective/scores": 0.1, "policy/approxkl_avg": 113.94715881347656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.666810154914856, "step": 1565, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9988000392913818 }, { "episode": 25072, "epoch": 0.4506596685480102, "loss/policy_avg": 0.012369789183139801, "lr": 2.699769938650307e-06, "objective/entropy": 147.94244384765625, "objective/kl": 15.91103744506836, "objective/non_score_reward": -1.5911036729812622, "objective/rlhf_reward": -8.36441421508789, "objective/scores": -0.5, "policy/approxkl_avg": 21.165550231933594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7558436393737793, "step": 1566, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9973013401031494 }, { "episode": 25088, "epoch": 0.45094726246539885, "loss/policy_avg": -0.012968000024557114, "lr": 2.6995782208588958e-06, "objective/entropy": -157.74293518066406, "objective/kl": 16.778770446777344, "objective/non_score_reward": -1.6778768301010132, "objective/rlhf_reward": -6.311507439613342, "objective/scores": 0.1, "policy/approxkl_avg": 43.33392333984375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7485702633857727, "step": 1567, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000155448913574 }, { "episode": 25104, "epoch": 0.4512348563827875, "loss/policy_avg": 0.2825145125389099, "lr": 2.6993865030674846e-06, "objective/entropy": -26.431015014648438, "objective/kl": 15.26270866394043, "objective/non_score_reward": -1.5262709856033325, "objective/rlhf_reward": -4.5009638703504375, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 25.48444366455078, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.834632158279419, "step": 1568, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9994863271713257 }, { "episode": 25120, "epoch": 0.4515224503001761, "loss/policy_avg": -0.168730229139328, "lr": 2.6991947852760734e-06, "objective/entropy": 39.87802505493164, "objective/kl": 21.178451538085938, "objective/non_score_reward": -2.117845058441162, "objective/rlhf_reward": -5.547661755920622, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 5.897341728210449, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7191267013549805, "step": 1569, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0011539459228516 }, { "episode": 25136, "epoch": 0.4518100442175648, "loss/policy_avg": 0.13665719330310822, "lr": 2.6990030674846626e-06, "objective/entropy": 118.00531005859375, "objective/kl": 10.522141456604004, "objective/non_score_reward": -1.0522141456604004, "objective/rlhf_reward": -6.208856582641602, "objective/scores": -0.5, "policy/approxkl_avg": 1.5498545169830322, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6503363847732544, "step": 1570, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0017614364624023 }, { "episode": 25152, "epoch": 0.45209763813495346, "loss/policy_avg": 0.1307908147573471, "lr": 2.6988113496932514e-06, "objective/entropy": -69.06759643554688, "objective/kl": 17.492443084716797, "objective/non_score_reward": -1.7492443323135376, "objective/rlhf_reward": -8.996976852416992, "objective/scores": -0.5, "policy/approxkl_avg": 71.87448120117188, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.775144636631012, "step": 1571, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9978597164154053 }, { "episode": 25168, "epoch": 0.4523852320523421, "loss/policy_avg": 0.12863555550575256, "lr": 2.6986196319018403e-06, "objective/entropy": -15.985279083251953, "objective/kl": 20.260404586791992, "objective/non_score_reward": -2.0260403156280518, "objective/rlhf_reward": -10.104161262512207, "objective/scores": -0.5, "policy/approxkl_avg": 35.53135681152344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5794572830200195, "step": 1572, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0000314712524414 }, { "episode": 25184, "epoch": 0.45267282596973074, "loss/policy_avg": 0.39225825667381287, "lr": 2.6984279141104295e-06, "objective/entropy": -145.51882934570312, "objective/kl": 22.5490779876709, "objective/non_score_reward": -2.2549078464508057, "objective/rlhf_reward": -6.619630849361419, "objective/scores": 0.6, "policy/approxkl_avg": 99.43373107910156, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.733991801738739, "step": 1573, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.997790813446045 }, { "episode": 25200, "epoch": 0.4529604198871194, "loss/policy_avg": 0.23225891590118408, "lr": 2.6982361963190183e-06, "objective/entropy": 157.5059814453125, "objective/kl": 13.41703987121582, "objective/non_score_reward": -1.34170401096344, "objective/rlhf_reward": -4.966816163063049, "objective/scores": 0.1, "policy/approxkl_avg": 71.42198181152344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5340787768363953, "step": 1574, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0022149085998535 }, { "episode": 25216, "epoch": 0.453248013804508, "loss/policy_avg": 0.09103409945964813, "lr": 2.6980444785276075e-06, "objective/entropy": 397.29486083984375, "objective/kl": 17.116559982299805, "objective/non_score_reward": -1.711656093597412, "objective/rlhf_reward": -3.9229054197084636, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 85.59910583496094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.9673483371734619, "step": 1575, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.997066855430603 }, { "episode": 25232, "epoch": 0.45353560772189666, "loss/policy_avg": 0.659239649772644, "lr": 2.6978527607361963e-06, "objective/entropy": -50.505401611328125, "objective/kl": 16.264190673828125, "objective/non_score_reward": -1.6264190673828125, "objective/rlhf_reward": -4.949417023864344, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 9.61915397644043, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.53434157371521, "step": 1576, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998645544052124 }, { "episode": 25248, "epoch": 0.45382320163928536, "loss/policy_avg": 0.2389046847820282, "lr": 2.697661042944785e-06, "objective/entropy": 12.503677368164062, "objective/kl": 15.69602108001709, "objective/non_score_reward": -1.569602131843567, "objective/rlhf_reward": -1.878408348560333, "objective/scores": 1.1, "policy/approxkl_avg": 9.81786060333252, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8174733519554138, "step": 1577, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.997225046157837 }, { "episode": 25264, "epoch": 0.454110795556674, "loss/policy_avg": 0.06463336944580078, "lr": 2.6974693251533744e-06, "objective/entropy": 14.687973022460938, "objective/kl": 16.26601791381836, "objective/non_score_reward": -1.626602053642273, "objective/rlhf_reward": -8.50640869140625, "objective/scores": -0.5, "policy/approxkl_avg": 52.611732482910156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7420953512191772, "step": 1578, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9989168643951416 }, { "episode": 25280, "epoch": 0.45439838947406264, "loss/policy_avg": 0.3006048798561096, "lr": 2.697277607361963e-06, "objective/entropy": 121.62173461914062, "objective/kl": 17.033985137939453, "objective/non_score_reward": -1.7033987045288086, "objective/rlhf_reward": -6.413594579696655, "objective/scores": 0.1, "policy/approxkl_avg": 11.945178985595703, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.840303897857666, "step": 1579, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9979677200317383 }, { "episode": 25296, "epoch": 0.4546859833914513, "loss/policy_avg": -0.2543651759624481, "lr": 2.6970858895705524e-06, "objective/entropy": 23.071609497070312, "objective/kl": 16.931270599365234, "objective/non_score_reward": -1.6931270360946655, "objective/rlhf_reward": -6.372508069872856, "objective/scores": 0.1, "policy/approxkl_avg": 4.906126022338867, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.81708824634552, "step": 1580, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000901699066162 }, { "episode": 25312, "epoch": 0.4549735773088399, "loss/policy_avg": 0.41243085265159607, "lr": 2.6968941717791412e-06, "objective/entropy": 356.097412109375, "objective/kl": 16.318756103515625, "objective/non_score_reward": -1.6318755149841309, "objective/rlhf_reward": -3.603782948793146, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 52.568607330322266, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.9110679030418396, "step": 1581, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.000410318374634 }, { "episode": 25328, "epoch": 0.45526117122622856, "loss/policy_avg": 0.07954712212085724, "lr": 2.69670245398773e-06, "objective/entropy": 114.1132583618164, "objective/kl": 20.863964080810547, "objective/non_score_reward": -2.0863962173461914, "objective/rlhf_reward": -6.741465274159031, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 86.94965362548828, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6331667900085449, "step": 1582, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9974026679992676 }, { "episode": 25344, "epoch": 0.4555487651436172, "loss/policy_avg": 0.01582658290863037, "lr": 2.6965107361963193e-06, "objective/entropy": 195.87881469726562, "objective/kl": 17.72394561767578, "objective/non_score_reward": -1.7723946571350098, "objective/rlhf_reward": -9.089578628540039, "objective/scores": -0.5, "policy/approxkl_avg": 32.234439849853516, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.8881067633628845, "step": 1583, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999032735824585 }, { "episode": 25360, "epoch": 0.45583635906100584, "loss/policy_avg": 0.36229202151298523, "lr": 2.696319018404908e-06, "objective/entropy": 23.606246948242188, "objective/kl": 19.955965042114258, "objective/non_score_reward": -1.9955966472625732, "objective/rlhf_reward": -9.982385635375977, "objective/scores": -0.5, "policy/approxkl_avg": 53.62287139892578, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.33281606435775757, "step": 1584, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9996920824050903 }, { "episode": 25376, "epoch": 0.45612395297839453, "loss/policy_avg": 0.26224541664123535, "lr": 2.696127300613497e-06, "objective/entropy": -22.723011016845703, "objective/kl": 14.956771850585938, "objective/non_score_reward": -1.4956772327423096, "objective/rlhf_reward": -7.982708930969238, "objective/scores": -0.5, "policy/approxkl_avg": 3.4624900817871094, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5113921165466309, "step": 1585, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998342514038086 }, { "episode": 25392, "epoch": 0.45641154689578317, "loss/policy_avg": 0.4027129113674164, "lr": 2.695935582822086e-06, "objective/entropy": -29.9932861328125, "objective/kl": 14.198205947875977, "objective/non_score_reward": -1.4198204278945923, "objective/rlhf_reward": -3.5565754196801525, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 149.37881469726562, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7940120697021484, "step": 1586, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9996426105499268 }, { "episode": 25408, "epoch": 0.4566991408131718, "loss/policy_avg": 0.4442477226257324, "lr": 2.695743865030675e-06, "objective/entropy": 2.5545082092285156, "objective/kl": 12.904587745666504, "objective/non_score_reward": -1.2904589176177979, "objective/rlhf_reward": -4.761835730075836, "objective/scores": 0.1, "policy/approxkl_avg": 44.589908599853516, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.44873154163360596, "step": 1587, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9959895610809326 }, { "episode": 25424, "epoch": 0.45698673473056045, "loss/policy_avg": 0.2857421934604645, "lr": 2.695552147239264e-06, "objective/entropy": 133.09530639648438, "objective/kl": 17.154420852661133, "objective/non_score_reward": -1.715442180633545, "objective/rlhf_reward": -4.739062073008094, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 129.16445922851562, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7996494174003601, "step": 1588, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9953253269195557 }, { "episode": 25440, "epoch": 0.4572743286479491, "loss/policy_avg": 0.122508704662323, "lr": 2.695360429447853e-06, "objective/entropy": 76.19819641113281, "objective/kl": 12.06997299194336, "objective/non_score_reward": -1.206997275352478, "objective/rlhf_reward": -4.4279892727732655, "objective/scores": 0.1, "policy/approxkl_avg": 24.52509880065918, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.739288866519928, "step": 1589, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9986529350280762 }, { "episode": 25456, "epoch": 0.45756192256533773, "loss/policy_avg": 0.4644879102706909, "lr": 2.695168711656442e-06, "objective/entropy": 124.99449157714844, "objective/kl": 16.30022430419922, "objective/non_score_reward": -1.6300225257873535, "objective/rlhf_reward": -4.397383751646553, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 10.321215629577637, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6177806854248047, "step": 1590, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9980406761169434 }, { "episode": 25472, "epoch": 0.45784951648272637, "loss/policy_avg": 0.24397248029708862, "lr": 2.6949769938650306e-06, "objective/entropy": 34.64955139160156, "objective/kl": 14.790996551513672, "objective/non_score_reward": -1.4790997505187988, "objective/rlhf_reward": -5.5163992702960964, "objective/scores": 0.1, "policy/approxkl_avg": 120.2115707397461, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8772210478782654, "step": 1591, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9973468780517578 }, { "episode": 25488, "epoch": 0.458137110400115, "loss/policy_avg": 0.3860127925872803, "lr": 2.6947852760736194e-06, "objective/entropy": -19.67723846435547, "objective/kl": 14.165413856506348, "objective/non_score_reward": -1.4165414571762085, "objective/rlhf_reward": -5.266165888309478, "objective/scores": 0.1, "policy/approxkl_avg": 12.3641357421875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6394959688186646, "step": 1592, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9994362592697144 }, { "episode": 25504, "epoch": 0.4584247043175037, "loss/policy_avg": 0.06918300688266754, "lr": 2.6945935582822086e-06, "objective/entropy": 114.20576477050781, "objective/kl": 16.480592727661133, "objective/non_score_reward": -1.6480592489242554, "objective/rlhf_reward": -2.192236995697021, "objective/scores": 1.1, "policy/approxkl_avg": 68.93272399902344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5571388006210327, "step": 1593, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999291181564331 }, { "episode": 25520, "epoch": 0.45871229823489235, "loss/policy_avg": 0.029633134603500366, "lr": 2.6944018404907975e-06, "objective/entropy": 158.27218627929688, "objective/kl": 13.224921226501465, "objective/non_score_reward": -1.3224921226501465, "objective/rlhf_reward": -7.289968490600586, "objective/scores": -0.5, "policy/approxkl_avg": 12.776941299438477, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6619477272033691, "step": 1594, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0042223930358887 }, { "episode": 25536, "epoch": 0.458999892152281, "loss/policy_avg": 0.4518873989582062, "lr": 2.6942101226993867e-06, "objective/entropy": 130.73275756835938, "objective/kl": 12.00938606262207, "objective/non_score_reward": -1.2009387016296387, "objective/rlhf_reward": -2.403754478693008, "objective/scores": 0.6, "policy/approxkl_avg": 24.144432067871094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4357760548591614, "step": 1595, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9999908208847046 }, { "episode": 25552, "epoch": 0.4592874860696696, "loss/policy_avg": -0.008014561608433723, "lr": 2.6940184049079755e-06, "objective/entropy": -63.465999603271484, "objective/kl": 16.632591247558594, "objective/non_score_reward": -1.6632592678070068, "objective/rlhf_reward": -2.2530369520187374, "objective/scores": 1.1, "policy/approxkl_avg": 73.7203140258789, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6869769096374512, "step": 1596, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0008316040039062 }, { "episode": 25568, "epoch": 0.45957507998705827, "loss/policy_avg": 0.11674131453037262, "lr": 2.6938266871165643e-06, "objective/entropy": -240.97103881835938, "objective/kl": 13.844555854797363, "objective/non_score_reward": -1.384455680847168, "objective/rlhf_reward": -1.1378224849700924, "objective/scores": 1.1, "policy/approxkl_avg": 66.28314208984375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5940455198287964, "step": 1597, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9971486330032349 }, { "episode": 25584, "epoch": 0.4598626739044469, "loss/policy_avg": 0.22572100162506104, "lr": 2.6936349693251535e-06, "objective/entropy": 73.80177307128906, "objective/kl": 17.256254196166992, "objective/non_score_reward": -1.7256255149841309, "objective/rlhf_reward": -6.502502059936523, "objective/scores": 0.1, "policy/approxkl_avg": 10.520421981811523, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.44892245531082153, "step": 1598, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999833583831787 }, { "episode": 25600, "epoch": 0.46015026782183555, "loss/policy_avg": 0.17625027894973755, "lr": 2.6934432515337424e-06, "objective/entropy": 64.5656967163086, "objective/kl": 15.18729019165039, "objective/non_score_reward": -1.5187289714813232, "objective/rlhf_reward": -5.674916064739227, "objective/scores": 0.1, "policy/approxkl_avg": 17.129301071166992, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.658516526222229, "step": 1599, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9983505010604858 }, { "episode": 25616, "epoch": 0.46043786173922424, "loss/policy_avg": -0.21334171295166016, "lr": 2.693251533742331e-06, "objective/entropy": -166.7024383544922, "objective/kl": 14.429752349853516, "objective/non_score_reward": -1.4429752826690674, "objective/rlhf_reward": -3.9470721438256016, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 10.913631439208984, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.5916569232940674, "step": 1600, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.005368709564209 }, { "episode": 25632, "epoch": 0.4607254556566129, "loss/policy_avg": 0.3519183099269867, "lr": 2.6930598159509204e-06, "objective/entropy": 203.21722412109375, "objective/kl": 9.83327579498291, "objective/non_score_reward": -0.9833276271820068, "objective/rlhf_reward": 0.4666895508766178, "objective/scores": 1.1, "policy/approxkl_avg": 6.112152099609375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7379715442657471, "step": 1601, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0004913806915283 }, { "episode": 25648, "epoch": 0.4610130495740015, "loss/policy_avg": -0.3812810778617859, "lr": 2.692868098159509e-06, "objective/entropy": -146.57159423828125, "objective/kl": 11.350961685180664, "objective/non_score_reward": -1.1350963115692139, "objective/rlhf_reward": -4.140384978055954, "objective/scores": 0.1, "policy/approxkl_avg": 46.152496337890625, "policy/clipfrac_avg": 2.0, "policy/entropy_avg": 0.651496171951294, "step": 1602, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0040106773376465 }, { "episode": 25664, "epoch": 0.46130064349139016, "loss/policy_avg": 0.1941705346107483, "lr": 2.6926763803680984e-06, "objective/entropy": 66.99221801757812, "objective/kl": 10.873661041259766, "objective/non_score_reward": -1.0873661041259766, "objective/rlhf_reward": -2.6876049838667972, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 23.330978393554688, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5211083292961121, "step": 1603, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9978305101394653 }, { "episode": 25680, "epoch": 0.4615882374087788, "loss/policy_avg": 0.5209361910820007, "lr": 2.6924846625766872e-06, "objective/entropy": -39.09912109375, "objective/kl": 7.698319435119629, "objective/non_score_reward": -0.7698320150375366, "objective/rlhf_reward": -0.9566216937461234, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 52.248313903808594, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.47457823157310486, "step": 1604, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9983742237091064 }, { "episode": 25696, "epoch": 0.46187583132616744, "loss/policy_avg": 0.35121509432792664, "lr": 2.692292944785276e-06, "objective/entropy": 164.5242919921875, "objective/kl": 22.50076675415039, "objective/non_score_reward": -2.2500767707824707, "objective/rlhf_reward": -11.000307083129883, "objective/scores": -0.5, "policy/approxkl_avg": 74.89537811279297, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.594908595085144, "step": 1605, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.997105598449707 }, { "episode": 25712, "epoch": 0.4621634252435561, "loss/policy_avg": 0.3918079733848572, "lr": 2.6921012269938653e-06, "objective/entropy": -41.62696838378906, "objective/kl": 12.502820014953613, "objective/non_score_reward": -1.2502820491790771, "objective/rlhf_reward": -0.6011278986930844, "objective/scores": 1.1, "policy/approxkl_avg": 9.170791625976562, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.73177570104599, "step": 1606, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9988858699798584 }, { "episode": 25728, "epoch": 0.4624510191609447, "loss/policy_avg": 0.12750157713890076, "lr": 2.691909509202454e-06, "objective/entropy": 116.52877807617188, "objective/kl": 14.242040634155273, "objective/non_score_reward": -1.4242041110992432, "objective/rlhf_reward": -5.296816742420196, "objective/scores": 0.1, "policy/approxkl_avg": 101.38616943359375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7711246013641357, "step": 1607, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9977200031280518 }, { "episode": 25744, "epoch": 0.4627386130783334, "loss/policy_avg": 0.4134465157985687, "lr": 2.6917177914110433e-06, "objective/entropy": 387.149658203125, "objective/kl": 15.058012008666992, "objective/non_score_reward": -1.5058010816574097, "objective/rlhf_reward": -1.6232041329145428, "objective/scores": 1.1, "policy/approxkl_avg": 60.06370544433594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.9526330828666687, "step": 1608, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9982794523239136 }, { "episode": 25760, "epoch": 0.46302620699572206, "loss/policy_avg": 0.15410056710243225, "lr": 2.691526073619632e-06, "objective/entropy": 74.04141998291016, "objective/kl": 15.319889068603516, "objective/non_score_reward": -1.5319888591766357, "objective/rlhf_reward": -3.2042362138044567, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 130.09005737304688, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7068464756011963, "step": 1609, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0008046627044678 }, { "episode": 25776, "epoch": 0.4633138009131107, "loss/policy_avg": -0.26566749811172485, "lr": 2.691334355828221e-06, "objective/entropy": 124.87033081054688, "objective/kl": 12.6633939743042, "objective/non_score_reward": -1.2663395404815674, "objective/rlhf_reward": -2.9426516316094737, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 12.721351623535156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.644338071346283, "step": 1610, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.003638982772827 }, { "episode": 25792, "epoch": 0.46360139483049934, "loss/policy_avg": 0.10031799972057343, "lr": 2.69114263803681e-06, "objective/entropy": 96.79659271240234, "objective/kl": 15.103389739990234, "objective/non_score_reward": -1.5103390216827393, "objective/rlhf_reward": -5.641355729103088, "objective/scores": 0.1, "policy/approxkl_avg": 40.584495544433594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.633362889289856, "step": 1611, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000192165374756 }, { "episode": 25808, "epoch": 0.463888988747888, "loss/policy_avg": 0.839004635810852, "lr": 2.6909509202453986e-06, "objective/entropy": 9.612741470336914, "objective/kl": 17.089344024658203, "objective/non_score_reward": -1.7089345455169678, "objective/rlhf_reward": -2.4357380032539364, "objective/scores": 1.1, "policy/approxkl_avg": 26.429393768310547, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5412091016769409, "step": 1612, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9982448816299438 }, { "episode": 25824, "epoch": 0.4641765826652766, "loss/policy_avg": 0.4816775918006897, "lr": 2.690759202453988e-06, "objective/entropy": 28.11272430419922, "objective/kl": 15.321728706359863, "objective/non_score_reward": -1.5321729183197021, "objective/rlhf_reward": -5.728691792488098, "objective/scores": 0.1, "policy/approxkl_avg": 25.030738830566406, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.735893189907074, "step": 1613, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9991583824157715 }, { "episode": 25840, "epoch": 0.46446417658266526, "loss/policy_avg": -0.004038345068693161, "lr": 2.6905674846625766e-06, "objective/entropy": 103.99654388427734, "objective/kl": 11.92760944366455, "objective/non_score_reward": -1.192760944366455, "objective/rlhf_reward": -6.771043300628662, "objective/scores": -0.5, "policy/approxkl_avg": 26.085834503173828, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5473529100418091, "step": 1614, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000544309616089 }, { "episode": 25856, "epoch": 0.46475177050005395, "loss/policy_avg": 0.09000711143016815, "lr": 2.6903757668711654e-06, "objective/entropy": 76.68963623046875, "objective/kl": 12.239154815673828, "objective/non_score_reward": -1.2239153385162354, "objective/rlhf_reward": -4.495661205053329, "objective/scores": 0.1, "policy/approxkl_avg": 2.7319424152374268, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.648979663848877, "step": 1615, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0000762939453125 }, { "episode": 25872, "epoch": 0.4650393644174426, "loss/policy_avg": 0.6946203708648682, "lr": 2.6901840490797547e-06, "objective/entropy": 210.3482666015625, "objective/kl": 14.363799095153809, "objective/non_score_reward": -1.4363799095153809, "objective/rlhf_reward": -1.3455196976661679, "objective/scores": 1.1, "policy/approxkl_avg": 21.82388687133789, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7468467950820923, "step": 1616, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9972286224365234 }, { "episode": 25888, "epoch": 0.46532695833483123, "loss/policy_avg": 0.28084972500801086, "lr": 2.6899923312883435e-06, "objective/entropy": 183.54605102539062, "objective/kl": 19.414718627929688, "objective/non_score_reward": -1.9414719343185425, "objective/rlhf_reward": -7.36588761806488, "objective/scores": 0.1, "policy/approxkl_avg": 69.62510681152344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6482871770858765, "step": 1617, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9941153526306152 }, { "episode": 25904, "epoch": 0.46561455225221987, "loss/policy_avg": 0.06988838315010071, "lr": 2.6898006134969327e-06, "objective/entropy": -54.10508728027344, "objective/kl": 15.33044147491455, "objective/non_score_reward": -1.5330440998077393, "objective/rlhf_reward": -8.132176399230957, "objective/scores": -0.5, "policy/approxkl_avg": 17.571006774902344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7482419013977051, "step": 1618, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.00229549407959 }, { "episode": 25920, "epoch": 0.4659021461696085, "loss/policy_avg": 0.20038458704948425, "lr": 2.6896088957055215e-06, "objective/entropy": 23.637174606323242, "objective/kl": 20.115951538085938, "objective/non_score_reward": -2.0115952491760254, "objective/rlhf_reward": -10.046380996704102, "objective/scores": -0.5, "policy/approxkl_avg": 261.33624267578125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6856141090393066, "step": 1619, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9966861009597778 }, { "episode": 25936, "epoch": 0.46618974008699715, "loss/policy_avg": 0.03766409680247307, "lr": 2.6894171779141103e-06, "objective/entropy": 260.55914306640625, "objective/kl": 17.825664520263672, "objective/non_score_reward": -1.7825665473937988, "objective/rlhf_reward": -4.206547056080076, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 49.903568267822266, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7212458848953247, "step": 1620, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9986686706542969 }, { "episode": 25952, "epoch": 0.4664773340043858, "loss/policy_avg": 0.1514892280101776, "lr": 2.6892254601226995e-06, "objective/entropy": 49.057987213134766, "objective/kl": 18.460351943969727, "objective/non_score_reward": -1.8460350036621094, "objective/rlhf_reward": -5.7222807459241025, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 94.69145202636719, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6090638637542725, "step": 1621, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9974236488342285 }, { "episode": 25968, "epoch": 0.46676492792177443, "loss/policy_avg": 0.7067244052886963, "lr": 2.6890337423312884e-06, "objective/entropy": 25.550270080566406, "objective/kl": 17.252092361450195, "objective/non_score_reward": -1.7252092361450195, "objective/rlhf_reward": -8.900836944580078, "objective/scores": -0.5, "policy/approxkl_avg": 169.4148712158203, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7647451162338257, "step": 1622, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9988857507705688 }, { "episode": 25984, "epoch": 0.4670525218391631, "loss/policy_avg": 0.020154371857643127, "lr": 2.688842024539877e-06, "objective/entropy": -240.26010131835938, "objective/kl": 21.083240509033203, "objective/non_score_reward": -2.1083240509033203, "objective/rlhf_reward": -8.033295547962188, "objective/scores": 0.1, "policy/approxkl_avg": 47.80495071411133, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5152453184127808, "step": 1623, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998983383178711 }, { "episode": 26000, "epoch": 0.46734011575655177, "loss/policy_avg": 0.4304991364479065, "lr": 2.6886503067484664e-06, "objective/entropy": -155.2620849609375, "objective/kl": 16.325481414794922, "objective/non_score_reward": -1.6325483322143555, "objective/rlhf_reward": -6.130193388462066, "objective/scores": 0.1, "policy/approxkl_avg": 99.36180114746094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6268974542617798, "step": 1624, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9972355365753174 }, { "episode": 26016, "epoch": 0.4676277096739404, "loss/policy_avg": 0.20302683115005493, "lr": 2.688458588957055e-06, "objective/entropy": 189.250244140625, "objective/kl": 14.729907035827637, "objective/non_score_reward": -1.4729907512664795, "objective/rlhf_reward": -3.7692564151444774, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 34.47994613647461, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5767304301261902, "step": 1625, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9969828128814697 }, { "episode": 26032, "epoch": 0.46791530359132905, "loss/policy_avg": 0.1673620343208313, "lr": 2.6882668711656444e-06, "objective/entropy": 190.11880493164062, "objective/kl": 16.155166625976562, "objective/non_score_reward": -1.6155165433883667, "objective/rlhf_reward": -8.462066650390625, "objective/scores": -0.5, "policy/approxkl_avg": 25.304584503173828, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9624060392379761, "step": 1626, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0002522468566895 }, { "episode": 26048, "epoch": 0.4682028975087177, "loss/policy_avg": 0.10353785753250122, "lr": 2.6880751533742333e-06, "objective/entropy": 204.98744201660156, "objective/kl": 15.436996459960938, "objective/non_score_reward": -1.5436995029449463, "objective/rlhf_reward": -4.34996944216163, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 142.20309448242188, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7310298681259155, "step": 1627, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000544786453247 }, { "episode": 26064, "epoch": 0.4684904914261063, "loss/policy_avg": 0.12273672968149185, "lr": 2.687883435582822e-06, "objective/entropy": -68.6159896850586, "objective/kl": 19.266712188720703, "objective/non_score_reward": -1.9266713857650757, "objective/rlhf_reward": -3.306685423851013, "objective/scores": 1.1, "policy/approxkl_avg": 71.09552001953125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8100576400756836, "step": 1628, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9961222410202026 }, { "episode": 26080, "epoch": 0.46877808534349497, "loss/policy_avg": 0.047170355916023254, "lr": 2.6876917177914113e-06, "objective/entropy": 78.24015808105469, "objective/kl": 13.48752212524414, "objective/non_score_reward": -1.3487520217895508, "objective/rlhf_reward": -4.995008414983749, "objective/scores": 0.1, "policy/approxkl_avg": 20.823776245117188, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8033530116081238, "step": 1629, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9998892545700073 }, { "episode": 26096, "epoch": 0.4690656792608836, "loss/policy_avg": 0.20813743770122528, "lr": 2.6875e-06, "objective/entropy": -0.6008510589599609, "objective/kl": 20.10837745666504, "objective/non_score_reward": -2.0108377933502197, "objective/rlhf_reward": -5.11963203990576, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 29.439212799072266, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.533843994140625, "step": 1630, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9998657703399658 }, { "episode": 26112, "epoch": 0.4693532731782723, "loss/policy_avg": 0.11692376434803009, "lr": 2.6873082822085893e-06, "objective/entropy": 47.902130126953125, "objective/kl": 17.010051727294922, "objective/non_score_reward": -1.7010051012039185, "objective/rlhf_reward": -8.804019927978516, "objective/scores": -0.5, "policy/approxkl_avg": 54.74475860595703, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.614177942276001, "step": 1631, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000200033187866 }, { "episode": 26128, "epoch": 0.46964086709566094, "loss/policy_avg": 0.28958654403686523, "lr": 2.687116564417178e-06, "objective/entropy": 1.1801567077636719, "objective/kl": 17.849510192871094, "objective/non_score_reward": -1.7849509716033936, "objective/rlhf_reward": -5.017097475306068, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 74.95404052734375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6543114185333252, "step": 1632, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001051902770996 }, { "episode": 26144, "epoch": 0.4699284610130496, "loss/policy_avg": 0.48122167587280273, "lr": 2.686924846625767e-06, "objective/entropy": 181.40182495117188, "objective/kl": 18.407989501953125, "objective/non_score_reward": -1.8407987356185913, "objective/rlhf_reward": -5.759075019423085, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 59.3737678527832, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6886003017425537, "step": 1633, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.996335506439209 }, { "episode": 26160, "epoch": 0.4702160549304382, "loss/policy_avg": 0.09453214704990387, "lr": 2.6867331288343558e-06, "objective/entropy": 144.73582458496094, "objective/kl": 15.524151802062988, "objective/non_score_reward": -1.552415370941162, "objective/rlhf_reward": -4.605541441504078, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 45.74516677856445, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.943570613861084, "step": 1634, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998361349105835 }, { "episode": 26176, "epoch": 0.47050364884782686, "loss/policy_avg": 0.6591281890869141, "lr": 2.6865414110429446e-06, "objective/entropy": -101.7740478515625, "objective/kl": 19.701892852783203, "objective/non_score_reward": -1.970189094543457, "objective/rlhf_reward": -7.480756616592407, "objective/scores": 0.1, "policy/approxkl_avg": 152.74351501464844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6178215742111206, "step": 1635, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9983309507369995 }, { "episode": 26192, "epoch": 0.4707912427652155, "loss/policy_avg": 0.203212171792984, "lr": 2.686349693251534e-06, "objective/entropy": -106.49153137207031, "objective/kl": 14.910431861877441, "objective/non_score_reward": -1.491043210029602, "objective/rlhf_reward": -4.0167617005872085, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 39.786354064941406, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8822001814842224, "step": 1636, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9977169036865234 }, { "episode": 26208, "epoch": 0.47107883668260414, "loss/policy_avg": -0.11033344268798828, "lr": 2.6861579754601226e-06, "objective/entropy": 131.6498565673828, "objective/kl": 16.115337371826172, "objective/non_score_reward": -1.6115338802337646, "objective/rlhf_reward": -8.446135520935059, "objective/scores": -0.5, "policy/approxkl_avg": 28.540287017822266, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6810669898986816, "step": 1637, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0009217262268066 }, { "episode": 26224, "epoch": 0.47136643059999284, "loss/policy_avg": 0.09941694140434265, "lr": 2.6859662576687114e-06, "objective/entropy": -60.27667236328125, "objective/kl": 13.570518493652344, "objective/non_score_reward": -1.357051968574524, "objective/rlhf_reward": -5.028207635879516, "objective/scores": 0.1, "policy/approxkl_avg": 20.635757446289062, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7173567414283752, "step": 1638, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9975781440734863 }, { "episode": 26240, "epoch": 0.4716540245173815, "loss/policy_avg": 0.6801759600639343, "lr": 2.6857745398773007e-06, "objective/entropy": 108.41455078125, "objective/kl": 12.158008575439453, "objective/non_score_reward": -1.2158007621765137, "objective/rlhf_reward": -6.863203048706055, "objective/scores": -0.5, "policy/approxkl_avg": 18.411354064941406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6560934782028198, "step": 1639, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0010602474212646 }, { "episode": 26256, "epoch": 0.4719416184347701, "loss/policy_avg": 1.7853608131408691, "lr": 2.6855828220858895e-06, "objective/entropy": 137.3661346435547, "objective/kl": 11.159649848937988, "objective/non_score_reward": -1.1159650087356567, "objective/rlhf_reward": -0.06385996043682063, "objective/scores": 1.1, "policy/approxkl_avg": 1.4167832136154175, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5946769714355469, "step": 1640, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.006664276123047 }, { "episode": 26272, "epoch": 0.47222921235215876, "loss/policy_avg": 0.48221543431282043, "lr": 2.6853911042944787e-06, "objective/entropy": 208.98565673828125, "objective/kl": 11.308876037597656, "objective/non_score_reward": -1.130887746810913, "objective/rlhf_reward": -0.12355086803436244, "objective/scores": 1.1, "policy/approxkl_avg": 3.408721446990967, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7396516799926758, "step": 1641, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.003695249557495 }, { "episode": 26288, "epoch": 0.4725168062695474, "loss/policy_avg": 0.3023480176925659, "lr": 2.6851993865030675e-06, "objective/entropy": 349.9120178222656, "objective/kl": 21.112802505493164, "objective/non_score_reward": -2.1112802028656006, "objective/rlhf_reward": -6.045120692253112, "objective/scores": 0.6, "policy/approxkl_avg": 64.028564453125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.9350632429122925, "step": 1642, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9987595081329346 }, { "episode": 26304, "epoch": 0.47280440018693604, "loss/policy_avg": 0.23029407858848572, "lr": 2.6850076687116563e-06, "objective/entropy": 73.98474884033203, "objective/kl": 15.536746978759766, "objective/non_score_reward": -1.5536746978759766, "objective/rlhf_reward": -3.8146989107131954, "objective/scores": 0.6, "policy/approxkl_avg": 24.919830322265625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4860690236091614, "step": 1643, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9997631311416626 }, { "episode": 26320, "epoch": 0.4730919941043247, "loss/policy_avg": 0.10321778059005737, "lr": 2.6848159509202456e-06, "objective/entropy": -64.98163604736328, "objective/kl": 11.039989471435547, "objective/non_score_reward": -1.1039988994598389, "objective/rlhf_reward": -2.468584398703511, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 18.096717834472656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5158751010894775, "step": 1644, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.00130558013916 }, { "episode": 26336, "epoch": 0.4733795880217133, "loss/policy_avg": -0.002828633412718773, "lr": 2.6846242331288344e-06, "objective/entropy": -9.201019287109375, "objective/kl": 10.427934646606445, "objective/non_score_reward": -1.0427935123443604, "objective/rlhf_reward": -6.171174049377441, "objective/scores": -0.5, "policy/approxkl_avg": 16.601865768432617, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.750141978263855, "step": 1645, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9995884895324707 }, { "episode": 26352, "epoch": 0.473667181939102, "loss/policy_avg": 1.0678659677505493, "lr": 2.6844325153374236e-06, "objective/entropy": -81.63957977294922, "objective/kl": 19.900205612182617, "objective/non_score_reward": -1.9900203943252563, "objective/rlhf_reward": -5.036362592817518, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 49.511234283447266, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7680249214172363, "step": 1646, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9990290403366089 }, { "episode": 26368, "epoch": 0.47395477585649065, "loss/policy_avg": -0.299903929233551, "lr": 2.6842407975460124e-06, "objective/entropy": 132.30357360839844, "objective/kl": 10.713934898376465, "objective/non_score_reward": -1.071393609046936, "objective/rlhf_reward": -3.8855744212865826, "objective/scores": 0.1, "policy/approxkl_avg": 16.781431198120117, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5183275938034058, "step": 1647, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9997494220733643 }, { "episode": 26384, "epoch": 0.4742423697738793, "loss/policy_avg": 0.9550938606262207, "lr": 2.6840490797546012e-06, "objective/entropy": 50.97139358520508, "objective/kl": 15.559367179870605, "objective/non_score_reward": -1.5559368133544922, "objective/rlhf_reward": -3.300028239132139, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 66.05949401855469, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5880213975906372, "step": 1648, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9958624839782715 }, { "episode": 26400, "epoch": 0.47452996369126793, "loss/policy_avg": 0.3384305238723755, "lr": 2.6838573619631904e-06, "objective/entropy": -16.053207397460938, "objective/kl": 14.48770523071289, "objective/non_score_reward": -1.4487704038619995, "objective/rlhf_reward": -7.795081615447998, "objective/scores": -0.5, "policy/approxkl_avg": 18.371421813964844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.80098557472229, "step": 1649, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.996049165725708 }, { "episode": 26416, "epoch": 0.47481755760865657, "loss/policy_avg": 0.041066974401474, "lr": 2.6836656441717793e-06, "objective/entropy": 183.5389862060547, "objective/kl": 16.953853607177734, "objective/non_score_reward": -1.695385456085205, "objective/rlhf_reward": -8.78154182434082, "objective/scores": -0.5, "policy/approxkl_avg": 52.10731506347656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5555412173271179, "step": 1650, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.003539562225342 }, { "episode": 26432, "epoch": 0.4751051515260452, "loss/policy_avg": 0.15615364909172058, "lr": 2.683473926380368e-06, "objective/entropy": 14.035999298095703, "objective/kl": 16.663122177124023, "objective/non_score_reward": -1.666312336921692, "objective/rlhf_reward": -8.66524887084961, "objective/scores": -0.5, "policy/approxkl_avg": 40.927711486816406, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5586593151092529, "step": 1651, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9988031387329102 }, { "episode": 26448, "epoch": 0.47539274544343385, "loss/policy_avg": 0.8067106008529663, "lr": 2.6832822085889573e-06, "objective/entropy": 110.69371032714844, "objective/kl": 17.34006118774414, "objective/non_score_reward": -1.7340062856674194, "objective/rlhf_reward": -6.536025023460388, "objective/scores": 0.1, "policy/approxkl_avg": 23.030296325683594, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.843829870223999, "step": 1652, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000535726547241 }, { "episode": 26464, "epoch": 0.47568033936082255, "loss/policy_avg": 0.4142029881477356, "lr": 2.683090490797546e-06, "objective/entropy": 182.6699676513672, "objective/kl": 17.451894760131836, "objective/non_score_reward": -1.7451895475387573, "objective/rlhf_reward": -6.580758041143417, "objective/scores": 0.1, "policy/approxkl_avg": 92.45771789550781, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4756516218185425, "step": 1653, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.000260353088379 }, { "episode": 26480, "epoch": 0.4759679332782112, "loss/policy_avg": 0.33923524618148804, "lr": 2.6828987730061353e-06, "objective/entropy": -156.19644165039062, "objective/kl": 20.641033172607422, "objective/non_score_reward": -2.064103364944458, "objective/rlhf_reward": -3.8564135193824765, "objective/scores": 1.1, "policy/approxkl_avg": 145.35055541992188, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6929056644439697, "step": 1654, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000690221786499 }, { "episode": 26496, "epoch": 0.4762555271955998, "loss/policy_avg": 0.0668339729309082, "lr": 2.682707055214724e-06, "objective/entropy": 119.09803771972656, "objective/kl": 16.56161117553711, "objective/non_score_reward": -1.6561613082885742, "objective/rlhf_reward": -2.224645113945007, "objective/scores": 1.1, "policy/approxkl_avg": 88.94144439697266, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7851405739784241, "step": 1655, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9987611770629883 }, { "episode": 26512, "epoch": 0.47654312111298847, "loss/policy_avg": 0.4231521487236023, "lr": 2.682515337423313e-06, "objective/entropy": 198.75799560546875, "objective/kl": 14.148205757141113, "objective/non_score_reward": -1.4148205518722534, "objective/rlhf_reward": -7.659282207489014, "objective/scores": -0.5, "policy/approxkl_avg": 27.799705505371094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7525266408920288, "step": 1656, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0012173652648926 }, { "episode": 26528, "epoch": 0.4768307150303771, "loss/policy_avg": 0.37918904423713684, "lr": 2.6823236196319018e-06, "objective/entropy": 42.97578430175781, "objective/kl": 13.691777229309082, "objective/non_score_reward": -1.3691778182983398, "objective/rlhf_reward": -7.476710796356201, "objective/scores": -0.5, "policy/approxkl_avg": 61.79126739501953, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7020362615585327, "step": 1657, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9970271587371826 }, { "episode": 26544, "epoch": 0.47711830894776575, "loss/policy_avg": 0.13542841374874115, "lr": 2.6821319018404906e-06, "objective/entropy": 26.955015182495117, "objective/kl": 7.511106491088867, "objective/non_score_reward": -0.7511106729507446, "objective/rlhf_reward": 1.3955574050545696, "objective/scores": 1.1, "policy/approxkl_avg": 14.479230880737305, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8626604080200195, "step": 1658, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9989900588989258 }, { "episode": 26560, "epoch": 0.4774059028651544, "loss/policy_avg": 0.02864725887775421, "lr": 2.68194018404908e-06, "objective/entropy": 10.317928314208984, "objective/kl": 15.39538288116455, "objective/non_score_reward": -1.5395383834838867, "objective/rlhf_reward": -4.496293788374053, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 14.582573890686035, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.60040283203125, "step": 1659, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9991271495819092 }, { "episode": 26576, "epoch": 0.477693496782543, "loss/policy_avg": 0.2558051347732544, "lr": 2.6817484662576686e-06, "objective/entropy": 127.64970397949219, "objective/kl": 14.231189727783203, "objective/non_score_reward": -1.423119068145752, "objective/rlhf_reward": -2.7687569602739543, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 7.797764778137207, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6011125445365906, "step": 1660, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9995901584625244 }, { "episode": 26592, "epoch": 0.4779810906999317, "loss/policy_avg": -0.2182144820690155, "lr": 2.681556748466258e-06, "objective/entropy": 2.858919143676758, "objective/kl": 7.278576850891113, "objective/non_score_reward": -0.7278577089309692, "objective/rlhf_reward": -4.911430835723877, "objective/scores": -0.5, "policy/approxkl_avg": 1.9155890941619873, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.541759729385376, "step": 1661, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0009493827819824 }, { "episode": 26608, "epoch": 0.47826868461732036, "loss/policy_avg": 0.8636528253555298, "lr": 2.6813650306748467e-06, "objective/entropy": 140.26402282714844, "objective/kl": 19.363082885742188, "objective/non_score_reward": -1.9363083839416504, "objective/rlhf_reward": -7.345234012603759, "objective/scores": 0.1, "policy/approxkl_avg": 14.906473159790039, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7830764055252075, "step": 1662, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0002543926239014 }, { "episode": 26624, "epoch": 0.478556278534709, "loss/policy_avg": 0.09733457863330841, "lr": 2.6811733128834355e-06, "objective/entropy": 70.89553833007812, "objective/kl": 16.588911056518555, "objective/non_score_reward": -1.658891201019287, "objective/rlhf_reward": -8.635564804077148, "objective/scores": -0.5, "policy/approxkl_avg": 129.03024291992188, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7068811655044556, "step": 1663, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9981067180633545 }, { "episode": 26640, "epoch": 0.47884387245209764, "loss/policy_avg": 0.3696031868457794, "lr": 2.6809815950920247e-06, "objective/entropy": -80.44916534423828, "objective/kl": 16.037981033325195, "objective/non_score_reward": -1.6037981510162354, "objective/rlhf_reward": -2.01519258916378, "objective/scores": 1.1, "policy/approxkl_avg": 122.46257019042969, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5848633050918579, "step": 1664, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999934434890747 }, { "episode": 26656, "epoch": 0.4791314663694863, "loss/policy_avg": 0.05235806852579117, "lr": 2.6807898773006135e-06, "objective/entropy": 279.45391845703125, "objective/kl": 12.546588897705078, "objective/non_score_reward": -1.2546589374542236, "objective/rlhf_reward": -3.1938070013848057, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 45.496795654296875, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7179256081581116, "step": 1665, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.999608039855957 }, { "episode": 26672, "epoch": 0.4794190602868749, "loss/policy_avg": 0.2440473437309265, "lr": 2.6805981595092023e-06, "objective/entropy": 230.3323974609375, "objective/kl": 18.421417236328125, "objective/non_score_reward": -1.8421416282653809, "objective/rlhf_reward": -6.968566811084747, "objective/scores": 0.1, "policy/approxkl_avg": 113.02169799804688, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7931129932403564, "step": 1666, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0004630088806152 }, { "episode": 26688, "epoch": 0.47970665420426356, "loss/policy_avg": 0.4596465826034546, "lr": 2.6804064417177916e-06, "objective/entropy": -190.0703887939453, "objective/kl": 13.094562530517578, "objective/non_score_reward": -1.3094563484191895, "objective/rlhf_reward": -3.5044923583666483, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 18.68999671936035, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6400429010391235, "step": 1667, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9988033771514893 }, { "episode": 26704, "epoch": 0.4799942481216522, "loss/policy_avg": 0.04195621609687805, "lr": 2.6802147239263804e-06, "objective/entropy": 54.4300537109375, "objective/kl": 15.151268005371094, "objective/non_score_reward": -1.5151269435882568, "objective/rlhf_reward": -5.660508012771606, "objective/scores": 0.1, "policy/approxkl_avg": 70.76380920410156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6737993955612183, "step": 1668, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9995094537734985 }, { "episode": 26720, "epoch": 0.4802818420390409, "loss/policy_avg": 0.8512704372406006, "lr": 2.6800230061349696e-06, "objective/entropy": 10.23702621459961, "objective/kl": 16.816810607910156, "objective/non_score_reward": -1.6816810369491577, "objective/rlhf_reward": -3.8030053719293804, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 7.483461380004883, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7293576002120972, "step": 1669, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9989674091339111 }, { "episode": 26736, "epoch": 0.48056943595642954, "loss/policy_avg": 0.09557580947875977, "lr": 2.6798312883435584e-06, "objective/entropy": 216.87448120117188, "objective/kl": 14.237401962280273, "objective/non_score_reward": -1.4237401485443115, "objective/rlhf_reward": -1.2949605941772457, "objective/scores": 1.1, "policy/approxkl_avg": 11.813751220703125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5397988557815552, "step": 1670, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9995009899139404 }, { "episode": 26752, "epoch": 0.4808570298738182, "loss/policy_avg": 0.8583983182907104, "lr": 2.6796395705521472e-06, "objective/entropy": 325.3941650390625, "objective/kl": 17.281902313232422, "objective/non_score_reward": -1.7281899452209473, "objective/rlhf_reward": -8.912759780883789, "objective/scores": -0.5, "policy/approxkl_avg": 68.82689666748047, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8322206735610962, "step": 1671, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.000537395477295 }, { "episode": 26768, "epoch": 0.4811446237912068, "loss/policy_avg": -0.03962177410721779, "lr": 2.6794478527607365e-06, "objective/entropy": 144.37335205078125, "objective/kl": 16.263277053833008, "objective/non_score_reward": -1.6263277530670166, "objective/rlhf_reward": -4.105310714244842, "objective/scores": 0.6, "policy/approxkl_avg": 8.10549545288086, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6529961824417114, "step": 1672, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9979302883148193 }, { "episode": 26784, "epoch": 0.48143221770859546, "loss/policy_avg": 0.8202505707740784, "lr": 2.6792561349693253e-06, "objective/entropy": 257.78668212890625, "objective/kl": 18.381481170654297, "objective/non_score_reward": -1.8381482362747192, "objective/rlhf_reward": -5.40518171616071, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 4.968506813049316, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6601094603538513, "step": 1673, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0014734268188477 }, { "episode": 26800, "epoch": 0.4817198116259841, "loss/policy_avg": 0.2722570300102234, "lr": 2.679064417177914e-06, "objective/entropy": 159.26776123046875, "objective/kl": 23.311752319335938, "objective/non_score_reward": -2.3311753273010254, "objective/rlhf_reward": -4.9247020244598385, "objective/scores": 1.1, "policy/approxkl_avg": 23.478086471557617, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6764966249465942, "step": 1674, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9992938041687012 }, { "episode": 26816, "epoch": 0.48200740554337274, "loss/policy_avg": 1.5214738845825195, "lr": 2.6788726993865033e-06, "objective/entropy": 53.97813415527344, "objective/kl": 11.97580337524414, "objective/non_score_reward": -1.197580337524414, "objective/rlhf_reward": -4.390321350097656, "objective/scores": 0.1, "policy/approxkl_avg": 2.5720791816711426, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5206632614135742, "step": 1675, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.003241777420044 }, { "episode": 26832, "epoch": 0.48229499946076143, "loss/policy_avg": 0.0932961255311966, "lr": 2.678680981595092e-06, "objective/entropy": -24.687271118164062, "objective/kl": 15.590835571289062, "objective/non_score_reward": -1.5590835809707642, "objective/rlhf_reward": -3.8363342046737667, "objective/scores": 0.6, "policy/approxkl_avg": 31.776020050048828, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6759655475616455, "step": 1676, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9985594749450684 }, { "episode": 26848, "epoch": 0.48258259337815007, "loss/policy_avg": 0.1449345350265503, "lr": 2.6784892638036813e-06, "objective/entropy": -87.6327133178711, "objective/kl": 12.795013427734375, "objective/non_score_reward": -1.2795013189315796, "objective/rlhf_reward": -3.170593927578862, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 17.413463592529297, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7654553055763245, "step": 1677, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999047040939331 }, { "episode": 26864, "epoch": 0.4828701872955387, "loss/policy_avg": 0.39904987812042236, "lr": 2.67829754601227e-06, "objective/entropy": -42.12534713745117, "objective/kl": 15.66563606262207, "objective/non_score_reward": -1.5665637254714966, "objective/rlhf_reward": -8.266254425048828, "objective/scores": -0.5, "policy/approxkl_avg": 3.999363899230957, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5445783138275146, "step": 1678, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9977316856384277 }, { "episode": 26880, "epoch": 0.48315778121292735, "loss/policy_avg": 0.033817049115896225, "lr": 2.678105828220859e-06, "objective/entropy": 347.9168701171875, "objective/kl": 13.055137634277344, "objective/non_score_reward": -1.305513858795166, "objective/rlhf_reward": -3.6657959361940176, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 37.63822555541992, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8259381055831909, "step": 1679, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9978172779083252 }, { "episode": 26896, "epoch": 0.483445375130316, "loss/policy_avg": 0.09545890986919403, "lr": 2.6779141104294478e-06, "objective/entropy": 118.7197494506836, "objective/kl": 12.683245658874512, "objective/non_score_reward": -1.268324613571167, "objective/rlhf_reward": -2.149579022766325, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 16.969579696655273, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5533679723739624, "step": 1680, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9996938705444336 }, { "episode": 26912, "epoch": 0.48373296904770463, "loss/policy_avg": 0.3336024284362793, "lr": 2.6777223926380366e-06, "objective/entropy": 150.80264282226562, "objective/kl": 17.18734359741211, "objective/non_score_reward": -1.7187345027923584, "objective/rlhf_reward": -6.474938011169433, "objective/scores": 0.1, "policy/approxkl_avg": 31.541332244873047, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.647294819355011, "step": 1681, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9961645603179932 }, { "episode": 26928, "epoch": 0.48402056296509327, "loss/policy_avg": 0.11377117037773132, "lr": 2.677530674846626e-06, "objective/entropy": 260.295654296875, "objective/kl": 19.10438346862793, "objective/non_score_reward": -1.9104382991790771, "objective/rlhf_reward": -5.979893451154815, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 70.76204681396484, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6796458959579468, "step": 1682, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9962824583053589 }, { "episode": 26944, "epoch": 0.4843081568824819, "loss/policy_avg": 0.5521122217178345, "lr": 2.6773389570552146e-06, "objective/entropy": 172.6796417236328, "objective/kl": 17.577075958251953, "objective/non_score_reward": -1.7577075958251953, "objective/rlhf_reward": -4.1071117415439815, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 23.999298095703125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6038773059844971, "step": 1683, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9993399381637573 }, { "episode": 26960, "epoch": 0.4845957507998706, "loss/policy_avg": 0.6839799880981445, "lr": 2.677147239263804e-06, "objective/entropy": -137.83380126953125, "objective/kl": 18.686176300048828, "objective/non_score_reward": -1.8686177730560303, "objective/rlhf_reward": -3.0744709134101864, "objective/scores": 1.1, "policy/approxkl_avg": 51.80305099487305, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.39423924684524536, "step": 1684, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998917818069458 }, { "episode": 26976, "epoch": 0.48488334471725925, "loss/policy_avg": 0.10252314805984497, "lr": 2.6769555214723927e-06, "objective/entropy": 50.94281005859375, "objective/kl": 15.866674423217773, "objective/non_score_reward": -1.586667537689209, "objective/rlhf_reward": -5.946670508384704, "objective/scores": 0.1, "policy/approxkl_avg": 92.38317108154297, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6353904008865356, "step": 1685, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0007472038269043 }, { "episode": 26992, "epoch": 0.4851709386346479, "loss/policy_avg": 0.5134760737419128, "lr": 2.6767638036809815e-06, "objective/entropy": 75.39920806884766, "objective/kl": 16.352144241333008, "objective/non_score_reward": -1.6352144479751587, "objective/rlhf_reward": -8.540857315063477, "objective/scores": -0.5, "policy/approxkl_avg": 69.13995361328125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7286012172698975, "step": 1686, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9959840774536133 }, { "episode": 27008, "epoch": 0.4854585325520365, "loss/policy_avg": -0.531006395816803, "lr": 2.6765720858895707e-06, "objective/entropy": 145.20367431640625, "objective/kl": 10.78251838684082, "objective/non_score_reward": -1.078251838684082, "objective/rlhf_reward": -6.313007354736328, "objective/scores": -0.5, "policy/approxkl_avg": 46.25226974487305, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5566188097000122, "step": 1687, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0006327629089355 }, { "episode": 27024, "epoch": 0.48574612646942517, "loss/policy_avg": -0.807724118232727, "lr": 2.6763803680981595e-06, "objective/entropy": -169.50186157226562, "objective/kl": 18.087905883789062, "objective/non_score_reward": -1.8087905645370483, "objective/rlhf_reward": -6.8351626157760625, "objective/scores": 0.1, "policy/approxkl_avg": 62.57245635986328, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.668152928352356, "step": 1688, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0666861534118652 }, { "episode": 27040, "epoch": 0.4860337203868138, "loss/policy_avg": 0.06836674362421036, "lr": 2.6761886503067483e-06, "objective/entropy": 90.70515441894531, "objective/kl": 14.302078247070312, "objective/non_score_reward": -1.4302079677581787, "objective/rlhf_reward": -1.3208320423960682, "objective/scores": 1.1, "policy/approxkl_avg": 45.73702621459961, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.41737520694732666, "step": 1689, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9990456104278564 }, { "episode": 27056, "epoch": 0.48632131430420245, "loss/policy_avg": 2.2695631980895996, "lr": 2.6759969325153376e-06, "objective/entropy": -119.82723999023438, "objective/kl": 14.979490280151367, "objective/non_score_reward": -1.4979491233825684, "objective/rlhf_reward": -4.258463070789972, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 169.4292755126953, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.44966164231300354, "step": 1690, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9990613460540771 }, { "episode": 27072, "epoch": 0.48660890822159114, "loss/policy_avg": 0.25318387150764465, "lr": 2.6758052147239264e-06, "objective/entropy": -54.465782165527344, "objective/kl": 10.187479019165039, "objective/non_score_reward": -1.0187478065490723, "objective/rlhf_reward": -6.074991226196289, "objective/scores": -0.5, "policy/approxkl_avg": 40.81450271606445, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6709445714950562, "step": 1691, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9990923404693604 }, { "episode": 27088, "epoch": 0.4868965021389798, "loss/policy_avg": 0.5283163189888, "lr": 2.6756134969325156e-06, "objective/entropy": 32.93486785888672, "objective/kl": 12.04556655883789, "objective/non_score_reward": -1.2045565843582153, "objective/rlhf_reward": -4.4182263821363446, "objective/scores": 0.1, "policy/approxkl_avg": 46.15248107910156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7208871841430664, "step": 1692, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000389337539673 }, { "episode": 27104, "epoch": 0.4871840960563684, "loss/policy_avg": -0.20891764760017395, "lr": 2.6754217791411044e-06, "objective/entropy": -61.90003967285156, "objective/kl": 10.981453895568848, "objective/non_score_reward": -1.0981454849243164, "objective/rlhf_reward": -1.4688628062021463, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 35.74761199951172, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6324474215507507, "step": 1693, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0004220008850098 }, { "episode": 27120, "epoch": 0.48747168997375706, "loss/policy_avg": 0.4535183906555176, "lr": 2.6752300613496932e-06, "objective/entropy": 197.14102172851562, "objective/kl": 11.678915977478027, "objective/non_score_reward": -1.1678917407989502, "objective/rlhf_reward": -0.27156662046909297, "objective/scores": 1.1, "policy/approxkl_avg": 10.70810317993164, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5310260057449341, "step": 1694, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9988822937011719 }, { "episode": 27136, "epoch": 0.4877592838911457, "loss/policy_avg": 0.16736939549446106, "lr": 2.6750383435582825e-06, "objective/entropy": 57.92140579223633, "objective/kl": 17.87525177001953, "objective/non_score_reward": -1.7875254154205322, "objective/rlhf_reward": -5.545981440607624, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 77.61048889160156, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8444628715515137, "step": 1695, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9988882541656494 }, { "episode": 27152, "epoch": 0.48804687780853434, "loss/policy_avg": 0.05939989909529686, "lr": 2.6748466257668713e-06, "objective/entropy": -58.93162536621094, "objective/kl": 21.960491180419922, "objective/non_score_reward": -2.196049213409424, "objective/rlhf_reward": -4.384197330474853, "objective/scores": 1.1, "policy/approxkl_avg": 82.17927551269531, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6906427145004272, "step": 1696, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9995031356811523 }, { "episode": 27168, "epoch": 0.488334471725923, "loss/policy_avg": 0.2913649082183838, "lr": 2.6746549079754605e-06, "objective/entropy": 81.17434692382812, "objective/kl": 15.473855018615723, "objective/non_score_reward": -1.5473856925964355, "objective/rlhf_reward": -5.789542427659034, "objective/scores": 0.1, "policy/approxkl_avg": 39.416595458984375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7009164094924927, "step": 1697, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9989409446716309 }, { "episode": 27184, "epoch": 0.4886220656433116, "loss/policy_avg": 0.2651277780532837, "lr": 2.6744631901840493e-06, "objective/entropy": 58.803428649902344, "objective/kl": 18.283138275146484, "objective/non_score_reward": -1.8283135890960693, "objective/rlhf_reward": -4.389535520912382, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 105.51332092285156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5497996211051941, "step": 1698, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.996532678604126 }, { "episode": 27200, "epoch": 0.4889096595607003, "loss/policy_avg": 0.6435315608978271, "lr": 2.674271472392638e-06, "objective/entropy": -44.15504455566406, "objective/kl": 15.604392051696777, "objective/non_score_reward": -1.560438871383667, "objective/rlhf_reward": -1.8417557053267952, "objective/scores": 1.1, "policy/approxkl_avg": 74.01034545898438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5064129829406738, "step": 1699, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9977962970733643 }, { "episode": 27216, "epoch": 0.48919725347808896, "loss/policy_avg": 0.5258274674415588, "lr": 2.6740797546012274e-06, "objective/entropy": 13.031089782714844, "objective/kl": 7.618081092834473, "objective/non_score_reward": -0.7618080377578735, "objective/rlhf_reward": -0.12351331555959844, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 9.073127746582031, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6002603769302368, "step": 1700, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9963665008544922 }, { "episode": 27232, "epoch": 0.4894848473954776, "loss/policy_avg": 0.5734251141548157, "lr": 2.6738880368098157e-06, "objective/entropy": 151.17642211914062, "objective/kl": 22.45712661743164, "objective/non_score_reward": -2.245712995529175, "objective/rlhf_reward": -10.9828519821167, "objective/scores": -0.5, "policy/approxkl_avg": 80.04830932617188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8935544490814209, "step": 1701, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9976305961608887 }, { "episode": 27248, "epoch": 0.48977244131286624, "loss/policy_avg": 0.18563227355480194, "lr": 2.673696319018405e-06, "objective/entropy": 71.73806762695312, "objective/kl": 10.811704635620117, "objective/non_score_reward": -1.0811705589294434, "objective/rlhf_reward": -3.924682086706161, "objective/scores": 0.1, "policy/approxkl_avg": 68.60709381103516, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7594705820083618, "step": 1702, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9997520446777344 }, { "episode": 27264, "epoch": 0.4900600352302549, "loss/policy_avg": 0.010921552777290344, "lr": 2.6735046012269938e-06, "objective/entropy": 312.49151611328125, "objective/kl": 19.373149871826172, "objective/non_score_reward": -1.9373149871826172, "objective/rlhf_reward": -3.3492601871490475, "objective/scores": 1.1, "policy/approxkl_avg": 65.90713500976562, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7818717956542969, "step": 1703, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.997588872909546 }, { "episode": 27280, "epoch": 0.4903476291476435, "loss/policy_avg": 0.2919304668903351, "lr": 2.6733128834355826e-06, "objective/entropy": 100.27139282226562, "objective/kl": 10.23652458190918, "objective/non_score_reward": -1.0236525535583496, "objective/rlhf_reward": 0.30538999438285863, "objective/scores": 1.1, "policy/approxkl_avg": 1.8728735446929932, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5916879177093506, "step": 1704, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0014801025390625 }, { "episode": 27296, "epoch": 0.49063522306503216, "loss/policy_avg": 0.14356620609760284, "lr": 2.673121165644172e-06, "objective/entropy": 242.97398376464844, "objective/kl": 16.659366607666016, "objective/non_score_reward": -1.6659367084503174, "objective/rlhf_reward": -8.66374683380127, "objective/scores": -0.5, "policy/approxkl_avg": 29.75727081298828, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6769480109214783, "step": 1705, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9984476566314697 }, { "episode": 27312, "epoch": 0.4909228169824208, "loss/policy_avg": 0.2744619846343994, "lr": 2.6729294478527606e-06, "objective/entropy": -69.02651977539062, "objective/kl": 17.554481506347656, "objective/non_score_reward": -1.7554482221603394, "objective/rlhf_reward": -9.0217924118042, "objective/scores": -0.5, "policy/approxkl_avg": 40.135658264160156, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5950236320495605, "step": 1706, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9994828701019287 }, { "episode": 27328, "epoch": 0.4912104108998095, "loss/policy_avg": 0.3935040235519409, "lr": 2.67273773006135e-06, "objective/entropy": 196.258544921875, "objective/kl": 4.807914733886719, "objective/non_score_reward": -0.48079144954681396, "objective/rlhf_reward": 2.4768340677022938, "objective/scores": 1.1, "policy/approxkl_avg": 1.6954267024993896, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.75687575340271, "step": 1707, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0042829513549805 }, { "episode": 27344, "epoch": 0.49149800481719813, "loss/policy_avg": 0.09713247418403625, "lr": 2.6725460122699387e-06, "objective/entropy": -250.28390502929688, "objective/kl": 20.579936981201172, "objective/non_score_reward": -2.0579938888549805, "objective/rlhf_reward": -7.831975115835666, "objective/scores": 0.1, "policy/approxkl_avg": 49.66277313232422, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7922552824020386, "step": 1708, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9978537559509277 }, { "episode": 27360, "epoch": 0.49178559873458677, "loss/policy_avg": 1.3034917116165161, "lr": 2.6723542944785275e-06, "objective/entropy": 293.18182373046875, "objective/kl": 13.69601821899414, "objective/non_score_reward": -1.3696017265319824, "objective/rlhf_reward": -1.0784070849418637, "objective/scores": 1.1, "policy/approxkl_avg": 44.882728576660156, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7728196382522583, "step": 1709, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9976916313171387 }, { "episode": 27376, "epoch": 0.4920731926519754, "loss/policy_avg": 0.6501213312149048, "lr": 2.6721625766871167e-06, "objective/entropy": 26.59703826904297, "objective/kl": 17.755245208740234, "objective/non_score_reward": -1.7755244970321655, "objective/rlhf_reward": -9.102097511291504, "objective/scores": -0.5, "policy/approxkl_avg": 62.82991027832031, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.9298363924026489, "step": 1710, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0004334449768066 }, { "episode": 27392, "epoch": 0.49236078656936405, "loss/policy_avg": 0.5077639222145081, "lr": 2.6719708588957055e-06, "objective/entropy": 161.69691467285156, "objective/kl": 21.700233459472656, "objective/non_score_reward": -2.1700234413146973, "objective/rlhf_reward": -5.756374274135801, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 53.48500061035156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7480050325393677, "step": 1711, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9954512119293213 }, { "episode": 27408, "epoch": 0.4926483804867527, "loss/policy_avg": 0.42052412033081055, "lr": 2.6717791411042948e-06, "objective/entropy": -36.82831954956055, "objective/kl": 11.032814979553223, "objective/non_score_reward": -1.1032816171646118, "objective/rlhf_reward": -6.413126468658447, "objective/scores": -0.5, "policy/approxkl_avg": 38.36772537231445, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5636872053146362, "step": 1712, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998363733291626 }, { "episode": 27424, "epoch": 0.49293597440414133, "loss/policy_avg": 1.5223966836929321, "lr": 2.6715874233128836e-06, "objective/entropy": 125.52906799316406, "objective/kl": 9.578523635864258, "objective/non_score_reward": -0.9578523635864258, "objective/rlhf_reward": -1.7087034008660653, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 11.727434158325195, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.28627026081085205, "step": 1713, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.001208543777466 }, { "episode": 27440, "epoch": 0.49322356832153, "loss/policy_avg": 0.16528268158435822, "lr": 2.6713957055214724e-06, "objective/entropy": -198.51060485839844, "objective/kl": 11.166755676269531, "objective/non_score_reward": -1.116675615310669, "objective/rlhf_reward": -0.06670258045196498, "objective/scores": 1.1, "policy/approxkl_avg": 48.924591064453125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4975529909133911, "step": 1714, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.997665286064148 }, { "episode": 27456, "epoch": 0.49351116223891867, "loss/policy_avg": -0.17147034406661987, "lr": 2.6712039877300616e-06, "objective/entropy": 172.23587036132812, "objective/kl": 13.310028076171875, "objective/non_score_reward": -1.3310028314590454, "objective/rlhf_reward": -3.7677519535094053, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 2.9677109718322754, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.7124722003936768, "step": 1715, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.003903865814209 }, { "episode": 27472, "epoch": 0.4937987561563073, "loss/policy_avg": 0.05405956506729126, "lr": 2.6710122699386504e-06, "objective/entropy": -11.503082275390625, "objective/kl": 17.24118995666504, "objective/non_score_reward": -1.724118947982788, "objective/rlhf_reward": -6.496476149559021, "objective/scores": 0.1, "policy/approxkl_avg": 22.422582626342773, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7170135974884033, "step": 1716, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000546455383301 }, { "episode": 27488, "epoch": 0.49408635007369595, "loss/policy_avg": 0.5887688398361206, "lr": 2.6708205521472392e-06, "objective/entropy": 17.882118225097656, "objective/kl": 19.30318260192871, "objective/non_score_reward": -1.9303183555603027, "objective/rlhf_reward": -4.797554348350737, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 135.39852905273438, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6600601077079773, "step": 1717, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9995791912078857 }, { "episode": 27504, "epoch": 0.4943739439910846, "loss/policy_avg": -0.27067333459854126, "lr": 2.6706288343558285e-06, "objective/entropy": 125.53389739990234, "objective/kl": 16.03993797302246, "objective/non_score_reward": -1.6039938926696777, "objective/rlhf_reward": -8.415975570678711, "objective/scores": -0.5, "policy/approxkl_avg": 8.563714027404785, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.48206979036331177, "step": 1718, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0037009716033936 }, { "episode": 27520, "epoch": 0.4946615379084732, "loss/policy_avg": 0.017645031213760376, "lr": 2.6704371165644173e-06, "objective/entropy": 232.59396362304688, "objective/kl": 15.594569206237793, "objective/non_score_reward": -1.5594568252563477, "objective/rlhf_reward": -8.23782730102539, "objective/scores": -0.5, "policy/approxkl_avg": 1.77301025390625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8485502004623413, "step": 1719, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0005722045898438 }, { "episode": 27536, "epoch": 0.49494913182586187, "loss/policy_avg": 0.6044291257858276, "lr": 2.6702453987730065e-06, "objective/entropy": 112.50225067138672, "objective/kl": 15.722814559936523, "objective/non_score_reward": -1.5722814798355103, "objective/rlhf_reward": -4.341715033130582, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 4.219062805175781, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5630087852478027, "step": 1720, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9984500408172607 }, { "episode": 27552, "epoch": 0.4952367257432505, "loss/policy_avg": 0.33518537878990173, "lr": 2.6700536809815953e-06, "objective/entropy": 158.0392608642578, "objective/kl": 19.25482177734375, "objective/non_score_reward": -1.9254825115203857, "objective/rlhf_reward": -7.301930522918701, "objective/scores": 0.1, "policy/approxkl_avg": 105.86214447021484, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6998374462127686, "step": 1721, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9984279870986938 }, { "episode": 27568, "epoch": 0.4955243196606392, "loss/policy_avg": 0.09444519132375717, "lr": 2.669861963190184e-06, "objective/entropy": 187.67007446289062, "objective/kl": 19.470523834228516, "objective/non_score_reward": -1.9470525979995728, "objective/rlhf_reward": -9.788209915161133, "objective/scores": -0.5, "policy/approxkl_avg": 110.34794616699219, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7890366315841675, "step": 1722, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9979407787322998 }, { "episode": 27584, "epoch": 0.49581191357802784, "loss/policy_avg": 0.9407260417938232, "lr": 2.669670245398773e-06, "objective/entropy": -31.73898696899414, "objective/kl": 14.171585083007812, "objective/non_score_reward": -1.4171584844589233, "objective/rlhf_reward": -7.668633937835693, "objective/scores": -0.5, "policy/approxkl_avg": 55.22858428955078, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7059850692749023, "step": 1723, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9999644756317139 }, { "episode": 27600, "epoch": 0.4960995074954165, "loss/policy_avg": 0.5673115253448486, "lr": 2.6694785276073617e-06, "objective/entropy": 2.834369659423828, "objective/kl": 12.505474090576172, "objective/non_score_reward": -1.2505474090576172, "objective/rlhf_reward": -2.0784708007585735, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 35.140323638916016, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5003098845481873, "step": 1724, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9997611045837402 }, { "episode": 27616, "epoch": 0.4963871014128051, "loss/policy_avg": 0.37552765011787415, "lr": 2.669286809815951e-06, "objective/entropy": 48.47637939453125, "objective/kl": 14.658306121826172, "objective/non_score_reward": -1.4658305644989014, "objective/rlhf_reward": -5.463322436809539, "objective/scores": 0.1, "policy/approxkl_avg": 78.75804138183594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7523536682128906, "step": 1725, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9979631900787354 }, { "episode": 27632, "epoch": 0.49667469533019376, "loss/policy_avg": 0.27567678689956665, "lr": 2.66909509202454e-06, "objective/entropy": 111.3084487915039, "objective/kl": 20.33347511291504, "objective/non_score_reward": -2.0333473682403564, "objective/rlhf_reward": -7.7333897113800045, "objective/scores": 0.1, "policy/approxkl_avg": 27.45749855041504, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5236789584159851, "step": 1726, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9998615980148315 }, { "episode": 27648, "epoch": 0.4969622892475824, "loss/policy_avg": 0.4971972107887268, "lr": 2.6689033742331286e-06, "objective/entropy": -37.139991760253906, "objective/kl": 11.294246673583984, "objective/non_score_reward": -1.1294245719909668, "objective/rlhf_reward": -6.517698287963867, "objective/scores": -0.5, "policy/approxkl_avg": 31.072978973388672, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7589508295059204, "step": 1727, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0010251998901367 }, { "episode": 27664, "epoch": 0.49724988316497104, "loss/policy_avg": 1.9962494373321533, "lr": 2.668711656441718e-06, "objective/entropy": 120.74639892578125, "objective/kl": 20.389034271240234, "objective/non_score_reward": -2.03890323638916, "objective/rlhf_reward": -10.15561294555664, "objective/scores": -0.5, "policy/approxkl_avg": 152.50051879882812, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5712071657180786, "step": 1728, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9977648258209229 }, { "episode": 27680, "epoch": 0.4975374770823597, "loss/policy_avg": 0.04471936449408531, "lr": 2.6685199386503066e-06, "objective/entropy": 172.6311492919922, "objective/kl": 17.905120849609375, "objective/non_score_reward": -1.7905118465423584, "objective/rlhf_reward": -6.762047624588012, "objective/scores": 0.1, "policy/approxkl_avg": 77.14485168457031, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.46560442447662354, "step": 1729, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.997314453125 }, { "episode": 27696, "epoch": 0.4978250709997484, "loss/policy_avg": 0.12098614126443863, "lr": 2.668328220858896e-06, "objective/entropy": 61.26271438598633, "objective/kl": 14.224048614501953, "objective/non_score_reward": -1.4224047660827637, "objective/rlhf_reward": -4.027759557188141, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 41.26526641845703, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7243987917900085, "step": 1730, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9998290538787842 }, { "episode": 27712, "epoch": 0.498112664917137, "loss/policy_avg": 0.09704755246639252, "lr": 2.6681365030674847e-06, "objective/entropy": 226.2583465576172, "objective/kl": 13.247810363769531, "objective/non_score_reward": -1.3247809410095215, "objective/rlhf_reward": -2.3754050775778026, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 14.467374801635742, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5639190673828125, "step": 1731, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.99894380569458 }, { "episode": 27728, "epoch": 0.49840025883452566, "loss/policy_avg": 0.22803986072540283, "lr": 2.6679447852760735e-06, "objective/entropy": 24.980182647705078, "objective/kl": 9.997011184692383, "objective/non_score_reward": -0.9997010827064514, "objective/rlhf_reward": -1.8760980687299111, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 48.40413284301758, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4330744743347168, "step": 1732, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9993398189544678 }, { "episode": 27744, "epoch": 0.4986878527519143, "loss/policy_avg": 0.2788919508457184, "lr": 2.6677530674846627e-06, "objective/entropy": -17.63727569580078, "objective/kl": 16.732961654663086, "objective/non_score_reward": -1.6732960939407349, "objective/rlhf_reward": -6.293184316158294, "objective/scores": 0.1, "policy/approxkl_avg": 65.41502380371094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.40989822149276733, "step": 1733, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9983696937561035 }, { "episode": 27760, "epoch": 0.49897544666930294, "loss/policy_avg": 0.13127373158931732, "lr": 2.6675613496932515e-06, "objective/entropy": 123.1053237915039, "objective/kl": 17.913818359375, "objective/non_score_reward": -1.7913819551467896, "objective/rlhf_reward": -6.7655277907848355, "objective/scores": 0.1, "policy/approxkl_avg": 37.40669631958008, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.48393747210502625, "step": 1734, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9971840381622314 }, { "episode": 27776, "epoch": 0.4992630405866916, "loss/policy_avg": 0.1614466905593872, "lr": 2.6673696319018408e-06, "objective/entropy": 70.39647674560547, "objective/kl": 15.208030700683594, "objective/non_score_reward": -1.5208032131195068, "objective/rlhf_reward": -5.683212614059448, "objective/scores": 0.1, "policy/approxkl_avg": 20.147375106811523, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6510974764823914, "step": 1735, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9979395866394043 }, { "episode": 27792, "epoch": 0.4995506345040802, "loss/policy_avg": 0.19504263997077942, "lr": 2.6671779141104296e-06, "objective/entropy": 24.889122009277344, "objective/kl": 18.118755340576172, "objective/non_score_reward": -1.811875581741333, "objective/rlhf_reward": -5.422673340114663, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 11.235595703125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7391984462738037, "step": 1736, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0010764598846436 }, { "episode": 27808, "epoch": 0.4998382284214689, "loss/policy_avg": -0.01884336769580841, "lr": 2.6669861963190184e-06, "objective/entropy": -1.0885772705078125, "objective/kl": 13.130938529968262, "objective/non_score_reward": -1.313093900680542, "objective/rlhf_reward": -2.328656946064207, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 10.169123649597168, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.5120453238487244, "step": 1737, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0012874603271484 }, { "episode": 27824, "epoch": 0.5001258223388575, "loss/policy_avg": 0.19150349497795105, "lr": 2.6667944785276076e-06, "objective/entropy": -21.409439086914062, "objective/kl": 14.654093742370605, "objective/non_score_reward": -1.465409517288208, "objective/rlhf_reward": -3.9142266166972473, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 33.64068603515625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5209637880325317, "step": 1738, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000447988510132 }, { "episode": 27840, "epoch": 0.5004134162562461, "loss/policy_avg": 0.4384639859199524, "lr": 2.6666027607361964e-06, "objective/entropy": -306.7899169921875, "objective/kl": 16.510332107543945, "objective/non_score_reward": -1.6510331630706787, "objective/rlhf_reward": -5.000012729231434, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 38.28997039794922, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5968725681304932, "step": 1739, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9977682828903198 }, { "episode": 27856, "epoch": 0.5007010101736348, "loss/policy_avg": 0.30068957805633545, "lr": 2.6664110429447852e-06, "objective/entropy": -86.3069839477539, "objective/kl": 19.811098098754883, "objective/non_score_reward": -1.9811099767684937, "objective/rlhf_reward": -5.524439787864685, "objective/scores": 0.6, "policy/approxkl_avg": 37.62815856933594, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7373462915420532, "step": 1740, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9983165264129639 }, { "episode": 27872, "epoch": 0.5009886040910235, "loss/policy_avg": 0.17040492594242096, "lr": 2.6662193251533745e-06, "objective/entropy": 156.28121948242188, "objective/kl": 11.497791290283203, "objective/non_score_reward": -1.1497790813446045, "objective/rlhf_reward": -2.1991163849830624, "objective/scores": 0.6, "policy/approxkl_avg": 7.689607620239258, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.38815832138061523, "step": 1741, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.002171516418457 }, { "episode": 27888, "epoch": 0.5012761980084122, "loss/policy_avg": 0.06310490518808365, "lr": 2.6660276073619633e-06, "objective/entropy": 142.2759246826172, "objective/kl": 22.004135131835938, "objective/non_score_reward": -2.2004141807556152, "objective/rlhf_reward": -4.401655888557434, "objective/scores": 1.1, "policy/approxkl_avg": 36.01221466064453, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6704832315444946, "step": 1742, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.99953293800354 }, { "episode": 27904, "epoch": 0.5015637919258008, "loss/policy_avg": -0.008167348802089691, "lr": 2.6658358895705525e-06, "objective/entropy": 140.29356384277344, "objective/kl": 16.384765625, "objective/non_score_reward": -1.6384766101837158, "objective/rlhf_reward": -6.153906142711639, "objective/scores": 0.1, "policy/approxkl_avg": 38.04493713378906, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.802377462387085, "step": 1743, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9984660148620605 }, { "episode": 27920, "epoch": 0.5018513858431894, "loss/policy_avg": 0.3492729067802429, "lr": 2.6656441717791413e-06, "objective/entropy": 30.628379821777344, "objective/kl": 12.58380126953125, "objective/non_score_reward": -1.2583800554275513, "objective/rlhf_reward": -2.6335202217102047, "objective/scores": 0.6, "policy/approxkl_avg": 9.596057891845703, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.50040602684021, "step": 1744, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998366117477417 }, { "episode": 27936, "epoch": 0.5021389797605781, "loss/policy_avg": 0.5450857281684875, "lr": 2.66545245398773e-06, "objective/entropy": 18.2037296295166, "objective/kl": 18.23172378540039, "objective/non_score_reward": -1.8231723308563232, "objective/rlhf_reward": -5.467860485586236, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 52.9035530090332, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5289323329925537, "step": 1745, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9986926317214966 }, { "episode": 27952, "epoch": 0.5024265736779667, "loss/policy_avg": 0.11652082949876785, "lr": 2.665260736196319e-06, "objective/entropy": 336.8443603515625, "objective/kl": 15.785937309265137, "objective/non_score_reward": -1.5785937309265137, "objective/rlhf_reward": -8.314374923706055, "objective/scores": -0.5, "policy/approxkl_avg": 112.4112548828125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8711972832679749, "step": 1746, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9983396530151367 }, { "episode": 27968, "epoch": 0.5027141675953554, "loss/policy_avg": 0.02146860957145691, "lr": 2.6650690184049078e-06, "objective/entropy": 202.36102294921875, "objective/kl": 13.70978832244873, "objective/non_score_reward": -1.370978832244873, "objective/rlhf_reward": -7.483915328979492, "objective/scores": -0.5, "policy/approxkl_avg": 171.70550537109375, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.80576491355896, "step": 1747, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9973173141479492 }, { "episode": 27984, "epoch": 0.503001761512744, "loss/policy_avg": 0.46403220295906067, "lr": 2.664877300613497e-06, "objective/entropy": 336.539794921875, "objective/kl": 12.597597122192383, "objective/non_score_reward": -1.2597599029541016, "objective/rlhf_reward": -0.6390396714210507, "objective/scores": 1.1, "policy/approxkl_avg": 34.25482940673828, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.808406412601471, "step": 1748, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9988746643066406 }, { "episode": 28000, "epoch": 0.5032893554301326, "loss/policy_avg": 0.746623158454895, "lr": 2.664685582822086e-06, "objective/entropy": 236.44168090820312, "objective/kl": 13.086444854736328, "objective/non_score_reward": -1.3086445331573486, "objective/rlhf_reward": -2.83457795381546, "objective/scores": 0.6, "policy/approxkl_avg": 8.036091804504395, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5963040590286255, "step": 1749, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9990828037261963 }, { "episode": 28016, "epoch": 0.5035769493475213, "loss/policy_avg": 0.07168757915496826, "lr": 2.664493865030675e-06, "objective/entropy": 71.59014892578125, "objective/kl": 18.932260513305664, "objective/non_score_reward": -1.8932262659072876, "objective/rlhf_reward": -5.450198697225128, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 51.12376403808594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.3951878249645233, "step": 1750, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9968023300170898 }, { "episode": 28032, "epoch": 0.5038645432649099, "loss/policy_avg": -0.30793046951293945, "lr": 2.664302147239264e-06, "objective/entropy": 42.83002853393555, "objective/kl": 10.219228744506836, "objective/non_score_reward": -1.0219228267669678, "objective/rlhf_reward": -6.087691307067871, "objective/scores": -0.5, "policy/approxkl_avg": 11.549211502075195, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6399141550064087, "step": 1751, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.00927734375 }, { "episode": 28048, "epoch": 0.5041521371822986, "loss/policy_avg": 0.32605302333831787, "lr": 2.6641104294478526e-06, "objective/entropy": -77.07260131835938, "objective/kl": 11.111099243164062, "objective/non_score_reward": -1.111109972000122, "objective/rlhf_reward": -4.0444394707679745, "objective/scores": 0.1, "policy/approxkl_avg": 2.713918685913086, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5597299337387085, "step": 1752, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9988629817962646 }, { "episode": 28064, "epoch": 0.5044397310996872, "loss/policy_avg": -0.13548192381858826, "lr": 2.663918711656442e-06, "objective/entropy": -74.50413513183594, "objective/kl": 11.32368278503418, "objective/non_score_reward": -1.1323683261871338, "objective/rlhf_reward": -1.6057543500673501, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 44.30047607421875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6872704029083252, "step": 1753, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0003886222839355 }, { "episode": 28080, "epoch": 0.5047273250170758, "loss/policy_avg": 0.25978559255599976, "lr": 2.6637269938650307e-06, "objective/entropy": -102.45779418945312, "objective/kl": 11.285503387451172, "objective/non_score_reward": -1.1285502910614014, "objective/rlhf_reward": -4.114201521873474, "objective/scores": 0.1, "policy/approxkl_avg": 44.16554260253906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.646943211555481, "step": 1754, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9980204105377197 }, { "episode": 28096, "epoch": 0.5050149189344645, "loss/policy_avg": 0.017559155821800232, "lr": 2.6635352760736195e-06, "objective/entropy": 48.8609619140625, "objective/kl": 10.03787612915039, "objective/non_score_reward": -1.0037877559661865, "objective/rlhf_reward": -6.015151023864746, "objective/scores": -0.5, "policy/approxkl_avg": 1.1227680444717407, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7357013821601868, "step": 1755, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.001687526702881 }, { "episode": 28112, "epoch": 0.5053025128518532, "loss/policy_avg": 0.6664595603942871, "lr": 2.6633435582822087e-06, "objective/entropy": 269.71820068359375, "objective/kl": 20.73150062561035, "objective/non_score_reward": -2.073150157928467, "objective/rlhf_reward": -5.892600929737091, "objective/scores": 0.6, "policy/approxkl_avg": 91.48931121826172, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6852459907531738, "step": 1756, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9988569021224976 }, { "episode": 28128, "epoch": 0.5055901067692419, "loss/policy_avg": 0.5310183763504028, "lr": 2.6631518404907975e-06, "objective/entropy": -99.25455474853516, "objective/kl": 9.640950202941895, "objective/non_score_reward": -0.9640949964523315, "objective/rlhf_reward": -3.4563798666000363, "objective/scores": 0.1, "policy/approxkl_avg": 33.46493911743164, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4926908612251282, "step": 1757, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9990811347961426 }, { "episode": 28144, "epoch": 0.5058777006866305, "loss/policy_avg": 0.12302426993846893, "lr": 2.6629601226993868e-06, "objective/entropy": 26.574722290039062, "objective/kl": 14.709476470947266, "objective/non_score_reward": -1.4709476232528687, "objective/rlhf_reward": -7.883790493011475, "objective/scores": -0.5, "policy/approxkl_avg": 27.887361526489258, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7425447702407837, "step": 1758, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000166416168213 }, { "episode": 28160, "epoch": 0.5061652946040192, "loss/policy_avg": 0.12881505489349365, "lr": 2.6627684049079756e-06, "objective/entropy": 7.130855560302734, "objective/kl": 14.024930000305176, "objective/non_score_reward": -1.4024930000305176, "objective/rlhf_reward": -5.2099721193313595, "objective/scores": 0.1, "policy/approxkl_avg": 23.538719177246094, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6327688694000244, "step": 1759, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0035862922668457 }, { "episode": 28176, "epoch": 0.5064528885214078, "loss/policy_avg": 0.04699630290269852, "lr": 2.6625766871165644e-06, "objective/entropy": 207.1057891845703, "objective/kl": 17.828094482421875, "objective/non_score_reward": -1.7828094959259033, "objective/rlhf_reward": -2.731237924098968, "objective/scores": 1.1, "policy/approxkl_avg": 48.873268127441406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5094485282897949, "step": 1760, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9982911348342896 }, { "episode": 28192, "epoch": 0.5067404824387964, "loss/policy_avg": 0.2013719379901886, "lr": 2.6623849693251536e-06, "objective/entropy": -141.62539672851562, "objective/kl": 11.267175674438477, "objective/non_score_reward": -1.1267175674438477, "objective/rlhf_reward": -0.1068704336881634, "objective/scores": 1.1, "policy/approxkl_avg": 11.064361572265625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.41444897651672363, "step": 1761, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0017099380493164 }, { "episode": 28208, "epoch": 0.5070280763561851, "loss/policy_avg": 0.541221022605896, "lr": 2.6621932515337424e-06, "objective/entropy": 206.86508178710938, "objective/kl": 16.236549377441406, "objective/non_score_reward": -1.623655080795288, "objective/rlhf_reward": -8.494619369506836, "objective/scores": -0.5, "policy/approxkl_avg": 30.70893096923828, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7763517498970032, "step": 1762, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9991366863250732 }, { "episode": 28224, "epoch": 0.5073156702735737, "loss/policy_avg": 0.4266519844532013, "lr": 2.6620015337423317e-06, "objective/entropy": -39.3725700378418, "objective/kl": 19.346342086791992, "objective/non_score_reward": -1.9346342086791992, "objective/rlhf_reward": -3.3385367751121517, "objective/scores": 1.1, "policy/approxkl_avg": 95.08259582519531, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5408810973167419, "step": 1763, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9986789226531982 }, { "episode": 28240, "epoch": 0.5076032641909624, "loss/policy_avg": 1.052474856376648, "lr": 2.6618098159509205e-06, "objective/entropy": 68.28462219238281, "objective/kl": 17.100852966308594, "objective/non_score_reward": -1.710085153579712, "objective/rlhf_reward": -4.717634620443855, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 130.52293395996094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5202251672744751, "step": 1764, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9974597692489624 }, { "episode": 28256, "epoch": 0.507890858108351, "loss/policy_avg": 0.022500693798065186, "lr": 2.6616180981595093e-06, "objective/entropy": -7.4169158935546875, "objective/kl": 15.090633392333984, "objective/non_score_reward": -1.509063482284546, "objective/rlhf_reward": -8.036253929138184, "objective/scores": -0.5, "policy/approxkl_avg": 60.160728454589844, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7391468286514282, "step": 1765, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9971258640289307 }, { "episode": 28272, "epoch": 0.5081784520257396, "loss/policy_avg": 0.23286819458007812, "lr": 2.6614263803680985e-06, "objective/entropy": -67.46270751953125, "objective/kl": 9.249923706054688, "objective/non_score_reward": -0.9249924421310425, "objective/rlhf_reward": -5.699970245361328, "objective/scores": -0.5, "policy/approxkl_avg": 0.629024088382721, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6066184043884277, "step": 1766, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0047874450683594 }, { "episode": 28288, "epoch": 0.5084660459431283, "loss/policy_avg": 0.4985557794570923, "lr": 2.6612346625766873e-06, "objective/entropy": 212.60858154296875, "objective/kl": 17.220657348632812, "objective/non_score_reward": -1.7220659255981445, "objective/rlhf_reward": -8.888263702392578, "objective/scores": -0.5, "policy/approxkl_avg": 83.53437805175781, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7002837657928467, "step": 1767, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.998044490814209 }, { "episode": 28304, "epoch": 0.5087536398605169, "loss/policy_avg": 0.224037304520607, "lr": 2.661042944785276e-06, "objective/entropy": 146.78701782226562, "objective/kl": 21.866727828979492, "objective/non_score_reward": -2.1866729259490967, "objective/rlhf_reward": -8.346691688895227, "objective/scores": 0.1, "policy/approxkl_avg": 42.53478240966797, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6788499355316162, "step": 1768, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9997270107269287 }, { "episode": 28320, "epoch": 0.5090412337779056, "loss/policy_avg": 0.11605388671159744, "lr": 2.660851226993865e-06, "objective/entropy": 204.34317016601562, "objective/kl": 21.220643997192383, "objective/non_score_reward": -2.1220645904541016, "objective/rlhf_reward": -6.365551712290321, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 57.263092041015625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5966320037841797, "step": 1769, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9974641799926758 }, { "episode": 28336, "epoch": 0.5093288276952942, "loss/policy_avg": 0.058426156640052795, "lr": 2.6606595092024538e-06, "objective/entropy": 119.5937271118164, "objective/kl": 10.65241527557373, "objective/non_score_reward": -1.0652415752410889, "objective/rlhf_reward": -3.860966151952743, "objective/scores": 0.1, "policy/approxkl_avg": 14.677213668823242, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6696159243583679, "step": 1770, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0011467933654785 }, { "episode": 28352, "epoch": 0.509616421612683, "loss/policy_avg": 0.2700793743133545, "lr": 2.660467791411043e-06, "objective/entropy": 34.39029312133789, "objective/kl": 17.694852828979492, "objective/non_score_reward": -1.7694852352142334, "objective/rlhf_reward": -4.6779410302639, "objective/scores": 0.6, "policy/approxkl_avg": 42.89826965332031, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.44230255484580994, "step": 1771, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0011649131774902 }, { "episode": 28368, "epoch": 0.5099040155300716, "loss/policy_avg": 0.22650663554668427, "lr": 2.660276073619632e-06, "objective/entropy": 62.46409606933594, "objective/kl": 18.707399368286133, "objective/non_score_reward": -1.8707401752471924, "objective/rlhf_reward": -7.082960522174835, "objective/scores": 0.1, "policy/approxkl_avg": 122.68310546875, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4607383608818054, "step": 1772, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000152826309204 }, { "episode": 28384, "epoch": 0.5101916094474602, "loss/policy_avg": 0.9882694482803345, "lr": 2.660084355828221e-06, "objective/entropy": 185.96481323242188, "objective/kl": 18.96381378173828, "objective/non_score_reward": -1.8963813781738281, "objective/rlhf_reward": -3.185525274276733, "objective/scores": 1.1, "policy/approxkl_avg": 50.954315185546875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8979307413101196, "step": 1773, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9962632656097412 }, { "episode": 28400, "epoch": 0.5104792033648489, "loss/policy_avg": -0.10225933790206909, "lr": 2.65989263803681e-06, "objective/entropy": -2.4116439819335938, "objective/kl": 10.670534133911133, "objective/non_score_reward": -1.0670535564422607, "objective/rlhf_reward": -1.868213868141174, "objective/scores": 0.6, "policy/approxkl_avg": 35.16023254394531, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.62814861536026, "step": 1774, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0009822845458984 }, { "episode": 28416, "epoch": 0.5107667972822375, "loss/policy_avg": 0.24588751792907715, "lr": 2.6597009202453987e-06, "objective/entropy": 126.577880859375, "objective/kl": 14.00534439086914, "objective/non_score_reward": -1.4005343914031982, "objective/rlhf_reward": -2.6784184023153514, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 56.3424072265625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.689423143863678, "step": 1775, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9982950687408447 }, { "episode": 28432, "epoch": 0.5110543911996261, "loss/policy_avg": 0.263106107711792, "lr": 2.659509202453988e-06, "objective/entropy": -146.71322631835938, "objective/kl": 18.640098571777344, "objective/non_score_reward": -1.8640098571777344, "objective/rlhf_reward": -9.456039428710938, "objective/scores": -0.5, "policy/approxkl_avg": 43.64714050292969, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.46649169921875, "step": 1776, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.997348666191101 }, { "episode": 28448, "epoch": 0.5113419851170148, "loss/policy_avg": -0.28649938106536865, "lr": 2.6593174846625767e-06, "objective/entropy": -46.71950149536133, "objective/kl": 15.516162872314453, "objective/non_score_reward": -1.5516164302825928, "objective/rlhf_reward": -1.8064654231071469, "objective/scores": 1.1, "policy/approxkl_avg": 18.319168090820312, "policy/clipfrac_avg": 2.0, "policy/entropy_avg": 0.71426922082901, "step": 1777, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.006796360015869 }, { "episode": 28464, "epoch": 0.5116295790344034, "loss/policy_avg": 0.12089434266090393, "lr": 2.6591257668711655e-06, "objective/entropy": 116.71041107177734, "objective/kl": 16.0557804107666, "objective/non_score_reward": -1.605578064918518, "objective/rlhf_reward": -6.022312170267105, "objective/scores": 0.1, "policy/approxkl_avg": 35.31344985961914, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5637420415878296, "step": 1778, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9995025396347046 }, { "episode": 28480, "epoch": 0.5119171729517921, "loss/policy_avg": 0.5408236980438232, "lr": 2.6589340490797547e-06, "objective/entropy": -41.114479064941406, "objective/kl": 13.265435218811035, "objective/non_score_reward": -1.3265435695648193, "objective/rlhf_reward": -4.906174151599407, "objective/scores": 0.1, "policy/approxkl_avg": 13.033656120300293, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.48320022225379944, "step": 1779, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000502586364746 }, { "episode": 28496, "epoch": 0.5122047668691807, "loss/policy_avg": 0.5629329681396484, "lr": 2.6587423312883435e-06, "objective/entropy": 25.448585510253906, "objective/kl": 10.900973320007324, "objective/non_score_reward": -1.090097427368164, "objective/rlhf_reward": -1.9603894189000126, "objective/scores": 0.6, "policy/approxkl_avg": 29.427316665649414, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7866188287734985, "step": 1780, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9992313385009766 }, { "episode": 28512, "epoch": 0.5124923607865693, "loss/policy_avg": 0.3418985903263092, "lr": 2.6585506134969328e-06, "objective/entropy": 77.93727111816406, "objective/kl": 11.338996887207031, "objective/non_score_reward": -1.1338996887207031, "objective/rlhf_reward": -1.6118795021784034, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 4.558671951293945, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.4622589349746704, "step": 1781, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.005500555038452 }, { "episode": 28528, "epoch": 0.512779954703958, "loss/policy_avg": -0.2597534954547882, "lr": 2.6583588957055216e-06, "objective/entropy": 211.84303283691406, "objective/kl": 15.383636474609375, "objective/non_score_reward": -1.5383635759353638, "objective/rlhf_reward": -3.2297350808393688, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 13.554239273071289, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7210692167282104, "step": 1782, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0022835731506348 }, { "episode": 28544, "epoch": 0.5130675486213466, "loss/policy_avg": 0.6771409511566162, "lr": 2.6581671779141104e-06, "objective/entropy": 82.52877044677734, "objective/kl": 16.593626022338867, "objective/non_score_reward": -1.659362554550171, "objective/rlhf_reward": -6.237450411915779, "objective/scores": 0.1, "policy/approxkl_avg": 18.595134735107422, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7311805486679077, "step": 1783, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9988240003585815 }, { "episode": 28560, "epoch": 0.5133551425387353, "loss/policy_avg": 0.5855733156204224, "lr": 2.6579754601226996e-06, "objective/entropy": 78.42135620117188, "objective/kl": 16.883634567260742, "objective/non_score_reward": -1.6883635520935059, "objective/rlhf_reward": -6.353453657031059, "objective/scores": 0.1, "policy/approxkl_avg": 23.954368591308594, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7775710821151733, "step": 1784, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9976449012756348 }, { "episode": 28576, "epoch": 0.5136427364561239, "loss/policy_avg": -0.47379228472709656, "lr": 2.6577837423312884e-06, "objective/entropy": 183.82095336914062, "objective/kl": 17.60445213317871, "objective/non_score_reward": -1.7604451179504395, "objective/rlhf_reward": -2.6417807102203366, "objective/scores": 1.1, "policy/approxkl_avg": 16.96839141845703, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.4322192072868347, "step": 1785, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.000903844833374 }, { "episode": 28592, "epoch": 0.5139303303735125, "loss/policy_avg": 0.040809325873851776, "lr": 2.6575920245398777e-06, "objective/entropy": 219.00308227539062, "objective/kl": 18.765731811523438, "objective/non_score_reward": -1.8765733242034912, "objective/rlhf_reward": -3.1062932968139645, "objective/scores": 1.1, "policy/approxkl_avg": 92.84384155273438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5429558753967285, "step": 1786, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9964911937713623 }, { "episode": 28608, "epoch": 0.5142179242909013, "loss/policy_avg": 0.05377146229147911, "lr": 2.6574003067484665e-06, "objective/entropy": 90.84602355957031, "objective/kl": 20.2624568939209, "objective/non_score_reward": -2.026245594024658, "objective/rlhf_reward": -7.704982435703277, "objective/scores": 0.1, "policy/approxkl_avg": 193.996337890625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7286136150360107, "step": 1787, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.996244192123413 }, { "episode": 28624, "epoch": 0.5145055182082899, "loss/policy_avg": -0.007620755583047867, "lr": 2.6572085889570553e-06, "objective/entropy": -268.669921875, "objective/kl": 12.50502872467041, "objective/non_score_reward": -1.2505029439926147, "objective/rlhf_reward": -0.6020117759704586, "objective/scores": 1.1, "policy/approxkl_avg": 10.24152946472168, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6491585373878479, "step": 1788, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9990335702896118 }, { "episode": 28640, "epoch": 0.5147931121256786, "loss/policy_avg": 0.09260250627994537, "lr": 2.6570168711656445e-06, "objective/entropy": -22.395694732666016, "objective/kl": 14.976754188537598, "objective/non_score_reward": -1.4976755380630493, "objective/rlhf_reward": -5.590702152252197, "objective/scores": 0.1, "policy/approxkl_avg": 34.219722747802734, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5158121585845947, "step": 1789, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9984734058380127 }, { "episode": 28656, "epoch": 0.5150807060430672, "loss/policy_avg": 0.1599373072385788, "lr": 2.656825153374233e-06, "objective/entropy": -151.83506774902344, "objective/kl": 13.787343978881836, "objective/non_score_reward": -1.3787343502044678, "objective/rlhf_reward": -1.1149376392364498, "objective/scores": 1.1, "policy/approxkl_avg": 2.0728182792663574, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6705546379089355, "step": 1790, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0042343139648438 }, { "episode": 28672, "epoch": 0.5153682999604559, "loss/policy_avg": 0.5093972086906433, "lr": 2.656633435582822e-06, "objective/entropy": 189.69818115234375, "objective/kl": 14.04090690612793, "objective/non_score_reward": -1.4040907621383667, "objective/rlhf_reward": -7.616363048553467, "objective/scores": -0.5, "policy/approxkl_avg": 74.02149963378906, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7557144165039062, "step": 1791, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0006518363952637 }, { "episode": 28688, "epoch": 0.5156558938778445, "loss/policy_avg": 0.37555861473083496, "lr": 2.656441717791411e-06, "objective/entropy": -76.02454376220703, "objective/kl": 19.386255264282227, "objective/non_score_reward": -1.938625693321228, "objective/rlhf_reward": -6.198243289199427, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 76.97025299072266, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6998109817504883, "step": 1792, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998724341392517 }, { "episode": 28704, "epoch": 0.5159434877952331, "loss/policy_avg": 0.16148006916046143, "lr": 2.6562499999999998e-06, "objective/entropy": 62.66816329956055, "objective/kl": 13.42384147644043, "objective/non_score_reward": -1.3423840999603271, "objective/rlhf_reward": -7.369536399841309, "objective/scores": -0.5, "policy/approxkl_avg": 58.304691314697266, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5450599789619446, "step": 1793, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999281644821167 }, { "episode": 28720, "epoch": 0.5162310817126218, "loss/policy_avg": 0.27762842178344727, "lr": 2.656058282208589e-06, "objective/entropy": 103.40982055664062, "objective/kl": 12.799996376037598, "objective/non_score_reward": -1.2799994945526123, "objective/rlhf_reward": -4.719998276233673, "objective/scores": 0.1, "policy/approxkl_avg": 4.410861015319824, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.637082576751709, "step": 1794, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999462366104126 }, { "episode": 28736, "epoch": 0.5165186756300104, "loss/policy_avg": 0.409230500459671, "lr": 2.655866564417178e-06, "objective/entropy": 232.2705841064453, "objective/kl": 22.624011993408203, "objective/non_score_reward": -2.2624013423919678, "objective/rlhf_reward": -6.92689901806501, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 48.6843147277832, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6138824820518494, "step": 1795, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9988563060760498 }, { "episode": 28752, "epoch": 0.516806269547399, "loss/policy_avg": 0.7418479919433594, "lr": 2.655674846625767e-06, "objective/entropy": -84.00031280517578, "objective/kl": 14.806282043457031, "objective/non_score_reward": -1.480628252029419, "objective/rlhf_reward": -1.5225130677223202, "objective/scores": 1.1, "policy/approxkl_avg": 149.33465576171875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6433936953544617, "step": 1796, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997800350189209 }, { "episode": 28768, "epoch": 0.5170938634647877, "loss/policy_avg": 0.217472642660141, "lr": 2.655483128834356e-06, "objective/entropy": -59.4697380065918, "objective/kl": 17.53496551513672, "objective/non_score_reward": -1.7534964084625244, "objective/rlhf_reward": -4.6139857232570645, "objective/scores": 0.6, "policy/approxkl_avg": 69.00981140136719, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.49576112627983093, "step": 1797, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9981770515441895 }, { "episode": 28784, "epoch": 0.5173814573821763, "loss/policy_avg": 0.26614606380462646, "lr": 2.6552914110429447e-06, "objective/entropy": 188.75660705566406, "objective/kl": 11.638067245483398, "objective/non_score_reward": -1.1638067960739136, "objective/rlhf_reward": -4.25522700548172, "objective/scores": 0.1, "policy/approxkl_avg": 58.32883071899414, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7997337579727173, "step": 1798, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9977247714996338 }, { "episode": 28800, "epoch": 0.517669051299565, "loss/policy_avg": 0.12947332859039307, "lr": 2.655099693251534e-06, "objective/entropy": 151.32919311523438, "objective/kl": 15.166569709777832, "objective/non_score_reward": -1.5166568756103516, "objective/rlhf_reward": -5.666627234220504, "objective/scores": 0.1, "policy/approxkl_avg": 11.051401138305664, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.699778139591217, "step": 1799, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9994456768035889 }, { "episode": 28816, "epoch": 0.5179566452169536, "loss/policy_avg": -0.002175837755203247, "lr": 2.6549079754601227e-06, "objective/entropy": 35.160484313964844, "objective/kl": 21.300491333007812, "objective/non_score_reward": -2.130049228668213, "objective/rlhf_reward": -10.520196914672852, "objective/scores": -0.5, "policy/approxkl_avg": 30.026445388793945, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5395582914352417, "step": 1800, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9978394508361816 }, { "episode": 28832, "epoch": 0.5182442391343423, "loss/policy_avg": 0.16893471777439117, "lr": 2.654716257668712e-06, "objective/entropy": 155.825439453125, "objective/kl": 15.55232048034668, "objective/non_score_reward": -1.555232048034668, "objective/rlhf_reward": -1.8209277451038357, "objective/scores": 1.1, "policy/approxkl_avg": 118.18456268310547, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5876666307449341, "step": 1801, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9970424175262451 }, { "episode": 28848, "epoch": 0.518531833051731, "loss/policy_avg": 0.22040048241615295, "lr": 2.6545245398773007e-06, "objective/entropy": -31.10854721069336, "objective/kl": 15.175678253173828, "objective/non_score_reward": -1.5175678730010986, "objective/rlhf_reward": -3.9475650063910823, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 8.521119117736816, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6477307677268982, "step": 1802, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999565839767456 }, { "episode": 28864, "epoch": 0.5188194269691196, "loss/policy_avg": 0.7363486289978027, "lr": 2.6543328220858896e-06, "objective/entropy": -71.38179779052734, "objective/kl": 14.623321533203125, "objective/non_score_reward": -1.4623322486877441, "objective/rlhf_reward": -1.4493289351463314, "objective/scores": 1.1, "policy/approxkl_avg": 117.98739624023438, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7105915546417236, "step": 1803, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9983479976654053 }, { "episode": 28880, "epoch": 0.5191070208865083, "loss/policy_avg": 0.09098640829324722, "lr": 2.654141104294479e-06, "objective/entropy": 13.222919464111328, "objective/kl": 16.781892776489258, "objective/non_score_reward": -1.6781893968582153, "objective/rlhf_reward": -8.712757110595703, "objective/scores": -0.5, "policy/approxkl_avg": 107.18089294433594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6652854084968567, "step": 1804, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.000478744506836 }, { "episode": 28896, "epoch": 0.5193946148038969, "loss/policy_avg": -0.08573319762945175, "lr": 2.6539493865030676e-06, "objective/entropy": 1.4567337036132812, "objective/kl": 9.958293914794922, "objective/non_score_reward": -0.9958294630050659, "objective/rlhf_reward": 0.4166820734739307, "objective/scores": 1.1, "policy/approxkl_avg": 1.789250373840332, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.8160400390625, "step": 1805, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0015993118286133 }, { "episode": 28912, "epoch": 0.5196822087212856, "loss/policy_avg": 0.21332049369812012, "lr": 2.6537576687116564e-06, "objective/entropy": 28.27048683166504, "objective/kl": 11.810449600219727, "objective/non_score_reward": -1.1810449361801147, "objective/rlhf_reward": -2.3241798043251034, "objective/scores": 0.6, "policy/approxkl_avg": 3.751129627227783, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.644847571849823, "step": 1806, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.001784324645996 }, { "episode": 28928, "epoch": 0.5199698026386742, "loss/policy_avg": 0.2684786915779114, "lr": 2.6535659509202456e-06, "objective/entropy": -41.66497039794922, "objective/kl": 18.632709503173828, "objective/non_score_reward": -1.863270878791809, "objective/rlhf_reward": -7.0530836343765255, "objective/scores": 0.1, "policy/approxkl_avg": 50.73341751098633, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5328782796859741, "step": 1807, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9980311393737793 }, { "episode": 28944, "epoch": 0.5202573965560628, "loss/policy_avg": 0.27005735039711, "lr": 2.6533742331288345e-06, "objective/entropy": -92.68614959716797, "objective/kl": 16.1339054107666, "objective/non_score_reward": -1.6133904457092285, "objective/rlhf_reward": -4.053561812639236, "objective/scores": 0.6, "policy/approxkl_avg": 75.5761489868164, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.435627818107605, "step": 1808, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997697114944458 }, { "episode": 28960, "epoch": 0.5205449904734515, "loss/policy_avg": 0.47697752714157104, "lr": 2.6531825153374237e-06, "objective/entropy": 70.55982208251953, "objective/kl": 15.462162017822266, "objective/non_score_reward": -1.5462161302566528, "objective/rlhf_reward": -3.261145387531492, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 25.127525329589844, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5837845802307129, "step": 1809, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9995059967041016 }, { "episode": 28976, "epoch": 0.5208325843908401, "loss/policy_avg": 0.0944967269897461, "lr": 2.6529907975460125e-06, "objective/entropy": -106.81289672851562, "objective/kl": 12.209148406982422, "objective/non_score_reward": -1.2209150791168213, "objective/rlhf_reward": -1.9599410190593927, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 1.313828945159912, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5029849410057068, "step": 1810, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0012412071228027 }, { "episode": 28992, "epoch": 0.5211201783082288, "loss/policy_avg": 0.16460567712783813, "lr": 2.6527990797546013e-06, "objective/entropy": -46.76816940307617, "objective/kl": 13.290376663208008, "objective/non_score_reward": -1.3290376663208008, "objective/rlhf_reward": -4.916150665283203, "objective/scores": 0.1, "policy/approxkl_avg": 43.19758987426758, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6998944282531738, "step": 1811, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9993230104446411 }, { "episode": 29008, "epoch": 0.5214077722256174, "loss/policy_avg": 0.09649604558944702, "lr": 2.65260736196319e-06, "objective/entropy": -97.85505676269531, "objective/kl": 13.006240844726562, "objective/non_score_reward": -1.3006240129470825, "objective/rlhf_reward": -4.802495850622654, "objective/scores": 0.1, "policy/approxkl_avg": 14.20913314819336, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6500852108001709, "step": 1812, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0012776851654053 }, { "episode": 29024, "epoch": 0.521695366143006, "loss/policy_avg": 0.13328894972801208, "lr": 2.652415644171779e-06, "objective/entropy": 2.2023277282714844, "objective/kl": 10.00713062286377, "objective/non_score_reward": -1.0007131099700928, "objective/rlhf_reward": -6.002852439880371, "objective/scores": -0.5, "policy/approxkl_avg": 13.523300170898438, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6957378387451172, "step": 1813, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.99851655960083 }, { "episode": 29040, "epoch": 0.5219829600603947, "loss/policy_avg": 0.1445261836051941, "lr": 2.652223926380368e-06, "objective/entropy": -19.08232879638672, "objective/kl": 10.624837875366211, "objective/non_score_reward": -1.0624839067459106, "objective/rlhf_reward": -2.6936763813167364, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 14.453914642333984, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7210265398025513, "step": 1814, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9996994733810425 }, { "episode": 29056, "epoch": 0.5222705539777833, "loss/policy_avg": -0.1162421777844429, "lr": 2.652032208588957e-06, "objective/entropy": 148.0369873046875, "objective/kl": 11.403135299682617, "objective/non_score_reward": -1.1403135061264038, "objective/rlhf_reward": -2.1612541288137432, "objective/scores": 0.6, "policy/approxkl_avg": 45.55539321899414, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6635168194770813, "step": 1815, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.039712429046631 }, { "episode": 29072, "epoch": 0.522558147895172, "loss/policy_avg": 0.2675674557685852, "lr": 2.6518404907975458e-06, "objective/entropy": -52.286190032958984, "objective/kl": 18.950439453125, "objective/non_score_reward": -1.8950438499450684, "objective/rlhf_reward": -5.7553467109528285, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 27.757678985595703, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6635863780975342, "step": 1816, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9989075660705566 }, { "episode": 29088, "epoch": 0.5228457418125607, "loss/policy_avg": 0.2123212069272995, "lr": 2.651648773006135e-06, "objective/entropy": -30.773658752441406, "objective/kl": 13.214252471923828, "objective/non_score_reward": -1.321425199508667, "objective/rlhf_reward": -7.285700798034668, "objective/scores": -0.5, "policy/approxkl_avg": 10.348447799682617, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6751729846000671, "step": 1817, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9963544607162476 }, { "episode": 29104, "epoch": 0.5231333357299494, "loss/policy_avg": 1.2047995328903198, "lr": 2.651457055214724e-06, "objective/entropy": 261.843505859375, "objective/kl": 20.371292114257812, "objective/non_score_reward": -2.0371291637420654, "objective/rlhf_reward": -7.74851701259613, "objective/scores": 0.1, "policy/approxkl_avg": 24.547252655029297, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.940751314163208, "step": 1818, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9997062683105469 }, { "episode": 29120, "epoch": 0.523420929647338, "loss/policy_avg": 0.37461650371551514, "lr": 2.651265337423313e-06, "objective/entropy": -122.65673828125, "objective/kl": 16.444629669189453, "objective/non_score_reward": -1.6444628238677979, "objective/rlhf_reward": -5.021592318025187, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 67.11688995361328, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5965983867645264, "step": 1819, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9979946613311768 }, { "episode": 29136, "epoch": 0.5237085235647266, "loss/policy_avg": -0.005302314180880785, "lr": 2.651073619631902e-06, "objective/entropy": 36.619300842285156, "objective/kl": 19.354164123535156, "objective/non_score_reward": -1.9354164600372314, "objective/rlhf_reward": -3.341666108369827, "objective/scores": 1.1, "policy/approxkl_avg": 4.855372428894043, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6970053911209106, "step": 1820, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0006046295166016 }, { "episode": 29152, "epoch": 0.5239961174821153, "loss/policy_avg": 0.0998329371213913, "lr": 2.6508819018404907e-06, "objective/entropy": 48.56463623046875, "objective/kl": 16.415325164794922, "objective/non_score_reward": -1.64153254032135, "objective/rlhf_reward": -6.166130459308624, "objective/scores": 0.1, "policy/approxkl_avg": 26.19106674194336, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7929558753967285, "step": 1821, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9998055696487427 }, { "episode": 29168, "epoch": 0.5242837113995039, "loss/policy_avg": 1.2181178331375122, "lr": 2.65069018404908e-06, "objective/entropy": 29.835037231445312, "objective/kl": 14.585583686828613, "objective/non_score_reward": -1.4585583209991455, "objective/rlhf_reward": -1.4342335820198056, "objective/scores": 1.1, "policy/approxkl_avg": 46.434059143066406, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6647939682006836, "step": 1822, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9998629093170166 }, { "episode": 29184, "epoch": 0.5245713053168926, "loss/policy_avg": -0.3791189193725586, "lr": 2.6504984662576687e-06, "objective/entropy": -51.36210250854492, "objective/kl": 11.42615795135498, "objective/non_score_reward": -1.1426159143447876, "objective/rlhf_reward": -1.6467445834886758, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 5.951854705810547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4953696131706238, "step": 1823, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.003664970397949 }, { "episode": 29200, "epoch": 0.5248588992342812, "loss/policy_avg": 0.2820661962032318, "lr": 2.650306748466258e-06, "objective/entropy": 51.438140869140625, "objective/kl": 12.149774551391602, "objective/non_score_reward": -1.214977502822876, "objective/rlhf_reward": -4.4599096983671185, "objective/scores": 0.1, "policy/approxkl_avg": 31.368728637695312, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.477517306804657, "step": 1824, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998262643814087 }, { "episode": 29216, "epoch": 0.5251464931516698, "loss/policy_avg": -0.019476689398288727, "lr": 2.6501150306748468e-06, "objective/entropy": -204.80804443359375, "objective/kl": 12.380779266357422, "objective/non_score_reward": -1.238077998161316, "objective/rlhf_reward": -0.5523121416568753, "objective/scores": 1.1, "policy/approxkl_avg": 85.80036163330078, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.704627275466919, "step": 1825, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0015084743499756 }, { "episode": 29232, "epoch": 0.5254340870690585, "loss/policy_avg": -0.14664429426193237, "lr": 2.6499233128834356e-06, "objective/entropy": 72.73933410644531, "objective/kl": 15.728586196899414, "objective/non_score_reward": -1.5728585720062256, "objective/rlhf_reward": -8.291434288024902, "objective/scores": -0.5, "policy/approxkl_avg": 22.642614364624023, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.8739404082298279, "step": 1826, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0036849975585938 }, { "episode": 29248, "epoch": 0.5257216809864471, "loss/policy_avg": 0.25007522106170654, "lr": 2.649731595092025e-06, "objective/entropy": -111.04607391357422, "objective/kl": 9.555280685424805, "objective/non_score_reward": -0.9555281400680542, "objective/rlhf_reward": -1.422112500667572, "objective/scores": 0.6, "policy/approxkl_avg": 7.105292320251465, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6159435510635376, "step": 1827, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9995931386947632 }, { "episode": 29264, "epoch": 0.5260092749038358, "loss/policy_avg": 0.14585937559604645, "lr": 2.6495398773006136e-06, "objective/entropy": 27.570262908935547, "objective/kl": 17.47567367553711, "objective/non_score_reward": -1.7475672960281372, "objective/rlhf_reward": -8.99026870727539, "objective/scores": -0.5, "policy/approxkl_avg": 36.218692779541016, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5340030193328857, "step": 1828, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9987754821777344 }, { "episode": 29280, "epoch": 0.5262968688212244, "loss/policy_avg": 0.6246511936187744, "lr": 2.6493481595092024e-06, "objective/entropy": -51.908145904541016, "objective/kl": 16.57758140563965, "objective/non_score_reward": -1.6577579975128174, "objective/rlhf_reward": -8.63103199005127, "objective/scores": -0.5, "policy/approxkl_avg": 27.057701110839844, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5691357254981995, "step": 1829, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9991605281829834 }, { "episode": 29296, "epoch": 0.526584462738613, "loss/policy_avg": 0.12075556814670563, "lr": 2.6491564417177916e-06, "objective/entropy": 76.26769256591797, "objective/kl": 14.916351318359375, "objective/non_score_reward": -1.4916352033615112, "objective/rlhf_reward": -5.566540813446045, "objective/scores": 0.1, "policy/approxkl_avg": 17.57536506652832, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6550225019454956, "step": 1830, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0020956993103027 }, { "episode": 29312, "epoch": 0.5268720566560017, "loss/policy_avg": 0.2506754696369171, "lr": 2.6489647239263805e-06, "objective/entropy": -35.4891242980957, "objective/kl": 14.797635078430176, "objective/non_score_reward": -1.4797635078430176, "objective/rlhf_reward": -5.519054269790649, "objective/scores": 0.1, "policy/approxkl_avg": 36.26906204223633, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7706562280654907, "step": 1831, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9965496063232422 }, { "episode": 29328, "epoch": 0.5271596505733904, "loss/policy_avg": 0.1543758064508438, "lr": 2.6487730061349697e-06, "objective/entropy": 67.9344482421875, "objective/kl": 13.906883239746094, "objective/non_score_reward": -1.390688419342041, "objective/rlhf_reward": -1.1627536773681637, "objective/scores": 1.1, "policy/approxkl_avg": 70.80870056152344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6718401908874512, "step": 1832, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9979827404022217 }, { "episode": 29344, "epoch": 0.5274472444907791, "loss/policy_avg": 0.05845007672905922, "lr": 2.6485812883435585e-06, "objective/entropy": -174.62997436523438, "objective/kl": 13.782123565673828, "objective/non_score_reward": -1.3782124519348145, "objective/rlhf_reward": -2.5891307338487834, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 21.229196548461914, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8168323040008545, "step": 1833, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.9980207681655884 }, { "episode": 29360, "epoch": 0.5277348384081677, "loss/policy_avg": 0.12811380624771118, "lr": 2.6483895705521473e-06, "objective/entropy": -122.46810913085938, "objective/kl": 18.168519973754883, "objective/non_score_reward": -1.8168519735336304, "objective/rlhf_reward": -6.867408013343811, "objective/scores": 0.1, "policy/approxkl_avg": 35.205116271972656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6772712469100952, "step": 1834, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9982894659042358 }, { "episode": 29376, "epoch": 0.5280224323255563, "loss/policy_avg": 0.10961230099201202, "lr": 2.648197852760736e-06, "objective/entropy": 32.48823547363281, "objective/kl": 21.051025390625, "objective/non_score_reward": -2.1051025390625, "objective/rlhf_reward": -10.42041015625, "objective/scores": -0.5, "policy/approxkl_avg": 6.921422481536865, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8856111168861389, "step": 1835, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0003602504730225 }, { "episode": 29392, "epoch": 0.528310026242945, "loss/policy_avg": 0.2646952271461487, "lr": 2.648006134969325e-06, "objective/entropy": 106.65617370605469, "objective/kl": 18.93328857421875, "objective/non_score_reward": -1.8933287858963013, "objective/rlhf_reward": -4.649596397520277, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 219.3662567138672, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4539373517036438, "step": 1836, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.997753620147705 }, { "episode": 29408, "epoch": 0.5285976201603336, "loss/policy_avg": 0.18501770496368408, "lr": 2.647814417177914e-06, "objective/entropy": -203.39031982421875, "objective/kl": 13.033735275268555, "objective/non_score_reward": -1.3033735752105713, "objective/rlhf_reward": -4.813494479656219, "objective/scores": 0.1, "policy/approxkl_avg": 58.33476638793945, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5079236030578613, "step": 1837, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9998493194580078 }, { "episode": 29424, "epoch": 0.5288852140777223, "loss/policy_avg": 0.2593972384929657, "lr": 2.647622699386503e-06, "objective/entropy": 169.56747436523438, "objective/kl": 20.271284103393555, "objective/non_score_reward": -2.0271284580230713, "objective/rlhf_reward": -10.108513832092285, "objective/scores": -0.5, "policy/approxkl_avg": 38.37139129638672, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6737958192825317, "step": 1838, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9973093271255493 }, { "episode": 29440, "epoch": 0.5291728079951109, "loss/policy_avg": 0.15093323588371277, "lr": 2.647430981595092e-06, "objective/entropy": 173.2559814453125, "objective/kl": 12.594919204711914, "objective/non_score_reward": -1.2594919204711914, "objective/rlhf_reward": -4.637967845797538, "objective/scores": 0.1, "policy/approxkl_avg": 21.31730842590332, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.664380669593811, "step": 1839, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000157594680786 }, { "episode": 29456, "epoch": 0.5294604019124995, "loss/policy_avg": 0.09447266906499863, "lr": 2.647239263803681e-06, "objective/entropy": 161.65419006347656, "objective/kl": 8.186479568481445, "objective/non_score_reward": -0.8186479806900024, "objective/rlhf_reward": -2.874591892957687, "objective/scores": 0.1, "policy/approxkl_avg": 23.823326110839844, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8956143856048584, "step": 1840, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.996443748474121 }, { "episode": 29472, "epoch": 0.5297479958298882, "loss/policy_avg": 0.24017485976219177, "lr": 2.64704754601227e-06, "objective/entropy": 194.16900634765625, "objective/kl": 16.956283569335938, "objective/non_score_reward": -1.6956284046173096, "objective/rlhf_reward": -8.782513618469238, "objective/scores": -0.5, "policy/approxkl_avg": 30.098064422607422, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7093454599380493, "step": 1841, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9990488290786743 }, { "episode": 29488, "epoch": 0.5300355897472768, "loss/policy_avg": 0.23009948432445526, "lr": 2.646855828220859e-06, "objective/entropy": 191.48776245117188, "objective/kl": 8.397547721862793, "objective/non_score_reward": -0.8397548198699951, "objective/rlhf_reward": -2.9590191602706906, "objective/scores": 0.1, "policy/approxkl_avg": 93.32293701171875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.46769386529922485, "step": 1842, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9982507228851318 }, { "episode": 29504, "epoch": 0.5303231836646655, "loss/policy_avg": 0.31807905435562134, "lr": 2.646664110429448e-06, "objective/entropy": 190.9818115234375, "objective/kl": 17.607507705688477, "objective/non_score_reward": -1.7607507705688477, "objective/rlhf_reward": -2.6430031716823574, "objective/scores": 1.1, "policy/approxkl_avg": 27.155506134033203, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6137632727622986, "step": 1843, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9971423149108887 }, { "episode": 29520, "epoch": 0.5306107775820541, "loss/policy_avg": 0.5301668643951416, "lr": 2.6464723926380367e-06, "objective/entropy": 81.72561645507812, "objective/kl": 16.590045928955078, "objective/non_score_reward": -1.659004807472229, "objective/rlhf_reward": -6.236018991470337, "objective/scores": 0.1, "policy/approxkl_avg": 23.18244743347168, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8008009195327759, "step": 1844, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9989930391311646 }, { "episode": 29536, "epoch": 0.5308983714994427, "loss/policy_avg": -0.15928786993026733, "lr": 2.646280674846626e-06, "objective/entropy": 124.23045349121094, "objective/kl": 16.684246063232422, "objective/non_score_reward": -1.6684244871139526, "objective/rlhf_reward": -4.726286898331578, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 19.319072723388672, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7122434377670288, "step": 1845, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.003610134124756 }, { "episode": 29552, "epoch": 0.5311859654168314, "loss/policy_avg": 0.3252880573272705, "lr": 2.6460889570552147e-06, "objective/entropy": 95.69757843017578, "objective/kl": 17.60025405883789, "objective/non_score_reward": -1.7600253820419312, "objective/rlhf_reward": -5.092690105514462, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 82.03265380859375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.771241307258606, "step": 1846, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9977837800979614 }, { "episode": 29568, "epoch": 0.5314735593342201, "loss/policy_avg": 0.23750726878643036, "lr": 2.645897239263804e-06, "objective/entropy": 186.9475555419922, "objective/kl": 13.092645645141602, "objective/non_score_reward": -1.3092646598815918, "objective/rlhf_reward": -4.837058520317077, "objective/scores": 0.1, "policy/approxkl_avg": 6.29829216003418, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6885484457015991, "step": 1847, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0034914016723633 }, { "episode": 29584, "epoch": 0.5317611532516088, "loss/policy_avg": 0.2734951376914978, "lr": 2.6457055214723928e-06, "objective/entropy": -124.91612243652344, "objective/kl": 15.988296508789062, "objective/non_score_reward": -1.5988296270370483, "objective/rlhf_reward": -8.395318031311035, "objective/scores": -0.5, "policy/approxkl_avg": 5.176590442657471, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7133872509002686, "step": 1848, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998915672302246 }, { "episode": 29600, "epoch": 0.5320487471689974, "loss/policy_avg": 0.6326997876167297, "lr": 2.6455138036809816e-06, "objective/entropy": 102.43971252441406, "objective/kl": 14.527070999145508, "objective/non_score_reward": -1.452707052230835, "objective/rlhf_reward": -3.863417278008397, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 13.749048233032227, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6314961910247803, "step": 1849, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9977028369903564 }, { "episode": 29616, "epoch": 0.5323363410863861, "loss/policy_avg": 0.018764328211545944, "lr": 2.645322085889571e-06, "objective/entropy": 173.0929718017578, "objective/kl": 13.599279403686523, "objective/non_score_reward": -1.359928011894226, "objective/rlhf_reward": -7.439712047576904, "objective/scores": -0.5, "policy/approxkl_avg": 26.08881187438965, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6948906779289246, "step": 1850, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9987165927886963 }, { "episode": 29632, "epoch": 0.5326239350037747, "loss/policy_avg": 0.8213224411010742, "lr": 2.6451303680981596e-06, "objective/entropy": 84.24604797363281, "objective/kl": 19.240615844726562, "objective/non_score_reward": -1.92406165599823, "objective/rlhf_reward": -6.092126611534672, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 32.59326171875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7166078090667725, "step": 1851, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9984686374664307 }, { "episode": 29648, "epoch": 0.5329115289211633, "loss/policy_avg": 0.20839475095272064, "lr": 2.644938650306749e-06, "objective/entropy": 143.82113647460938, "objective/kl": 16.232925415039062, "objective/non_score_reward": -1.6232926845550537, "objective/rlhf_reward": -2.093170380592346, "objective/scores": 1.1, "policy/approxkl_avg": 14.679071426391602, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7232511043548584, "step": 1852, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9998900890350342 }, { "episode": 29664, "epoch": 0.533199122838552, "loss/policy_avg": 0.07763684540987015, "lr": 2.6447469325153377e-06, "objective/entropy": -36.77897262573242, "objective/kl": 16.796016693115234, "objective/non_score_reward": -1.6796014308929443, "objective/rlhf_reward": -6.318406081199646, "objective/scores": 0.1, "policy/approxkl_avg": 42.441017150878906, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6772186756134033, "step": 1853, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9989824295043945 }, { "episode": 29680, "epoch": 0.5334867167559406, "loss/policy_avg": 0.025462936609983444, "lr": 2.6445552147239265e-06, "objective/entropy": 132.22561645507812, "objective/kl": 9.946213722229004, "objective/non_score_reward": -0.9946214556694031, "objective/rlhf_reward": -3.5784857928752896, "objective/scores": 0.1, "policy/approxkl_avg": 6.1699419021606445, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5213096737861633, "step": 1854, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9997193813323975 }, { "episode": 29696, "epoch": 0.5337743106733293, "loss/policy_avg": 0.8330253958702087, "lr": 2.6443634969325157e-06, "objective/entropy": 294.1936950683594, "objective/kl": 16.371532440185547, "objective/non_score_reward": -1.6371533870697021, "objective/rlhf_reward": -8.548613548278809, "objective/scores": -0.5, "policy/approxkl_avg": 9.149263381958008, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.9009605050086975, "step": 1855, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9972525835037231 }, { "episode": 29712, "epoch": 0.5340619045907179, "loss/policy_avg": 0.07781802117824554, "lr": 2.644171779141104e-06, "objective/entropy": 165.48971557617188, "objective/kl": 18.383617401123047, "objective/non_score_reward": -1.8383618593215942, "objective/rlhf_reward": -6.953447675704956, "objective/scores": 0.1, "policy/approxkl_avg": 14.224011421203613, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9337248802185059, "step": 1856, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9977425336837769 }, { "episode": 29728, "epoch": 0.5343494985081065, "loss/policy_avg": 0.3984661102294922, "lr": 2.6439800613496933e-06, "objective/entropy": 0.33826446533203125, "objective/kl": 11.344948768615723, "objective/non_score_reward": -1.1344949007034302, "objective/rlhf_reward": -2.4152732513108592, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 17.914337158203125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7256457805633545, "step": 1857, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9982662200927734 }, { "episode": 29744, "epoch": 0.5346370924254952, "loss/policy_avg": 0.19412755966186523, "lr": 2.643788343558282e-06, "objective/entropy": 99.91259765625, "objective/kl": 22.232093811035156, "objective/non_score_reward": -2.2232093811035156, "objective/rlhf_reward": -10.892837524414062, "objective/scores": -0.5, "policy/approxkl_avg": 46.14105987548828, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7228943109512329, "step": 1858, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999452829360962 }, { "episode": 29760, "epoch": 0.5349246863428838, "loss/policy_avg": 0.44888734817504883, "lr": 2.643596625766871e-06, "objective/entropy": 206.21575927734375, "objective/kl": 17.497486114501953, "objective/non_score_reward": -1.749748706817627, "objective/rlhf_reward": -4.876288416163002, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 157.12306213378906, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.598418116569519, "step": 1859, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.997058629989624 }, { "episode": 29776, "epoch": 0.5352122802602725, "loss/policy_avg": 0.11433567106723785, "lr": 2.64340490797546e-06, "objective/entropy": -88.20219421386719, "objective/kl": 11.040931701660156, "objective/non_score_reward": -1.104093074798584, "objective/rlhf_reward": -2.4689614725875213, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 9.95333480834961, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6028738021850586, "step": 1860, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000257730484009 }, { "episode": 29792, "epoch": 0.5354998741776611, "loss/policy_avg": 0.45190608501434326, "lr": 2.643213190184049e-06, "objective/entropy": 223.72943115234375, "objective/kl": 17.677797317504883, "objective/non_score_reward": -1.76777982711792, "objective/rlhf_reward": -4.14740041339514, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 76.53868103027344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5580284595489502, "step": 1861, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9989244937896729 }, { "episode": 29808, "epoch": 0.5357874680950497, "loss/policy_avg": 0.5275196433067322, "lr": 2.643021472392638e-06, "objective/entropy": -49.960601806640625, "objective/kl": 15.230354309082031, "objective/non_score_reward": -1.5230354070663452, "objective/rlhf_reward": -8.092142105102539, "objective/scores": -0.5, "policy/approxkl_avg": 59.79082489013672, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7794216275215149, "step": 1862, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9985644817352295 }, { "episode": 29824, "epoch": 0.5360750620124385, "loss/policy_avg": 1.3458707332611084, "lr": 2.642829754601227e-06, "objective/entropy": -221.41986083984375, "objective/kl": 9.127204895019531, "objective/non_score_reward": -0.9127205014228821, "objective/rlhf_reward": -1.7034706575440723, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 18.279823303222656, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.560336172580719, "step": 1863, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9984030723571777 }, { "episode": 29840, "epoch": 0.5363626559298271, "loss/policy_avg": 0.23017987608909607, "lr": 2.642638036809816e-06, "objective/entropy": 99.97102355957031, "objective/kl": 18.287355422973633, "objective/non_score_reward": -1.8287353515625, "objective/rlhf_reward": -9.31494140625, "objective/scores": -0.5, "policy/approxkl_avg": 28.416362762451172, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.539484977722168, "step": 1864, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9988839626312256 }, { "episode": 29856, "epoch": 0.5366502498472158, "loss/policy_avg": 0.11260244250297546, "lr": 2.642446319018405e-06, "objective/entropy": 68.40955352783203, "objective/kl": 17.344467163085938, "objective/non_score_reward": -1.7344467639923096, "objective/rlhf_reward": -2.537786996364593, "objective/scores": 1.1, "policy/approxkl_avg": 6.906153678894043, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6735920906066895, "step": 1865, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000178813934326 }, { "episode": 29872, "epoch": 0.5369378437646044, "loss/policy_avg": 0.5199981927871704, "lr": 2.642254601226994e-06, "objective/entropy": 223.3328857421875, "objective/kl": 10.768610000610352, "objective/non_score_reward": -1.076861023902893, "objective/rlhf_reward": -6.307444095611572, "objective/scores": -0.5, "policy/approxkl_avg": 47.59469223022461, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5734953880310059, "step": 1866, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9990028142929077 }, { "episode": 29888, "epoch": 0.537225437681993, "loss/policy_avg": 0.10744701325893402, "lr": 2.6420628834355827e-06, "objective/entropy": 63.99552536010742, "objective/kl": 14.580144882202148, "objective/non_score_reward": -1.4580143690109253, "objective/rlhf_reward": -5.43205771446228, "objective/scores": 0.1, "policy/approxkl_avg": 52.58818817138672, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6608311533927917, "step": 1867, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9967219829559326 }, { "episode": 29904, "epoch": 0.5375130315993817, "loss/policy_avg": 0.44945427775382996, "lr": 2.641871165644172e-06, "objective/entropy": -281.01654052734375, "objective/kl": 14.940108299255371, "objective/non_score_reward": -1.4940109252929688, "objective/rlhf_reward": -3.5760435372591015, "objective/scores": 0.6, "policy/approxkl_avg": 74.54122924804688, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5839606523513794, "step": 1868, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9974722862243652 }, { "episode": 29920, "epoch": 0.5378006255167703, "loss/policy_avg": -0.037052758038043976, "lr": 2.6416794478527607e-06, "objective/entropy": 179.74900817871094, "objective/kl": 15.392721176147461, "objective/non_score_reward": -1.5392720699310303, "objective/rlhf_reward": -3.2333691015255184, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 66.53801727294922, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6649525165557861, "step": 1869, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.002619743347168 }, { "episode": 29936, "epoch": 0.538088219434159, "loss/policy_avg": 0.15332826972007751, "lr": 2.64148773006135e-06, "objective/entropy": 219.11968994140625, "objective/kl": 11.19294261932373, "objective/non_score_reward": -1.119294285774231, "objective/rlhf_reward": -1.553457964898321, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 17.272003173828125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5493988990783691, "step": 1870, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.000128746032715 }, { "episode": 29952, "epoch": 0.5383758133515476, "loss/policy_avg": 0.3887026011943817, "lr": 2.6412960122699388e-06, "objective/entropy": 327.54498291015625, "objective/kl": 15.907135963439941, "objective/non_score_reward": -1.5907135009765625, "objective/rlhf_reward": -8.36285400390625, "objective/scores": -0.5, "policy/approxkl_avg": 9.208090782165527, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 1.0575509071350098, "step": 1871, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9971492290496826 }, { "episode": 29968, "epoch": 0.5386634072689362, "loss/policy_avg": 0.703474760055542, "lr": 2.6411042944785276e-06, "objective/entropy": 128.798095703125, "objective/kl": 14.402985572814941, "objective/non_score_reward": -1.4402985572814941, "objective/rlhf_reward": -3.6384882948556285, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 14.528717041015625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6321216225624084, "step": 1872, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0003087520599365 }, { "episode": 29984, "epoch": 0.5389510011863249, "loss/policy_avg": 0.5478439331054688, "lr": 2.640912576687117e-06, "objective/entropy": 42.24315643310547, "objective/kl": 17.929065704345703, "objective/non_score_reward": -1.7929065227508545, "objective/rlhf_reward": -9.171626091003418, "objective/scores": -0.5, "policy/approxkl_avg": 78.95010375976562, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6490499973297119, "step": 1873, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9995921850204468 }, { "episode": 30000, "epoch": 0.5392385951037135, "loss/policy_avg": 0.30609291791915894, "lr": 2.6407208588957056e-06, "objective/entropy": 126.87837982177734, "objective/kl": 12.08336067199707, "objective/non_score_reward": -1.2083361148834229, "objective/rlhf_reward": -4.433344638347625, "objective/scores": 0.1, "policy/approxkl_avg": 78.1723403930664, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4774414300918579, "step": 1874, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9979808330535889 }, { "episode": 30016, "epoch": 0.5395261890211022, "loss/policy_avg": 0.1784551441669464, "lr": 2.640529141104295e-06, "objective/entropy": -67.91968536376953, "objective/kl": 12.234102249145508, "objective/non_score_reward": -1.2234103679656982, "objective/rlhf_reward": -3.068812485012125, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 7.247363567352295, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6835782527923584, "step": 1875, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000981330871582 }, { "episode": 30032, "epoch": 0.5398137829384908, "loss/policy_avg": 0.07866216450929642, "lr": 2.6403374233128837e-06, "objective/entropy": 31.470932006835938, "objective/kl": 18.203540802001953, "objective/non_score_reward": -1.8203542232513428, "objective/rlhf_reward": -6.881416893005371, "objective/scores": 0.1, "policy/approxkl_avg": 223.04364013671875, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6657415628433228, "step": 1876, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9984439611434937 }, { "episode": 30048, "epoch": 0.5401013768558794, "loss/policy_avg": 0.3167308568954468, "lr": 2.6401457055214725e-06, "objective/entropy": 320.1375732421875, "objective/kl": 18.73411750793457, "objective/non_score_reward": -1.8734118938446045, "objective/rlhf_reward": -3.093647754192352, "objective/scores": 1.1, "policy/approxkl_avg": 74.82015228271484, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.857011079788208, "step": 1877, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9953978061676025 }, { "episode": 30064, "epoch": 0.5403889707732682, "loss/policy_avg": 0.21011970937252045, "lr": 2.6399539877300617e-06, "objective/entropy": 133.35537719726562, "objective/kl": 14.358227729797363, "objective/non_score_reward": -1.4358227252960205, "objective/rlhf_reward": -5.343290990591049, "objective/scores": 0.1, "policy/approxkl_avg": 2.1520862579345703, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5805197954177856, "step": 1878, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999664545059204 }, { "episode": 30080, "epoch": 0.5406765646906568, "loss/policy_avg": 0.21725696325302124, "lr": 2.63976226993865e-06, "objective/entropy": 126.29757690429688, "objective/kl": 13.66500473022461, "objective/non_score_reward": -1.3665004968643188, "objective/rlhf_reward": -3.5185906989144637, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 18.468700408935547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.32678791880607605, "step": 1879, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9992204904556274 }, { "episode": 30096, "epoch": 0.5409641586080455, "loss/policy_avg": 0.3724169135093689, "lr": 2.6395705521472393e-06, "objective/entropy": 23.84024429321289, "objective/kl": 14.622039794921875, "objective/non_score_reward": -1.4622039794921875, "objective/rlhf_reward": -7.84881591796875, "objective/scores": -0.5, "policy/approxkl_avg": 58.616756439208984, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6998543739318848, "step": 1880, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.997481107711792 }, { "episode": 30112, "epoch": 0.5412517525254341, "loss/policy_avg": 0.09946730732917786, "lr": 2.639378834355828e-06, "objective/entropy": 73.49078369140625, "objective/kl": 20.70635986328125, "objective/non_score_reward": -2.0706357955932617, "objective/rlhf_reward": -7.882543003559112, "objective/scores": 0.1, "policy/approxkl_avg": 55.02135467529297, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6006935834884644, "step": 1881, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999374508857727 }, { "episode": 30128, "epoch": 0.5415393464428228, "loss/policy_avg": 0.25386878848075867, "lr": 2.639187116564417e-06, "objective/entropy": 15.929668426513672, "objective/kl": 19.90158462524414, "objective/non_score_reward": -1.9901586771011353, "objective/rlhf_reward": -5.560634529590606, "objective/scores": 0.6, "policy/approxkl_avg": 44.23625183105469, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6773188710212708, "step": 1882, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9970426559448242 }, { "episode": 30144, "epoch": 0.5418269403602114, "loss/policy_avg": 2.0969796180725098, "lr": 2.638995398773006e-06, "objective/entropy": 52.69280242919922, "objective/kl": 18.107351303100586, "objective/non_score_reward": -1.8107349872589111, "objective/rlhf_reward": -6.84293989688158, "objective/scores": 0.1, "policy/approxkl_avg": 52.367340087890625, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7185496091842651, "step": 1883, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999009370803833 }, { "episode": 30160, "epoch": 0.5421145342776, "loss/policy_avg": -0.183127298951149, "lr": 2.638803680981595e-06, "objective/entropy": -332.404296875, "objective/kl": 13.645541191101074, "objective/non_score_reward": -1.3645542860031128, "objective/rlhf_reward": -3.796357398450957, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 1.9626456499099731, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7224613428115845, "step": 1884, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0001583099365234 }, { "episode": 30176, "epoch": 0.5424021281949887, "loss/policy_avg": 0.23557086288928986, "lr": 2.6386119631901842e-06, "objective/entropy": 119.83067321777344, "objective/kl": 15.023857116699219, "objective/non_score_reward": -1.5023858547210693, "objective/rlhf_reward": -1.6095432996749874, "objective/scores": 1.1, "policy/approxkl_avg": 49.335533142089844, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6670380234718323, "step": 1885, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9991014003753662 }, { "episode": 30192, "epoch": 0.5426897221123773, "loss/policy_avg": -0.24799206852912903, "lr": 2.638420245398773e-06, "objective/entropy": -135.42721557617188, "objective/kl": 16.508018493652344, "objective/non_score_reward": -1.6508018970489502, "objective/rlhf_reward": -6.203207468986511, "objective/scores": 0.1, "policy/approxkl_avg": 39.26435852050781, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4349724054336548, "step": 1886, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.996954321861267 }, { "episode": 30208, "epoch": 0.542977316029766, "loss/policy_avg": 0.2918689250946045, "lr": 2.638228527607362e-06, "objective/entropy": 101.60655212402344, "objective/kl": 18.337299346923828, "objective/non_score_reward": -1.8337297439575195, "objective/rlhf_reward": -5.673059051454651, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 104.64295196533203, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4675699472427368, "step": 1887, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9995253086090088 }, { "episode": 30224, "epoch": 0.5432649099471546, "loss/policy_avg": 0.018769599497318268, "lr": 2.638036809815951e-06, "objective/entropy": 24.645986557006836, "objective/kl": 19.31072235107422, "objective/non_score_reward": -1.9310722351074219, "objective/rlhf_reward": -7.324289298057556, "objective/scores": 0.1, "policy/approxkl_avg": 69.67178344726562, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.47627758979797363, "step": 1888, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0013792514801025 }, { "episode": 30240, "epoch": 0.5435525038645432, "loss/policy_avg": 0.15706397593021393, "lr": 2.63784509202454e-06, "objective/entropy": -278.9888916015625, "objective/kl": 1.9181170463562012, "objective/non_score_reward": -0.19181174039840698, "objective/rlhf_reward": 2.1564720824945245, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 16.385345458984375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6495269536972046, "step": 1889, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.99857497215271 }, { "episode": 30256, "epoch": 0.5438400977819319, "loss/policy_avg": -0.10978720337152481, "lr": 2.637653374233129e-06, "objective/entropy": 221.46072387695312, "objective/kl": 18.129825592041016, "objective/non_score_reward": -1.8129827976226807, "objective/rlhf_reward": -6.851930877566337, "objective/scores": 0.1, "policy/approxkl_avg": 98.64704895019531, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5976404547691345, "step": 1890, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0071773529052734 }, { "episode": 30272, "epoch": 0.5441276916993205, "loss/policy_avg": 0.5263071060180664, "lr": 2.637461656441718e-06, "objective/entropy": -24.288192749023438, "objective/kl": 13.550774574279785, "objective/non_score_reward": -1.3550773859024048, "objective/rlhf_reward": -2.496590469719145, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 25.168991088867188, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7016432285308838, "step": 1891, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9976330995559692 }, { "episode": 30288, "epoch": 0.5444152856167092, "loss/policy_avg": 0.14902125298976898, "lr": 2.6372699386503067e-06, "objective/entropy": 19.211254119873047, "objective/kl": 12.499555587768555, "objective/non_score_reward": -1.249955654144287, "objective/rlhf_reward": -4.599822735786438, "objective/scores": 0.1, "policy/approxkl_avg": 3.2693653106689453, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4941726326942444, "step": 1892, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9993488788604736 }, { "episode": 30304, "epoch": 0.5447028795340979, "loss/policy_avg": 1.661439061164856, "lr": 2.637078220858896e-06, "objective/entropy": -65.19476318359375, "objective/kl": 14.480757713317871, "objective/non_score_reward": -1.4480756521224976, "objective/rlhf_reward": -3.3923026382923123, "objective/scores": 0.6, "policy/approxkl_avg": 107.93418884277344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6490131616592407, "step": 1893, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9979932308197021 }, { "episode": 30320, "epoch": 0.5449904734514865, "loss/policy_avg": 0.3524066209793091, "lr": 2.6368865030674848e-06, "objective/entropy": -170.36708068847656, "objective/kl": 16.79137420654297, "objective/non_score_reward": -1.6791372299194336, "objective/rlhf_reward": -6.316548681259155, "objective/scores": 0.1, "policy/approxkl_avg": 29.46489715576172, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.507827877998352, "step": 1894, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9996378421783447 }, { "episode": 30336, "epoch": 0.5452780673688752, "loss/policy_avg": 0.3015563488006592, "lr": 2.6366947852760736e-06, "objective/entropy": 7.907737731933594, "objective/kl": 11.733333587646484, "objective/non_score_reward": -1.1733335256576538, "objective/rlhf_reward": -4.293333983421325, "objective/scores": 0.1, "policy/approxkl_avg": 32.35322952270508, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6082165241241455, "step": 1895, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9987642765045166 }, { "episode": 30352, "epoch": 0.5455656612862638, "loss/policy_avg": -0.0767693892121315, "lr": 2.636503067484663e-06, "objective/entropy": 42.51039505004883, "objective/kl": 14.21924114227295, "objective/non_score_reward": -1.421924114227295, "objective/rlhf_reward": -2.7639773234140605, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 7.097938537597656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4579886794090271, "step": 1896, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000124931335449 }, { "episode": 30368, "epoch": 0.5458532552036525, "loss/policy_avg": 0.009962936863303185, "lr": 2.6363113496932516e-06, "objective/entropy": 119.24744415283203, "objective/kl": 6.005527496337891, "objective/non_score_reward": -0.6005527973175049, "objective/rlhf_reward": 1.997788900136948, "objective/scores": 1.1, "policy/approxkl_avg": 1.5343382358551025, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5637143850326538, "step": 1897, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.001119613647461 }, { "episode": 30384, "epoch": 0.5461408491210411, "loss/policy_avg": 0.3311734199523926, "lr": 2.636119631901841e-06, "objective/entropy": 139.10549926757812, "objective/kl": 11.098630905151367, "objective/non_score_reward": -1.109863042831421, "objective/rlhf_reward": -0.03945221602916682, "objective/scores": 1.1, "policy/approxkl_avg": 14.385029792785645, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.596035361289978, "step": 1898, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0025429725646973 }, { "episode": 30400, "epoch": 0.5464284430384297, "loss/policy_avg": 0.14937268197536469, "lr": 2.6359279141104297e-06, "objective/entropy": -126.49579620361328, "objective/kl": 13.532024383544922, "objective/non_score_reward": -1.3532025814056396, "objective/rlhf_reward": -1.0128101207315918, "objective/scores": 1.1, "policy/approxkl_avg": 56.03593444824219, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5827751159667969, "step": 1899, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9989010095596313 }, { "episode": 30416, "epoch": 0.5467160369558184, "loss/policy_avg": 0.2829972207546234, "lr": 2.6357361963190185e-06, "objective/entropy": 216.04718017578125, "objective/kl": 13.735719680786133, "objective/non_score_reward": -1.3735718727111816, "objective/rlhf_reward": -3.546876351313527, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 50.316139221191406, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5368937253952026, "step": 1900, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.997572660446167 }, { "episode": 30432, "epoch": 0.547003630873207, "loss/policy_avg": 0.17570659518241882, "lr": 2.6355444785276073e-06, "objective/entropy": 354.5912780761719, "objective/kl": 16.02384376525879, "objective/non_score_reward": -1.6023844480514526, "objective/rlhf_reward": -6.009537851810455, "objective/scores": 0.1, "policy/approxkl_avg": 21.36066246032715, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8833104372024536, "step": 1901, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9990440607070923 }, { "episode": 30448, "epoch": 0.5472912247905957, "loss/policy_avg": 0.4607427418231964, "lr": 2.635352760736196e-06, "objective/entropy": 20.947402954101562, "objective/kl": 14.86680793762207, "objective/non_score_reward": -1.4866809844970703, "objective/rlhf_reward": -3.5467239081859585, "objective/scores": 0.6, "policy/approxkl_avg": 58.469356536865234, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6606134176254272, "step": 1902, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9991655349731445 }, { "episode": 30464, "epoch": 0.5475788187079843, "loss/policy_avg": 0.5278885364532471, "lr": 2.6351610429447853e-06, "objective/entropy": 65.04945373535156, "objective/kl": 13.691702842712402, "objective/non_score_reward": -1.3691703081130981, "objective/rlhf_reward": -5.076681351661682, "objective/scores": 0.1, "policy/approxkl_avg": 39.13874816894531, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5448737144470215, "step": 1903, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.996506690979004 }, { "episode": 30480, "epoch": 0.547866412625373, "loss/policy_avg": 0.1716153770685196, "lr": 2.634969325153374e-06, "objective/entropy": 271.1617126464844, "objective/kl": 19.076047897338867, "objective/non_score_reward": -1.9076049327850342, "objective/rlhf_reward": -9.630420684814453, "objective/scores": -0.5, "policy/approxkl_avg": 192.4165496826172, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6272881031036377, "step": 1904, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9973095655441284 }, { "episode": 30496, "epoch": 0.5481540065427616, "loss/policy_avg": 0.16718728840351105, "lr": 2.634777607361963e-06, "objective/entropy": 56.779449462890625, "objective/kl": 16.912702560424805, "objective/non_score_reward": -1.691270351409912, "objective/rlhf_reward": -6.365081644058227, "objective/scores": 0.1, "policy/approxkl_avg": 58.04198455810547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6575337648391724, "step": 1905, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.00191593170166 }, { "episode": 30512, "epoch": 0.5484416004601502, "loss/policy_avg": 0.02517540752887726, "lr": 2.634585889570552e-06, "objective/entropy": -61.73188018798828, "objective/kl": 8.17990493774414, "objective/non_score_reward": -0.8179904222488403, "objective/rlhf_reward": -0.3482427566659183, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 21.236021041870117, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5999727249145508, "step": 1906, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999282956123352 }, { "episode": 30528, "epoch": 0.5487291943775389, "loss/policy_avg": 0.11145927011966705, "lr": 2.634394171779141e-06, "objective/entropy": -360.6292419433594, "objective/kl": 15.74618911743164, "objective/non_score_reward": -1.5746190547943115, "objective/rlhf_reward": -5.898476040363311, "objective/scores": 0.1, "policy/approxkl_avg": 29.0206298828125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8304791450500488, "step": 1907, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9990675449371338 }, { "episode": 30544, "epoch": 0.5490167882949276, "loss/policy_avg": 0.17270202934741974, "lr": 2.6342024539877302e-06, "objective/entropy": 256.72802734375, "objective/kl": 20.78005599975586, "objective/non_score_reward": -2.07800555229187, "objective/rlhf_reward": -7.912022566795349, "objective/scores": 0.1, "policy/approxkl_avg": 22.87220573425293, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6917515993118286, "step": 1908, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.00004506111145 }, { "episode": 30560, "epoch": 0.5493043822123163, "loss/policy_avg": 0.13270515203475952, "lr": 2.634010736196319e-06, "objective/entropy": 116.8352279663086, "objective/kl": 15.580759048461914, "objective/non_score_reward": -1.5580757856369019, "objective/rlhf_reward": -5.832303127646446, "objective/scores": 0.1, "policy/approxkl_avg": 53.574005126953125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6051105856895447, "step": 1909, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9993774890899658 }, { "episode": 30576, "epoch": 0.5495919761297049, "loss/policy_avg": 0.5251754522323608, "lr": 2.633819018404908e-06, "objective/entropy": -125.42312622070312, "objective/kl": 14.342058181762695, "objective/non_score_reward": -1.4342057704925537, "objective/rlhf_reward": -3.3368228435516354, "objective/scores": 0.6, "policy/approxkl_avg": 7.771772861480713, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5477792024612427, "step": 1910, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0014166831970215 }, { "episode": 30592, "epoch": 0.5498795700470935, "loss/policy_avg": 0.18452884256839752, "lr": 2.633627300613497e-06, "objective/entropy": 240.2740020751953, "objective/kl": 12.670340538024902, "objective/non_score_reward": -1.2670340538024902, "objective/rlhf_reward": -4.668136155605316, "objective/scores": 0.1, "policy/approxkl_avg": 3.453449010848999, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6076258420944214, "step": 1911, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0000457763671875 }, { "episode": 30608, "epoch": 0.5501671639644822, "loss/policy_avg": 0.12677043676376343, "lr": 2.633435582822086e-06, "objective/entropy": 31.767406463623047, "objective/kl": 11.432640075683594, "objective/non_score_reward": -1.1432639360427856, "objective/rlhf_reward": -6.573055744171143, "objective/scores": -0.5, "policy/approxkl_avg": 34.6868896484375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5203061699867249, "step": 1912, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0009207725524902 }, { "episode": 30624, "epoch": 0.5504547578818708, "loss/policy_avg": 0.47530901432037354, "lr": 2.633243865030675e-06, "objective/entropy": 224.4400634765625, "objective/kl": 12.745110511779785, "objective/non_score_reward": -1.2745109796524048, "objective/rlhf_reward": -7.098043918609619, "objective/scores": -0.5, "policy/approxkl_avg": 56.56214904785156, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5925279855728149, "step": 1913, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.000298023223877 }, { "episode": 30640, "epoch": 0.5507423517992595, "loss/policy_avg": 0.029194243252277374, "lr": 2.633052147239264e-06, "objective/entropy": 156.1571044921875, "objective/kl": 8.93233871459961, "objective/non_score_reward": -0.8932338953018188, "objective/rlhf_reward": -3.172935730218887, "objective/scores": 0.1, "policy/approxkl_avg": 17.79572296142578, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7106249332427979, "step": 1914, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0002284049987793 }, { "episode": 30656, "epoch": 0.5510299457166481, "loss/policy_avg": 0.03485263139009476, "lr": 2.6328604294478527e-06, "objective/entropy": 230.6031036376953, "objective/kl": 13.536725997924805, "objective/non_score_reward": -1.3536725044250488, "objective/rlhf_reward": -5.014690196514129, "objective/scores": 0.1, "policy/approxkl_avg": 56.59600067138672, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5597454309463501, "step": 1915, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9974443912506104 }, { "episode": 30672, "epoch": 0.5513175396340367, "loss/policy_avg": 0.2497161328792572, "lr": 2.632668711656442e-06, "objective/entropy": -96.27470397949219, "objective/kl": 12.014034271240234, "objective/non_score_reward": -1.2014034986495972, "objective/rlhf_reward": -1.8818948611032693, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 2.185368776321411, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5934675931930542, "step": 1916, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.004218101501465 }, { "episode": 30688, "epoch": 0.5516051335514254, "loss/policy_avg": 0.36477264761924744, "lr": 2.6324769938650308e-06, "objective/entropy": 210.66531372070312, "objective/kl": 12.38323974609375, "objective/non_score_reward": -1.2383239269256592, "objective/rlhf_reward": -6.953295707702637, "objective/scores": -0.5, "policy/approxkl_avg": 45.72761154174805, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5291770696640015, "step": 1917, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9988799095153809 }, { "episode": 30704, "epoch": 0.551892727468814, "loss/policy_avg": 0.4472461938858032, "lr": 2.6322852760736196e-06, "objective/entropy": 104.30414581298828, "objective/kl": 13.77920913696289, "objective/non_score_reward": -1.3779209852218628, "objective/rlhf_reward": -5.111683821678161, "objective/scores": 0.1, "policy/approxkl_avg": 11.586515426635742, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5660973787307739, "step": 1918, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9980164766311646 }, { "episode": 30720, "epoch": 0.5521803213862027, "loss/policy_avg": 0.37102967500686646, "lr": 2.632093558282209e-06, "objective/entropy": 60.02656936645508, "objective/kl": 10.058416366577148, "objective/non_score_reward": -1.0058417320251465, "objective/rlhf_reward": -6.023366928100586, "objective/scores": -0.5, "policy/approxkl_avg": 10.679643630981445, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7811880111694336, "step": 1919, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0008389949798584 }, { "episode": 30736, "epoch": 0.5524679153035913, "loss/policy_avg": 0.21096105873584747, "lr": 2.6319018404907976e-06, "objective/entropy": -134.3823699951172, "objective/kl": 13.197624206542969, "objective/non_score_reward": -1.3197624683380127, "objective/rlhf_reward": -2.355330799461576, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 45.305763244628906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.47559309005737305, "step": 1920, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9972378015518188 }, { "episode": 30752, "epoch": 0.5527555092209799, "loss/policy_avg": 0.40054818987846375, "lr": 2.631710122699387e-06, "objective/entropy": 175.58895874023438, "objective/kl": 17.43608856201172, "objective/non_score_reward": -1.743609070777893, "objective/rlhf_reward": -8.974435806274414, "objective/scores": -0.5, "policy/approxkl_avg": 59.900245666503906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8297168016433716, "step": 1921, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998053789138794 }, { "episode": 30768, "epoch": 0.5530431031383686, "loss/policy_avg": -0.3259451985359192, "lr": 2.6315184049079757e-06, "objective/entropy": -119.45405578613281, "objective/kl": 16.492374420166016, "objective/non_score_reward": -1.6492376327514648, "objective/rlhf_reward": -8.59695053100586, "objective/scores": -0.5, "policy/approxkl_avg": 56.74144744873047, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.831850528717041, "step": 1922, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.998541235923767 }, { "episode": 30784, "epoch": 0.5533306970557573, "loss/policy_avg": 1.059677243232727, "lr": 2.6313266871165645e-06, "objective/entropy": -17.18384552001953, "objective/kl": 17.018753051757812, "objective/non_score_reward": -1.7018752098083496, "objective/rlhf_reward": -6.4075008690357205, "objective/scores": 0.1, "policy/approxkl_avg": 84.392333984375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7476541996002197, "step": 1923, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9991600513458252 }, { "episode": 30800, "epoch": 0.553618290973146, "loss/policy_avg": 0.5696789622306824, "lr": 2.6311349693251533e-06, "objective/entropy": 37.21489715576172, "objective/kl": 17.65481948852539, "objective/non_score_reward": -1.76548171043396, "objective/rlhf_reward": -9.06192684173584, "objective/scores": -0.5, "policy/approxkl_avg": 46.97014617919922, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.46043169498443604, "step": 1924, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998187780380249 }, { "episode": 30816, "epoch": 0.5539058848905346, "loss/policy_avg": 0.4576580822467804, "lr": 2.630943251533742e-06, "objective/entropy": 163.5697021484375, "objective/kl": 19.395936965942383, "objective/non_score_reward": -1.93959379196167, "objective/rlhf_reward": -9.75837516784668, "objective/scores": -0.5, "policy/approxkl_avg": 27.783203125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.633905827999115, "step": 1925, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0000224113464355 }, { "episode": 30832, "epoch": 0.5541934788079232, "loss/policy_avg": 0.370788037776947, "lr": 2.6307515337423313e-06, "objective/entropy": -40.51841735839844, "objective/kl": 12.110527038574219, "objective/non_score_reward": -1.2110527753829956, "objective/rlhf_reward": -2.8967998725938156, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 15.352380752563477, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6605029106140137, "step": 1926, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.997687816619873 }, { "episode": 30848, "epoch": 0.5544810727253119, "loss/policy_avg": 0.5198642611503601, "lr": 2.63055981595092e-06, "objective/entropy": 123.91789245605469, "objective/kl": 17.366943359375, "objective/non_score_reward": -1.736694097518921, "objective/rlhf_reward": -6.546776747703552, "objective/scores": 0.1, "policy/approxkl_avg": 54.66370391845703, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7496399879455566, "step": 1927, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9992420673370361 }, { "episode": 30864, "epoch": 0.5547686666427005, "loss/policy_avg": 0.46590036153793335, "lr": 2.6303680981595094e-06, "objective/entropy": 179.61434936523438, "objective/kl": 12.078149795532227, "objective/non_score_reward": -1.2078149318695068, "objective/rlhf_reward": -2.708553614393745, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 37.074127197265625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6669144630432129, "step": 1928, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9998729228973389 }, { "episode": 30880, "epoch": 0.5550562605600892, "loss/policy_avg": 0.23146501183509827, "lr": 2.630176380368098e-06, "objective/entropy": -43.257476806640625, "objective/kl": 10.434656143188477, "objective/non_score_reward": -1.0434656143188477, "objective/rlhf_reward": -2.512002890527831, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 14.093836784362793, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6398032903671265, "step": 1929, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9998741149902344 }, { "episode": 30896, "epoch": 0.5553438544774778, "loss/policy_avg": -0.3979191780090332, "lr": 2.629984662576687e-06, "objective/entropy": 85.81379699707031, "objective/kl": 13.083731651306152, "objective/non_score_reward": -1.308373212814331, "objective/rlhf_reward": -3.408664400848459, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 16.67183494567871, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5375758409500122, "step": 1930, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0003044605255127 }, { "episode": 30912, "epoch": 0.5556314483948664, "loss/policy_avg": 0.4099561870098114, "lr": 2.6297929447852762e-06, "objective/entropy": -108.34793853759766, "objective/kl": 19.073108673095703, "objective/non_score_reward": -1.9073107242584229, "objective/rlhf_reward": -6.025123406116085, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 47.74517822265625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6598101854324341, "step": 1931, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0001935958862305 }, { "episode": 30928, "epoch": 0.5559190423122551, "loss/policy_avg": 0.8242303729057312, "lr": 2.629601226993865e-06, "objective/entropy": 56.59760284423828, "objective/kl": 17.36560821533203, "objective/non_score_reward": -1.7365608215332031, "objective/rlhf_reward": -2.5462436139583584, "objective/scores": 1.1, "policy/approxkl_avg": 25.9139404296875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4316277503967285, "step": 1932, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9991772174835205 }, { "episode": 30944, "epoch": 0.5562066362296437, "loss/policy_avg": 1.5252747535705566, "lr": 2.629409509202454e-06, "objective/entropy": 9.067985534667969, "objective/kl": 16.401121139526367, "objective/non_score_reward": -1.6401121616363525, "objective/rlhf_reward": -2.1604487061500546, "objective/scores": 1.1, "policy/approxkl_avg": 18.144981384277344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6066452264785767, "step": 1933, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9995198249816895 }, { "episode": 30960, "epoch": 0.5564942301470324, "loss/policy_avg": 0.16620782017707825, "lr": 2.629217791411043e-06, "objective/entropy": 20.33401107788086, "objective/kl": 19.010009765625, "objective/non_score_reward": -1.9010009765625, "objective/rlhf_reward": -5.779175217422555, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 24.804540634155273, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5517880916595459, "step": 1934, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9990503787994385 }, { "episode": 30976, "epoch": 0.556781824064421, "loss/policy_avg": 0.1389436423778534, "lr": 2.629026073619632e-06, "objective/entropy": -255.1221923828125, "objective/kl": 11.334492683410645, "objective/non_score_reward": -1.1334493160247803, "objective/rlhf_reward": -4.133797264099121, "objective/scores": 0.1, "policy/approxkl_avg": 26.180503845214844, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6840099096298218, "step": 1935, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9998408555984497 }, { "episode": 30992, "epoch": 0.5570694179818096, "loss/policy_avg": 0.14210224151611328, "lr": 2.628834355828221e-06, "objective/entropy": 51.18537139892578, "objective/kl": 15.396123886108398, "objective/non_score_reward": -1.5396125316619873, "objective/rlhf_reward": -1.758449977636337, "objective/scores": 1.1, "policy/approxkl_avg": 49.77583312988281, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4005848169326782, "step": 1936, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9984276294708252 }, { "episode": 31008, "epoch": 0.5573570118991983, "loss/policy_avg": 0.22125059366226196, "lr": 2.62864263803681e-06, "objective/entropy": 101.8367691040039, "objective/kl": 12.313599586486816, "objective/non_score_reward": -1.2313599586486816, "objective/rlhf_reward": -6.925439834594727, "objective/scores": -0.5, "policy/approxkl_avg": 40.22418212890625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.33145105838775635, "step": 1937, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9973564147949219 }, { "episode": 31024, "epoch": 0.5576446058165869, "loss/policy_avg": 0.060771048069000244, "lr": 2.6284509202453987e-06, "objective/entropy": 197.3813934326172, "objective/kl": 14.080726623535156, "objective/non_score_reward": -1.4080727100372314, "objective/rlhf_reward": -7.632290840148926, "objective/scores": -0.5, "policy/approxkl_avg": 18.806310653686523, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5231298804283142, "step": 1938, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000919818878174 }, { "episode": 31040, "epoch": 0.5579321997339757, "loss/policy_avg": 0.28798460960388184, "lr": 2.628259202453988e-06, "objective/entropy": -39.44812774658203, "objective/kl": 10.068058967590332, "objective/non_score_reward": -1.0068060159683228, "objective/rlhf_reward": 0.37277611494064367, "objective/scores": 1.1, "policy/approxkl_avg": 21.18744659423828, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5737462639808655, "step": 1939, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9989917278289795 }, { "episode": 31056, "epoch": 0.5582197936513643, "loss/policy_avg": 0.2492627054452896, "lr": 2.6280674846625768e-06, "objective/entropy": 301.3166198730469, "objective/kl": 15.651753425598145, "objective/non_score_reward": -1.5651755332946777, "objective/rlhf_reward": -3.3369828208696575, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 38.802520751953125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7086349129676819, "step": 1940, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9971044063568115 }, { "episode": 31072, "epoch": 0.558507387568753, "loss/policy_avg": 0.5365092754364014, "lr": 2.627875766871166e-06, "objective/entropy": -48.53523254394531, "objective/kl": 16.297000885009766, "objective/non_score_reward": -1.6297001838684082, "objective/rlhf_reward": -2.118800616264343, "objective/scores": 1.1, "policy/approxkl_avg": 51.571128845214844, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5370905995368958, "step": 1941, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0003647804260254 }, { "episode": 31088, "epoch": 0.5587949814861416, "loss/policy_avg": 0.07891640812158585, "lr": 2.627684049079755e-06, "objective/entropy": -131.36619567871094, "objective/kl": 15.51970100402832, "objective/non_score_reward": -1.5519700050354004, "objective/rlhf_reward": -4.085174175278221, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 234.3270721435547, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5256676077842712, "step": 1942, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9981193542480469 }, { "episode": 31104, "epoch": 0.5590825754035302, "loss/policy_avg": 0.17335310578346252, "lr": 2.6274923312883436e-06, "objective/entropy": 43.700927734375, "objective/kl": 18.097789764404297, "objective/non_score_reward": -1.809779167175293, "objective/rlhf_reward": -2.839116489887237, "objective/scores": 1.1, "policy/approxkl_avg": 15.332867622375488, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5896799564361572, "step": 1943, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998621940612793 }, { "episode": 31120, "epoch": 0.5593701693209189, "loss/policy_avg": 0.6577703952789307, "lr": 2.627300613496933e-06, "objective/entropy": -123.42121124267578, "objective/kl": 13.580167770385742, "objective/non_score_reward": -1.3580169677734375, "objective/rlhf_reward": -2.508348677993986, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 18.91336441040039, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6367899775505066, "step": 1944, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9985871315002441 }, { "episode": 31136, "epoch": 0.5596577632383075, "loss/policy_avg": 0.207408145070076, "lr": 2.6271088957055213e-06, "objective/entropy": 51.933204650878906, "objective/kl": 21.24639129638672, "objective/non_score_reward": -2.1246392726898193, "objective/rlhf_reward": -6.942297666278437, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 20.490079879760742, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7846795320510864, "step": 1945, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0000905990600586 }, { "episode": 31152, "epoch": 0.5599453571556962, "loss/policy_avg": 0.3303542733192444, "lr": 2.6269171779141105e-06, "objective/entropy": 53.62565994262695, "objective/kl": 11.193902015686035, "objective/non_score_reward": -1.1193903684616089, "objective/rlhf_reward": -2.744227902094523, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 3.3035974502563477, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4402533769607544, "step": 1946, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9998788833618164 }, { "episode": 31168, "epoch": 0.5602329510730848, "loss/policy_avg": 0.5981451272964478, "lr": 2.6267254601226993e-06, "objective/entropy": 21.825145721435547, "objective/kl": 14.38486385345459, "objective/non_score_reward": -1.4384863376617432, "objective/rlhf_reward": -1.3539454698562619, "objective/scores": 1.1, "policy/approxkl_avg": 37.79143524169922, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5124356746673584, "step": 1947, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9986077547073364 }, { "episode": 31184, "epoch": 0.5605205449904734, "loss/policy_avg": -0.11670660972595215, "lr": 2.626533742331288e-06, "objective/entropy": 86.26981353759766, "objective/kl": 18.290149688720703, "objective/non_score_reward": -1.8290153741836548, "objective/rlhf_reward": -6.916061496734619, "objective/scores": 0.1, "policy/approxkl_avg": 27.349811553955078, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6505062580108643, "step": 1948, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9995582103729248 }, { "episode": 31200, "epoch": 0.5608081389078621, "loss/policy_avg": 0.1440569907426834, "lr": 2.6263420245398773e-06, "objective/entropy": -76.61561584472656, "objective/kl": 10.80213737487793, "objective/non_score_reward": -1.0802137851715088, "objective/rlhf_reward": -2.496026153835367, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 13.197822570800781, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6293392181396484, "step": 1949, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0017919540405273 }, { "episode": 31216, "epoch": 0.5610957328252507, "loss/policy_avg": -0.1457497775554657, "lr": 2.626150306748466e-06, "objective/entropy": -66.14817810058594, "objective/kl": 11.837576866149902, "objective/non_score_reward": -1.1837577819824219, "objective/rlhf_reward": -3.001697854200999, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 4.371072292327881, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.601198136806488, "step": 1950, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9993054866790771 }, { "episode": 31232, "epoch": 0.5613833267426394, "loss/policy_avg": 0.08133930712938309, "lr": 2.6259585889570554e-06, "objective/entropy": 35.34192657470703, "objective/kl": 16.1737003326416, "objective/non_score_reward": -1.6173698902130127, "objective/rlhf_reward": -3.5457607849848003, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 51.5657958984375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4502888321876526, "step": 1951, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.99739408493042 }, { "episode": 31248, "epoch": 0.561670920660028, "loss/policy_avg": 0.5748536586761475, "lr": 2.625766871165644e-06, "objective/entropy": 277.04815673828125, "objective/kl": 15.610389709472656, "objective/non_score_reward": -1.5610390901565552, "objective/rlhf_reward": -8.244155883789062, "objective/scores": -0.5, "policy/approxkl_avg": 22.583017349243164, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7663341760635376, "step": 1952, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9974110126495361 }, { "episode": 31264, "epoch": 0.5619585145774166, "loss/policy_avg": 0.40739142894744873, "lr": 2.625575153374233e-06, "objective/entropy": 44.04022979736328, "objective/kl": 13.912942886352539, "objective/non_score_reward": -1.391294240951538, "objective/rlhf_reward": -2.6414577111017437, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 132.23989868164062, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8223682641983032, "step": 1953, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9980732202529907 }, { "episode": 31280, "epoch": 0.5622461084948054, "loss/policy_avg": 0.2794537842273712, "lr": 2.6253834355828222e-06, "objective/entropy": 52.798545837402344, "objective/kl": 15.072214126586914, "objective/non_score_reward": -1.5072214603424072, "objective/rlhf_reward": -8.028885841369629, "objective/scores": -0.5, "policy/approxkl_avg": 33.49165344238281, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7069833278656006, "step": 1954, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9987144470214844 }, { "episode": 31296, "epoch": 0.562533702412194, "loss/policy_avg": 0.1479717493057251, "lr": 2.625191717791411e-06, "objective/entropy": 59.11641311645508, "objective/kl": 20.31096649169922, "objective/non_score_reward": -2.031096935272217, "objective/rlhf_reward": -6.176975856499608, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 26.35702133178711, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.502386212348938, "step": 1955, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998948097229004 }, { "episode": 31312, "epoch": 0.5628212963295827, "loss/policy_avg": 0.1483973264694214, "lr": 2.6250000000000003e-06, "objective/entropy": 172.51075744628906, "objective/kl": 14.092042922973633, "objective/non_score_reward": -1.409204363822937, "objective/rlhf_reward": -3.6894061667489364, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 43.896331787109375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5590693950653076, "step": 1956, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0002806186676025 }, { "episode": 31328, "epoch": 0.5631088902469713, "loss/policy_avg": 0.7653570175170898, "lr": 2.624808282208589e-06, "objective/entropy": 128.2239532470703, "objective/kl": 11.804733276367188, "objective/non_score_reward": -1.1804733276367188, "objective/rlhf_reward": -6.721893310546875, "objective/scores": -0.5, "policy/approxkl_avg": 4.992034912109375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6958256363868713, "step": 1957, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.003971576690674 }, { "episode": 31344, "epoch": 0.56339648416436, "loss/policy_avg": -0.05574982613325119, "lr": 2.624616564417178e-06, "objective/entropy": -8.689697265625, "objective/kl": 16.48877716064453, "objective/non_score_reward": -1.6488778591156006, "objective/rlhf_reward": -3.671792362571928, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 20.264589309692383, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.45056283473968506, "step": 1958, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9995653629302979 }, { "episode": 31360, "epoch": 0.5636840780817486, "loss/policy_avg": 0.15298953652381897, "lr": 2.624424846625767e-06, "objective/entropy": 50.52933120727539, "objective/kl": 22.40363311767578, "objective/non_score_reward": -2.240363597869873, "objective/rlhf_reward": -10.961454391479492, "objective/scores": -0.5, "policy/approxkl_avg": 36.90216827392578, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5374445915222168, "step": 1959, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0000500679016113 }, { "episode": 31376, "epoch": 0.5639716719991372, "loss/policy_avg": 1.3337948322296143, "lr": 2.624233128834356e-06, "objective/entropy": 76.81420135498047, "objective/kl": 13.708114624023438, "objective/non_score_reward": -1.3708115816116333, "objective/rlhf_reward": -5.083246028423309, "objective/scores": 0.1, "policy/approxkl_avg": 39.27752685546875, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7236926555633545, "step": 1960, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998272180557251 }, { "episode": 31392, "epoch": 0.5642592659165259, "loss/policy_avg": 0.016748720780014992, "lr": 2.6240414110429447e-06, "objective/entropy": 227.30496215820312, "objective/kl": 16.957714080810547, "objective/non_score_reward": -1.6957714557647705, "objective/rlhf_reward": -6.383085823059082, "objective/scores": 0.1, "policy/approxkl_avg": 53.046302795410156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5604294538497925, "step": 1961, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9976122379302979 }, { "episode": 31408, "epoch": 0.5645468598339145, "loss/policy_avg": 0.45783132314682007, "lr": 2.623849693251534e-06, "objective/entropy": -48.12693786621094, "objective/kl": 15.814268112182617, "objective/non_score_reward": -1.5814268589019775, "objective/rlhf_reward": -5.925707584619522, "objective/scores": 0.1, "policy/approxkl_avg": 32.74335479736328, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5133075714111328, "step": 1962, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9977269172668457 }, { "episode": 31424, "epoch": 0.5648344537513031, "loss/policy_avg": -0.22306829690933228, "lr": 2.623657975460123e-06, "objective/entropy": 128.39199829101562, "objective/kl": 10.377822875976562, "objective/non_score_reward": -1.0377821922302246, "objective/rlhf_reward": -3.751129066944122, "objective/scores": 0.1, "policy/approxkl_avg": 2.1646804809570312, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.598357081413269, "step": 1963, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0028271675109863 }, { "episode": 31440, "epoch": 0.5651220476686918, "loss/policy_avg": 0.2750639021396637, "lr": 2.623466257668712e-06, "objective/entropy": 8.598228454589844, "objective/kl": 8.15994644165039, "objective/non_score_reward": -0.8159946203231812, "objective/rlhf_reward": -5.263978481292725, "objective/scores": -0.5, "policy/approxkl_avg": 29.306640625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.572208046913147, "step": 1964, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998093843460083 }, { "episode": 31456, "epoch": 0.5654096415860804, "loss/policy_avg": 0.4767600893974304, "lr": 2.623274539877301e-06, "objective/entropy": 110.88385009765625, "objective/kl": 14.536544799804688, "objective/non_score_reward": -1.453654408454895, "objective/rlhf_reward": -3.4146176934242245, "objective/scores": 0.6, "policy/approxkl_avg": 44.370018005371094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7621891498565674, "step": 1965, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9984304904937744 }, { "episode": 31472, "epoch": 0.5656972355034691, "loss/policy_avg": 0.6700938940048218, "lr": 2.6230828220858896e-06, "objective/entropy": 91.74417114257812, "objective/kl": 16.03462791442871, "objective/non_score_reward": -1.6034626960754395, "objective/rlhf_reward": -3.4901316508066387, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 47.99883270263672, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.536391019821167, "step": 1966, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9991071224212646 }, { "episode": 31488, "epoch": 0.5659848294208577, "loss/policy_avg": 0.2143508791923523, "lr": 2.622891104294479e-06, "objective/entropy": 50.70976257324219, "objective/kl": 13.917573928833008, "objective/non_score_reward": -1.3917572498321533, "objective/rlhf_reward": -1.1670292973518368, "objective/scores": 1.1, "policy/approxkl_avg": 34.19868850708008, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7093191742897034, "step": 1967, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0008420944213867 }, { "episode": 31504, "epoch": 0.5662724233382463, "loss/policy_avg": 0.27268171310424805, "lr": 2.6226993865030673e-06, "objective/entropy": 74.58323669433594, "objective/kl": 13.26856803894043, "objective/non_score_reward": -1.3268567323684692, "objective/rlhf_reward": -7.307426929473877, "objective/scores": -0.5, "policy/approxkl_avg": 17.578540802001953, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4392435550689697, "step": 1968, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9976592063903809 }, { "episode": 31520, "epoch": 0.5665600172556351, "loss/policy_avg": 0.26809120178222656, "lr": 2.6225076687116565e-06, "objective/entropy": -37.09600067138672, "objective/kl": 13.520483016967773, "objective/non_score_reward": -1.352048397064209, "objective/rlhf_reward": -3.008193230628967, "objective/scores": 0.6, "policy/approxkl_avg": 23.37268829345703, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.9211034774780273, "step": 1969, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9987969398498535 }, { "episode": 31536, "epoch": 0.5668476111730237, "loss/policy_avg": 0.0321061909198761, "lr": 2.6223159509202453e-06, "objective/entropy": -193.0868682861328, "objective/kl": 10.896371841430664, "objective/non_score_reward": -1.089637041091919, "objective/rlhf_reward": 0.0414515376091007, "objective/scores": 1.1, "policy/approxkl_avg": 4.7421770095825195, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6736397743225098, "step": 1970, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9986076354980469 }, { "episode": 31552, "epoch": 0.5671352050904124, "loss/policy_avg": 0.9158979654312134, "lr": 2.622124233128834e-06, "objective/entropy": 98.08502197265625, "objective/kl": 11.713408470153809, "objective/non_score_reward": -1.1713407039642334, "objective/rlhf_reward": -4.285363173484802, "objective/scores": 0.1, "policy/approxkl_avg": 9.338741302490234, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6211938858032227, "step": 1971, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.001859188079834 }, { "episode": 31568, "epoch": 0.567422799007801, "loss/policy_avg": 0.15321816504001617, "lr": 2.6219325153374233e-06, "objective/entropy": 57.19317626953125, "objective/kl": 12.833736419677734, "objective/non_score_reward": -1.2833735942840576, "objective/rlhf_reward": -4.73349437713623, "objective/scores": 0.1, "policy/approxkl_avg": 14.374547958374023, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8906912803649902, "step": 1972, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9997797012329102 }, { "episode": 31584, "epoch": 0.5677103929251897, "loss/policy_avg": 0.04171551764011383, "lr": 2.621740797546012e-06, "objective/entropy": -98.7196044921875, "objective/kl": 12.432456016540527, "objective/non_score_reward": -1.2432456016540527, "objective/rlhf_reward": -4.572982108592987, "objective/scores": 0.1, "policy/approxkl_avg": 30.483003616333008, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6093274354934692, "step": 1973, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9983088970184326 }, { "episode": 31600, "epoch": 0.5679979868425783, "loss/policy_avg": 0.2402482032775879, "lr": 2.6215490797546014e-06, "objective/entropy": 67.22429656982422, "objective/kl": 19.42490005493164, "objective/non_score_reward": -1.9424902200698853, "objective/rlhf_reward": -6.21370169421728, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 28.987808227539062, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6655051708221436, "step": 1974, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9978091716766357 }, { "episode": 31616, "epoch": 0.5682855807599669, "loss/policy_avg": -0.026089750230312347, "lr": 2.62135736196319e-06, "objective/entropy": 158.90890502929688, "objective/kl": 20.56407928466797, "objective/non_score_reward": -2.056407928466797, "objective/rlhf_reward": -10.225631713867188, "objective/scores": -0.5, "policy/approxkl_avg": 6.276256561279297, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6126078367233276, "step": 1975, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.00205397605896 }, { "episode": 31632, "epoch": 0.5685731746773556, "loss/policy_avg": 0.38028374314308167, "lr": 2.621165644171779e-06, "objective/entropy": 239.01248168945312, "objective/kl": 13.714025497436523, "objective/non_score_reward": -1.371402621269226, "objective/rlhf_reward": -5.0856106042861935, "objective/scores": 0.1, "policy/approxkl_avg": 54.81404495239258, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5809019804000854, "step": 1976, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9958641529083252 }, { "episode": 31648, "epoch": 0.5688607685947442, "loss/policy_avg": -0.05175406485795975, "lr": 2.6209739263803682e-06, "objective/entropy": 124.25830078125, "objective/kl": 10.488260269165039, "objective/non_score_reward": -1.048825979232788, "objective/rlhf_reward": -1.2715851708662238, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 2.1010313034057617, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.8621615171432495, "step": 1977, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0017189979553223 }, { "episode": 31664, "epoch": 0.5691483625121329, "loss/policy_avg": 0.26172560453414917, "lr": 2.620782208588957e-06, "objective/entropy": -3.961435317993164, "objective/kl": 15.588273048400879, "objective/non_score_reward": -1.5588274002075195, "objective/rlhf_reward": -5.835309481620788, "objective/scores": 0.1, "policy/approxkl_avg": 8.479052543640137, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5890230536460876, "step": 1978, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999295949935913 }, { "episode": 31680, "epoch": 0.5694359564295215, "loss/policy_avg": 0.07885098457336426, "lr": 2.6205904907975463e-06, "objective/entropy": 163.91082763671875, "objective/kl": 15.549473762512207, "objective/non_score_reward": -1.5549473762512207, "objective/rlhf_reward": -3.8197897434234616, "objective/scores": 0.6, "policy/approxkl_avg": 12.931072235107422, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7356927394866943, "step": 1979, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9969947338104248 }, { "episode": 31696, "epoch": 0.5697235503469101, "loss/policy_avg": 0.3573415279388428, "lr": 2.620398773006135e-06, "objective/entropy": -116.21089935302734, "objective/kl": 13.6551513671875, "objective/non_score_reward": -1.3655149936676025, "objective/rlhf_reward": -5.062060168385505, "objective/scores": 0.1, "policy/approxkl_avg": 33.5490608215332, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5857123136520386, "step": 1980, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999571442604065 }, { "episode": 31712, "epoch": 0.5700111442642988, "loss/policy_avg": -0.036694854497909546, "lr": 2.620207055214724e-06, "objective/entropy": 126.82040405273438, "objective/kl": 8.93669319152832, "objective/non_score_reward": -0.8936692476272583, "objective/rlhf_reward": -5.574676990509033, "objective/scores": -0.5, "policy/approxkl_avg": 72.33108520507812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6285960078239441, "step": 1981, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0009305477142334 }, { "episode": 31728, "epoch": 0.5702987381816874, "loss/policy_avg": 0.2230103760957718, "lr": 2.620015337423313e-06, "objective/entropy": 83.2691879272461, "objective/kl": 13.386994361877441, "objective/non_score_reward": -1.338699460029602, "objective/rlhf_reward": -3.5299691214886417, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 76.8690185546875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.49291151762008667, "step": 1982, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000518798828125 }, { "episode": 31744, "epoch": 0.570586332099076, "loss/policy_avg": 0.6935573220252991, "lr": 2.619823619631902e-06, "objective/entropy": 334.6748046875, "objective/kl": 23.331436157226562, "objective/non_score_reward": -2.333143711090088, "objective/rlhf_reward": -8.932574546337127, "objective/scores": 0.1, "policy/approxkl_avg": 95.91483306884766, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.8640803694725037, "step": 1983, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9957650899887085 }, { "episode": 31760, "epoch": 0.5708739260164648, "loss/policy_avg": -0.06788041442632675, "lr": 2.6196319018404908e-06, "objective/entropy": -40.34531784057617, "objective/kl": 10.183060646057129, "objective/non_score_reward": -1.018306016921997, "objective/rlhf_reward": -2.4113647095566852, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 17.93832015991211, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.506335973739624, "step": 1984, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000466823577881 }, { "episode": 31776, "epoch": 0.5711615199338534, "loss/policy_avg": 0.27647337317466736, "lr": 2.61944018404908e-06, "objective/entropy": 219.0719451904297, "objective/kl": 15.476117134094238, "objective/non_score_reward": -1.5476117134094238, "objective/rlhf_reward": -5.790447032451629, "objective/scores": 0.1, "policy/approxkl_avg": 34.48750686645508, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5134598016738892, "step": 1985, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.003304958343506 }, { "episode": 31792, "epoch": 0.5714491138512421, "loss/policy_avg": 0.36243271827697754, "lr": 2.619248466257669e-06, "objective/entropy": -5.402244567871094, "objective/kl": 13.373044967651367, "objective/non_score_reward": -1.3373044729232788, "objective/rlhf_reward": -7.349217891693115, "objective/scores": -0.5, "policy/approxkl_avg": 31.055587768554688, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5952838659286499, "step": 1986, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0000016689300537 }, { "episode": 31808, "epoch": 0.5717367077686307, "loss/policy_avg": 0.030699964612722397, "lr": 2.619056748466258e-06, "objective/entropy": 235.13650512695312, "objective/kl": 14.822487831115723, "objective/non_score_reward": -1.4822489023208618, "objective/rlhf_reward": -7.928995609283447, "objective/scores": -0.5, "policy/approxkl_avg": 64.26378631591797, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7725933790206909, "step": 1987, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9992258548736572 }, { "episode": 31824, "epoch": 0.5720243016860194, "loss/policy_avg": 0.03670354560017586, "lr": 2.618865030674847e-06, "objective/entropy": -120.67810821533203, "objective/kl": 13.513415336608887, "objective/non_score_reward": -1.3513414859771729, "objective/rlhf_reward": -7.405365943908691, "objective/scores": -0.5, "policy/approxkl_avg": 14.002901077270508, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6766031980514526, "step": 1988, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9997832775115967 }, { "episode": 31840, "epoch": 0.572311895603408, "loss/policy_avg": 0.2261669933795929, "lr": 2.6186733128834357e-06, "objective/entropy": 183.0972442626953, "objective/kl": 21.516197204589844, "objective/non_score_reward": -2.1516199111938477, "objective/rlhf_reward": -10.60647964477539, "objective/scores": -0.5, "policy/approxkl_avg": 28.96692657470703, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.43175703287124634, "step": 1989, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9977149963378906 }, { "episode": 31856, "epoch": 0.5725994895207966, "loss/policy_avg": 0.533334493637085, "lr": 2.6184815950920245e-06, "objective/entropy": 176.357421875, "objective/kl": 17.872817993164062, "objective/non_score_reward": -1.7872819900512695, "objective/rlhf_reward": -9.149127960205078, "objective/scores": -0.5, "policy/approxkl_avg": 185.1457061767578, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5334278345108032, "step": 1990, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9992812871932983 }, { "episode": 31872, "epoch": 0.5728870834381853, "loss/policy_avg": 0.15894320607185364, "lr": 2.6182898773006133e-06, "objective/entropy": -197.00643920898438, "objective/kl": 10.364501953125, "objective/non_score_reward": -1.0364501476287842, "objective/rlhf_reward": -6.145800590515137, "objective/scores": -0.5, "policy/approxkl_avg": 12.997642517089844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6820682883262634, "step": 1991, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9986462593078613 }, { "episode": 31888, "epoch": 0.5731746773555739, "loss/policy_avg": 0.2844724953174591, "lr": 2.6180981595092025e-06, "objective/entropy": -84.8385238647461, "objective/kl": 14.503667831420898, "objective/non_score_reward": -1.450366735458374, "objective/rlhf_reward": -3.678760858551536, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 28.349754333496094, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6058744788169861, "step": 1992, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997042179107666 }, { "episode": 31904, "epoch": 0.5734622712729626, "loss/policy_avg": 0.5715179443359375, "lr": 2.6179064417177913e-06, "objective/entropy": -42.54454803466797, "objective/kl": 17.865976333618164, "objective/non_score_reward": -1.786597728729248, "objective/rlhf_reward": -6.746390855312347, "objective/scores": 0.1, "policy/approxkl_avg": 28.264297485351562, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4473644196987152, "step": 1993, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9976756572723389 }, { "episode": 31920, "epoch": 0.5737498651903512, "loss/policy_avg": 0.45799100399017334, "lr": 2.6177147239263805e-06, "objective/entropy": -278.7918701171875, "objective/kl": 8.003768920898438, "objective/non_score_reward": -0.8003770112991333, "objective/rlhf_reward": -1.5396485380536182, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 38.73306655883789, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.650258481502533, "step": 1994, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.999009132385254 }, { "episode": 31936, "epoch": 0.5740374591077398, "loss/policy_avg": -0.06823521852493286, "lr": 2.6175230061349694e-06, "objective/entropy": -70.7914810180664, "objective/kl": 12.955906867980957, "objective/non_score_reward": -1.2955907583236694, "objective/rlhf_reward": -3.6261036982208044, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 24.642898559570312, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4823078513145447, "step": 1995, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.002995014190674 }, { "episode": 31952, "epoch": 0.5743250530251285, "loss/policy_avg": -0.12416256964206696, "lr": 2.617331288343558e-06, "objective/entropy": -112.54239654541016, "objective/kl": 3.643691062927246, "objective/non_score_reward": -0.36436912417411804, "objective/rlhf_reward": 0.9425235554575919, "objective/scores": 0.6, "policy/approxkl_avg": 1.6883997917175293, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4229438304901123, "step": 1996, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.001127004623413 }, { "episode": 31968, "epoch": 0.5746126469425171, "loss/policy_avg": 0.8278412222862244, "lr": 2.6171395705521474e-06, "objective/entropy": -195.83651733398438, "objective/kl": 14.30837631225586, "objective/non_score_reward": -1.4308377504348755, "objective/rlhf_reward": -3.898522193702768, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 101.39013671875, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5440574884414673, "step": 1997, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9996654987335205 }, { "episode": 31984, "epoch": 0.5749002408599058, "loss/policy_avg": 0.5108898282051086, "lr": 2.616947852760736e-06, "objective/entropy": 372.1040344238281, "objective/kl": 18.92198944091797, "objective/non_score_reward": -1.8921990394592285, "objective/rlhf_reward": -7.1687962174415585, "objective/scores": 0.1, "policy/approxkl_avg": 18.65614128112793, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.9018811583518982, "step": 1998, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.997493028640747 }, { "episode": 32000, "epoch": 0.5751878347772945, "loss/policy_avg": 2.341435670852661, "lr": 2.616756134969325e-06, "objective/entropy": 200.54232788085938, "objective/kl": 6.54392147064209, "objective/non_score_reward": -0.6543921232223511, "objective/rlhf_reward": -2.217568612098694, "objective/scores": 0.1, "policy/approxkl_avg": 2.1673641204833984, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5942652225494385, "step": 1999, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0033254623413086 }, { "episode": 32016, "epoch": 0.5754754286946832, "loss/policy_avg": 0.32693976163864136, "lr": 2.6165644171779142e-06, "objective/entropy": 50.55780029296875, "objective/kl": 11.196540832519531, "objective/non_score_reward": -1.1196540594100952, "objective/rlhf_reward": -1.554897238255712, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 25.62256622314453, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8175270557403564, "step": 2000, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.996117115020752 }, { "episode": 32032, "epoch": 0.5757630226120718, "loss/policy_avg": 0.523784875869751, "lr": 2.616372699386503e-06, "objective/entropy": 142.1219024658203, "objective/kl": 19.365474700927734, "objective/non_score_reward": -1.9365475177764893, "objective/rlhf_reward": -6.012856737772623, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 87.06364440917969, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5942108035087585, "step": 2001, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9976255893707275 }, { "episode": 32048, "epoch": 0.5760506165294604, "loss/policy_avg": 0.37893959879875183, "lr": 2.6161809815950923e-06, "objective/entropy": -63.05537414550781, "objective/kl": 14.981361389160156, "objective/non_score_reward": -1.498136281967163, "objective/rlhf_reward": -3.5925449490547177, "objective/scores": 0.6, "policy/approxkl_avg": 7.1709418296813965, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6707227230072021, "step": 2002, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9987297058105469 }, { "episode": 32064, "epoch": 0.5763382104468491, "loss/policy_avg": 0.6728400588035583, "lr": 2.615989263803681e-06, "objective/entropy": -191.27810668945312, "objective/kl": 12.201635360717773, "objective/non_score_reward": -1.2201635837554932, "objective/rlhf_reward": -2.4806545138359066, "objective/scores": 0.6, "policy/approxkl_avg": 37.39985656738281, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5896199941635132, "step": 2003, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9985238313674927 }, { "episode": 32080, "epoch": 0.5766258043642377, "loss/policy_avg": -0.35722067952156067, "lr": 2.61579754601227e-06, "objective/entropy": 25.450927734375, "objective/kl": 13.033873558044434, "objective/non_score_reward": -1.3033874034881592, "objective/rlhf_reward": -2.8135497927665707, "objective/scores": 0.6, "policy/approxkl_avg": 39.72835159301758, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.44449368119239807, "step": 2004, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0003347396850586 }, { "episode": 32096, "epoch": 0.5769133982816264, "loss/policy_avg": 0.43654775619506836, "lr": 2.615605828220859e-06, "objective/entropy": -39.03499984741211, "objective/kl": 15.21171760559082, "objective/non_score_reward": -1.521172046661377, "objective/rlhf_reward": -5.684687769412994, "objective/scores": 0.1, "policy/approxkl_avg": 20.998878479003906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7011727094650269, "step": 2005, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9980759620666504 }, { "episode": 32112, "epoch": 0.577200992199015, "loss/policy_avg": -0.0905875414609909, "lr": 2.615414110429448e-06, "objective/entropy": 184.36927795410156, "objective/kl": 18.664302825927734, "objective/non_score_reward": -1.866430401802063, "objective/rlhf_reward": -5.343015732542549, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 12.670072555541992, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8122575283050537, "step": 2006, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0021438598632812 }, { "episode": 32128, "epoch": 0.5774885861164036, "loss/policy_avg": 0.4686034619808197, "lr": 2.615222392638037e-06, "objective/entropy": 6.0128631591796875, "objective/kl": 15.820640563964844, "objective/non_score_reward": -1.582064151763916, "objective/rlhf_reward": -8.328256607055664, "objective/scores": -0.5, "policy/approxkl_avg": 30.504484176635742, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6733450889587402, "step": 2007, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.997711420059204 }, { "episode": 32144, "epoch": 0.5777761800337923, "loss/policy_avg": 0.046472251415252686, "lr": 2.615030674846626e-06, "objective/entropy": -84.07270050048828, "objective/kl": 11.857654571533203, "objective/non_score_reward": -1.1857655048370361, "objective/rlhf_reward": -6.7430620193481445, "objective/scores": -0.5, "policy/approxkl_avg": 8.025468826293945, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9546033143997192, "step": 2008, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0003514289855957 }, { "episode": 32160, "epoch": 0.5780637739511809, "loss/policy_avg": 0.0446510948240757, "lr": 2.614838957055215e-06, "objective/entropy": 38.73136901855469, "objective/kl": 17.77886962890625, "objective/non_score_reward": -1.7778868675231934, "objective/rlhf_reward": -5.286718364032815, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 15.033638954162598, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5836319923400879, "step": 2009, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000471591949463 }, { "episode": 32176, "epoch": 0.5783513678685696, "loss/policy_avg": -0.23901139199733734, "lr": 2.614647239263804e-06, "objective/entropy": -46.67261505126953, "objective/kl": 12.934122085571289, "objective/non_score_reward": -1.293412208557129, "objective/rlhf_reward": -3.0509426019349437, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 77.60780334472656, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5740710496902466, "step": 2010, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0050253868103027 }, { "episode": 32192, "epoch": 0.5786389617859582, "loss/policy_avg": -0.44756045937538147, "lr": 2.614455521472393e-06, "objective/entropy": -230.7035369873047, "objective/kl": 10.906891822814941, "objective/non_score_reward": -1.0906891822814941, "objective/rlhf_reward": -3.962757086753845, "objective/scores": 0.1, "policy/approxkl_avg": 1.9308230876922607, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5085049867630005, "step": 2011, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.002194404602051 }, { "episode": 32208, "epoch": 0.5789265557033468, "loss/policy_avg": 0.5888445973396301, "lr": 2.6142638036809817e-06, "objective/entropy": 220.92822265625, "objective/kl": 17.640647888183594, "objective/non_score_reward": -1.7640647888183594, "objective/rlhf_reward": -6.656258976459503, "objective/scores": 0.1, "policy/approxkl_avg": 26.095348358154297, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.809324324131012, "step": 2012, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0005431175231934 }, { "episode": 32224, "epoch": 0.5792141496207355, "loss/policy_avg": 0.026016470044851303, "lr": 2.6140720858895705e-06, "objective/entropy": 299.5074462890625, "objective/kl": 16.170055389404297, "objective/non_score_reward": -1.617005705833435, "objective/rlhf_reward": -6.06802282333374, "objective/scores": 0.1, "policy/approxkl_avg": 19.101377487182617, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7076947689056396, "step": 2013, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9998183250427246 }, { "episode": 32240, "epoch": 0.5795017435381241, "loss/policy_avg": 0.38306957483291626, "lr": 2.6138803680981593e-06, "objective/entropy": 248.753173828125, "objective/kl": 9.927104949951172, "objective/non_score_reward": -0.992710530757904, "objective/rlhf_reward": -3.570842123031616, "objective/scores": 0.1, "policy/approxkl_avg": 52.19784927368164, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7442108392715454, "step": 2014, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.99979829788208 }, { "episode": 32256, "epoch": 0.5797893374555129, "loss/policy_avg": 0.7347298860549927, "lr": 2.6136886503067485e-06, "objective/entropy": 264.7408447265625, "objective/kl": 20.418472290039062, "objective/non_score_reward": -2.041846990585327, "objective/rlhf_reward": -10.167387962341309, "objective/scores": -0.5, "policy/approxkl_avg": 30.207717895507812, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8410416841506958, "step": 2015, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9964735507965088 }, { "episode": 32272, "epoch": 0.5800769313729015, "loss/policy_avg": 0.10777494311332703, "lr": 2.6134969325153373e-06, "objective/entropy": -33.255218505859375, "objective/kl": 12.130520820617676, "objective/non_score_reward": -1.2130520343780518, "objective/rlhf_reward": -4.452207899093628, "objective/scores": 0.1, "policy/approxkl_avg": 58.70673370361328, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5788019895553589, "step": 2016, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999755620956421 }, { "episode": 32288, "epoch": 0.5803645252902901, "loss/policy_avg": 0.4941823482513428, "lr": 2.6133052147239266e-06, "objective/entropy": 166.5862579345703, "objective/kl": 18.242103576660156, "objective/non_score_reward": -1.824210286140442, "objective/rlhf_reward": -2.896841204166412, "objective/scores": 1.1, "policy/approxkl_avg": 6.646388053894043, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6881924271583557, "step": 2017, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9998159408569336 }, { "episode": 32304, "epoch": 0.5806521192076788, "loss/policy_avg": -0.22474974393844604, "lr": 2.6131134969325154e-06, "objective/entropy": -193.62680053710938, "objective/kl": 10.035699844360352, "objective/non_score_reward": -1.0035700798034668, "objective/rlhf_reward": -1.090561126114103, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 5.371432304382324, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5917261838912964, "step": 2018, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0010905265808105 }, { "episode": 32320, "epoch": 0.5809397131250674, "loss/policy_avg": 0.24477297067642212, "lr": 2.612921779141104e-06, "objective/entropy": -146.20303344726562, "objective/kl": 13.633316040039062, "objective/non_score_reward": -1.3633315563201904, "objective/rlhf_reward": -7.453326225280762, "objective/scores": -0.5, "policy/approxkl_avg": 43.155723571777344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.474772572517395, "step": 2019, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9979207515716553 }, { "episode": 32336, "epoch": 0.5812273070424561, "loss/policy_avg": 0.1283385157585144, "lr": 2.6127300613496934e-06, "objective/entropy": -5.738563537597656, "objective/kl": 20.078689575195312, "objective/non_score_reward": -2.007869243621826, "objective/rlhf_reward": -7.631476855278015, "objective/scores": 0.1, "policy/approxkl_avg": 46.95381164550781, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6904228329658508, "step": 2020, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9993395805358887 }, { "episode": 32352, "epoch": 0.5815149009598447, "loss/policy_avg": 0.26754477620124817, "lr": 2.6125383435582822e-06, "objective/entropy": 127.22401428222656, "objective/kl": 13.649679183959961, "objective/non_score_reward": -1.364967942237854, "objective/rlhf_reward": -7.459871768951416, "objective/scores": -0.5, "policy/approxkl_avg": 23.265398025512695, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.39020296931266785, "step": 2021, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9974396228790283 }, { "episode": 32368, "epoch": 0.5818024948772333, "loss/policy_avg": 0.20494411885738373, "lr": 2.612346625766871e-06, "objective/entropy": 185.99484252929688, "objective/kl": 11.199531555175781, "objective/non_score_reward": -1.1199530363082886, "objective/rlhf_reward": -4.0798122644424435, "objective/scores": 0.1, "policy/approxkl_avg": 26.743867874145508, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4661474823951721, "step": 2022, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0036611557006836 }, { "episode": 32384, "epoch": 0.582090088794622, "loss/policy_avg": -0.4422033429145813, "lr": 2.6121549079754603e-06, "objective/entropy": 106.64537048339844, "objective/kl": 9.224336624145508, "objective/non_score_reward": -0.9224337339401245, "objective/rlhf_reward": -0.7660157426607339, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 3.0834896564483643, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5475035309791565, "step": 2023, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0093464851379395 }, { "episode": 32400, "epoch": 0.5823776827120106, "loss/policy_avg": 0.2676440179347992, "lr": 2.611963190184049e-06, "objective/entropy": 111.9757080078125, "objective/kl": 17.51507568359375, "objective/non_score_reward": -1.7515077590942383, "objective/rlhf_reward": -9.006031036376953, "objective/scores": -0.5, "policy/approxkl_avg": 11.372546195983887, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5439157485961914, "step": 2024, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9999245405197144 }, { "episode": 32416, "epoch": 0.5826652766293993, "loss/policy_avg": 0.18945106863975525, "lr": 2.6117714723926383e-06, "objective/entropy": 183.9626007080078, "objective/kl": 12.516935348510742, "objective/non_score_reward": -1.2516934871673584, "objective/rlhf_reward": -4.6067740380764, "objective/scores": 0.1, "policy/approxkl_avg": 19.953580856323242, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6190844774246216, "step": 2025, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0008950233459473 }, { "episode": 32432, "epoch": 0.5829528705467879, "loss/policy_avg": 0.09500116109848022, "lr": 2.611579754601227e-06, "objective/entropy": 75.4105453491211, "objective/kl": 18.89362144470215, "objective/non_score_reward": -1.8893619775772095, "objective/rlhf_reward": -3.157448089122772, "objective/scores": 1.1, "policy/approxkl_avg": 45.4237060546875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4325343668460846, "step": 2026, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9984755516052246 }, { "episode": 32448, "epoch": 0.5832404644641765, "loss/policy_avg": 0.6804702281951904, "lr": 2.611388036809816e-06, "objective/entropy": -103.95718383789062, "objective/kl": 20.174652099609375, "objective/non_score_reward": -2.017465353012085, "objective/rlhf_reward": -3.6698614120483395, "objective/scores": 1.1, "policy/approxkl_avg": 23.145023345947266, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5804574489593506, "step": 2027, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.99713134765625 }, { "episode": 32464, "epoch": 0.5835280583815652, "loss/policy_avg": 0.39008814096450806, "lr": 2.611196319018405e-06, "objective/entropy": 193.8568572998047, "objective/kl": 10.51252555847168, "objective/non_score_reward": -1.0512524843215942, "objective/rlhf_reward": -2.471676708261172, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 48.12255859375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7742000818252563, "step": 2028, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9971537590026855 }, { "episode": 32480, "epoch": 0.5838156522989538, "loss/policy_avg": -0.27750644087791443, "lr": 2.611004601226994e-06, "objective/entropy": 0.22932052612304688, "objective/kl": 12.089995384216309, "objective/non_score_reward": -1.2089996337890625, "objective/rlhf_reward": -2.4359982967376705, "objective/scores": 0.6, "policy/approxkl_avg": 17.11049461364746, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4802630543708801, "step": 2029, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0016231536865234 }, { "episode": 32496, "epoch": 0.5841032462163426, "loss/policy_avg": 0.12393666803836823, "lr": 2.610812883435583e-06, "objective/entropy": 98.4588623046875, "objective/kl": 19.640907287597656, "objective/non_score_reward": -1.9640907049179077, "objective/rlhf_reward": -9.856363296508789, "objective/scores": -0.5, "policy/approxkl_avg": 7.085240364074707, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.26065224409103394, "step": 2030, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.997912049293518 }, { "episode": 32512, "epoch": 0.5843908401337312, "loss/policy_avg": -0.44430527091026306, "lr": 2.610621165644172e-06, "objective/entropy": 79.62841033935547, "objective/kl": 12.913888931274414, "objective/non_score_reward": -1.2913891077041626, "objective/rlhf_reward": -2.7655563116073605, "objective/scores": 0.6, "policy/approxkl_avg": 25.055837631225586, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.44760024547576904, "step": 2031, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.002227544784546 }, { "episode": 32528, "epoch": 0.5846784340511199, "loss/policy_avg": -0.6092654466629028, "lr": 2.610429447852761e-06, "objective/entropy": 88.92134094238281, "objective/kl": 14.717385292053223, "objective/non_score_reward": -1.471738338470459, "objective/rlhf_reward": -7.886953353881836, "objective/scores": -0.5, "policy/approxkl_avg": 37.71894836425781, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5509711503982544, "step": 2032, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0035958290100098 }, { "episode": 32544, "epoch": 0.5849660279685085, "loss/policy_avg": 0.23970016837120056, "lr": 2.61023773006135e-06, "objective/entropy": 152.98385620117188, "objective/kl": 15.108621597290039, "objective/non_score_reward": -1.510862112045288, "objective/rlhf_reward": -3.9207423648991924, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 23.307903289794922, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6641147136688232, "step": 2033, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9983627796173096 }, { "episode": 32560, "epoch": 0.5852536218858971, "loss/policy_avg": -0.2796148955821991, "lr": 2.6100460122699384e-06, "objective/entropy": 218.56283569335938, "objective/kl": 13.127628326416016, "objective/non_score_reward": -1.31276273727417, "objective/rlhf_reward": -3.6947917928367406, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 4.39356803894043, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5682885646820068, "step": 2034, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0047197341918945 }, { "episode": 32576, "epoch": 0.5855412158032858, "loss/policy_avg": 0.4500732421875, "lr": 2.6098542944785277e-06, "objective/entropy": 165.1419677734375, "objective/kl": 16.184364318847656, "objective/non_score_reward": -1.618436336517334, "objective/rlhf_reward": -6.073745763301849, "objective/scores": 0.1, "policy/approxkl_avg": 25.664451599121094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4045563340187073, "step": 2035, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9982361793518066 }, { "episode": 32592, "epoch": 0.5858288097206744, "loss/policy_avg": 1.1184520721435547, "lr": 2.6096625766871165e-06, "objective/entropy": 74.55671691894531, "objective/kl": 24.226789474487305, "objective/non_score_reward": -2.4226789474487305, "objective/rlhf_reward": -11.690715789794922, "objective/scores": -0.5, "policy/approxkl_avg": 29.897789001464844, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5844389200210571, "step": 2036, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.001523494720459 }, { "episode": 32608, "epoch": 0.586116403638063, "loss/policy_avg": -0.016746334731578827, "lr": 2.6094708588957053e-06, "objective/entropy": 117.7492904663086, "objective/kl": 16.263198852539062, "objective/non_score_reward": -1.6263197660446167, "objective/rlhf_reward": -4.382573070303474, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 147.6581573486328, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.9091885089874268, "step": 2037, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.005404472351074 }, { "episode": 32624, "epoch": 0.5864039975554517, "loss/policy_avg": 1.110663652420044, "lr": 2.6092791411042945e-06, "objective/entropy": 176.1644744873047, "objective/kl": 15.09540843963623, "objective/non_score_reward": -1.5095406770706177, "objective/rlhf_reward": -8.038162231445312, "objective/scores": -0.5, "policy/approxkl_avg": 28.064037322998047, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7703660726547241, "step": 2038, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9991657733917236 }, { "episode": 32640, "epoch": 0.5866915914728403, "loss/policy_avg": 0.08125920593738556, "lr": 2.6090874233128833e-06, "objective/entropy": 63.565513610839844, "objective/kl": 14.566051483154297, "objective/non_score_reward": -1.4566051959991455, "objective/rlhf_reward": -3.8790097338723495, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 35.45296096801758, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.3615097105503082, "step": 2039, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9950077533721924 }, { "episode": 32656, "epoch": 0.586979185390229, "loss/policy_avg": 0.6111937165260315, "lr": 2.6088957055214726e-06, "objective/entropy": 24.75860595703125, "objective/kl": 9.614357948303223, "objective/non_score_reward": -0.9614357948303223, "objective/rlhf_reward": -1.898332144098218, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 22.132753372192383, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.48295459151268005, "step": 2040, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9996814727783203 }, { "episode": 32672, "epoch": 0.5872667793076176, "loss/policy_avg": 0.3595702648162842, "lr": 2.6087039877300614e-06, "objective/entropy": -32.59522247314453, "objective/kl": 9.41091537475586, "objective/non_score_reward": -0.9410915374755859, "objective/rlhf_reward": 0.6356339693069462, "objective/scores": 1.1, "policy/approxkl_avg": 7.560471057891846, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.46686244010925293, "step": 2041, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0003132820129395 }, { "episode": 32688, "epoch": 0.5875543732250063, "loss/policy_avg": 0.41997677087783813, "lr": 2.60851226993865e-06, "objective/entropy": 269.6581726074219, "objective/kl": 15.583868980407715, "objective/non_score_reward": -1.5583869218826294, "objective/rlhf_reward": -3.8335475683212277, "objective/scores": 0.6, "policy/approxkl_avg": 26.866287231445312, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7443997859954834, "step": 2042, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0007753372192383 }, { "episode": 32704, "epoch": 0.5878419671423949, "loss/policy_avg": -0.09083029627799988, "lr": 2.6083205521472394e-06, "objective/entropy": 225.44314575195312, "objective/kl": 7.201667785644531, "objective/non_score_reward": -0.7201669216156006, "objective/rlhf_reward": 1.51933246254921, "objective/scores": 1.1, "policy/approxkl_avg": 0.43545225262641907, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5843120217323303, "step": 2043, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.004441976547241 }, { "episode": 32720, "epoch": 0.5881295610597835, "loss/policy_avg": 0.22586756944656372, "lr": 2.6081288343558282e-06, "objective/entropy": 8.289024353027344, "objective/kl": 22.944671630859375, "objective/non_score_reward": -2.2944672107696533, "objective/rlhf_reward": -7.353040213855813, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 92.32232666015625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6938124895095825, "step": 2044, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0006496906280518 }, { "episode": 32736, "epoch": 0.5884171549771723, "loss/policy_avg": -0.015067555010318756, "lr": 2.6079371165644175e-06, "objective/entropy": -206.26466369628906, "objective/kl": 14.04336929321289, "objective/non_score_reward": -1.4043371677398682, "objective/rlhf_reward": -1.2173484623432156, "objective/scores": 1.1, "policy/approxkl_avg": 4.8251872062683105, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.613540530204773, "step": 2045, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0013580322265625 }, { "episode": 32752, "epoch": 0.5887047488945609, "loss/policy_avg": 0.14012257754802704, "lr": 2.6077453987730063e-06, "objective/entropy": -78.76776885986328, "objective/kl": 18.866756439208984, "objective/non_score_reward": -1.8866755962371826, "objective/rlhf_reward": -9.54670238494873, "objective/scores": -0.5, "policy/approxkl_avg": 4.896300792694092, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.9173866510391235, "step": 2046, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9997925758361816 }, { "episode": 32768, "epoch": 0.5889923428119496, "loss/policy_avg": 0.01659621298313141, "lr": 2.607553680981595e-06, "objective/entropy": -23.22461700439453, "objective/kl": 12.49586009979248, "objective/non_score_reward": -1.2495861053466797, "objective/rlhf_reward": -4.598344361782074, "objective/scores": 0.1, "policy/approxkl_avg": 30.9034366607666, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5299052000045776, "step": 2047, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0000593662261963 }, { "episode": 32784, "epoch": 0.5892799367293382, "loss/policy_avg": 0.2230335772037506, "lr": 2.6073619631901843e-06, "objective/entropy": -54.427371978759766, "objective/kl": 16.49000358581543, "objective/non_score_reward": -1.6490004062652588, "objective/rlhf_reward": -4.862668232123056, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 39.52677917480469, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6089744567871094, "step": 2048, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998667597770691 }, { "episode": 32800, "epoch": 0.5895675306467268, "loss/policy_avg": 0.08745601773262024, "lr": 2.607170245398773e-06, "objective/entropy": -16.302215576171875, "objective/kl": 19.69635009765625, "objective/non_score_reward": -1.9696348905563354, "objective/rlhf_reward": -5.478539860248565, "objective/scores": 0.6, "policy/approxkl_avg": 31.309520721435547, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7404862642288208, "step": 2049, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9993966817855835 }, { "episode": 32816, "epoch": 0.5898551245641155, "loss/policy_avg": 0.2967149019241333, "lr": 2.606978527607362e-06, "objective/entropy": 202.35987854003906, "objective/kl": 21.735036849975586, "objective/non_score_reward": -2.173503875732422, "objective/rlhf_reward": -8.294015264511108, "objective/scores": 0.1, "policy/approxkl_avg": 16.59661293029785, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5977518558502197, "step": 2050, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.998718023300171 }, { "episode": 32832, "epoch": 0.5901427184815041, "loss/policy_avg": -0.04524856433272362, "lr": 2.606786809815951e-06, "objective/entropy": 199.6927490234375, "objective/kl": 12.524423599243164, "objective/non_score_reward": -1.2524423599243164, "objective/rlhf_reward": -2.086050663830015, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 7.180833339691162, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7892223596572876, "step": 2051, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000894069671631 }, { "episode": 32848, "epoch": 0.5904303123988928, "loss/policy_avg": 0.1637168824672699, "lr": 2.60659509202454e-06, "objective/entropy": -87.76792907714844, "objective/kl": 11.287168502807617, "objective/non_score_reward": -1.1287169456481934, "objective/rlhf_reward": -4.11486736536026, "objective/scores": 0.1, "policy/approxkl_avg": 2.6567301750183105, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6446081399917603, "step": 2052, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000990390777588 }, { "episode": 32864, "epoch": 0.5907179063162814, "loss/policy_avg": 0.6465239524841309, "lr": 2.606403374233129e-06, "objective/entropy": 207.52310180664062, "objective/kl": 14.48089599609375, "objective/non_score_reward": -1.448089599609375, "objective/rlhf_reward": -5.392358517646789, "objective/scores": 0.1, "policy/approxkl_avg": 39.36521911621094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.819767951965332, "step": 2053, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9989714622497559 }, { "episode": 32880, "epoch": 0.59100550023367, "loss/policy_avg": 0.5887457132339478, "lr": 2.606211656441718e-06, "objective/entropy": -170.1577606201172, "objective/kl": 13.745086669921875, "objective/non_score_reward": -1.3745086193084717, "objective/rlhf_reward": -5.098034194111824, "objective/scores": 0.1, "policy/approxkl_avg": 31.378250122070312, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4332253634929657, "step": 2054, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.001190185546875 }, { "episode": 32896, "epoch": 0.5912930941510587, "loss/policy_avg": -0.0851367712020874, "lr": 2.606019938650307e-06, "objective/entropy": -175.6912384033203, "objective/kl": 11.137053489685059, "objective/non_score_reward": -1.1137053966522217, "objective/rlhf_reward": -6.454821586608887, "objective/scores": -0.5, "policy/approxkl_avg": 40.83671569824219, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.573056697845459, "step": 2055, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0009331703186035 }, { "episode": 32912, "epoch": 0.5915806880684473, "loss/policy_avg": 0.05345374345779419, "lr": 2.605828220858896e-06, "objective/entropy": 35.686378479003906, "objective/kl": 15.517290115356445, "objective/non_score_reward": -1.5517290830612183, "objective/rlhf_reward": -1.8069164514541622, "objective/scores": 1.1, "policy/approxkl_avg": 61.70820999145508, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6552486419677734, "step": 2056, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9993808269500732 }, { "episode": 32928, "epoch": 0.591868281985836, "loss/policy_avg": 0.23575273156166077, "lr": 2.6056365030674844e-06, "objective/entropy": -190.5808868408203, "objective/kl": 16.56703758239746, "objective/non_score_reward": -1.6567037105560303, "objective/rlhf_reward": -6.226814544200897, "objective/scores": 0.1, "policy/approxkl_avg": 64.16632080078125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.47018104791641235, "step": 2057, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9991967678070068 }, { "episode": 32944, "epoch": 0.5921558759032246, "loss/policy_avg": 0.44772571325302124, "lr": 2.6054447852760737e-06, "objective/entropy": 51.64216995239258, "objective/kl": 17.7371826171875, "objective/non_score_reward": -1.7737181186676025, "objective/rlhf_reward": -6.694872593879699, "objective/scores": 0.1, "policy/approxkl_avg": 35.988739013671875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6093859672546387, "step": 2058, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9980683326721191 }, { "episode": 32960, "epoch": 0.5924434698206132, "loss/policy_avg": 0.11212430894374847, "lr": 2.6052530674846625e-06, "objective/entropy": 202.07647705078125, "objective/kl": 14.013270378112793, "objective/non_score_reward": -1.4013270139694214, "objective/rlhf_reward": -2.681589041591856, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 51.12955093383789, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6560755968093872, "step": 2059, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9973139762878418 }, { "episode": 32976, "epoch": 0.592731063738002, "loss/policy_avg": 0.16852912306785583, "lr": 2.6050613496932513e-06, "objective/entropy": 158.34957885742188, "objective/kl": 17.040546417236328, "objective/non_score_reward": -1.7040544748306274, "objective/rlhf_reward": -6.416217899322509, "objective/scores": 0.1, "policy/approxkl_avg": 39.814979553222656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8345825672149658, "step": 2060, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9986486434936523 }, { "episode": 32992, "epoch": 0.5930186576553906, "loss/policy_avg": 0.5923249125480652, "lr": 2.6048696319018405e-06, "objective/entropy": 77.84188079833984, "objective/kl": 13.63203239440918, "objective/non_score_reward": -1.3632032871246338, "objective/rlhf_reward": -7.452813148498535, "objective/scores": -0.5, "policy/approxkl_avg": 71.78942108154297, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6449399590492249, "step": 2061, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000882625579834 }, { "episode": 33008, "epoch": 0.5933062515727793, "loss/policy_avg": 0.13562044501304626, "lr": 2.6046779141104293e-06, "objective/entropy": 22.380054473876953, "objective/kl": 14.888589859008789, "objective/non_score_reward": -1.488858938217163, "objective/rlhf_reward": -4.008024792151387, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 2.43534779548645, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5684009790420532, "step": 2062, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000018358230591 }, { "episode": 33024, "epoch": 0.5935938454901679, "loss/policy_avg": -0.22385066747665405, "lr": 2.6044861963190186e-06, "objective/entropy": -12.513151168823242, "objective/kl": 15.879064559936523, "objective/non_score_reward": -1.5879064798355103, "objective/rlhf_reward": -8.351625442504883, "objective/scores": -0.5, "policy/approxkl_avg": 0.4901871085166931, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7548037767410278, "step": 2063, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.002842426300049 }, { "episode": 33040, "epoch": 0.5938814394075566, "loss/policy_avg": -0.4307985305786133, "lr": 2.6042944785276074e-06, "objective/entropy": 151.069091796875, "objective/kl": 16.00432777404785, "objective/non_score_reward": -1.6004326343536377, "objective/rlhf_reward": -4.2790240667024, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 53.35271453857422, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.42856454849243164, "step": 2064, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0085434913635254 }, { "episode": 33056, "epoch": 0.5941690333249452, "loss/policy_avg": -0.3380109965801239, "lr": 2.604102760736196e-06, "objective/entropy": 298.4809265136719, "objective/kl": 11.528972625732422, "objective/non_score_reward": -1.1528971195220947, "objective/rlhf_reward": -2.7867600872841587, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 62.940818786621094, "policy/clipfrac_avg": 2.0, "policy/entropy_avg": 0.983273983001709, "step": 2065, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0027170181274414 }, { "episode": 33072, "epoch": 0.5944566272423338, "loss/policy_avg": 0.20114445686340332, "lr": 2.6039110429447854e-06, "objective/entropy": 193.4998779296875, "objective/kl": 15.039509773254395, "objective/non_score_reward": -1.503950834274292, "objective/rlhf_reward": -5.615803307294845, "objective/scores": 0.1, "policy/approxkl_avg": 107.57582092285156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5463466048240662, "step": 2066, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.00111722946167 }, { "episode": 33088, "epoch": 0.5947442211597225, "loss/policy_avg": 0.43512579798698425, "lr": 2.6037193251533742e-06, "objective/entropy": 73.24812316894531, "objective/kl": 22.267837524414062, "objective/non_score_reward": -2.2267839908599854, "objective/rlhf_reward": -8.507136023044586, "objective/scores": 0.1, "policy/approxkl_avg": 79.75790405273438, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.835204005241394, "step": 2067, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9973831176757812 }, { "episode": 33104, "epoch": 0.5950318150771111, "loss/policy_avg": 0.06888563930988312, "lr": 2.6035276073619635e-06, "objective/entropy": 232.14309692382812, "objective/kl": 15.005265235900879, "objective/non_score_reward": -1.5005265474319458, "objective/rlhf_reward": -5.602106219530105, "objective/scores": 0.1, "policy/approxkl_avg": 34.626739501953125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5669118165969849, "step": 2068, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9966709613800049 }, { "episode": 33120, "epoch": 0.5953194089944998, "loss/policy_avg": 0.2841304838657379, "lr": 2.6033358895705523e-06, "objective/entropy": -28.131656646728516, "objective/kl": 15.817472457885742, "objective/non_score_reward": -1.58174729347229, "objective/rlhf_reward": -4.204282762781654, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 12.378704071044922, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7138805389404297, "step": 2069, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9970505237579346 }, { "episode": 33136, "epoch": 0.5956070029118884, "loss/policy_avg": 0.0031527914106845856, "lr": 2.603144171779141e-06, "objective/entropy": 21.773712158203125, "objective/kl": 10.743021011352539, "objective/non_score_reward": -1.0743021965026855, "objective/rlhf_reward": 0.10279151201248204, "objective/scores": 1.1, "policy/approxkl_avg": 1.1652061939239502, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7505438327789307, "step": 2070, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000839948654175 }, { "episode": 33152, "epoch": 0.595894596829277, "loss/policy_avg": 0.1873525083065033, "lr": 2.6029524539877303e-06, "objective/entropy": 12.441814422607422, "objective/kl": 18.700258255004883, "objective/non_score_reward": -1.8700261116027832, "objective/rlhf_reward": -7.080104267597198, "objective/scores": 0.1, "policy/approxkl_avg": 30.97005271911621, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5343834161758423, "step": 2071, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0008387565612793 }, { "episode": 33168, "epoch": 0.5961821907466657, "loss/policy_avg": 0.5001279711723328, "lr": 2.602760736196319e-06, "objective/entropy": -264.6827392578125, "objective/kl": 12.770312309265137, "objective/non_score_reward": -1.2770311832427979, "objective/rlhf_reward": -4.7081250011920925, "objective/scores": 0.1, "policy/approxkl_avg": 89.31438446044922, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5510841608047485, "step": 2072, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9999988079071045 }, { "episode": 33184, "epoch": 0.5964697846640543, "loss/policy_avg": 0.15056979656219482, "lr": 2.602569018404908e-06, "objective/entropy": -190.62677001953125, "objective/kl": 13.944238662719727, "objective/non_score_reward": -1.3944240808486938, "objective/rlhf_reward": -3.1776962041854855, "objective/scores": 0.6, "policy/approxkl_avg": 52.246315002441406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7502607107162476, "step": 2073, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.996614933013916 }, { "episode": 33200, "epoch": 0.596757378581443, "loss/policy_avg": 0.37330591678619385, "lr": 2.602377300613497e-06, "objective/entropy": 47.70793533325195, "objective/kl": 14.258697509765625, "objective/non_score_reward": -1.4258697032928467, "objective/rlhf_reward": -5.3034790515899655, "objective/scores": 0.1, "policy/approxkl_avg": 10.65910530090332, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6878872513771057, "step": 2074, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0026094913482666 }, { "episode": 33216, "epoch": 0.5970449724988316, "loss/policy_avg": 0.08630452305078506, "lr": 2.602185582822086e-06, "objective/entropy": -24.456968307495117, "objective/kl": 13.694202423095703, "objective/non_score_reward": -1.3694202899932861, "objective/rlhf_reward": -5.077681159973144, "objective/scores": 0.1, "policy/approxkl_avg": 10.62160873413086, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6508252620697021, "step": 2075, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9972835779190063 }, { "episode": 33232, "epoch": 0.5973325664162203, "loss/policy_avg": 0.7143691778182983, "lr": 2.601993865030675e-06, "objective/entropy": 244.7336883544922, "objective/kl": 15.322755813598633, "objective/non_score_reward": -1.5322757959365845, "objective/rlhf_reward": -4.006396951452766, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 27.416717529296875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.9038101434707642, "step": 2076, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9971814155578613 }, { "episode": 33248, "epoch": 0.597620160333609, "loss/policy_avg": 0.8709487915039062, "lr": 2.601802147239264e-06, "objective/entropy": 22.043338775634766, "objective/kl": 13.561304092407227, "objective/non_score_reward": -1.356130599975586, "objective/rlhf_reward": -1.0245220422744747, "objective/scores": 1.1, "policy/approxkl_avg": 17.434871673583984, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5641729235649109, "step": 2077, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9974422454833984 }, { "episode": 33264, "epoch": 0.5979077542509976, "loss/policy_avg": 0.02021685242652893, "lr": 2.601610429447853e-06, "objective/entropy": -15.377296447753906, "objective/kl": 13.26352310180664, "objective/non_score_reward": -1.3263523578643799, "objective/rlhf_reward": -4.9054096102714535, "objective/scores": 0.1, "policy/approxkl_avg": 10.072490692138672, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.62602698802948, "step": 2078, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0023858547210693 }, { "episode": 33280, "epoch": 0.5981953481683863, "loss/policy_avg": 0.3721355199813843, "lr": 2.6014187116564416e-06, "objective/entropy": -88.40010070800781, "objective/kl": 19.315494537353516, "objective/non_score_reward": -1.9315496683120728, "objective/rlhf_reward": -7.326198710501194, "objective/scores": 0.1, "policy/approxkl_avg": 127.0091552734375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8153165578842163, "step": 2079, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9973070621490479 }, { "episode": 33296, "epoch": 0.5984829420857749, "loss/policy_avg": 0.30539119243621826, "lr": 2.6012269938650304e-06, "objective/entropy": 47.699588775634766, "objective/kl": 14.241891860961914, "objective/non_score_reward": -1.4241890907287598, "objective/rlhf_reward": -5.296756184101104, "objective/scores": 0.1, "policy/approxkl_avg": 1.3573634624481201, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.3682810068130493, "step": 2080, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000620126724243 }, { "episode": 33312, "epoch": 0.5987705360031635, "loss/policy_avg": 0.04993399232625961, "lr": 2.6010352760736197e-06, "objective/entropy": -33.56816101074219, "objective/kl": 16.420448303222656, "objective/non_score_reward": -1.6420449018478394, "objective/rlhf_reward": -2.168179845809936, "objective/scores": 1.1, "policy/approxkl_avg": 29.767711639404297, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6949491500854492, "step": 2081, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9982733726501465 }, { "episode": 33328, "epoch": 0.5990581299205522, "loss/policy_avg": 0.12590837478637695, "lr": 2.6008435582822085e-06, "objective/entropy": 70.84333801269531, "objective/kl": 18.988134384155273, "objective/non_score_reward": -1.898813247680664, "objective/rlhf_reward": -5.195253229141235, "objective/scores": 0.6, "policy/approxkl_avg": 52.74480438232422, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5343348979949951, "step": 2082, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998291015625 }, { "episode": 33344, "epoch": 0.5993457238379408, "loss/policy_avg": 0.4130307734012604, "lr": 2.6006518404907977e-06, "objective/entropy": 143.94061279296875, "objective/kl": 17.626792907714844, "objective/non_score_reward": -1.7626793384552002, "objective/rlhf_reward": -5.225888605388711, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 10.599780082702637, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5684624910354614, "step": 2083, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.001006841659546 }, { "episode": 33360, "epoch": 0.5996333177553295, "loss/policy_avg": 0.35321566462516785, "lr": 2.6004601226993865e-06, "objective/entropy": 143.16958618164062, "objective/kl": 15.126535415649414, "objective/non_score_reward": -1.5126534700393677, "objective/rlhf_reward": -4.103202651219304, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 11.734973907470703, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.3501328229904175, "step": 2084, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.999567985534668 }, { "episode": 33376, "epoch": 0.5999209116727181, "loss/policy_avg": 0.1641218066215515, "lr": 2.6002684049079753e-06, "objective/entropy": -69.33512115478516, "objective/kl": 18.16229248046875, "objective/non_score_reward": -1.8162293434143066, "objective/rlhf_reward": -4.864917194843292, "objective/scores": 0.6, "policy/approxkl_avg": 48.57612991333008, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7362520098686218, "step": 2085, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9975570440292358 }, { "episode": 33392, "epoch": 0.6002085055901067, "loss/policy_avg": 0.4311325252056122, "lr": 2.6000766871165646e-06, "objective/entropy": 76.26156616210938, "objective/kl": 15.489213943481445, "objective/non_score_reward": -1.5489213466644287, "objective/rlhf_reward": -1.7956856250762936, "objective/scores": 1.1, "policy/approxkl_avg": 85.90757751464844, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4286644756793976, "step": 2086, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.998852252960205 }, { "episode": 33408, "epoch": 0.6004960995074954, "loss/policy_avg": 0.591722846031189, "lr": 2.5998849693251534e-06, "objective/entropy": 189.01287841796875, "objective/kl": 19.79343032836914, "objective/non_score_reward": -1.979343295097351, "objective/rlhf_reward": -9.917373657226562, "objective/scores": -0.5, "policy/approxkl_avg": 44.156105041503906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8051648736000061, "step": 2087, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000519037246704 }, { "episode": 33424, "epoch": 0.600783693424884, "loss/policy_avg": 0.10551775991916656, "lr": 2.599693251533742e-06, "objective/entropy": 189.19497680664062, "objective/kl": 17.67028045654297, "objective/non_score_reward": -1.7670280933380127, "objective/rlhf_reward": -4.945406439081703, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 30.548511505126953, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.47096550464630127, "step": 2088, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9998936653137207 }, { "episode": 33440, "epoch": 0.6010712873422727, "loss/policy_avg": 0.00404435396194458, "lr": 2.5995015337423314e-06, "objective/entropy": 2.9050827026367188, "objective/kl": 14.470444679260254, "objective/non_score_reward": -1.4470446109771729, "objective/rlhf_reward": -7.788178443908691, "objective/scores": -0.5, "policy/approxkl_avg": 13.609698295593262, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4019898772239685, "step": 2089, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999104619026184 }, { "episode": 33456, "epoch": 0.6013588812596613, "loss/policy_avg": -0.2547715902328491, "lr": 2.5993098159509202e-06, "objective/entropy": 64.21862030029297, "objective/kl": 15.499526023864746, "objective/non_score_reward": -1.5499526262283325, "objective/rlhf_reward": -8.199810981750488, "objective/scores": -0.5, "policy/approxkl_avg": 1.8819937705993652, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.7023075222969055, "step": 2090, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.001739025115967 }, { "episode": 33472, "epoch": 0.60164647517705, "loss/policy_avg": 0.21478338539600372, "lr": 2.5991180981595095e-06, "objective/entropy": -2.176471710205078, "objective/kl": 9.324807167053223, "objective/non_score_reward": -0.9324808120727539, "objective/rlhf_reward": -3.3299232482910153, "objective/scores": 0.1, "policy/approxkl_avg": 16.506996154785156, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.37977907061576843, "step": 2091, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9980961084365845 }, { "episode": 33488, "epoch": 0.6019340690944387, "loss/policy_avg": 0.3434308171272278, "lr": 2.5989263803680983e-06, "objective/entropy": -41.94181823730469, "objective/kl": 14.127401351928711, "objective/non_score_reward": -1.4127402305603027, "objective/rlhf_reward": -1.250961190462112, "objective/scores": 1.1, "policy/approxkl_avg": 36.69062042236328, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.31178468465805054, "step": 2092, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.998267650604248 }, { "episode": 33504, "epoch": 0.6022216630118273, "loss/policy_avg": 0.012844249606132507, "lr": 2.598734662576687e-06, "objective/entropy": -70.4189224243164, "objective/kl": 9.646286964416504, "objective/non_score_reward": -0.9646286368370056, "objective/rlhf_reward": -3.458514547348022, "objective/scores": 0.1, "policy/approxkl_avg": 9.505643844604492, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5593554973602295, "step": 2093, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0008978843688965 }, { "episode": 33520, "epoch": 0.602509256929216, "loss/policy_avg": 2.3791656494140625, "lr": 2.5985429447852763e-06, "objective/entropy": -168.18963623046875, "objective/kl": 10.93196964263916, "objective/non_score_reward": -1.0931968688964844, "objective/rlhf_reward": -2.7109279088383778, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 10.194269180297852, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.639975368976593, "step": 2094, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9989033937454224 }, { "episode": 33536, "epoch": 0.6027968508466046, "loss/policy_avg": 0.23867468535900116, "lr": 2.598351226993865e-06, "objective/entropy": 28.35572052001953, "objective/kl": 17.075637817382812, "objective/non_score_reward": -1.7075636386871338, "objective/rlhf_reward": -2.4302548527717587, "objective/scores": 1.1, "policy/approxkl_avg": 29.440872192382812, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6343339681625366, "step": 2095, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9969229698181152 }, { "episode": 33552, "epoch": 0.6030844447639933, "loss/policy_avg": 0.27316632866859436, "lr": 2.5981595092024544e-06, "objective/entropy": -110.1327896118164, "objective/kl": 14.059263229370117, "objective/non_score_reward": -1.40592622756958, "objective/rlhf_reward": -3.798876370462488, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 13.267318725585938, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7431437969207764, "step": 2096, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000326633453369 }, { "episode": 33568, "epoch": 0.6033720386813819, "loss/policy_avg": 0.04450540989637375, "lr": 2.597967791411043e-06, "objective/entropy": -200.97593688964844, "objective/kl": 14.460651397705078, "objective/non_score_reward": -1.4460651874542236, "objective/rlhf_reward": -4.050927476088206, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 13.181662559509277, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6776976585388184, "step": 2097, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9983975887298584 }, { "episode": 33584, "epoch": 0.6036596325987705, "loss/policy_avg": 0.43644464015960693, "lr": 2.597776073619632e-06, "objective/entropy": -169.66427612304688, "objective/kl": 14.372163772583008, "objective/non_score_reward": -1.4372166395187378, "objective/rlhf_reward": -1.348866498470306, "objective/scores": 1.1, "policy/approxkl_avg": 105.80145263671875, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.46849268674850464, "step": 2098, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9996814727783203 }, { "episode": 33600, "epoch": 0.6039472265161592, "loss/policy_avg": -0.29122257232666016, "lr": 2.597584355828221e-06, "objective/entropy": 185.86685180664062, "objective/kl": 13.328239440917969, "objective/non_score_reward": -1.3328239917755127, "objective/rlhf_reward": -4.9312962055206295, "objective/scores": 0.1, "policy/approxkl_avg": 2.79552960395813, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4727972149848938, "step": 2099, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0019476413726807 }, { "episode": 33616, "epoch": 0.6042348204335478, "loss/policy_avg": 0.20983460545539856, "lr": 2.59739263803681e-06, "objective/entropy": -98.51879119873047, "objective/kl": 16.45016860961914, "objective/non_score_reward": -1.6450170278549194, "objective/rlhf_reward": -4.755239243778298, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 99.38389587402344, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.591879665851593, "step": 2100, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0003983974456787 }, { "episode": 33632, "epoch": 0.6045224143509365, "loss/policy_avg": 0.22341737151145935, "lr": 2.597200920245399e-06, "objective/entropy": -89.65531921386719, "objective/kl": 13.054361343383789, "objective/non_score_reward": -1.305436134338379, "objective/rlhf_reward": -7.221744537353516, "objective/scores": -0.5, "policy/approxkl_avg": 7.273288726806641, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.5509786605834961, "step": 2101, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0029349327087402 }, { "episode": 33648, "epoch": 0.6048100082683251, "loss/policy_avg": 0.3788665533065796, "lr": 2.5970092024539876e-06, "objective/entropy": 6.503326416015625, "objective/kl": 19.492740631103516, "objective/non_score_reward": -1.9492741823196411, "objective/rlhf_reward": -9.797096252441406, "objective/scores": -0.5, "policy/approxkl_avg": 48.16189193725586, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5450559854507446, "step": 2102, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9992316961288452 }, { "episode": 33664, "epoch": 0.6050976021857137, "loss/policy_avg": -0.001059025526046753, "lr": 2.5968174846625764e-06, "objective/entropy": 101.92574310302734, "objective/kl": 8.146560668945312, "objective/non_score_reward": -0.8146560192108154, "objective/rlhf_reward": -0.8586242109537123, "objective/scores": 0.6, "policy/approxkl_avg": 1.331566572189331, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.59111088514328, "step": 2103, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000415086746216 }, { "episode": 33680, "epoch": 0.6053851961031024, "loss/policy_avg": -0.27263638377189636, "lr": 2.5966257668711657e-06, "objective/entropy": 77.05372619628906, "objective/kl": 17.506303787231445, "objective/non_score_reward": -1.7506303787231445, "objective/rlhf_reward": -6.602521447837352, "objective/scores": 0.1, "policy/approxkl_avg": 11.02352523803711, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5078814029693604, "step": 2104, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.00350284576416 }, { "episode": 33696, "epoch": 0.605672790020491, "loss/policy_avg": 0.2301843762397766, "lr": 2.5964340490797545e-06, "objective/entropy": 137.177734375, "objective/kl": 9.50554084777832, "objective/non_score_reward": -0.9505541324615479, "objective/rlhf_reward": -1.4022166490554808, "objective/scores": 0.6, "policy/approxkl_avg": 3.2152836322784424, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.690402626991272, "step": 2105, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0016870498657227 }, { "episode": 33712, "epoch": 0.6059603839378798, "loss/policy_avg": 0.06770651787519455, "lr": 2.5962423312883437e-06, "objective/entropy": 2.0992813110351562, "objective/kl": 9.80885124206543, "objective/non_score_reward": -0.9808850288391113, "objective/rlhf_reward": -0.9998212500822273, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 2.15012526512146, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5921926498413086, "step": 2106, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.003202438354492 }, { "episode": 33728, "epoch": 0.6062479778552684, "loss/policy_avg": 0.25330886244773865, "lr": 2.5960506134969325e-06, "objective/entropy": 196.8042449951172, "objective/kl": 22.02609634399414, "objective/non_score_reward": -2.2026097774505615, "objective/rlhf_reward": -4.410439348220825, "objective/scores": 1.1, "policy/approxkl_avg": 146.92312622070312, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.510128915309906, "step": 2107, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.996383547782898 }, { "episode": 33744, "epoch": 0.606535571772657, "loss/policy_avg": 0.3180674910545349, "lr": 2.5958588957055213e-06, "objective/entropy": 109.73847198486328, "objective/kl": 10.875676155090332, "objective/non_score_reward": -1.087567687034607, "objective/rlhf_reward": -3.95027065873146, "objective/scores": 0.1, "policy/approxkl_avg": 34.97682189941406, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.3390194773674011, "step": 2108, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9979220628738403 }, { "episode": 33760, "epoch": 0.6068231656900457, "loss/policy_avg": 0.31364506483078003, "lr": 2.5956671779141106e-06, "objective/entropy": 108.84683227539062, "objective/kl": 22.46896743774414, "objective/non_score_reward": -2.246896743774414, "objective/rlhf_reward": -7.4313281466632635, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 53.97173309326172, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.512633204460144, "step": 2109, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998118281364441 }, { "episode": 33776, "epoch": 0.6071107596074343, "loss/policy_avg": 0.10164511948823929, "lr": 2.5954754601226994e-06, "objective/entropy": 154.34152221679688, "objective/kl": 13.435369491577148, "objective/non_score_reward": -1.3435368537902832, "objective/rlhf_reward": -2.4504285796892376, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 6.303553104400635, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6599438190460205, "step": 2110, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999377965927124 }, { "episode": 33792, "epoch": 0.607398353524823, "loss/policy_avg": 0.16709858179092407, "lr": 2.595283742331288e-06, "objective/entropy": 188.6673583984375, "objective/kl": 19.079668045043945, "objective/non_score_reward": -1.9079668521881104, "objective/rlhf_reward": -5.68445612020963, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 9.285051345825195, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7654898166656494, "step": 2111, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0025992393493652 }, { "episode": 33808, "epoch": 0.6076859474422116, "loss/policy_avg": 0.09121402353048325, "lr": 2.5950920245398774e-06, "objective/entropy": 31.230186462402344, "objective/kl": 15.165401458740234, "objective/non_score_reward": -1.516540288925171, "objective/rlhf_reward": -8.066161155700684, "objective/scores": -0.5, "policy/approxkl_avg": 43.637332916259766, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7986932396888733, "step": 2112, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9979455471038818 }, { "episode": 33824, "epoch": 0.6079735413596002, "loss/policy_avg": 0.25094541907310486, "lr": 2.5949003067484662e-06, "objective/entropy": -11.48862075805664, "objective/kl": 18.022008895874023, "objective/non_score_reward": -1.8022009134292603, "objective/rlhf_reward": -6.80880377292633, "objective/scores": 0.1, "policy/approxkl_avg": 36.03727722167969, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.582868218421936, "step": 2113, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9999412298202515 }, { "episode": 33840, "epoch": 0.6082611352769889, "loss/policy_avg": 0.4924889802932739, "lr": 2.5947085889570555e-06, "objective/entropy": -97.55308532714844, "objective/kl": 14.677192687988281, "objective/non_score_reward": -1.467719316482544, "objective/rlhf_reward": -1.4708772957324978, "objective/scores": 1.1, "policy/approxkl_avg": 17.880861282348633, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7399845719337463, "step": 2114, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0017504692077637 }, { "episode": 33856, "epoch": 0.6085487291943775, "loss/policy_avg": 0.4132453203201294, "lr": 2.5945168711656443e-06, "objective/entropy": 52.88839340209961, "objective/kl": 18.023258209228516, "objective/non_score_reward": -1.802325963973999, "objective/rlhf_reward": -5.3844751968708735, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 36.086788177490234, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5082260370254517, "step": 2115, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9986095428466797 }, { "episode": 33872, "epoch": 0.6088363231117662, "loss/policy_avg": 0.22052277624607086, "lr": 2.594325153374233e-06, "objective/entropy": 129.5975799560547, "objective/kl": 18.520706176757812, "objective/non_score_reward": -1.8520703315734863, "objective/rlhf_reward": -7.008281609416008, "objective/scores": 0.1, "policy/approxkl_avg": 51.77848815917969, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7177315354347229, "step": 2116, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9968171119689941 }, { "episode": 33888, "epoch": 0.6091239170291548, "loss/policy_avg": 0.14162278175354004, "lr": 2.5941334355828223e-06, "objective/entropy": -17.719482421875, "objective/kl": 7.724222183227539, "objective/non_score_reward": -0.7724223136901855, "objective/rlhf_reward": 1.3103109240531925, "objective/scores": 1.1, "policy/approxkl_avg": 69.4462890625, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8562661409378052, "step": 2117, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9992499351501465 }, { "episode": 33904, "epoch": 0.6094115109465434, "loss/policy_avg": 0.1741405427455902, "lr": 2.593941717791411e-06, "objective/entropy": 275.2198486328125, "objective/kl": 18.36294937133789, "objective/non_score_reward": -1.8362950086593628, "objective/rlhf_reward": -6.94518015384674, "objective/scores": 0.1, "policy/approxkl_avg": 19.012439727783203, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.9217780828475952, "step": 2118, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9999818801879883 }, { "episode": 33920, "epoch": 0.6096991048639321, "loss/policy_avg": 4.892614364624023, "lr": 2.5937500000000004e-06, "objective/entropy": -6.740753173828125, "objective/kl": 8.5400390625, "objective/non_score_reward": -0.85400390625, "objective/rlhf_reward": -5.416015625, "objective/scores": -0.5, "policy/approxkl_avg": 1.4418704509735107, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.538404107093811, "step": 2119, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0201470851898193 }, { "episode": 33936, "epoch": 0.6099866987813207, "loss/policy_avg": 0.21670030057430267, "lr": 2.593558282208589e-06, "objective/entropy": 2.4147682189941406, "objective/kl": 15.552010536193848, "objective/non_score_reward": -1.5552010536193848, "objective/rlhf_reward": -8.220804214477539, "objective/scores": -0.5, "policy/approxkl_avg": 105.36181640625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6115633249282837, "step": 2120, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9976463317871094 }, { "episode": 33952, "epoch": 0.6102742926987095, "loss/policy_avg": 0.033039819449186325, "lr": 2.593366564417178e-06, "objective/entropy": 73.02333068847656, "objective/kl": 22.716238021850586, "objective/non_score_reward": -2.2716236114501953, "objective/rlhf_reward": -8.686495041847229, "objective/scores": 0.1, "policy/approxkl_avg": 73.9190444946289, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.49127280712127686, "step": 2121, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.999624252319336 }, { "episode": 33968, "epoch": 0.6105618866160981, "loss/policy_avg": -0.7839405536651611, "lr": 2.5931748466257672e-06, "objective/entropy": 69.94853973388672, "objective/kl": 12.057262420654297, "objective/non_score_reward": -1.2057262659072876, "objective/rlhf_reward": -0.4229049146175381, "objective/scores": 1.1, "policy/approxkl_avg": 41.606651306152344, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6370517015457153, "step": 2122, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0159318447113037 }, { "episode": 33984, "epoch": 0.6108494805334868, "loss/policy_avg": 0.07773086428642273, "lr": 2.5929831288343556e-06, "objective/entropy": -92.78646087646484, "objective/kl": 13.057992935180664, "objective/non_score_reward": -1.3057992458343506, "objective/rlhf_reward": -0.8231969535350796, "objective/scores": 1.1, "policy/approxkl_avg": 11.172804832458496, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6352230310440063, "step": 2123, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.99819016456604 }, { "episode": 34000, "epoch": 0.6111370744508754, "loss/policy_avg": 0.22555482387542725, "lr": 2.592791411042945e-06, "objective/entropy": -72.2308349609375, "objective/kl": 17.175045013427734, "objective/non_score_reward": -1.7175045013427734, "objective/rlhf_reward": -6.4700182437896725, "objective/scores": 0.1, "policy/approxkl_avg": 29.779685974121094, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4434419870376587, "step": 2124, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.996457576751709 }, { "episode": 34016, "epoch": 0.611424668368264, "loss/policy_avg": 0.05994892120361328, "lr": 2.5925996932515336e-06, "objective/entropy": 246.38250732421875, "objective/kl": 14.213615417480469, "objective/non_score_reward": -1.4213614463806152, "objective/rlhf_reward": -7.685445785522461, "objective/scores": -0.5, "policy/approxkl_avg": 44.60198974609375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7083953619003296, "step": 2125, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.002152442932129 }, { "episode": 34032, "epoch": 0.6117122622856527, "loss/policy_avg": 0.12799659371376038, "lr": 2.5924079754601225e-06, "objective/entropy": 208.40374755859375, "objective/kl": 19.928834915161133, "objective/non_score_reward": -1.9928836822509766, "objective/rlhf_reward": -3.5715342521667477, "objective/scores": 1.1, "policy/approxkl_avg": 145.67617797851562, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5317258238792419, "step": 2126, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9993952512741089 }, { "episode": 34048, "epoch": 0.6119998562030413, "loss/policy_avg": 0.6138862371444702, "lr": 2.5922162576687117e-06, "objective/entropy": -125.1441421508789, "objective/kl": 18.083675384521484, "objective/non_score_reward": -1.8083676099777222, "objective/rlhf_reward": -6.833470410108566, "objective/scores": 0.1, "policy/approxkl_avg": 72.46153259277344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6206263899803162, "step": 2127, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9993391036987305 }, { "episode": 34064, "epoch": 0.61228745012043, "loss/policy_avg": 0.24099749326705933, "lr": 2.5920245398773005e-06, "objective/entropy": 63.56041717529297, "objective/kl": 15.835171699523926, "objective/non_score_reward": -1.583517074584961, "objective/rlhf_reward": -4.509239728721688, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 29.088022232055664, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7318596243858337, "step": 2128, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9989407062530518 }, { "episode": 34080, "epoch": 0.6125750440378186, "loss/policy_avg": 0.10305094718933105, "lr": 2.5918328220858897e-06, "objective/entropy": 70.02751159667969, "objective/kl": 16.354942321777344, "objective/non_score_reward": -1.6354944705963135, "objective/rlhf_reward": -8.541976928710938, "objective/scores": -0.5, "policy/approxkl_avg": 22.052690505981445, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6812468767166138, "step": 2129, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.002415180206299 }, { "episode": 34096, "epoch": 0.6128626379552072, "loss/policy_avg": 0.42362314462661743, "lr": 2.5916411042944785e-06, "objective/entropy": -75.88069152832031, "objective/kl": 17.92064094543457, "objective/non_score_reward": -1.7920641899108887, "objective/rlhf_reward": -5.220845411496098, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 29.6253662109375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.632318377494812, "step": 2130, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9992579221725464 }, { "episode": 34112, "epoch": 0.6131502318725959, "loss/policy_avg": 0.37457984685897827, "lr": 2.5914493865030674e-06, "objective/entropy": 124.02549743652344, "objective/kl": 18.799131393432617, "objective/non_score_reward": -1.879913091659546, "objective/rlhf_reward": -9.5196533203125, "objective/scores": -0.5, "policy/approxkl_avg": 69.57762908935547, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6399707794189453, "step": 2131, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9947148561477661 }, { "episode": 34128, "epoch": 0.6134378257899845, "loss/policy_avg": 0.46973419189453125, "lr": 2.5912576687116566e-06, "objective/entropy": -160.57627868652344, "objective/kl": 11.583633422851562, "objective/non_score_reward": -1.1583632230758667, "objective/rlhf_reward": -1.709734116436216, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 9.947965621948242, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7383812665939331, "step": 2132, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9979161024093628 }, { "episode": 34144, "epoch": 0.6137254197073732, "loss/policy_avg": 0.0801706463098526, "lr": 2.5910659509202454e-06, "objective/entropy": -42.889068603515625, "objective/kl": 15.087331771850586, "objective/non_score_reward": -1.5087332725524902, "objective/rlhf_reward": -1.6349332094192501, "objective/scores": 1.1, "policy/approxkl_avg": 4.216672897338867, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4354102611541748, "step": 2133, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9970632791519165 }, { "episode": 34160, "epoch": 0.6140130136247618, "loss/policy_avg": 0.3341831564903259, "lr": 2.5908742331288346e-06, "objective/entropy": -53.92280197143555, "objective/kl": 22.34456443786621, "objective/non_score_reward": -2.2344565391540527, "objective/rlhf_reward": -10.937826156616211, "objective/scores": -0.5, "policy/approxkl_avg": 37.245609283447266, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7126411199569702, "step": 2134, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9983025789260864 }, { "episode": 34176, "epoch": 0.6143006075421504, "loss/policy_avg": -0.20949918031692505, "lr": 2.5906825153374234e-06, "objective/entropy": 16.97821044921875, "objective/kl": 14.048188209533691, "objective/non_score_reward": -1.4048187732696533, "objective/rlhf_reward": -7.619275093078613, "objective/scores": -0.5, "policy/approxkl_avg": 11.851763725280762, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.42240235209465027, "step": 2135, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9991393089294434 }, { "episode": 34192, "epoch": 0.6145882014595392, "loss/policy_avg": 0.36326050758361816, "lr": 2.5904907975460122e-06, "objective/entropy": -49.466636657714844, "objective/kl": 14.126543045043945, "objective/non_score_reward": -1.412654161453247, "objective/rlhf_reward": -3.9172833124796544, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 15.922784805297852, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5579012632369995, "step": 2136, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9999401569366455 }, { "episode": 34208, "epoch": 0.6148757953769278, "loss/policy_avg": 0.023842569440603256, "lr": 2.5902990797546015e-06, "objective/entropy": 50.25873565673828, "objective/kl": 14.170000076293945, "objective/non_score_reward": -1.4170001745224, "objective/rlhf_reward": -5.268000727891922, "objective/scores": 0.1, "policy/approxkl_avg": 29.010520935058594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5795632600784302, "step": 2137, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9991328716278076 }, { "episode": 34224, "epoch": 0.6151633892943165, "loss/policy_avg": 0.04017828032374382, "lr": 2.5901073619631903e-06, "objective/entropy": 88.83770751953125, "objective/kl": 18.84758186340332, "objective/non_score_reward": -1.8847582340240479, "objective/rlhf_reward": -7.139032757282257, "objective/scores": 0.1, "policy/approxkl_avg": 74.74348449707031, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5087491273880005, "step": 2138, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0004384517669678 }, { "episode": 34240, "epoch": 0.6154509832117051, "loss/policy_avg": 0.026270896196365356, "lr": 2.589915644171779e-06, "objective/entropy": 81.49057006835938, "objective/kl": 17.17093276977539, "objective/non_score_reward": -1.7170931100845337, "objective/rlhf_reward": -6.468372440338134, "objective/scores": 0.1, "policy/approxkl_avg": 35.445743560791016, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4986514449119568, "step": 2139, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9982997179031372 }, { "episode": 34256, "epoch": 0.6157385771290937, "loss/policy_avg": 0.1535506248474121, "lr": 2.5897239263803683e-06, "objective/entropy": 138.4166259765625, "objective/kl": 15.88475227355957, "objective/non_score_reward": -1.588475227355957, "objective/rlhf_reward": -8.353900909423828, "objective/scores": -0.5, "policy/approxkl_avg": 81.95653533935547, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6369169354438782, "step": 2140, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9999618530273438 }, { "episode": 34272, "epoch": 0.6160261710464824, "loss/policy_avg": 0.22110715508460999, "lr": 2.589532208588957e-06, "objective/entropy": -51.144073486328125, "objective/kl": 16.943445205688477, "objective/non_score_reward": -1.6943447589874268, "objective/rlhf_reward": -2.377378916740417, "objective/scores": 1.1, "policy/approxkl_avg": 32.16203689575195, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5588784217834473, "step": 2141, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998297929763794 }, { "episode": 34288, "epoch": 0.616313764963871, "loss/policy_avg": -0.39085668325424194, "lr": 2.5893404907975464e-06, "objective/entropy": 218.20677185058594, "objective/kl": 11.77801513671875, "objective/non_score_reward": -1.177801489830017, "objective/rlhf_reward": -2.5884996376195293, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 39.3564338684082, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6796581745147705, "step": 2142, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.00933837890625 }, { "episode": 34304, "epoch": 0.6166013588812597, "loss/policy_avg": 0.7143984436988831, "lr": 2.589148773006135e-06, "objective/entropy": 257.098388671875, "objective/kl": 25.953664779663086, "objective/non_score_reward": -2.5953664779663086, "objective/rlhf_reward": -8.258759739176305, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 243.64572143554688, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6894684433937073, "step": 2143, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9984405040740967 }, { "episode": 34320, "epoch": 0.6168889527986483, "loss/policy_avg": 0.07599098980426788, "lr": 2.588957055214724e-06, "objective/entropy": -22.387248992919922, "objective/kl": 22.233367919921875, "objective/non_score_reward": -2.223336935043335, "objective/rlhf_reward": -10.89334774017334, "objective/scores": -0.5, "policy/approxkl_avg": 31.48845863342285, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.804703950881958, "step": 2144, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000074863433838 }, { "episode": 34336, "epoch": 0.617176546716037, "loss/policy_avg": 0.28670692443847656, "lr": 2.588765337423313e-06, "objective/entropy": 70.882568359375, "objective/kl": 14.724905014038086, "objective/non_score_reward": -1.4724903106689453, "objective/rlhf_reward": -3.4899616003036495, "objective/scores": 0.6, "policy/approxkl_avg": 10.478135108947754, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7913788557052612, "step": 2145, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9978686571121216 }, { "episode": 34352, "epoch": 0.6174641406334256, "loss/policy_avg": 0.19924291968345642, "lr": 2.5885736196319016e-06, "objective/entropy": 123.67643737792969, "objective/kl": 19.60727310180664, "objective/non_score_reward": -1.960727334022522, "objective/rlhf_reward": -7.4429093360900875, "objective/scores": 0.1, "policy/approxkl_avg": 35.37279510498047, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.49038147926330566, "step": 2146, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9988164901733398 }, { "episode": 34368, "epoch": 0.6177517345508142, "loss/policy_avg": 0.12335044145584106, "lr": 2.588381901840491e-06, "objective/entropy": 158.38768005371094, "objective/kl": 7.689789772033691, "objective/non_score_reward": -0.768979012966156, "objective/rlhf_reward": -0.9532098195710517, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 3.716804027557373, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.700070858001709, "step": 2147, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9987845420837402 }, { "episode": 34384, "epoch": 0.6180393284682029, "loss/policy_avg": 0.7577451467514038, "lr": 2.5881901840490797e-06, "objective/entropy": 111.6480712890625, "objective/kl": 20.38470458984375, "objective/non_score_reward": -2.038470506668091, "objective/rlhf_reward": -10.153882026672363, "objective/scores": -0.5, "policy/approxkl_avg": 47.94173049926758, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6813660860061646, "step": 2148, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9996566772460938 }, { "episode": 34400, "epoch": 0.6183269223855915, "loss/policy_avg": 0.2539398670196533, "lr": 2.5879984662576685e-06, "objective/entropy": -34.9099006652832, "objective/kl": 8.784804344177246, "objective/non_score_reward": -0.8784804940223694, "objective/rlhf_reward": -3.113921976089477, "objective/scores": 0.1, "policy/approxkl_avg": 26.73517608642578, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.671658992767334, "step": 2149, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0013880729675293 }, { "episode": 34416, "epoch": 0.6186145163029801, "loss/policy_avg": 0.14950716495513916, "lr": 2.5878067484662577e-06, "objective/entropy": 6.590118408203125, "objective/kl": 18.786617279052734, "objective/non_score_reward": -1.8786617517471313, "objective/rlhf_reward": -3.1146468877792355, "objective/scores": 1.1, "policy/approxkl_avg": 25.397262573242188, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5859331488609314, "step": 2150, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9985339641571045 }, { "episode": 34432, "epoch": 0.6189021102203688, "loss/policy_avg": -0.10131794214248657, "lr": 2.5876150306748465e-06, "objective/entropy": 168.03851318359375, "objective/kl": 5.868832588195801, "objective/non_score_reward": -0.5868832468986511, "objective/rlhf_reward": 2.052467012405396, "objective/scores": 1.1, "policy/approxkl_avg": 2.2829232215881348, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6661413908004761, "step": 2151, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.008175849914551 }, { "episode": 34448, "epoch": 0.6191897041377575, "loss/policy_avg": 0.39233455061912537, "lr": 2.5874233128834357e-06, "objective/entropy": 115.36898040771484, "objective/kl": 11.404361724853516, "objective/non_score_reward": -1.1404361724853516, "objective/rlhf_reward": -6.561744689941406, "objective/scores": -0.5, "policy/approxkl_avg": 32.5582275390625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8021819591522217, "step": 2152, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.00008487701416 }, { "episode": 34464, "epoch": 0.6194772980551462, "loss/policy_avg": 0.2662584185600281, "lr": 2.5872315950920245e-06, "objective/entropy": 249.67864990234375, "objective/kl": 17.200284957885742, "objective/non_score_reward": -1.7200286388397217, "objective/rlhf_reward": -8.880114555358887, "objective/scores": -0.5, "policy/approxkl_avg": 23.378746032714844, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6200641393661499, "step": 2153, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.999152660369873 }, { "episode": 34480, "epoch": 0.6197648919725348, "loss/policy_avg": -0.2773422300815582, "lr": 2.5870398773006134e-06, "objective/entropy": 203.25189208984375, "objective/kl": 16.20944595336914, "objective/non_score_reward": -1.620944619178772, "objective/rlhf_reward": -6.083778536319732, "objective/scores": 0.1, "policy/approxkl_avg": 19.978534698486328, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7069727182388306, "step": 2154, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9999535083770752 }, { "episode": 34496, "epoch": 0.6200524858899235, "loss/policy_avg": 0.25541549921035767, "lr": 2.5868481595092026e-06, "objective/entropy": -46.375946044921875, "objective/kl": 12.585369110107422, "objective/non_score_reward": -1.2585369348526, "objective/rlhf_reward": -4.634147977828979, "objective/scores": 0.1, "policy/approxkl_avg": 53.30701446533203, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4830355644226074, "step": 2155, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9997302293777466 }, { "episode": 34512, "epoch": 0.6203400798073121, "loss/policy_avg": 0.5278187394142151, "lr": 2.5866564417177914e-06, "objective/entropy": 19.008319854736328, "objective/kl": 10.38141918182373, "objective/non_score_reward": -1.0381418466567993, "objective/rlhf_reward": 0.24743238985538518, "objective/scores": 1.1, "policy/approxkl_avg": 39.551109313964844, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7112669348716736, "step": 2156, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0008163452148438 }, { "episode": 34528, "epoch": 0.6206276737247007, "loss/policy_avg": 0.23893393576145172, "lr": 2.5864647239263806e-06, "objective/entropy": -12.230049133300781, "objective/kl": 22.22760009765625, "objective/non_score_reward": -2.2227602005004883, "objective/rlhf_reward": -4.491040995717048, "objective/scores": 1.1, "policy/approxkl_avg": 50.70768737792969, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7049212455749512, "step": 2157, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9978277683258057 }, { "episode": 34544, "epoch": 0.6209152676420894, "loss/policy_avg": 0.23087981343269348, "lr": 2.5862730061349694e-06, "objective/entropy": 85.62858581542969, "objective/kl": 15.143526077270508, "objective/non_score_reward": -1.5143526792526245, "objective/rlhf_reward": -8.057411193847656, "objective/scores": -0.5, "policy/approxkl_avg": 98.90402221679688, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7348974943161011, "step": 2158, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0005483627319336 }, { "episode": 34560, "epoch": 0.621202861559478, "loss/policy_avg": 0.13627511262893677, "lr": 2.5860812883435583e-06, "objective/entropy": 126.21749877929688, "objective/kl": 13.286699295043945, "objective/non_score_reward": -1.3286700248718262, "objective/rlhf_reward": -2.390960876585218, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 29.746185302734375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8702835440635681, "step": 2159, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9985427856445312 }, { "episode": 34576, "epoch": 0.6214904554768667, "loss/policy_avg": 0.5095393657684326, "lr": 2.5858895705521475e-06, "objective/entropy": -173.3975830078125, "objective/kl": 19.235570907592773, "objective/non_score_reward": -1.9235572814941406, "objective/rlhf_reward": -5.746817897038396, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 23.77338981628418, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4544439911842346, "step": 2160, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9970526695251465 }, { "episode": 34592, "epoch": 0.6217780493942553, "loss/policy_avg": -0.14496827125549316, "lr": 2.5856978527607363e-06, "objective/entropy": -33.920597076416016, "objective/kl": 12.791587829589844, "objective/non_score_reward": -1.2791587114334106, "objective/rlhf_reward": -4.716634964942932, "objective/scores": 0.1, "policy/approxkl_avg": 3.218846321105957, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7137322425842285, "step": 2161, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000558376312256 }, { "episode": 34608, "epoch": 0.6220656433116439, "loss/policy_avg": 0.478646457195282, "lr": 2.585506134969325e-06, "objective/entropy": 182.32119750976562, "objective/kl": 20.583545684814453, "objective/non_score_reward": -2.058354616165161, "objective/rlhf_reward": -6.408590073856423, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 24.505878448486328, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5689951181411743, "step": 2162, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9981651306152344 }, { "episode": 34624, "epoch": 0.6223532372290326, "loss/policy_avg": 0.2710217237472534, "lr": 2.5853144171779143e-06, "objective/entropy": 16.150726318359375, "objective/kl": 15.956924438476562, "objective/non_score_reward": -1.5956923961639404, "objective/rlhf_reward": -5.9827698230743405, "objective/scores": 0.1, "policy/approxkl_avg": 22.574764251708984, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8749411106109619, "step": 2163, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9991207122802734 }, { "episode": 34640, "epoch": 0.6226408311464212, "loss/policy_avg": 0.17157940566539764, "lr": 2.585122699386503e-06, "objective/entropy": 147.69705200195312, "objective/kl": 12.435047149658203, "objective/non_score_reward": -1.2435047626495361, "objective/rlhf_reward": -6.974018573760986, "objective/scores": -0.5, "policy/approxkl_avg": 39.84374237060547, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7384412288665771, "step": 2164, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9993503093719482 }, { "episode": 34656, "epoch": 0.6229284250638099, "loss/policy_avg": 0.39116328954696655, "lr": 2.5849309815950924e-06, "objective/entropy": 124.02345275878906, "objective/kl": 17.45986557006836, "objective/non_score_reward": -1.7459867000579834, "objective/rlhf_reward": -6.583946830034256, "objective/scores": 0.1, "policy/approxkl_avg": 131.04702758789062, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.602171003818512, "step": 2165, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.996525526046753 }, { "episode": 34672, "epoch": 0.6232160189811985, "loss/policy_avg": -0.323068767786026, "lr": 2.584739263803681e-06, "objective/entropy": 23.200973510742188, "objective/kl": 19.314199447631836, "objective/non_score_reward": -1.931420087814331, "objective/rlhf_reward": -4.8019612475645275, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 11.689594268798828, "policy/clipfrac_avg": 2.0, "policy/entropy_avg": 0.6117805242538452, "step": 2166, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.005882501602173 }, { "episode": 34688, "epoch": 0.6235036128985872, "loss/policy_avg": 0.07887321710586548, "lr": 2.58454754601227e-06, "objective/entropy": 207.01947021484375, "objective/kl": 16.987045288085938, "objective/non_score_reward": -1.6987048387527466, "objective/rlhf_reward": -6.394819355010986, "objective/scores": 0.1, "policy/approxkl_avg": 18.009706497192383, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5339441299438477, "step": 2167, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0004165172576904 }, { "episode": 34704, "epoch": 0.6237912068159759, "loss/policy_avg": 0.6891222596168518, "lr": 2.584355828220859e-06, "objective/entropy": -21.742630004882812, "objective/kl": 17.947301864624023, "objective/non_score_reward": -1.794730305671692, "objective/rlhf_reward": -6.7789211332798, "objective/scores": 0.1, "policy/approxkl_avg": 28.476604461669922, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7632720470428467, "step": 2168, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9985806941986084 }, { "episode": 34720, "epoch": 0.6240788007333645, "loss/policy_avg": 0.48487964272499084, "lr": 2.5841641104294476e-06, "objective/entropy": 294.52569580078125, "objective/kl": 16.802213668823242, "objective/non_score_reward": -1.6802215576171875, "objective/rlhf_reward": -4.320886051654815, "objective/scores": 0.6, "policy/approxkl_avg": 57.61943054199219, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7062151432037354, "step": 2169, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.998595952987671 }, { "episode": 34736, "epoch": 0.6243663946507532, "loss/policy_avg": 0.16234064102172852, "lr": 2.583972392638037e-06, "objective/entropy": -8.229663848876953, "objective/kl": 12.112107276916504, "objective/non_score_reward": -1.21121084690094, "objective/rlhf_reward": -4.444843447208404, "objective/scores": 0.1, "policy/approxkl_avg": 66.40957641601562, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.623025119304657, "step": 2170, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0005271434783936 }, { "episode": 34752, "epoch": 0.6246539885681418, "loss/policy_avg": 0.4374160170555115, "lr": 2.5837806748466257e-06, "objective/entropy": 215.26991271972656, "objective/kl": 14.201522827148438, "objective/non_score_reward": -1.4201524257659912, "objective/rlhf_reward": -7.680609703063965, "objective/scores": -0.5, "policy/approxkl_avg": 37.60218811035156, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6455620527267456, "step": 2171, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000101089477539 }, { "episode": 34768, "epoch": 0.6249415824855304, "loss/policy_avg": 0.9038959741592407, "lr": 2.583588957055215e-06, "objective/entropy": 164.766357421875, "objective/kl": 10.354567527770996, "objective/non_score_reward": -1.035456895828247, "objective/rlhf_reward": -6.141827583312988, "objective/scores": -0.5, "policy/approxkl_avg": 35.68964385986328, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.4149360656738281, "step": 2172, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0044846534729004 }, { "episode": 34784, "epoch": 0.6252291764029191, "loss/policy_avg": -0.24729087948799133, "lr": 2.5833972392638037e-06, "objective/entropy": 109.50682067871094, "objective/kl": 12.598501205444336, "objective/non_score_reward": -1.259850263595581, "objective/rlhf_reward": -4.639400786161422, "objective/scores": 0.1, "policy/approxkl_avg": 24.71780776977539, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6037960648536682, "step": 2173, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0024328231811523 }, { "episode": 34800, "epoch": 0.6255167703203077, "loss/policy_avg": -0.07739436626434326, "lr": 2.5832055214723925e-06, "objective/entropy": 82.29547119140625, "objective/kl": 13.763072967529297, "objective/non_score_reward": -1.3763072490692139, "objective/rlhf_reward": -3.901109311644154, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 19.90868377685547, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.4547133445739746, "step": 2174, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0001060962677 }, { "episode": 34816, "epoch": 0.6258043642376964, "loss/policy_avg": 0.5104341506958008, "lr": 2.5830138036809817e-06, "objective/entropy": 193.64498901367188, "objective/kl": 19.853286743164062, "objective/non_score_reward": -1.9853289127349854, "objective/rlhf_reward": -7.54131588935852, "objective/scores": 0.1, "policy/approxkl_avg": 22.88241958618164, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8123583793640137, "step": 2175, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.996779441833496 }, { "episode": 34832, "epoch": 0.626091958155085, "loss/policy_avg": 0.26050132513046265, "lr": 2.5828220858895706e-06, "objective/entropy": 112.65647888183594, "objective/kl": 15.261848449707031, "objective/non_score_reward": -1.5261847972869873, "objective/rlhf_reward": -5.704739069938659, "objective/scores": 0.1, "policy/approxkl_avg": 14.26208209991455, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5949582457542419, "step": 2176, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.002542018890381 }, { "episode": 34848, "epoch": 0.6263795520724736, "loss/policy_avg": -0.006953395903110504, "lr": 2.5826303680981594e-06, "objective/entropy": -35.79608154296875, "objective/kl": 8.320389747619629, "objective/non_score_reward": -0.8320389986038208, "objective/rlhf_reward": -1.2054497919240332, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 2.80181622505188, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5749397873878479, "step": 2177, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.003983974456787 }, { "episode": 34864, "epoch": 0.6266671459898623, "loss/policy_avg": 0.44770270586013794, "lr": 2.5824386503067486e-06, "objective/entropy": -24.63408851623535, "objective/kl": 14.420526504516602, "objective/non_score_reward": -1.4420526027679443, "objective/rlhf_reward": -7.768210411071777, "objective/scores": -0.5, "policy/approxkl_avg": 61.44035339355469, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6937098503112793, "step": 2178, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9966654777526855 }, { "episode": 34880, "epoch": 0.6269547399072509, "loss/policy_avg": 0.4457979202270508, "lr": 2.5822469325153374e-06, "objective/entropy": 13.465904235839844, "objective/kl": 19.523706436157227, "objective/non_score_reward": -1.9523706436157227, "objective/rlhf_reward": -4.885763649584028, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 37.15239715576172, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5032546520233154, "step": 2179, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000812530517578 }, { "episode": 34896, "epoch": 0.6272423338246396, "loss/policy_avg": 0.487781822681427, "lr": 2.5820552147239266e-06, "objective/entropy": 143.8909912109375, "objective/kl": 19.466087341308594, "objective/non_score_reward": -1.9466089010238647, "objective/rlhf_reward": -5.663729192987953, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 75.92357635498047, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.524456262588501, "step": 2180, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9970227479934692 }, { "episode": 34912, "epoch": 0.6275299277420282, "loss/policy_avg": 0.6310595273971558, "lr": 2.5818634969325154e-06, "objective/entropy": 20.076377868652344, "objective/kl": 9.920660018920898, "objective/non_score_reward": -0.9920661449432373, "objective/rlhf_reward": -3.568264371156692, "objective/scores": 0.1, "policy/approxkl_avg": 20.638607025146484, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5525496006011963, "step": 2181, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9988200664520264 }, { "episode": 34928, "epoch": 0.627817521659417, "loss/policy_avg": -0.31129300594329834, "lr": 2.5816717791411043e-06, "objective/entropy": 63.76435089111328, "objective/kl": 15.30907154083252, "objective/non_score_reward": -1.530907154083252, "objective/rlhf_reward": -5.723628854751587, "objective/scores": 0.1, "policy/approxkl_avg": 39.06525802612305, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5918775200843811, "step": 2182, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.001953125 }, { "episode": 34944, "epoch": 0.6281051155768056, "loss/policy_avg": 0.24727630615234375, "lr": 2.5814800613496935e-06, "objective/entropy": 208.6817626953125, "objective/kl": 15.481653213500977, "objective/non_score_reward": -1.5481653213500977, "objective/rlhf_reward": -3.7926613152027127, "objective/scores": 0.6, "policy/approxkl_avg": 18.966259002685547, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5907472372055054, "step": 2183, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.998089075088501 }, { "episode": 34960, "epoch": 0.6283927094941942, "loss/policy_avg": 0.40158331394195557, "lr": 2.5812883435582823e-06, "objective/entropy": 100.16302490234375, "objective/kl": 16.88314437866211, "objective/non_score_reward": -1.6883143186569214, "objective/rlhf_reward": -6.353257274627685, "objective/scores": 0.1, "policy/approxkl_avg": 32.46660614013672, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5955220460891724, "step": 2184, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998399257659912 }, { "episode": 34976, "epoch": 0.6286803034115829, "loss/policy_avg": 0.44291186332702637, "lr": 2.5810966257668715e-06, "objective/entropy": 107.09803009033203, "objective/kl": 20.636152267456055, "objective/non_score_reward": -2.063615322113037, "objective/rlhf_reward": -5.330741797329161, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 71.88838958740234, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6331840753555298, "step": 2185, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.00006103515625 }, { "episode": 34992, "epoch": 0.6289678973289715, "loss/policy_avg": 0.20810046792030334, "lr": 2.5809049079754603e-06, "objective/entropy": 352.854736328125, "objective/kl": 21.351238250732422, "objective/non_score_reward": -2.1351239681243896, "objective/rlhf_reward": -8.140495753288269, "objective/scores": 0.1, "policy/approxkl_avg": 64.23518371582031, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8724610805511475, "step": 2186, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9992315769195557 }, { "episode": 35008, "epoch": 0.6292554912463602, "loss/policy_avg": 0.1791592687368393, "lr": 2.580713190184049e-06, "objective/entropy": 237.42965698242188, "objective/kl": 15.62234115600586, "objective/non_score_reward": -1.5622341632843018, "objective/rlhf_reward": -3.8489364743232723, "objective/scores": 0.6, "policy/approxkl_avg": 95.47721862792969, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9437090158462524, "step": 2187, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9988491535186768 }, { "episode": 35024, "epoch": 0.6295430851637488, "loss/policy_avg": 0.3126150369644165, "lr": 2.5805214723926384e-06, "objective/entropy": -6.007709503173828, "objective/kl": 25.649818420410156, "objective/non_score_reward": -2.5649819374084473, "objective/rlhf_reward": -7.336208750249121, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 263.31591796875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4821949005126953, "step": 2188, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998019814491272 }, { "episode": 35040, "epoch": 0.6298306790811374, "loss/policy_avg": 0.041694313287734985, "lr": 2.580329754601227e-06, "objective/entropy": -69.11876678466797, "objective/kl": 11.226625442504883, "objective/non_score_reward": -1.1226625442504883, "objective/rlhf_reward": -4.090650296211242, "objective/scores": 0.1, "policy/approxkl_avg": 0.908916711807251, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.4388587474822998, "step": 2189, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0019636154174805 }, { "episode": 35056, "epoch": 0.6301182729985261, "loss/policy_avg": 0.11040852963924408, "lr": 2.580138036809816e-06, "objective/entropy": 247.57484436035156, "objective/kl": 21.945003509521484, "objective/non_score_reward": -2.194500207901001, "objective/rlhf_reward": -10.778000831604004, "objective/scores": -0.5, "policy/approxkl_avg": 44.54824447631836, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7419674396514893, "step": 2190, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9986804723739624 }, { "episode": 35072, "epoch": 0.6304058669159147, "loss/policy_avg": 0.19531464576721191, "lr": 2.579946319018405e-06, "objective/entropy": 258.46746826171875, "objective/kl": 12.224682807922363, "objective/non_score_reward": -1.222468376159668, "objective/rlhf_reward": -4.489873206615448, "objective/scores": 0.1, "policy/approxkl_avg": 24.236316680908203, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6577352285385132, "step": 2191, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.999692678451538 }, { "episode": 35088, "epoch": 0.6306934608333034, "loss/policy_avg": 0.2209586203098297, "lr": 2.5797546012269936e-06, "objective/entropy": -99.6376953125, "objective/kl": 20.67704963684082, "objective/non_score_reward": -2.067704916000366, "objective/rlhf_reward": -10.270819664001465, "objective/scores": -0.5, "policy/approxkl_avg": 105.64271545410156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6112191677093506, "step": 2192, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9972584247589111 }, { "episode": 35104, "epoch": 0.630981054750692, "loss/policy_avg": -0.1769150048494339, "lr": 2.579562883435583e-06, "objective/entropy": -255.36572265625, "objective/kl": 13.98208236694336, "objective/non_score_reward": -1.3982082605361938, "objective/rlhf_reward": -5.19283310174942, "objective/scores": 0.1, "policy/approxkl_avg": 16.08042335510254, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6317181587219238, "step": 2193, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0026397705078125 }, { "episode": 35120, "epoch": 0.6312686486680806, "loss/policy_avg": 0.07765771448612213, "lr": 2.5793711656441717e-06, "objective/entropy": -76.46177673339844, "objective/kl": 6.011279106140137, "objective/non_score_reward": -0.6011279821395874, "objective/rlhf_reward": -0.8003920353093918, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 2.6643407344818115, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5533638000488281, "step": 2194, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9986436367034912 }, { "episode": 35136, "epoch": 0.6315562425854693, "loss/policy_avg": 1.0513546466827393, "lr": 2.579179447852761e-06, "objective/entropy": 6.0850830078125, "objective/kl": 20.63864517211914, "objective/non_score_reward": -2.0638644695281982, "objective/rlhf_reward": -7.855458116531372, "objective/scores": 0.1, "policy/approxkl_avg": 87.43357849121094, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5335657000541687, "step": 2195, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9983572959899902 }, { "episode": 35152, "epoch": 0.6318438365028579, "loss/policy_avg": 0.9989783763885498, "lr": 2.5789877300613497e-06, "objective/entropy": 235.584716796875, "objective/kl": 14.521580696105957, "objective/non_score_reward": -1.4521582126617432, "objective/rlhf_reward": -5.408632612228393, "objective/scores": 0.1, "policy/approxkl_avg": 20.214359283447266, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8818501234054565, "step": 2196, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998250961303711 }, { "episode": 35168, "epoch": 0.6321314304202467, "loss/policy_avg": 0.17217445373535156, "lr": 2.5787960122699385e-06, "objective/entropy": -112.44560241699219, "objective/kl": 17.418087005615234, "objective/non_score_reward": -1.7418086528778076, "objective/rlhf_reward": -8.967233657836914, "objective/scores": -0.5, "policy/approxkl_avg": 49.461700439453125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6804555058479309, "step": 2197, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9982492923736572 }, { "episode": 35184, "epoch": 0.6324190243376353, "loss/policy_avg": -0.1301862895488739, "lr": 2.5786042944785278e-06, "objective/entropy": 202.34921264648438, "objective/kl": 19.765905380249023, "objective/non_score_reward": -1.9765905141830444, "objective/rlhf_reward": -7.506362175941467, "objective/scores": 0.1, "policy/approxkl_avg": 22.713653564453125, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7051633596420288, "step": 2198, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999467134475708 }, { "episode": 35200, "epoch": 0.632706618255024, "loss/policy_avg": 0.4120163917541504, "lr": 2.5784125766871166e-06, "objective/entropy": 178.94874572753906, "objective/kl": 18.871971130371094, "objective/non_score_reward": -1.8871972560882568, "objective/rlhf_reward": -9.548789024353027, "objective/scores": -0.5, "policy/approxkl_avg": 37.069515228271484, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7663606405258179, "step": 2199, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.001192569732666 }, { "episode": 35216, "epoch": 0.6329942121724126, "loss/policy_avg": 0.7163809537887573, "lr": 2.5782208588957054e-06, "objective/entropy": 66.29650115966797, "objective/kl": 14.291718482971191, "objective/non_score_reward": -1.4291718006134033, "objective/rlhf_reward": -7.716687202453613, "objective/scores": -0.5, "policy/approxkl_avg": 101.60475158691406, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5268129706382751, "step": 2200, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.996827483177185 }, { "episode": 35232, "epoch": 0.6332818060898012, "loss/policy_avg": 0.028121359646320343, "lr": 2.5780291411042946e-06, "objective/entropy": 282.8331604003906, "objective/kl": 16.655906677246094, "objective/non_score_reward": -1.665590763092041, "objective/rlhf_reward": -4.262362873554229, "objective/scores": 0.6, "policy/approxkl_avg": 1.9305273294448853, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7045209407806396, "step": 2201, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0025622844696045 }, { "episode": 35248, "epoch": 0.6335694000071899, "loss/policy_avg": 0.09198611974716187, "lr": 2.5778374233128834e-06, "objective/entropy": 9.080646514892578, "objective/kl": 14.53342056274414, "objective/non_score_reward": -1.4533421993255615, "objective/rlhf_reward": -5.413368529081344, "objective/scores": 0.1, "policy/approxkl_avg": 2.323448419570923, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8238525390625, "step": 2202, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999985694885254 }, { "episode": 35264, "epoch": 0.6338569939245785, "loss/policy_avg": 0.4124908745288849, "lr": 2.5776457055214726e-06, "objective/entropy": -354.57403564453125, "objective/kl": 12.172119140625, "objective/non_score_reward": -1.2172119617462158, "objective/rlhf_reward": -4.468847846984863, "objective/scores": 0.1, "policy/approxkl_avg": 46.26481628417969, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6718429327011108, "step": 2203, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.9982786178588867 }, { "episode": 35280, "epoch": 0.6341445878419671, "loss/policy_avg": 0.09881316870450974, "lr": 2.5774539877300615e-06, "objective/entropy": 38.90171813964844, "objective/kl": 13.71724796295166, "objective/non_score_reward": -1.3717248439788818, "objective/rlhf_reward": -5.086899465322494, "objective/scores": 0.1, "policy/approxkl_avg": 21.995763778686523, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6945664882659912, "step": 2204, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9988892078399658 }, { "episode": 35296, "epoch": 0.6344321817593558, "loss/policy_avg": 0.31177273392677307, "lr": 2.5772622699386503e-06, "objective/entropy": -98.65557861328125, "objective/kl": 15.464643478393555, "objective/non_score_reward": -1.5464643239974976, "objective/rlhf_reward": -3.7858574151992794, "objective/scores": 0.6, "policy/approxkl_avg": 19.114368438720703, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5550520420074463, "step": 2205, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9985768795013428 }, { "episode": 35312, "epoch": 0.6347197756767444, "loss/policy_avg": 0.22732603549957275, "lr": 2.5770705521472395e-06, "objective/entropy": 45.62663650512695, "objective/kl": 12.204948425292969, "objective/non_score_reward": -1.2204947471618652, "objective/rlhf_reward": -2.48197910785675, "objective/scores": 0.6, "policy/approxkl_avg": 23.474681854248047, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5377674102783203, "step": 2206, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9971551895141602 }, { "episode": 35328, "epoch": 0.6350073695941331, "loss/policy_avg": 0.6622054576873779, "lr": 2.5768788343558283e-06, "objective/entropy": -16.36989974975586, "objective/kl": 14.84906005859375, "objective/non_score_reward": -1.4849061965942383, "objective/rlhf_reward": -3.816918554083381, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 23.934528350830078, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.5687286853790283, "step": 2207, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9993095397949219 }, { "episode": 35344, "epoch": 0.6352949635115217, "loss/policy_avg": 0.6508431434631348, "lr": 2.5766871165644175e-06, "objective/entropy": 12.291481018066406, "objective/kl": 16.560619354248047, "objective/non_score_reward": -1.6560620069503784, "objective/rlhf_reward": -8.624248504638672, "objective/scores": -0.5, "policy/approxkl_avg": 10.811452865600586, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7654626369476318, "step": 2208, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9988648891448975 }, { "episode": 35360, "epoch": 0.6355825574289103, "loss/policy_avg": 0.24387690424919128, "lr": 2.5764953987730063e-06, "objective/entropy": 224.07534790039062, "objective/kl": 22.381715774536133, "objective/non_score_reward": -2.238171100616455, "objective/rlhf_reward": -7.219351843992868, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 65.56282043457031, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.9638044834136963, "step": 2209, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000033140182495 }, { "episode": 35376, "epoch": 0.635870151346299, "loss/policy_avg": 0.09040187299251556, "lr": 2.576303680981595e-06, "objective/entropy": 79.04753875732422, "objective/kl": 23.915878295898438, "objective/non_score_reward": -2.391587495803833, "objective/rlhf_reward": -9.166350162029268, "objective/scores": 0.1, "policy/approxkl_avg": 94.76409912109375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.37892991304397583, "step": 2210, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9963321685791016 }, { "episode": 35392, "epoch": 0.6361577452636876, "loss/policy_avg": -0.1181950569152832, "lr": 2.5761119631901844e-06, "objective/entropy": 209.74940490722656, "objective/kl": 15.779610633850098, "objective/non_score_reward": -1.5779609680175781, "objective/rlhf_reward": -8.311843872070312, "objective/scores": -0.5, "policy/approxkl_avg": 9.557290077209473, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7678151726722717, "step": 2211, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0032005310058594 }, { "episode": 35408, "epoch": 0.6364453391810764, "loss/policy_avg": 0.29135096073150635, "lr": 2.5759202453987728e-06, "objective/entropy": -126.32437133789062, "objective/kl": 14.804946899414062, "objective/non_score_reward": -1.480494499206543, "objective/rlhf_reward": -5.521978384256363, "objective/scores": 0.1, "policy/approxkl_avg": 96.0042724609375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8453115224838257, "step": 2212, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997128963470459 }, { "episode": 35424, "epoch": 0.636732933098465, "loss/policy_avg": 0.23182812333106995, "lr": 2.575728527607362e-06, "objective/entropy": 116.05543518066406, "objective/kl": 16.660411834716797, "objective/non_score_reward": -1.666041374206543, "objective/rlhf_reward": -3.7404463633310527, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 33.42845916748047, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6534185409545898, "step": 2213, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9983100891113281 }, { "episode": 35440, "epoch": 0.6370205270158537, "loss/policy_avg": 0.3903784453868866, "lr": 2.575536809815951e-06, "objective/entropy": -84.97456359863281, "objective/kl": 11.240859985351562, "objective/non_score_reward": -1.1240859031677246, "objective/rlhf_reward": -2.548932800965245, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 29.660133361816406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.41411858797073364, "step": 2214, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9963812828063965 }, { "episode": 35456, "epoch": 0.6373081209332423, "loss/policy_avg": 0.5894661545753479, "lr": 2.5753450920245396e-06, "objective/entropy": 63.571022033691406, "objective/kl": 17.98499870300293, "objective/non_score_reward": -1.7984999418258667, "objective/rlhf_reward": -4.793999528884887, "objective/scores": 0.6, "policy/approxkl_avg": 10.82928466796875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6973264217376709, "step": 2215, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9998317956924438 }, { "episode": 35472, "epoch": 0.6375957148506309, "loss/policy_avg": 0.04164008051156998, "lr": 2.575153374233129e-06, "objective/entropy": 111.24397277832031, "objective/kl": 13.686548233032227, "objective/non_score_reward": -1.3686549663543701, "objective/rlhf_reward": -7.4746198654174805, "objective/scores": -0.5, "policy/approxkl_avg": 3.952038049697876, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7927234172821045, "step": 2216, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0024447441101074 }, { "episode": 35488, "epoch": 0.6378833087680196, "loss/policy_avg": 0.00472402386367321, "lr": 2.5749616564417177e-06, "objective/entropy": 40.287269592285156, "objective/kl": 15.373626708984375, "objective/non_score_reward": -1.5373626947402954, "objective/rlhf_reward": -5.749450838565826, "objective/scores": 0.1, "policy/approxkl_avg": 15.220986366271973, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6288732290267944, "step": 2217, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9989573955535889 }, { "episode": 35504, "epoch": 0.6381709026854082, "loss/policy_avg": -0.1521018147468567, "lr": 2.574769938650307e-06, "objective/entropy": -65.29877471923828, "objective/kl": 16.19780921936035, "objective/non_score_reward": -1.6197808980941772, "objective/rlhf_reward": -8.479124069213867, "objective/scores": -0.5, "policy/approxkl_avg": 2.63850474357605, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7238868474960327, "step": 2218, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0011723041534424 }, { "episode": 35520, "epoch": 0.6384584966027969, "loss/policy_avg": 0.22721001505851746, "lr": 2.5745782208588957e-06, "objective/entropy": 8.837226867675781, "objective/kl": 12.960824966430664, "objective/non_score_reward": -1.296082615852356, "objective/rlhf_reward": -3.628071083632067, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 2.8337883949279785, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6526169776916504, "step": 2219, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999176025390625 }, { "episode": 35536, "epoch": 0.6387460905201855, "loss/policy_avg": 0.32163006067276, "lr": 2.5743865030674845e-06, "objective/entropy": 56.67760467529297, "objective/kl": 23.63744354248047, "objective/non_score_reward": -2.3637444972991943, "objective/rlhf_reward": -7.507566521840031, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 38.21224594116211, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7226811647415161, "step": 2220, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.99851655960083 }, { "episode": 35552, "epoch": 0.6390336844375741, "loss/policy_avg": 0.2586836814880371, "lr": 2.5741947852760738e-06, "objective/entropy": 127.28630828857422, "objective/kl": 14.451157569885254, "objective/non_score_reward": -1.445115566253662, "objective/rlhf_reward": -7.780462741851807, "objective/scores": -0.5, "policy/approxkl_avg": 10.218864440917969, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6248810887336731, "step": 2221, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9987244606018066 }, { "episode": 35568, "epoch": 0.6393212783549628, "loss/policy_avg": 0.5988067984580994, "lr": 2.5740030674846626e-06, "objective/entropy": 310.6684265136719, "objective/kl": 14.75908374786377, "objective/non_score_reward": -1.4759085178375244, "objective/rlhf_reward": -5.503633654117584, "objective/scores": 0.1, "policy/approxkl_avg": 10.692106246948242, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7594001293182373, "step": 2222, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9993855953216553 }, { "episode": 35584, "epoch": 0.6396088722723514, "loss/policy_avg": 0.01884036883711815, "lr": 2.573811349693252e-06, "objective/entropy": 269.0919494628906, "objective/kl": 11.99139404296875, "objective/non_score_reward": -1.1991393566131592, "objective/rlhf_reward": -2.396557545661926, "objective/scores": 0.6, "policy/approxkl_avg": 5.300789833068848, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6532878279685974, "step": 2223, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9982726573944092 }, { "episode": 35600, "epoch": 0.63989646618974, "loss/policy_avg": 0.11802544444799423, "lr": 2.5736196319018406e-06, "objective/entropy": -66.02192687988281, "objective/kl": 18.99173927307129, "objective/non_score_reward": -1.8991740942001343, "objective/rlhf_reward": -5.473990144506965, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 31.625484466552734, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6842813491821289, "step": 2224, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9983280897140503 }, { "episode": 35616, "epoch": 0.6401840601071287, "loss/policy_avg": 0.12923726439476013, "lr": 2.5734279141104294e-06, "objective/entropy": 312.881103515625, "objective/kl": 20.01099967956543, "objective/non_score_reward": -2.0011000633239746, "objective/rlhf_reward": -10.004400253295898, "objective/scores": -0.5, "policy/approxkl_avg": 64.80439758300781, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8023340106010437, "step": 2225, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9957354068756104 }, { "episode": 35632, "epoch": 0.6404716540245173, "loss/policy_avg": 0.16196031868457794, "lr": 2.5732361963190187e-06, "objective/entropy": -3.3410987854003906, "objective/kl": 20.95907974243164, "objective/non_score_reward": -2.0959081649780273, "objective/rlhf_reward": -3.983632212877273, "objective/scores": 1.1, "policy/approxkl_avg": 58.89521408081055, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8127495050430298, "step": 2226, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9977425336837769 }, { "episode": 35648, "epoch": 0.640759247941906, "loss/policy_avg": 0.3808813691139221, "lr": 2.5730444785276075e-06, "objective/entropy": -48.94996643066406, "objective/kl": 17.937030792236328, "objective/non_score_reward": -1.7937030792236328, "objective/rlhf_reward": -2.7748126149177548, "objective/scores": 1.1, "policy/approxkl_avg": 38.477439880371094, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7229498624801636, "step": 2227, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999549150466919 }, { "episode": 35664, "epoch": 0.6410468418592947, "loss/policy_avg": 0.5239992737770081, "lr": 2.5728527607361963e-06, "objective/entropy": -146.61708068847656, "objective/kl": 14.59036636352539, "objective/non_score_reward": -1.4590365886688232, "objective/rlhf_reward": -4.174287145555602, "objective/scores": 0.41546487678572874, "policy/approxkl_avg": 9.376331329345703, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6471960544586182, "step": 2228, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.998356819152832 }, { "episode": 35680, "epoch": 0.6413344357766834, "loss/policy_avg": 0.0809326097369194, "lr": 2.5726610429447855e-06, "objective/entropy": 139.27410888671875, "objective/kl": 12.63823127746582, "objective/non_score_reward": -1.26382315158844, "objective/rlhf_reward": -7.05529260635376, "objective/scores": -0.5, "policy/approxkl_avg": 5.977476119995117, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.9265446662902832, "step": 2229, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9994497299194336 }, { "episode": 35696, "epoch": 0.641622029694072, "loss/policy_avg": 0.40282607078552246, "lr": 2.5724693251533743e-06, "objective/entropy": 74.11846160888672, "objective/kl": 15.80052375793457, "objective/non_score_reward": -1.580052375793457, "objective/rlhf_reward": -5.920209205150604, "objective/scores": 0.1, "policy/approxkl_avg": 74.08509826660156, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.44877538084983826, "step": 2230, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.998140811920166 }, { "episode": 35712, "epoch": 0.6419096236114606, "loss/policy_avg": -0.023778066039085388, "lr": 2.5722776073619635e-06, "objective/entropy": 160.42071533203125, "objective/kl": 18.071002960205078, "objective/non_score_reward": -1.8071002960205078, "objective/rlhf_reward": -9.228401184082031, "objective/scores": -0.5, "policy/approxkl_avg": 42.72297668457031, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7528549432754517, "step": 2231, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9997116327285767 }, { "episode": 35728, "epoch": 0.6421972175288493, "loss/policy_avg": -0.12553301453590393, "lr": 2.5720858895705524e-06, "objective/entropy": -118.97586059570312, "objective/kl": 12.522685050964355, "objective/non_score_reward": -1.2522684335708618, "objective/rlhf_reward": -0.6090736746788021, "objective/scores": 1.1, "policy/approxkl_avg": 14.815485000610352, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5027907490730286, "step": 2232, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.002737045288086 }, { "episode": 35744, "epoch": 0.6424848114462379, "loss/policy_avg": 0.5497890710830688, "lr": 2.571894171779141e-06, "objective/entropy": 66.39590454101562, "objective/kl": 15.05737590789795, "objective/non_score_reward": -1.5057374238967896, "objective/rlhf_reward": -5.622949934005737, "objective/scores": 0.1, "policy/approxkl_avg": 24.636985778808594, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7436261773109436, "step": 2233, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9996707439422607 }, { "episode": 35760, "epoch": 0.6427724053636266, "loss/policy_avg": 0.035653773695230484, "lr": 2.57170245398773e-06, "objective/entropy": 91.70762634277344, "objective/kl": 21.097057342529297, "objective/non_score_reward": -2.109705924987793, "objective/rlhf_reward": -5.5151040896188945, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 11.354876518249512, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8169498443603516, "step": 2234, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.000910997390747 }, { "episode": 35776, "epoch": 0.6430599992810152, "loss/policy_avg": 0.043960705399513245, "lr": 2.5715107361963188e-06, "objective/entropy": -137.8557586669922, "objective/kl": 16.38060760498047, "objective/non_score_reward": -1.6380605697631836, "objective/rlhf_reward": -6.152242219448089, "objective/scores": 0.1, "policy/approxkl_avg": 71.73484802246094, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5255050659179688, "step": 2235, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9994916915893555 }, { "episode": 35792, "epoch": 0.6433475931984038, "loss/policy_avg": 0.1638680398464203, "lr": 2.571319018404908e-06, "objective/entropy": -82.89593505859375, "objective/kl": 18.11610221862793, "objective/non_score_reward": -1.811610221862793, "objective/rlhf_reward": -6.846440827846527, "objective/scores": 0.1, "policy/approxkl_avg": 43.05682373046875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5359176397323608, "step": 2236, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9976513385772705 }, { "episode": 35808, "epoch": 0.6436351871157925, "loss/policy_avg": 0.5455341339111328, "lr": 2.571127300613497e-06, "objective/entropy": 136.18603515625, "objective/kl": 15.515323638916016, "objective/non_score_reward": -1.55153226852417, "objective/rlhf_reward": -8.20612907409668, "objective/scores": -0.5, "policy/approxkl_avg": 29.07931137084961, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5907673835754395, "step": 2237, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9959040880203247 }, { "episode": 35824, "epoch": 0.6439227810331811, "loss/policy_avg": 0.5567790269851685, "lr": 2.570935582822086e-06, "objective/entropy": -230.35806274414062, "objective/kl": 15.261903762817383, "objective/non_score_reward": -1.5261905193328857, "objective/rlhf_reward": -3.7047623157501217, "objective/scores": 0.6, "policy/approxkl_avg": 28.395586013793945, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7888268232345581, "step": 2238, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9991778135299683 }, { "episode": 35840, "epoch": 0.6442103749505698, "loss/policy_avg": 0.22221067547798157, "lr": 2.570743865030675e-06, "objective/entropy": 51.5346794128418, "objective/kl": 20.33760643005371, "objective/non_score_reward": -2.0337605476379395, "objective/rlhf_reward": -10.135042190551758, "objective/scores": -0.5, "policy/approxkl_avg": 21.599842071533203, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6876698136329651, "step": 2239, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9994490146636963 }, { "episode": 35856, "epoch": 0.6444979688679584, "loss/policy_avg": 0.24794799089431763, "lr": 2.5705521472392637e-06, "objective/entropy": 101.41891479492188, "objective/kl": 10.934738159179688, "objective/non_score_reward": -1.0934739112854004, "objective/rlhf_reward": 0.026104474067688344, "objective/scores": 1.1, "policy/approxkl_avg": 10.448841094970703, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7271315455436707, "step": 2240, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9968032836914062 }, { "episode": 35872, "epoch": 0.644785562785347, "loss/policy_avg": -0.15643905103206635, "lr": 2.570360429447853e-06, "objective/entropy": -17.24741554260254, "objective/kl": 14.677450180053711, "objective/non_score_reward": -1.467745065689087, "objective/rlhf_reward": -1.4709803819656369, "objective/scores": 1.1, "policy/approxkl_avg": 27.442018508911133, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.48285290598869324, "step": 2241, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0026421546936035 }, { "episode": 35888, "epoch": 0.6450731567027357, "loss/policy_avg": 1.282882571220398, "lr": 2.5701687116564417e-06, "objective/entropy": -16.522808074951172, "objective/kl": 12.279808044433594, "objective/non_score_reward": -1.2279808521270752, "objective/rlhf_reward": -2.7892173997321468, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 17.468976974487305, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5521416068077087, "step": 2242, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.001244068145752 }, { "episode": 35904, "epoch": 0.6453607506201244, "loss/policy_avg": 0.6120246648788452, "lr": 2.5699769938650305e-06, "objective/entropy": -58.17369842529297, "objective/kl": 9.920572280883789, "objective/non_score_reward": -0.992057204246521, "objective/rlhf_reward": 0.43177100419998204, "objective/scores": 1.1, "policy/approxkl_avg": 12.923457145690918, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5883625745773315, "step": 2243, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0009925365448 }, { "episode": 35920, "epoch": 0.6456483445375131, "loss/policy_avg": 0.21067282557487488, "lr": 2.5697852760736198e-06, "objective/entropy": -4.984157562255859, "objective/kl": 18.276622772216797, "objective/non_score_reward": -1.8276622295379639, "objective/rlhf_reward": -4.386930142284605, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 100.22476196289062, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6311012506484985, "step": 2244, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.999492883682251 }, { "episode": 35936, "epoch": 0.6459359384549017, "loss/policy_avg": 0.12202519178390503, "lr": 2.5695935582822086e-06, "objective/entropy": 167.21475219726562, "objective/kl": 13.268052101135254, "objective/non_score_reward": -1.3268052339553833, "objective/rlhf_reward": -7.307220935821533, "objective/scores": -0.5, "policy/approxkl_avg": 40.42982482910156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6104716658592224, "step": 2245, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0034589767456055 }, { "episode": 35952, "epoch": 0.6462235323722904, "loss/policy_avg": 0.04010404273867607, "lr": 2.569401840490798e-06, "objective/entropy": -75.36890411376953, "objective/kl": 17.823413848876953, "objective/non_score_reward": -1.782341480255127, "objective/rlhf_reward": -5.006659509913002, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 15.992976188659668, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.5671273469924927, "step": 2246, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0030524730682373 }, { "episode": 35968, "epoch": 0.646511126289679, "loss/policy_avg": 0.4200291037559509, "lr": 2.5692101226993866e-06, "objective/entropy": -238.55908203125, "objective/kl": 19.406217575073242, "objective/non_score_reward": -1.9406214952468872, "objective/rlhf_reward": -9.76248550415039, "objective/scores": -0.5, "policy/approxkl_avg": 42.762123107910156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.527471661567688, "step": 2247, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9961581230163574 }, { "episode": 35984, "epoch": 0.6467987202070676, "loss/policy_avg": -0.33676448464393616, "lr": 2.5690184049079754e-06, "objective/entropy": -120.24238586425781, "objective/kl": 13.346169471740723, "objective/non_score_reward": -1.334617018699646, "objective/rlhf_reward": -4.9384682610630986, "objective/scores": 0.1, "policy/approxkl_avg": 27.310237884521484, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.5737497210502625, "step": 2248, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0009799003601074 }, { "episode": 36000, "epoch": 0.6470863141244563, "loss/policy_avg": 0.5354056358337402, "lr": 2.5688266871165647e-06, "objective/entropy": -36.78726577758789, "objective/kl": 20.65894889831543, "objective/non_score_reward": -2.0658950805664062, "objective/rlhf_reward": -6.707320540156916, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 43.62550354003906, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7647566199302673, "step": 2249, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9989060163497925 }, { "episode": 36016, "epoch": 0.6473739080418449, "loss/policy_avg": 0.046671465039253235, "lr": 2.5686349693251535e-06, "objective/entropy": 301.4794921875, "objective/kl": 17.496397018432617, "objective/non_score_reward": -1.7496397495269775, "objective/rlhf_reward": -8.998558044433594, "objective/scores": -0.5, "policy/approxkl_avg": 9.562626838684082, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.73357754945755, "step": 2250, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.000436305999756 }, { "episode": 36032, "epoch": 0.6476615019592336, "loss/policy_avg": 0.14498819410800934, "lr": 2.5684432515337423e-06, "objective/entropy": -76.91134643554688, "objective/kl": 21.101152420043945, "objective/non_score_reward": -2.1101155281066895, "objective/rlhf_reward": -4.0404615759849545, "objective/scores": 1.1, "policy/approxkl_avg": 70.1800537109375, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.527640700340271, "step": 2251, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9971266984939575 }, { "episode": 36048, "epoch": 0.6479490958766222, "loss/policy_avg": 0.6092630624771118, "lr": 2.5682515337423315e-06, "objective/entropy": 156.32025146484375, "objective/kl": 26.440006256103516, "objective/non_score_reward": -2.644000768661499, "objective/rlhf_reward": -8.176003074645998, "objective/scores": 0.6, "policy/approxkl_avg": 122.7042007446289, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6378190517425537, "step": 2252, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9993071556091309 }, { "episode": 36064, "epoch": 0.6482366897940108, "loss/policy_avg": 0.08317308127880096, "lr": 2.5680598159509203e-06, "objective/entropy": -141.36428833007812, "objective/kl": 18.52130889892578, "objective/non_score_reward": -1.8521307706832886, "objective/rlhf_reward": -9.408522605895996, "objective/scores": -0.5, "policy/approxkl_avg": 117.25154113769531, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.835111141204834, "step": 2253, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9980623722076416 }, { "episode": 36080, "epoch": 0.6485242837113995, "loss/policy_avg": 0.1629417985677719, "lr": 2.5678680981595096e-06, "objective/entropy": 82.80020141601562, "objective/kl": 15.670761108398438, "objective/non_score_reward": -1.5670759677886963, "objective/rlhf_reward": -3.344585214496824, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 36.081214904785156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5541933178901672, "step": 2254, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9976167678833008 }, { "episode": 36096, "epoch": 0.6488118776287881, "loss/policy_avg": 0.596114456653595, "lr": 2.5676763803680984e-06, "objective/entropy": 69.89568328857422, "objective/kl": 22.839479446411133, "objective/non_score_reward": -2.2839479446411133, "objective/rlhf_reward": -11.135791778564453, "objective/scores": -0.5, "policy/approxkl_avg": 13.495725631713867, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.43410223722457886, "step": 2255, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999086618423462 }, { "episode": 36112, "epoch": 0.6490994715461768, "loss/policy_avg": 0.07463404536247253, "lr": 2.567484662576687e-06, "objective/entropy": 118.45555877685547, "objective/kl": 14.987808227539062, "objective/non_score_reward": -1.4987808465957642, "objective/rlhf_reward": -7.995123386383057, "objective/scores": -0.5, "policy/approxkl_avg": 16.267757415771484, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.46388620138168335, "step": 2256, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0005156993865967 }, { "episode": 36128, "epoch": 0.6493870654635654, "loss/policy_avg": 0.23220334947109222, "lr": 2.567292944785276e-06, "objective/entropy": -73.17439270019531, "objective/kl": 17.924890518188477, "objective/non_score_reward": -1.7924890518188477, "objective/rlhf_reward": -9.16995620727539, "objective/scores": -0.5, "policy/approxkl_avg": 41.652099609375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7985033988952637, "step": 2257, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9974535703659058 }, { "episode": 36144, "epoch": 0.6496746593809541, "loss/policy_avg": 0.2749658524990082, "lr": 2.567101226993865e-06, "objective/entropy": 87.86886596679688, "objective/kl": 14.881671905517578, "objective/non_score_reward": -1.4881671667099, "objective/rlhf_reward": -1.5526686072349545, "objective/scores": 1.1, "policy/approxkl_avg": 5.160519599914551, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.545424222946167, "step": 2258, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000108242034912 }, { "episode": 36160, "epoch": 0.6499622532983428, "loss/policy_avg": 0.3695850372314453, "lr": 2.566909509202454e-06, "objective/entropy": -106.71471405029297, "objective/kl": 15.884528160095215, "objective/non_score_reward": -1.588452935218811, "objective/rlhf_reward": -5.953811800479889, "objective/scores": 0.1, "policy/approxkl_avg": 10.107734680175781, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.608582615852356, "step": 2259, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9991270303726196 }, { "episode": 36176, "epoch": 0.6502498472157314, "loss/policy_avg": 1.5744253396987915, "lr": 2.566717791411043e-06, "objective/entropy": -150.28012084960938, "objective/kl": 12.480113983154297, "objective/non_score_reward": -1.2480113506317139, "objective/rlhf_reward": -6.992045879364014, "objective/scores": -0.5, "policy/approxkl_avg": 6.144730091094971, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.687921404838562, "step": 2260, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0005011558532715 }, { "episode": 36192, "epoch": 0.6505374411331201, "loss/policy_avg": 0.46961259841918945, "lr": 2.566526073619632e-06, "objective/entropy": 355.0657958984375, "objective/kl": 15.923636436462402, "objective/non_score_reward": -1.5923638343811035, "objective/rlhf_reward": -1.9694551587104794, "objective/scores": 1.1, "policy/approxkl_avg": 23.633182525634766, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8576830625534058, "step": 2261, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.997279167175293 }, { "episode": 36208, "epoch": 0.6508250350505087, "loss/policy_avg": -0.017100073397159576, "lr": 2.566334355828221e-06, "objective/entropy": -42.104312896728516, "objective/kl": 19.936870574951172, "objective/non_score_reward": -1.9936869144439697, "objective/rlhf_reward": -5.051028941513273, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 43.8869514465332, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.541776180267334, "step": 2262, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9991776943206787 }, { "episode": 36224, "epoch": 0.6511126289678973, "loss/policy_avg": 0.16236397624015808, "lr": 2.5661426380368097e-06, "objective/entropy": 76.25511169433594, "objective/kl": 19.869869232177734, "objective/non_score_reward": -1.986986756324768, "objective/rlhf_reward": -9.947946548461914, "objective/scores": -0.5, "policy/approxkl_avg": 141.39076232910156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6694316864013672, "step": 2263, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9994683265686035 }, { "episode": 36240, "epoch": 0.651400222885286, "loss/policy_avg": 0.31850236654281616, "lr": 2.565950920245399e-06, "objective/entropy": 102.84500122070312, "objective/kl": 16.424327850341797, "objective/non_score_reward": -1.642432451248169, "objective/rlhf_reward": -8.569729804992676, "objective/scores": -0.5, "policy/approxkl_avg": 4.593269348144531, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7008790373802185, "step": 2264, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999637246131897 }, { "episode": 36256, "epoch": 0.6516878168026746, "loss/policy_avg": 0.37176790833473206, "lr": 2.5657592024539877e-06, "objective/entropy": -42.34572982788086, "objective/kl": 16.17561912536621, "objective/non_score_reward": -1.6175618171691895, "objective/rlhf_reward": -4.3475415430226665, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 54.164154052734375, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.8059068918228149, "step": 2265, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9993457794189453 }, { "episode": 36272, "epoch": 0.6519754107200633, "loss/policy_avg": 0.2269030511379242, "lr": 2.5655674846625765e-06, "objective/entropy": 156.19015502929688, "objective/kl": 20.02953338623047, "objective/non_score_reward": -2.002953052520752, "objective/rlhf_reward": -6.064401517586644, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 3.891697406768799, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.38503801822662354, "step": 2266, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0019311904907227 }, { "episode": 36288, "epoch": 0.6522630046374519, "loss/policy_avg": 0.3679116368293762, "lr": 2.5653757668711658e-06, "objective/entropy": 126.4683609008789, "objective/kl": 15.896159172058105, "objective/non_score_reward": -1.589616060256958, "objective/rlhf_reward": -8.358464241027832, "objective/scores": -0.5, "policy/approxkl_avg": 4.862239360809326, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7594268321990967, "step": 2267, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9980642795562744 }, { "episode": 36304, "epoch": 0.6525505985548405, "loss/policy_avg": 0.6537224650382996, "lr": 2.5651840490797546e-06, "objective/entropy": 216.36676025390625, "objective/kl": 12.314484596252441, "objective/non_score_reward": -1.2314484119415283, "objective/rlhf_reward": -4.525793886184692, "objective/scores": 0.1, "policy/approxkl_avg": 10.041047096252441, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.9338732957839966, "step": 2268, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9994885921478271 }, { "episode": 36320, "epoch": 0.6528381924722292, "loss/policy_avg": -0.007923688739538193, "lr": 2.564992331288344e-06, "objective/entropy": -149.3037872314453, "objective/kl": 11.473003387451172, "objective/non_score_reward": -1.1473004817962646, "objective/rlhf_reward": -6.589201927185059, "objective/scores": -0.5, "policy/approxkl_avg": 0.23434622585773468, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4156072735786438, "step": 2269, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000920057296753 }, { "episode": 36336, "epoch": 0.6531257863896178, "loss/policy_avg": 0.678230345249176, "lr": 2.5648006134969326e-06, "objective/entropy": 363.0588073730469, "objective/kl": 17.33286476135254, "objective/non_score_reward": -1.7332863807678223, "objective/rlhf_reward": -8.933145523071289, "objective/scores": -0.5, "policy/approxkl_avg": 26.384531021118164, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8610842227935791, "step": 2270, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.001692771911621 }, { "episode": 36352, "epoch": 0.6534133803070065, "loss/policy_avg": 0.08171165734529495, "lr": 2.5646088957055214e-06, "objective/entropy": -11.031600952148438, "objective/kl": 17.83512306213379, "objective/non_score_reward": -1.7835124731063843, "objective/rlhf_reward": -2.7340499520301815, "objective/scores": 1.1, "policy/approxkl_avg": 70.58218383789062, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6740350723266602, "step": 2271, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9959897994995117 }, { "episode": 36368, "epoch": 0.6537009742243951, "loss/policy_avg": 0.17662328481674194, "lr": 2.5644171779141107e-06, "objective/entropy": 88.57758331298828, "objective/kl": 9.481622695922852, "objective/non_score_reward": -0.9481624364852905, "objective/rlhf_reward": -3.3926497310400006, "objective/scores": 0.1, "policy/approxkl_avg": 4.293864727020264, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5531899929046631, "step": 2272, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.0002083778381348 }, { "episode": 36384, "epoch": 0.6539885681417839, "loss/policy_avg": -0.1894199550151825, "lr": 2.5642254601226995e-06, "objective/entropy": -25.101699829101562, "objective/kl": 11.592086791992188, "objective/non_score_reward": -1.1592087745666504, "objective/rlhf_reward": -2.5141291043916087, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 2.358327865600586, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.5184176564216614, "step": 2273, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.001044750213623 }, { "episode": 36400, "epoch": 0.6542761620591725, "loss/policy_avg": -0.09053383767604828, "lr": 2.5640337423312887e-06, "objective/entropy": 98.01313781738281, "objective/kl": 8.616978645324707, "objective/non_score_reward": -0.8616980314254761, "objective/rlhf_reward": -3.0467919468879696, "objective/scores": 0.1, "policy/approxkl_avg": 18.988311767578125, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5338178277015686, "step": 2274, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000063896179199 }, { "episode": 36416, "epoch": 0.6545637559765611, "loss/policy_avg": 0.43914473056793213, "lr": 2.5638420245398775e-06, "objective/entropy": 2.6595993041992188, "objective/kl": 19.53274154663086, "objective/non_score_reward": -1.9532740116119385, "objective/rlhf_reward": -9.813096046447754, "objective/scores": -0.5, "policy/approxkl_avg": 46.37147521972656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5686173439025879, "step": 2275, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9987603425979614 }, { "episode": 36432, "epoch": 0.6548513498939498, "loss/policy_avg": 0.0820363238453865, "lr": 2.5636503067484663e-06, "objective/entropy": 145.08493041992188, "objective/kl": 14.222003936767578, "objective/non_score_reward": -1.422200322151184, "objective/rlhf_reward": -3.9554681936899816, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 99.89260864257812, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6236465573310852, "step": 2276, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998683214187622 }, { "episode": 36448, "epoch": 0.6551389438113384, "loss/policy_avg": 0.00839349627494812, "lr": 2.5634585889570556e-06, "objective/entropy": 215.92877197265625, "objective/kl": 21.089658737182617, "objective/non_score_reward": -2.1089658737182617, "objective/rlhf_reward": -10.435863494873047, "objective/scores": -0.5, "policy/approxkl_avg": 24.066848754882812, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.8423826098442078, "step": 2277, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0013651847839355 }, { "episode": 36464, "epoch": 0.655426537728727, "loss/policy_avg": 0.675013542175293, "lr": 2.5632668711656444e-06, "objective/entropy": -108.94208526611328, "objective/kl": 19.815250396728516, "objective/non_score_reward": -1.9815250635147095, "objective/rlhf_reward": -9.92609977722168, "objective/scores": -0.5, "policy/approxkl_avg": 67.00296020507812, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8092669248580933, "step": 2278, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9995282888412476 }, { "episode": 36480, "epoch": 0.6557141316461157, "loss/policy_avg": 1.2140064239501953, "lr": 2.563075153374233e-06, "objective/entropy": -95.05614471435547, "objective/kl": 16.91280174255371, "objective/non_score_reward": -1.6912803649902344, "objective/rlhf_reward": -6.365121459960937, "objective/scores": 0.1, "policy/approxkl_avg": 20.181533813476562, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7570371627807617, "step": 2279, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9980716705322266 }, { "episode": 36496, "epoch": 0.6560017255635043, "loss/policy_avg": 0.0030971169471740723, "lr": 2.562883435582822e-06, "objective/entropy": -145.6276397705078, "objective/kl": 6.605377197265625, "objective/non_score_reward": -0.6605377197265625, "objective/rlhf_reward": -2.24215075969696, "objective/scores": 0.1, "policy/approxkl_avg": 1.3553369045257568, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5911043882369995, "step": 2280, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0003302097320557 }, { "episode": 36512, "epoch": 0.656289319480893, "loss/policy_avg": 0.3546699583530426, "lr": 2.562691717791411e-06, "objective/entropy": 233.58958435058594, "objective/kl": 17.121244430541992, "objective/non_score_reward": -1.7121243476867676, "objective/rlhf_reward": -3.9247784956705303, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 12.144831657409668, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8071939945220947, "step": 2281, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0023207664489746 }, { "episode": 36528, "epoch": 0.6565769133982816, "loss/policy_avg": 1.2533683776855469, "lr": 2.5625e-06, "objective/entropy": 40.26350402832031, "objective/kl": 11.750067710876465, "objective/non_score_reward": -1.1750068664550781, "objective/rlhf_reward": -2.577320875898872, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 34.881553649902344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6683433055877686, "step": 2282, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9997665882110596 }, { "episode": 36544, "epoch": 0.6568645073156703, "loss/policy_avg": -0.07329759001731873, "lr": 2.562308282208589e-06, "objective/entropy": 74.04084777832031, "objective/kl": 14.00217342376709, "objective/non_score_reward": -1.4002174139022827, "objective/rlhf_reward": -7.600869655609131, "objective/scores": -0.5, "policy/approxkl_avg": 15.320232391357422, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.760829508304596, "step": 2283, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000995635986328 }, { "episode": 36560, "epoch": 0.6571521012330589, "loss/policy_avg": 0.40319544076919556, "lr": 2.562116564417178e-06, "objective/entropy": 132.4573974609375, "objective/kl": 17.98862075805664, "objective/non_score_reward": -1.7988622188568115, "objective/rlhf_reward": -4.795448756217956, "objective/scores": 0.6, "policy/approxkl_avg": 17.402114868164062, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6830170154571533, "step": 2284, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9980769157409668 }, { "episode": 36576, "epoch": 0.6574396951504475, "loss/policy_avg": 0.4163977801799774, "lr": 2.561924846625767e-06, "objective/entropy": 52.17835998535156, "objective/kl": 14.525779724121094, "objective/non_score_reward": -1.452578067779541, "objective/rlhf_reward": -1.410312300920486, "objective/scores": 1.1, "policy/approxkl_avg": 14.813898086547852, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.811333417892456, "step": 2285, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9997317790985107 }, { "episode": 36592, "epoch": 0.6577272890678362, "loss/policy_avg": 0.23318399488925934, "lr": 2.5617331288343557e-06, "objective/entropy": 176.15615844726562, "objective/kl": 17.697359085083008, "objective/non_score_reward": -1.7697358131408691, "objective/rlhf_reward": -9.078943252563477, "objective/scores": -0.5, "policy/approxkl_avg": 78.4808349609375, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7467453479766846, "step": 2286, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9992974996566772 }, { "episode": 36608, "epoch": 0.6580148829852248, "loss/policy_avg": 0.1549290418624878, "lr": 2.561541411042945e-06, "objective/entropy": 181.3204345703125, "objective/kl": 21.084064483642578, "objective/non_score_reward": -2.1084065437316895, "objective/rlhf_reward": -4.033626532554626, "objective/scores": 1.1, "policy/approxkl_avg": 16.693029403686523, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.790191650390625, "step": 2287, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9995472431182861 }, { "episode": 36624, "epoch": 0.6583024769026136, "loss/policy_avg": -0.020601997151970863, "lr": 2.5613496932515337e-06, "objective/entropy": 260.5650634765625, "objective/kl": 17.291284561157227, "objective/non_score_reward": -1.729128360748291, "objective/rlhf_reward": -8.916513442993164, "objective/scores": -0.5, "policy/approxkl_avg": 32.037269592285156, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6160063743591309, "step": 2288, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999596357345581 }, { "episode": 36640, "epoch": 0.6585900708200022, "loss/policy_avg": 0.19600838422775269, "lr": 2.561157975460123e-06, "objective/entropy": -18.00920867919922, "objective/kl": 9.472246170043945, "objective/non_score_reward": -0.9472246170043945, "objective/rlhf_reward": -5.788898468017578, "objective/scores": -0.5, "policy/approxkl_avg": 53.95936584472656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6834456920623779, "step": 2289, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.998971700668335 }, { "episode": 36656, "epoch": 0.6588776647373908, "loss/policy_avg": 0.576319694519043, "lr": 2.5609662576687118e-06, "objective/entropy": -37.32265090942383, "objective/kl": 12.981508255004883, "objective/non_score_reward": -1.2981507778167725, "objective/rlhf_reward": -4.7926029920578, "objective/scores": 0.1, "policy/approxkl_avg": 4.780340671539307, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5498796701431274, "step": 2290, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9995492696762085 }, { "episode": 36672, "epoch": 0.6591652586547795, "loss/policy_avg": 0.64011150598526, "lr": 2.5607745398773006e-06, "objective/entropy": 123.31390380859375, "objective/kl": 21.683334350585938, "objective/non_score_reward": -2.1683335304260254, "objective/rlhf_reward": -10.673334121704102, "objective/scores": -0.5, "policy/approxkl_avg": 97.21592712402344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6466020345687866, "step": 2291, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9990426301956177 }, { "episode": 36688, "epoch": 0.6594528525721681, "loss/policy_avg": 0.804427981376648, "lr": 2.56058282208589e-06, "objective/entropy": -145.3238525390625, "objective/kl": 16.019611358642578, "objective/non_score_reward": -1.6019612550735474, "objective/rlhf_reward": -2.0078449010848995, "objective/scores": 1.1, "policy/approxkl_avg": 84.41429138183594, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4610861539840698, "step": 2292, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9991902112960815 }, { "episode": 36704, "epoch": 0.6597404464895568, "loss/policy_avg": 0.5242794752120972, "lr": 2.5603911042944786e-06, "objective/entropy": -87.87837219238281, "objective/kl": 14.958013534545898, "objective/non_score_reward": -1.4958014488220215, "objective/rlhf_reward": -7.983206272125244, "objective/scores": -0.5, "policy/approxkl_avg": 20.005577087402344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6943378448486328, "step": 2293, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.9993853569030762 }, { "episode": 36720, "epoch": 0.6600280404069454, "loss/policy_avg": 0.19961407780647278, "lr": 2.5601993865030674e-06, "objective/entropy": -21.319744110107422, "objective/kl": 11.861923217773438, "objective/non_score_reward": -1.186192274093628, "objective/rlhf_reward": -1.8210502012979715, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 1.3134453296661377, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.8009828329086304, "step": 2294, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9999687671661377 }, { "episode": 36736, "epoch": 0.660315634324334, "loss/policy_avg": 0.148433655500412, "lr": 2.5600076687116567e-06, "objective/entropy": 152.937255859375, "objective/kl": 17.217933654785156, "objective/non_score_reward": -1.7217931747436523, "objective/rlhf_reward": -5.330913155284479, "objective/scores": 0.38906482631788786, "policy/approxkl_avg": 76.15084838867188, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7016232013702393, "step": 2295, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9986050128936768 }, { "episode": 36752, "epoch": 0.6606032282417227, "loss/policy_avg": 0.6339513659477234, "lr": 2.5598159509202455e-06, "objective/entropy": 89.31312561035156, "objective/kl": 18.98331642150879, "objective/non_score_reward": -1.898331642150879, "objective/rlhf_reward": -7.19332624077797, "objective/scores": 0.1, "policy/approxkl_avg": 37.01063919067383, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5004596710205078, "step": 2296, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9980783462524414 }, { "episode": 36768, "epoch": 0.6608908221591113, "loss/policy_avg": 0.32428914308547974, "lr": 2.5596242331288347e-06, "objective/entropy": 208.98477172851562, "objective/kl": 23.732393264770508, "objective/non_score_reward": -2.373239517211914, "objective/rlhf_reward": -11.492958068847656, "objective/scores": -0.5, "policy/approxkl_avg": 90.21807098388672, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7058777213096619, "step": 2297, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9969258308410645 }, { "episode": 36784, "epoch": 0.6611784160765, "loss/policy_avg": 0.426738977432251, "lr": 2.5594325153374235e-06, "objective/entropy": 59.808349609375, "objective/kl": 18.861650466918945, "objective/non_score_reward": -1.886164903640747, "objective/rlhf_reward": -4.620940540672514, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 51.7357177734375, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7321165204048157, "step": 2298, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9962069988250732 }, { "episode": 36800, "epoch": 0.6614660099938886, "loss/policy_avg": -0.3187381327152252, "lr": 2.5592407975460123e-06, "objective/entropy": 113.80389404296875, "objective/kl": 17.378707885742188, "objective/non_score_reward": -1.737870693206787, "objective/rlhf_reward": -4.551482772827148, "objective/scores": 0.6, "policy/approxkl_avg": 18.090682983398438, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6032657623291016, "step": 2299, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.001037120819092 }, { "episode": 36816, "epoch": 0.6617536039112772, "loss/policy_avg": 0.1893741935491562, "lr": 2.5590490797546016e-06, "objective/entropy": 88.16151428222656, "objective/kl": 18.612485885620117, "objective/non_score_reward": -1.86124849319458, "objective/rlhf_reward": -9.44499397277832, "objective/scores": -0.5, "policy/approxkl_avg": 74.23924255371094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6313012838363647, "step": 2300, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.00156831741333 }, { "episode": 36832, "epoch": 0.6620411978286659, "loss/policy_avg": -0.07124347239732742, "lr": 2.55885736196319e-06, "objective/entropy": 3.6212921142578125, "objective/kl": 16.59735107421875, "objective/non_score_reward": -1.6597352027893066, "objective/rlhf_reward": -8.638940811157227, "objective/scores": -0.5, "policy/approxkl_avg": 2.6608316898345947, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.586020827293396, "step": 2301, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.000122547149658 }, { "episode": 36848, "epoch": 0.6623287917460545, "loss/policy_avg": -0.32903608679771423, "lr": 2.558665644171779e-06, "objective/entropy": 4.168998718261719, "objective/kl": 16.042404174804688, "objective/non_score_reward": -1.6042404174804688, "objective/rlhf_reward": -4.016961371898651, "objective/scores": 0.6, "policy/approxkl_avg": 20.106163024902344, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6971254944801331, "step": 2302, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.00260066986084 }, { "episode": 36864, "epoch": 0.6626163856634432, "loss/policy_avg": 0.420703649520874, "lr": 2.558473926380368e-06, "objective/entropy": 203.9630889892578, "objective/kl": 16.670202255249023, "objective/non_score_reward": -1.6670202016830444, "objective/rlhf_reward": -6.2680810451507565, "objective/scores": 0.1, "policy/approxkl_avg": 72.04973602294922, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.749180793762207, "step": 2303, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9969289302825928 }, { "episode": 36880, "epoch": 0.6629039795808319, "loss/policy_avg": 0.05179433524608612, "lr": 2.558282208588957e-06, "objective/entropy": 0.6831283569335938, "objective/kl": 18.128053665161133, "objective/non_score_reward": -1.812805414199829, "objective/rlhf_reward": -5.303810517268117, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 13.856398582458496, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 1.0042017698287964, "step": 2304, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9989118576049805 }, { "episode": 36896, "epoch": 0.6631915734982206, "loss/policy_avg": 0.2960914075374603, "lr": 2.558090490797546e-06, "objective/entropy": 83.56949615478516, "objective/kl": 17.618938446044922, "objective/non_score_reward": -1.7618937492370605, "objective/rlhf_reward": -9.047574996948242, "objective/scores": -0.5, "policy/approxkl_avg": 8.121335983276367, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5466283559799194, "step": 2305, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000248908996582 }, { "episode": 36912, "epoch": 0.6634791674156092, "loss/policy_avg": 0.17726615071296692, "lr": 2.557898773006135e-06, "objective/entropy": 62.649749755859375, "objective/kl": 14.863105773925781, "objective/non_score_reward": -1.4863104820251465, "objective/rlhf_reward": -1.5452419877052304, "objective/scores": 1.1, "policy/approxkl_avg": 21.797748565673828, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7958052158355713, "step": 2306, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.999192237854004 }, { "episode": 36928, "epoch": 0.6637667613329978, "loss/policy_avg": 0.27799302339553833, "lr": 2.557707055214724e-06, "objective/entropy": -89.15963745117188, "objective/kl": 12.794493675231934, "objective/non_score_reward": -1.279449462890625, "objective/rlhf_reward": -2.194078732968542, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 7.691387176513672, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5591141581535339, "step": 2307, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9968161582946777 }, { "episode": 36944, "epoch": 0.6640543552503865, "loss/policy_avg": 0.22466981410980225, "lr": 2.557515337423313e-06, "objective/entropy": 128.76828002929688, "objective/kl": 16.3138484954834, "objective/non_score_reward": -1.6313848495483398, "objective/rlhf_reward": -2.125539308786392, "objective/scores": 1.1, "policy/approxkl_avg": 32.673011779785156, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7959351539611816, "step": 2308, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9986681938171387 }, { "episode": 36960, "epoch": 0.6643419491677751, "loss/policy_avg": 0.15630009770393372, "lr": 2.5573236196319017e-06, "objective/entropy": -250.48313903808594, "objective/kl": 13.846506118774414, "objective/non_score_reward": -1.384650707244873, "objective/rlhf_reward": -2.6148838742983074, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 10.164833068847656, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5223649740219116, "step": 2309, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9981579780578613 }, { "episode": 36976, "epoch": 0.6646295430851638, "loss/policy_avg": 0.0861392617225647, "lr": 2.557131901840491e-06, "objective/entropy": 149.0970916748047, "objective/kl": 15.741743087768555, "objective/non_score_reward": -1.574174165725708, "objective/rlhf_reward": -3.896696931123733, "objective/scores": 0.6, "policy/approxkl_avg": 103.422119140625, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8788071870803833, "step": 2310, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9999258518218994 }, { "episode": 36992, "epoch": 0.6649171370025524, "loss/policy_avg": 0.13818402588367462, "lr": 2.5569401840490797e-06, "objective/entropy": 198.20147705078125, "objective/kl": 22.796815872192383, "objective/non_score_reward": -2.279681444168091, "objective/rlhf_reward": -6.195007000805113, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 46.45262145996094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4907756447792053, "step": 2311, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9959955215454102 }, { "episode": 37008, "epoch": 0.665204730919941, "loss/policy_avg": 0.10368118435144424, "lr": 2.556748466257669e-06, "objective/entropy": 5.179817199707031, "objective/kl": 16.019678115844727, "objective/non_score_reward": -1.6019678115844727, "objective/rlhf_reward": -2.0078711867332455, "objective/scores": 1.1, "policy/approxkl_avg": 1.488909363746643, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7089933156967163, "step": 2312, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0010156631469727 }, { "episode": 37024, "epoch": 0.6654923248373297, "loss/policy_avg": 0.5133357644081116, "lr": 2.5565567484662578e-06, "objective/entropy": -228.21189880371094, "objective/kl": 11.758785247802734, "objective/non_score_reward": -1.1758785247802734, "objective/rlhf_reward": -0.3035142779350277, "objective/scores": 1.1, "policy/approxkl_avg": 10.367254257202148, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8332253694534302, "step": 2313, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 2.0001230239868164 }, { "episode": 37040, "epoch": 0.6657799187547183, "loss/policy_avg": 1.2695897817611694, "lr": 2.5563650306748466e-06, "objective/entropy": 69.35960388183594, "objective/kl": 14.953081130981445, "objective/non_score_reward": -1.4953081607818604, "objective/rlhf_reward": -7.9812331199646, "objective/scores": -0.5, "policy/approxkl_avg": 35.81647491455078, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7310347557067871, "step": 2314, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9994488954544067 }, { "episode": 37056, "epoch": 0.666067512672107, "loss/policy_avg": 0.3404615521430969, "lr": 2.556173312883436e-06, "objective/entropy": 55.503807067871094, "objective/kl": 19.9896240234375, "objective/non_score_reward": -1.998962163925171, "objective/rlhf_reward": -6.391729030672627, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 13.813854217529297, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6280677914619446, "step": 2315, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.997502326965332 }, { "episode": 37072, "epoch": 0.6663551065894956, "loss/policy_avg": 0.26698288321495056, "lr": 2.5559815950920246e-06, "objective/entropy": 66.94442749023438, "objective/kl": 11.107053756713867, "objective/non_score_reward": -1.1107053756713867, "objective/rlhf_reward": -6.442821502685547, "objective/scores": -0.5, "policy/approxkl_avg": 5.230381011962891, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6185259819030762, "step": 2316, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9984166622161865 }, { "episode": 37088, "epoch": 0.6666427005068842, "loss/policy_avg": 0.272176593542099, "lr": 2.5557898773006134e-06, "objective/entropy": -25.308387756347656, "objective/kl": 13.86503791809082, "objective/non_score_reward": -1.3865039348602295, "objective/rlhf_reward": -5.146015635132789, "objective/scores": 0.1, "policy/approxkl_avg": 84.04141235351562, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.708859920501709, "step": 2317, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0056681632995605 }, { "episode": 37104, "epoch": 0.6669302944242729, "loss/policy_avg": 0.4910784959793091, "lr": 2.5555981595092027e-06, "objective/entropy": 175.00706481933594, "objective/kl": 13.743255615234375, "objective/non_score_reward": -1.3743257522583008, "objective/rlhf_reward": -5.097303009033203, "objective/scores": 0.1, "policy/approxkl_avg": 29.28903579711914, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4330393671989441, "step": 2318, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.0001161098480225 }, { "episode": 37120, "epoch": 0.6672178883416616, "loss/policy_avg": 0.3694079518318176, "lr": 2.5554064417177915e-06, "objective/entropy": 148.99612426757812, "objective/kl": 19.65071678161621, "objective/non_score_reward": -1.965071678161621, "objective/rlhf_reward": -9.860286712646484, "objective/scores": -0.5, "policy/approxkl_avg": 144.99139404296875, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5872672200202942, "step": 2319, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.999612808227539 }, { "episode": 37136, "epoch": 0.6675054822590503, "loss/policy_avg": 0.3520659804344177, "lr": 2.5552147239263807e-06, "objective/entropy": 141.15826416015625, "objective/kl": 17.581241607666016, "objective/non_score_reward": -1.7581241130828857, "objective/rlhf_reward": -5.085085282998021, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 29.694534301757812, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.3657630980014801, "step": 2320, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9991331100463867 }, { "episode": 37152, "epoch": 0.6677930761764389, "loss/policy_avg": 0.11424215137958527, "lr": 2.5550230061349695e-06, "objective/entropy": 117.16325378417969, "objective/kl": 15.977231979370117, "objective/non_score_reward": -1.5977232456207275, "objective/rlhf_reward": -8.39089298248291, "objective/scores": -0.5, "policy/approxkl_avg": 38.100990295410156, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.598469614982605, "step": 2321, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9974943399429321 }, { "episode": 37168, "epoch": 0.6680806700938275, "loss/policy_avg": 0.06039281189441681, "lr": 2.5548312883435583e-06, "objective/entropy": 173.42393493652344, "objective/kl": 18.59466552734375, "objective/non_score_reward": -1.859466552734375, "objective/rlhf_reward": -3.0378662109374996, "objective/scores": 1.1, "policy/approxkl_avg": 83.97549438476562, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7640442848205566, "step": 2322, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.995422601699829 }, { "episode": 37184, "epoch": 0.6683682640112162, "loss/policy_avg": 0.4325703978538513, "lr": 2.554639570552147e-06, "objective/entropy": 101.97483825683594, "objective/kl": 18.731670379638672, "objective/non_score_reward": -1.8731671571731567, "objective/rlhf_reward": -4.568949614406797, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 98.36922454833984, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6910192370414734, "step": 2323, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.999758005142212 }, { "episode": 37200, "epoch": 0.6686558579286048, "loss/policy_avg": 0.013559557497501373, "lr": 2.554447852760736e-06, "objective/entropy": 123.21492767333984, "objective/kl": 15.45811653137207, "objective/non_score_reward": -1.5458115339279175, "objective/rlhf_reward": -5.783246254920959, "objective/scores": 0.1, "policy/approxkl_avg": 21.97567367553711, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6617206335067749, "step": 2324, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0010128021240234 }, { "episode": 37216, "epoch": 0.6689434518459935, "loss/policy_avg": 0.011508099734783173, "lr": 2.554256134969325e-06, "objective/entropy": 256.7122802734375, "objective/kl": 17.93226432800293, "objective/non_score_reward": -1.7932264804840088, "objective/rlhf_reward": -6.772906160354614, "objective/scores": 0.1, "policy/approxkl_avg": 14.55018424987793, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8229022026062012, "step": 2325, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000396251678467 }, { "episode": 37232, "epoch": 0.6692310457633821, "loss/policy_avg": 0.22444044053554535, "lr": 2.554064417177914e-06, "objective/entropy": 158.66749572753906, "objective/kl": 13.320199966430664, "objective/non_score_reward": -1.3320200443267822, "objective/rlhf_reward": -7.328080177307129, "objective/scores": -0.5, "policy/approxkl_avg": 49.490081787109375, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6645934581756592, "step": 2326, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0004334449768066 }, { "episode": 37248, "epoch": 0.6695186396807707, "loss/policy_avg": 0.24487340450286865, "lr": 2.5538726993865032e-06, "objective/entropy": 148.02096557617188, "objective/kl": 23.824968338012695, "objective/non_score_reward": -2.3824968338012695, "objective/rlhf_reward": -11.529987335205078, "objective/scores": -0.5, "policy/approxkl_avg": 41.53895950317383, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6445210576057434, "step": 2327, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9995183944702148 }, { "episode": 37264, "epoch": 0.6698062335981594, "loss/policy_avg": 0.021379921585321426, "lr": 2.553680981595092e-06, "objective/entropy": -103.62948608398438, "objective/kl": 13.832340240478516, "objective/non_score_reward": -1.3832340240478516, "objective/rlhf_reward": -3.5855251056718185, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 1.3906583786010742, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6477087140083313, "step": 2328, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0015311241149902 }, { "episode": 37280, "epoch": 0.670093827515548, "loss/policy_avg": 0.49968069791793823, "lr": 2.553489263803681e-06, "objective/entropy": 59.137794494628906, "objective/kl": 18.70618438720703, "objective/non_score_reward": -1.8706185817718506, "objective/rlhf_reward": -7.082474192976951, "objective/scores": 0.1, "policy/approxkl_avg": 209.8355255126953, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.597786545753479, "step": 2329, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9967823028564453 }, { "episode": 37296, "epoch": 0.6703814214329367, "loss/policy_avg": 0.4317970275878906, "lr": 2.55329754601227e-06, "objective/entropy": 15.281234741210938, "objective/kl": 8.350692749023438, "objective/non_score_reward": -0.8350692391395569, "objective/rlhf_reward": -5.340276718139648, "objective/scores": -0.5, "policy/approxkl_avg": 13.626282691955566, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7200409173965454, "step": 2330, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.997509479522705 }, { "episode": 37312, "epoch": 0.6706690153503253, "loss/policy_avg": 2.48857045173645, "lr": 2.553105828220859e-06, "objective/entropy": 10.84235954284668, "objective/kl": 15.428094863891602, "objective/non_score_reward": -1.5428093671798706, "objective/rlhf_reward": -1.7712374091148373, "objective/scores": 1.1, "policy/approxkl_avg": 38.43216323852539, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7159597873687744, "step": 2331, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0019233226776123 }, { "episode": 37328, "epoch": 0.6709566092677139, "loss/policy_avg": 0.27047663927078247, "lr": 2.5529141104294477e-06, "objective/entropy": 1.0194292068481445, "objective/kl": 19.12179946899414, "objective/non_score_reward": -1.9121801853179932, "objective/rlhf_reward": -5.70130969114774, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 67.82426452636719, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.4647403359413147, "step": 2332, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9991931915283203 }, { "episode": 37344, "epoch": 0.6712442031851026, "loss/policy_avg": 0.18591710925102234, "lr": 2.552722392638037e-06, "objective/entropy": -36.61052703857422, "objective/kl": 18.88507843017578, "objective/non_score_reward": -1.8885078430175781, "objective/rlhf_reward": -7.154031640291214, "objective/scores": 0.1, "policy/approxkl_avg": 35.58317565917969, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5592823624610901, "step": 2333, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9964879751205444 }, { "episode": 37360, "epoch": 0.6715317971024913, "loss/policy_avg": 0.05644374340772629, "lr": 2.5525306748466257e-06, "objective/entropy": 63.062744140625, "objective/kl": 17.671581268310547, "objective/non_score_reward": -1.7671581506729126, "objective/rlhf_reward": -6.668632781505584, "objective/scores": 0.1, "policy/approxkl_avg": 38.121299743652344, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8344275951385498, "step": 2334, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9961326122283936 }, { "episode": 37376, "epoch": 0.67181939101988, "loss/policy_avg": 0.17964068055152893, "lr": 2.552338957055215e-06, "objective/entropy": 50.110347747802734, "objective/kl": 12.703283309936523, "objective/non_score_reward": -1.270328164100647, "objective/rlhf_reward": -2.681312835216522, "objective/scores": 0.6, "policy/approxkl_avg": 9.324361801147461, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6545754671096802, "step": 2335, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0004725456237793 }, { "episode": 37392, "epoch": 0.6721069849372686, "loss/policy_avg": 0.18911662697792053, "lr": 2.552147239263804e-06, "objective/entropy": 197.40760803222656, "objective/kl": 16.612634658813477, "objective/non_score_reward": -1.6612634658813477, "objective/rlhf_reward": -4.911720425883928, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 58.14643096923828, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.634497880935669, "step": 2336, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9985204935073853 }, { "episode": 37408, "epoch": 0.6723945788546573, "loss/policy_avg": 0.4584101736545563, "lr": 2.5519555214723926e-06, "objective/entropy": 149.51400756835938, "objective/kl": 16.61104965209961, "objective/non_score_reward": -1.6611049175262451, "objective/rlhf_reward": -5.040299389425831, "objective/scores": 0.40102999566398123, "policy/approxkl_avg": 62.66534423828125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7145014405250549, "step": 2337, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9983527660369873 }, { "episode": 37424, "epoch": 0.6726821727720459, "loss/policy_avg": 0.6171430349349976, "lr": 2.551763803680982e-06, "objective/entropy": -9.677066802978516, "objective/kl": 13.620866775512695, "objective/non_score_reward": -1.3620866537094116, "objective/rlhf_reward": -3.048346465826034, "objective/scores": 0.6, "policy/approxkl_avg": 44.04124450683594, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6463913917541504, "step": 2338, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9975409507751465 }, { "episode": 37440, "epoch": 0.6729697666894345, "loss/policy_avg": 0.1318362057209015, "lr": 2.5515720858895706e-06, "objective/entropy": 38.19598388671875, "objective/kl": 19.960039138793945, "objective/non_score_reward": -1.9960038661956787, "objective/rlhf_reward": -7.5840154647827145, "objective/scores": 0.1, "policy/approxkl_avg": 4.450936794281006, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5201414823532104, "step": 2339, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000936269760132 }, { "episode": 37456, "epoch": 0.6732573606068232, "loss/policy_avg": -0.4302183985710144, "lr": 2.55138036809816e-06, "objective/entropy": -12.776840209960938, "objective/kl": 11.0087890625, "objective/non_score_reward": -1.1008789539337158, "objective/rlhf_reward": -6.403515815734863, "objective/scores": -0.5, "policy/approxkl_avg": 4.597243309020996, "policy/clipfrac_avg": 2.0, "policy/entropy_avg": 0.7018436789512634, "step": 2340, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0039219856262207 }, { "episode": 37472, "epoch": 0.6735449545242118, "loss/policy_avg": 0.1691046804189682, "lr": 2.5511886503067487e-06, "objective/entropy": 159.7733917236328, "objective/kl": 11.087263107299805, "objective/non_score_reward": -1.1087265014648438, "objective/rlhf_reward": -6.434906005859375, "objective/scores": -0.5, "policy/approxkl_avg": 16.17793083190918, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.43710124492645264, "step": 2341, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9960179328918457 }, { "episode": 37488, "epoch": 0.6738325484416005, "loss/policy_avg": 0.35037779808044434, "lr": 2.5509969325153375e-06, "objective/entropy": 56.622718811035156, "objective/kl": 17.291534423828125, "objective/non_score_reward": -1.7291532754898071, "objective/rlhf_reward": -2.5166131317615505, "objective/scores": 1.1, "policy/approxkl_avg": 29.37803077697754, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.3321661353111267, "step": 2342, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9989839792251587 }, { "episode": 37504, "epoch": 0.6741201423589891, "loss/policy_avg": -0.08184777200222015, "lr": 2.5508052147239267e-06, "objective/entropy": -168.5766143798828, "objective/kl": 16.240413665771484, "objective/non_score_reward": -1.6240415573120117, "objective/rlhf_reward": -6.096166288852691, "objective/scores": 0.1, "policy/approxkl_avg": 54.85392761230469, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7052419781684875, "step": 2343, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9971129894256592 }, { "episode": 37520, "epoch": 0.6744077362763777, "loss/policy_avg": -0.20576009154319763, "lr": 2.5506134969325155e-06, "objective/entropy": -11.535301208496094, "objective/kl": 15.220268249511719, "objective/non_score_reward": -1.522026777267456, "objective/rlhf_reward": -1.6881071090698239, "objective/scores": 1.1, "policy/approxkl_avg": 15.641721725463867, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.47453761100769043, "step": 2344, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9983747005462646 }, { "episode": 37536, "epoch": 0.6746953301937664, "loss/policy_avg": 0.7535731792449951, "lr": 2.5504217791411043e-06, "objective/entropy": 97.59146118164062, "objective/kl": 12.347482681274414, "objective/non_score_reward": -1.2347482442855835, "objective/rlhf_reward": -2.8162866256394725, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 5.028593063354492, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5101989507675171, "step": 2345, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0025951862335205 }, { "episode": 37552, "epoch": 0.674982924111155, "loss/policy_avg": 0.07940283417701721, "lr": 2.550230061349693e-06, "objective/entropy": 21.05797576904297, "objective/kl": 9.749893188476562, "objective/non_score_reward": -0.9749892354011536, "objective/rlhf_reward": -3.4999567627906796, "objective/scores": 0.1, "policy/approxkl_avg": 2.4519693851470947, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6360624432563782, "step": 2346, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.000640869140625 }, { "episode": 37568, "epoch": 0.6752705180285437, "loss/policy_avg": 0.5512768030166626, "lr": 2.550038343558282e-06, "objective/entropy": 23.716659545898438, "objective/kl": 17.07099151611328, "objective/non_score_reward": -1.707099199295044, "objective/rlhf_reward": -8.82839584350586, "objective/scores": -0.5, "policy/approxkl_avg": 37.055694580078125, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5303549766540527, "step": 2347, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.996858835220337 }, { "episode": 37584, "epoch": 0.6755581119459323, "loss/policy_avg": 1.214499831199646, "lr": 2.549846625766871e-06, "objective/entropy": -126.47698974609375, "objective/kl": 13.222586631774902, "objective/non_score_reward": -1.322258710861206, "objective/rlhf_reward": -0.8890346050262448, "objective/scores": 1.1, "policy/approxkl_avg": 3.0465681552886963, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.7465829849243164, "step": 2348, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9995687007904053 }, { "episode": 37600, "epoch": 0.675845705863321, "loss/policy_avg": 0.1287229359149933, "lr": 2.54965490797546e-06, "objective/entropy": -96.94557189941406, "objective/kl": 17.62995147705078, "objective/non_score_reward": -1.7629950046539307, "objective/rlhf_reward": -6.651980257034301, "objective/scores": 0.1, "policy/approxkl_avg": 23.454124450683594, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8209439516067505, "step": 2349, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9989672899246216 }, { "episode": 37616, "epoch": 0.6761332997807097, "loss/policy_avg": -0.0770440399646759, "lr": 2.5494631901840492e-06, "objective/entropy": -243.44192504882812, "objective/kl": 14.587154388427734, "objective/non_score_reward": -1.458715558052063, "objective/rlhf_reward": -3.88745106287473, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 8.927319526672363, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6826550960540771, "step": 2350, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.998389720916748 }, { "episode": 37632, "epoch": 0.6764208936980983, "loss/policy_avg": 0.21553274989128113, "lr": 2.549271472392638e-06, "objective/entropy": 9.667755126953125, "objective/kl": 17.31271743774414, "objective/non_score_reward": -1.731271743774414, "objective/rlhf_reward": -4.802380504385505, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 57.263301849365234, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6656813621520996, "step": 2351, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.000150203704834 }, { "episode": 37648, "epoch": 0.676708487615487, "loss/policy_avg": -0.047590918838977814, "lr": 2.549079754601227e-06, "objective/entropy": 90.6687240600586, "objective/kl": 14.7943696975708, "objective/non_score_reward": -1.479436993598938, "objective/rlhf_reward": -5.517748034000396, "objective/scores": 0.1, "policy/approxkl_avg": 5.7691826820373535, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6057709455490112, "step": 2352, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0000667572021484 }, { "episode": 37664, "epoch": 0.6769960815328756, "loss/policy_avg": 0.8578144311904907, "lr": 2.548888036809816e-06, "objective/entropy": 70.19683837890625, "objective/kl": 18.876052856445312, "objective/non_score_reward": -1.8876051902770996, "objective/rlhf_reward": -3.150420686602592, "objective/scores": 1.1, "policy/approxkl_avg": 41.112037658691406, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5748271942138672, "step": 2353, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.000706195831299 }, { "episode": 37680, "epoch": 0.6772836754502642, "loss/policy_avg": 7.543760776519775, "lr": 2.548696319018405e-06, "objective/entropy": 126.7467041015625, "objective/kl": 10.020503997802734, "objective/non_score_reward": -1.0020503997802734, "objective/rlhf_reward": -3.6082015097141262, "objective/scores": 0.1, "policy/approxkl_avg": 3.69321608543396, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5338419675827026, "step": 2354, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.015646457672119 }, { "episode": 37696, "epoch": 0.6775712693676529, "loss/policy_avg": 0.4916594624519348, "lr": 2.5485046012269937e-06, "objective/entropy": 15.650882720947266, "objective/kl": 16.535350799560547, "objective/non_score_reward": -1.653535008430481, "objective/rlhf_reward": -8.614140510559082, "objective/scores": -0.5, "policy/approxkl_avg": 42.67940902709961, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.46649202704429626, "step": 2355, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.998720407485962 }, { "episode": 37712, "epoch": 0.6778588632850415, "loss/policy_avg": 0.3172226846218109, "lr": 2.548312883435583e-06, "objective/entropy": 265.034912109375, "objective/kl": 17.4699764251709, "objective/non_score_reward": -1.746997594833374, "objective/rlhf_reward": -6.587990260124206, "objective/scores": 0.1, "policy/approxkl_avg": 33.108619689941406, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7924455404281616, "step": 2356, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9985918998718262 }, { "episode": 37728, "epoch": 0.6781464572024302, "loss/policy_avg": 0.11512107402086258, "lr": 2.5481211656441718e-06, "objective/entropy": 207.16583251953125, "objective/kl": 22.506271362304688, "objective/non_score_reward": -2.250627040863037, "objective/rlhf_reward": -8.602507925033569, "objective/scores": 0.1, "policy/approxkl_avg": 48.099510192871094, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5274972915649414, "step": 2357, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9984009265899658 }, { "episode": 37744, "epoch": 0.6784340511198188, "loss/policy_avg": -0.010802611708641052, "lr": 2.547929447852761e-06, "objective/entropy": -104.96611022949219, "objective/kl": 11.402233123779297, "objective/non_score_reward": -1.1402233839035034, "objective/rlhf_reward": -6.560893535614014, "objective/scores": -0.5, "policy/approxkl_avg": 1.0427619218826294, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.597365140914917, "step": 2358, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0010321140289307 }, { "episode": 37760, "epoch": 0.6787216450372074, "loss/policy_avg": 0.5463196039199829, "lr": 2.54773773006135e-06, "objective/entropy": 110.8509292602539, "objective/kl": 20.538684844970703, "objective/non_score_reward": -2.0538687705993652, "objective/rlhf_reward": -10.215475082397461, "objective/scores": -0.5, "policy/approxkl_avg": 35.8312873840332, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.4094330072402954, "step": 2359, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9977003335952759 }, { "episode": 37776, "epoch": 0.6790092389545961, "loss/policy_avg": 0.30159619450569153, "lr": 2.5475460122699386e-06, "objective/entropy": -3.8185462951660156, "objective/kl": 15.697071075439453, "objective/non_score_reward": -1.5697071552276611, "objective/rlhf_reward": -5.878828680515289, "objective/scores": 0.1, "policy/approxkl_avg": 32.05399703979492, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6199531555175781, "step": 2360, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9988099336624146 }, { "episode": 37792, "epoch": 0.6792968328719847, "loss/policy_avg": 0.5906786918640137, "lr": 2.547354294478528e-06, "objective/entropy": -6.215213775634766, "objective/kl": 11.929386138916016, "objective/non_score_reward": -1.1929385662078857, "objective/rlhf_reward": -4.37175435423851, "objective/scores": 0.1, "policy/approxkl_avg": 19.593338012695312, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7729129791259766, "step": 2361, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.997424602508545 }, { "episode": 37808, "epoch": 0.6795844267893734, "loss/policy_avg": 0.7287322282791138, "lr": 2.5471625766871166e-06, "objective/entropy": -13.709220886230469, "objective/kl": 10.416068077087402, "objective/non_score_reward": -1.0416066646575928, "objective/rlhf_reward": -3.7664268821477886, "objective/scores": 0.1, "policy/approxkl_avg": 21.55803108215332, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7005133628845215, "step": 2362, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9963767528533936 }, { "episode": 37824, "epoch": 0.679872020706762, "loss/policy_avg": 0.5065807104110718, "lr": 2.546970858895706e-06, "objective/entropy": 165.01046752929688, "objective/kl": 18.968791961669922, "objective/non_score_reward": -1.8968794345855713, "objective/rlhf_reward": -3.1875179469585415, "objective/scores": 1.1, "policy/approxkl_avg": 24.66299057006836, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7943272590637207, "step": 2363, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9983073472976685 }, { "episode": 37840, "epoch": 0.6801596146241508, "loss/policy_avg": -0.09428460896015167, "lr": 2.5467791411042947e-06, "objective/entropy": 57.868064880371094, "objective/kl": 19.484859466552734, "objective/non_score_reward": -1.9484859704971313, "objective/rlhf_reward": -5.393944060802459, "objective/scores": 0.6, "policy/approxkl_avg": 38.548187255859375, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.4131024479866028, "step": 2364, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.998547911643982 }, { "episode": 37856, "epoch": 0.6804472085415394, "loss/policy_avg": 0.014633553102612495, "lr": 2.5465874233128835e-06, "objective/entropy": 11.214363098144531, "objective/kl": 15.86156940460205, "objective/non_score_reward": -1.5861570835113525, "objective/rlhf_reward": -5.944628274440765, "objective/scores": 0.1, "policy/approxkl_avg": 8.504777908325195, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5623284578323364, "step": 2365, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9983117580413818 }, { "episode": 37872, "epoch": 0.680734802458928, "loss/policy_avg": -0.0003791128983721137, "lr": 2.5463957055214727e-06, "objective/entropy": 114.24466705322266, "objective/kl": 17.914379119873047, "objective/non_score_reward": -1.7914378643035889, "objective/rlhf_reward": -5.43241848150889, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 6.264281272888184, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6559627056121826, "step": 2366, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0004148483276367 }, { "episode": 37888, "epoch": 0.6810223963763167, "loss/policy_avg": 0.2917371690273285, "lr": 2.5462039877300615e-06, "objective/entropy": -3.1063995361328125, "objective/kl": 19.82198715209961, "objective/non_score_reward": -1.98219895362854, "objective/rlhf_reward": -7.5287956357002255, "objective/scores": 0.1, "policy/approxkl_avg": 146.663330078125, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.6002984046936035, "step": 2367, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 2.0023727416992188 }, { "episode": 37904, "epoch": 0.6813099902937053, "loss/policy_avg": 0.2226359248161316, "lr": 2.5460122699386504e-06, "objective/entropy": -102.39707946777344, "objective/kl": 19.871543884277344, "objective/non_score_reward": -1.987154483795166, "objective/rlhf_reward": -3.5486179426312443, "objective/scores": 1.1, "policy/approxkl_avg": 35.11748123168945, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7066978216171265, "step": 2368, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.00105619430542 }, { "episode": 37920, "epoch": 0.681597584211094, "loss/policy_avg": -0.22409255802631378, "lr": 2.545820552147239e-06, "objective/entropy": 28.46088409423828, "objective/kl": 12.455662727355957, "objective/non_score_reward": -1.2455663681030273, "objective/rlhf_reward": -0.5822652339935299, "objective/scores": 1.1, "policy/approxkl_avg": 22.929950714111328, "policy/clipfrac_avg": 1.75, "policy/entropy_avg": 0.4477207660675049, "step": 2369, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9993529319763184 }, { "episode": 37936, "epoch": 0.6818851781284826, "loss/policy_avg": 0.5391968488693237, "lr": 2.545628834355828e-06, "objective/entropy": 200.6731719970703, "objective/kl": 13.703605651855469, "objective/non_score_reward": -1.3703603744506836, "objective/rlhf_reward": -1.081441795825958, "objective/scores": 1.1, "policy/approxkl_avg": 22.747711181640625, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7975244522094727, "step": 2370, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000143051147461 }, { "episode": 37952, "epoch": 0.6821727720458712, "loss/policy_avg": 0.4769810736179352, "lr": 2.545437116564417e-06, "objective/entropy": 64.26600646972656, "objective/kl": 18.710128784179688, "objective/non_score_reward": -1.8710130453109741, "objective/rlhf_reward": -5.53664095230573, "objective/scores": 0.4868528072345416, "policy/approxkl_avg": 182.27891540527344, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5450712442398071, "step": 2371, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9994101524353027 }, { "episode": 37968, "epoch": 0.6824603659632599, "loss/policy_avg": -0.10944107174873352, "lr": 2.545245398773006e-06, "objective/entropy": -144.7941436767578, "objective/kl": 12.572160720825195, "objective/non_score_reward": -1.2572160959243774, "objective/rlhf_reward": -3.2955309907595316, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 27.950170516967773, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5846720933914185, "step": 2372, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 2.0016770362854004 }, { "episode": 37984, "epoch": 0.6827479598806485, "loss/policy_avg": 0.490910142660141, "lr": 2.5450536809815952e-06, "objective/entropy": -3.4798126220703125, "objective/kl": 16.669143676757812, "objective/non_score_reward": -1.6669142246246338, "objective/rlhf_reward": -4.5449509940305095, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 17.67532730102539, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.8314906358718872, "step": 2373, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9999407529830933 }, { "episode": 38000, "epoch": 0.6830355537980372, "loss/policy_avg": -0.02355530858039856, "lr": 2.544861963190184e-06, "objective/entropy": 2.0637435913085938, "objective/kl": 15.970002174377441, "objective/non_score_reward": -1.597000241279602, "objective/rlhf_reward": -4.563172216686319, "objective/scores": 0.4562071871080222, "policy/approxkl_avg": 41.44702911376953, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.43509942293167114, "step": 2374, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9991532564163208 }, { "episode": 38016, "epoch": 0.6833231477154258, "loss/policy_avg": 0.5410069227218628, "lr": 2.544670245398773e-06, "objective/entropy": -88.23371887207031, "objective/kl": 19.367130279541016, "objective/non_score_reward": -1.9367129802703857, "objective/rlhf_reward": -4.823133324028227, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 18.989822387695312, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4978526532649994, "step": 2375, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.9983242750167847 }, { "episode": 38032, "epoch": 0.6836107416328144, "loss/policy_avg": 0.07011735439300537, "lr": 2.544478527607362e-06, "objective/entropy": -34.97230529785156, "objective/kl": 11.747721672058105, "objective/non_score_reward": -1.1747722625732422, "objective/rlhf_reward": -4.299088871479034, "objective/scores": 0.1, "policy/approxkl_avg": 15.02751636505127, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6224102973937988, "step": 2376, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0016679763793945 }, { "episode": 38048, "epoch": 0.6838983355502031, "loss/policy_avg": 0.23295675218105316, "lr": 2.544286809815951e-06, "objective/entropy": -29.86457061767578, "objective/kl": 14.828042984008789, "objective/non_score_reward": -1.4828044176101685, "objective/rlhf_reward": -5.531217566132545, "objective/scores": 0.1, "policy/approxkl_avg": 80.19185638427734, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.647622287273407, "step": 2377, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.997412919998169 }, { "episode": 38064, "epoch": 0.6841859294675917, "loss/policy_avg": 0.24506890773773193, "lr": 2.54409509202454e-06, "objective/entropy": -49.15517807006836, "objective/kl": 13.11131477355957, "objective/non_score_reward": -1.3111315965652466, "objective/rlhf_reward": -0.8445264458656307, "objective/scores": 1.1, "policy/approxkl_avg": 10.21036148071289, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.7912728786468506, "step": 2378, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.999809980392456 }, { "episode": 38080, "epoch": 0.6844735233849804, "loss/policy_avg": 0.022225193679332733, "lr": 2.543903374233129e-06, "objective/entropy": 226.58547973632812, "objective/kl": 19.322124481201172, "objective/non_score_reward": -1.9322123527526855, "objective/rlhf_reward": -7.328849291801452, "objective/scores": 0.1, "policy/approxkl_avg": 84.02395629882812, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.836097776889801, "step": 2379, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 2.000347137451172 }, { "episode": 38096, "epoch": 0.6847611173023691, "loss/policy_avg": 0.15902036428451538, "lr": 2.5437116564417178e-06, "objective/entropy": -104.35630798339844, "objective/kl": 14.964473724365234, "objective/non_score_reward": -1.4964473247528076, "objective/rlhf_reward": -3.0620705827486248, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 42.95732879638672, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.4826052784919739, "step": 2380, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 1.9977092742919922 }, { "episode": 38112, "epoch": 0.6850487112197577, "loss/policy_avg": 0.17381584644317627, "lr": 2.543519938650307e-06, "objective/entropy": 153.31005859375, "objective/kl": 12.905035018920898, "objective/non_score_reward": -1.2905036211013794, "objective/rlhf_reward": -4.762014365196228, "objective/scores": 0.1, "policy/approxkl_avg": 6.92567253112793, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6593793630599976, "step": 2381, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 1.99961256980896 }, { "episode": 38128, "epoch": 0.6853363051371464, "loss/policy_avg": -0.15650756657123566, "lr": 2.543328220858896e-06, "objective/entropy": -78.62450408935547, "objective/kl": 16.713642120361328, "objective/non_score_reward": -1.6713643074035645, "objective/rlhf_reward": -4.285456991195678, "objective/scores": 0.6, "policy/approxkl_avg": 33.79967498779297, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7772436738014221, "step": 2382, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 4, "val/ratio": 2.0038373470306396 }, { "episode": 38144, "epoch": 0.685623899054535, "loss/policy_avg": 0.2791460156440735, "lr": 2.5431365030674846e-06, "objective/entropy": 45.02009582519531, "objective/kl": 14.298861503601074, "objective/non_score_reward": -1.4298863410949707, "objective/rlhf_reward": -7.719544887542725, "objective/scores": -0.5, "policy/approxkl_avg": 13.945395469665527, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7025325894355774, "step": 2383, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.9980533123016357 }, { "episode": 38160, "epoch": 0.6859114929719237, "loss/policy_avg": 0.14568883180618286, "lr": 2.542944785276074e-06, "objective/entropy": 195.76580810546875, "objective/kl": 16.112838745117188, "objective/non_score_reward": -1.611283779144287, "objective/rlhf_reward": -3.521416102291319, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 60.78529739379883, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5057171583175659, "step": 2384, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9989773035049438 }, { "episode": 38176, "epoch": 0.6861990868893123, "loss/policy_avg": 0.5063943862915039, "lr": 2.5427530674846627e-06, "objective/entropy": 165.6365203857422, "objective/kl": 17.116657257080078, "objective/non_score_reward": -1.7116656303405762, "objective/rlhf_reward": -8.846662521362305, "objective/scores": -0.5, "policy/approxkl_avg": 44.173545837402344, "policy/clipfrac_avg": 0.5, "policy/entropy_avg": 0.6770908832550049, "step": 2385, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9996287822723389 }, { "episode": 38192, "epoch": 0.6864866808067009, "loss/policy_avg": 0.1624879390001297, "lr": 2.542561349693252e-06, "objective/entropy": 11.813888549804688, "objective/kl": 10.612668991088867, "objective/non_score_reward": -1.0612671375274658, "objective/rlhf_reward": -6.245068550109863, "objective/scores": -0.5, "policy/approxkl_avg": 12.712059020996094, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7070558667182922, "step": 2386, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0033936500549316 }, { "episode": 38208, "epoch": 0.6867742747240896, "loss/policy_avg": -0.14744633436203003, "lr": 2.5423696319018407e-06, "objective/entropy": 15.351272583007812, "objective/kl": 20.102500915527344, "objective/non_score_reward": -2.0102505683898926, "objective/rlhf_reward": -7.641001945734025, "objective/scores": 0.1, "policy/approxkl_avg": 24.63510513305664, "policy/clipfrac_avg": 1.5, "policy/entropy_avg": 0.6769348978996277, "step": 2387, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.00319242477417 }, { "episode": 38224, "epoch": 0.6870618686414782, "loss/policy_avg": 0.3262820243835449, "lr": 2.5421779141104295e-06, "objective/entropy": -229.1365509033203, "objective/kl": 3.1505727767944336, "objective/non_score_reward": -0.31505724787712097, "objective/rlhf_reward": -0.8602290138602258, "objective/scores": 0.1, "policy/approxkl_avg": 1.0276970863342285, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.4453001916408539, "step": 2388, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 2.0001306533813477 }, { "episode": 38240, "epoch": 0.6873494625588669, "loss/policy_avg": 0.26754438877105713, "lr": 2.5419861963190187e-06, "objective/entropy": 255.03985595703125, "objective/kl": 13.205718994140625, "objective/non_score_reward": -1.3205718994140625, "objective/rlhf_reward": -0.8822874784469601, "objective/scores": 1.1, "policy/approxkl_avg": 17.32726287841797, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.6657876372337341, "step": 2389, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 2.001314640045166 }, { "episode": 38256, "epoch": 0.6876370564762555, "loss/policy_avg": 0.3306029438972473, "lr": 2.541794478527607e-06, "objective/entropy": -46.668853759765625, "objective/kl": 17.140121459960938, "objective/non_score_reward": -1.7140121459960938, "objective/rlhf_reward": -6.45604852437973, "objective/scores": 0.1, "policy/approxkl_avg": 55.39460754394531, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.8255484700202942, "step": 2390, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9972751140594482 }, { "episode": 38272, "epoch": 0.6879246503936441, "loss/policy_avg": 0.02296963334083557, "lr": 2.5416027607361964e-06, "objective/entropy": -19.883819580078125, "objective/kl": 13.475825309753418, "objective/non_score_reward": -1.3475825786590576, "objective/rlhf_reward": -7.3903303146362305, "objective/scores": -0.5, "policy/approxkl_avg": 5.257452487945557, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.7932485342025757, "step": 2391, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.9987152814865112 }, { "episode": 38288, "epoch": 0.6882122443110328, "loss/policy_avg": 0.1291026622056961, "lr": 2.541411042944785e-06, "objective/entropy": -122.60946655273438, "objective/kl": 19.44902801513672, "objective/non_score_reward": -1.9449028968811035, "objective/rlhf_reward": -5.656905176416908, "objective/scores": 0.5306765580733931, "policy/approxkl_avg": 32.762733459472656, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.7750039100646973, "step": 2392, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 5, "val/ratio": 1.997039794921875 }, { "episode": 38304, "epoch": 0.6884998382284214, "loss/policy_avg": 0.4377826452255249, "lr": 2.541219325153374e-06, "objective/entropy": 132.87936401367188, "objective/kl": 17.26844596862793, "objective/non_score_reward": -1.7268447875976562, "objective/rlhf_reward": -8.907379150390625, "objective/scores": -0.5, "policy/approxkl_avg": 9.914093017578125, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.5781127214431763, "step": 2393, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9998254776000977 }, { "episode": 38320, "epoch": 0.6887874321458101, "loss/policy_avg": 0.4851827025413513, "lr": 2.541027607361963e-06, "objective/entropy": 41.502628326416016, "objective/kl": 16.9117431640625, "objective/non_score_reward": -1.6911745071411133, "objective/rlhf_reward": -2.364697879552841, "objective/scores": 1.1, "policy/approxkl_avg": 21.00037384033203, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.7911096811294556, "step": 2394, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 1.9994444847106934 }, { "episode": 38336, "epoch": 0.6890750260631988, "loss/policy_avg": 0.2244480699300766, "lr": 2.540835889570552e-06, "objective/entropy": 92.34452819824219, "objective/kl": 17.634567260742188, "objective/non_score_reward": -1.7634568214416504, "objective/rlhf_reward": -5.320493773619333, "objective/scores": 0.43333333333333335, "policy/approxkl_avg": 41.524139404296875, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5093786716461182, "step": 2395, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.998244047164917 }, { "episode": 38352, "epoch": 0.6893626199805875, "loss/policy_avg": 0.04284711554646492, "lr": 2.5406441717791413e-06, "objective/entropy": 224.9025421142578, "objective/kl": 17.01390266418457, "objective/non_score_reward": -1.701390266418457, "objective/rlhf_reward": -8.805561065673828, "objective/scores": -0.5, "policy/approxkl_avg": 28.81897735595703, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.6981585025787354, "step": 2396, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 1, "val/ratio": 1.9998838901519775 }, { "episode": 38368, "epoch": 0.6896502138979761, "loss/policy_avg": 3.3234200477600098, "lr": 2.54045245398773e-06, "objective/entropy": 19.97397804260254, "objective/kl": 20.188440322875977, "objective/non_score_reward": -2.0188441276550293, "objective/rlhf_reward": -5.151657138706419, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 16.88092803955078, "policy/clipfrac_avg": 1.25, "policy/entropy_avg": 0.5968458652496338, "step": 2397, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.0011932849884033 }, { "episode": 38384, "epoch": 0.6899378078153647, "loss/policy_avg": 0.5251247882843018, "lr": 2.540260736196319e-06, "objective/entropy": 1.8503284454345703, "objective/kl": 19.552143096923828, "objective/non_score_reward": -1.9552143812179565, "objective/rlhf_reward": -4.89713830196974, "objective/scores": 0.7309297535714575, "policy/approxkl_avg": 6.352148532867432, "policy/clipfrac_avg": 0.75, "policy/entropy_avg": 0.6283992528915405, "step": 2398, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 3, "val/ratio": 2.00150990486145 }, { "episode": 38400, "epoch": 0.6902254017327534, "loss/policy_avg": 0.30495020747184753, "lr": 2.540069018404908e-06, "objective/entropy": 225.76051330566406, "objective/kl": 18.361190795898438, "objective/non_score_reward": -1.8361191749572754, "objective/rlhf_reward": -6.944476789236068, "objective/scores": 0.1, "policy/approxkl_avg": 11.112527847290039, "policy/clipfrac_avg": 1.0, "policy/entropy_avg": 0.5440366268157959, "step": 2399, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 1.9987056255340576 } ], "logging_steps": 500, "max_steps": 7824, "num_input_tokens_seen": 0, "num_train_epochs": 9.0, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0, "train_batch_size": null, "trial_name": null, "trial_params": null }