{ "best_metric": null, "best_model_checkpoint": null, "episode": 10240, "epoch": 0.14045290575664887, "eval_steps": 200.0, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "episode": 256, "epoch": 0.003511322643916222, "eps": 6, "loss/policy_avg": -0.07090990990400314, "loss/value_avg": 0.0, "lr": 3e-06, "objective/entropy": 49.42120361328125, "objective/kl": 0.006465356796979904, "objective/non_score_reward": -0.000646535714622587, "objective/rlhf_reward": -1.1137903928756714, "objective/scores": -1.109375, "policy/approxkl_avg": 27.096786499023438, "policy/clipfrac_avg": 0.732421875, "policy/entropy_avg": 0.92181396484375, "step": 5, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.0399832725524902, "val/ratio_var": 0.010045886039733887 }, { "episode": 512, "epoch": 0.007022645287832444, "eps": 6, "loss/policy_avg": -0.06497187167406082, "loss/value_avg": 0.0, "lr": 2.9923273657289e-06, "objective/entropy": 48.286014556884766, "objective/kl": 0.8119473457336426, "objective/non_score_reward": -0.08119472861289978, "objective/rlhf_reward": -1.266162633895874, "objective/scores": -1.1875, "policy/approxkl_avg": 18.666072845458984, "policy/clipfrac_avg": 0.7314453125, "policy/entropy_avg": 0.912261962890625, "step": 10, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 16, "val/ratio": 1.020957112312317, "val/ratio_var": 0.00411860179156065 }, { "episode": 768, "epoch": 0.010533967931748666, "eps": 6, "loss/policy_avg": -0.0872286781668663, "loss/value_avg": 0.0, "lr": 2.9846547314578008e-06, "objective/entropy": 49.34376525878906, "objective/kl": 1.9591996669769287, "objective/non_score_reward": -0.1959199756383896, "objective/rlhf_reward": -1.2858657836914062, "objective/scores": -1.09375, "policy/approxkl_avg": 20.772502899169922, "policy/clipfrac_avg": 0.73828125, "policy/entropy_avg": 0.927978515625, "step": 15, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.0191609859466553, "val/ratio_var": 0.00307083735242486 }, { "episode": 1024, "epoch": 0.014045290575664887, "eps": 6, "loss/policy_avg": -0.07566041499376297, "loss/value_avg": 0.0, "lr": 2.9769820971867007e-06, "objective/entropy": 53.13662338256836, "objective/kl": 2.4811532497406006, "objective/non_score_reward": -0.24811533093452454, "objective/rlhf_reward": -1.2548893690109253, "objective/scores": -1.0078125, "policy/approxkl_avg": 20.665164947509766, "policy/clipfrac_avg": 0.7314453125, "policy/entropy_avg": 0.989776611328125, "step": 20, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.011010766029358, "val/ratio_var": 0.004201602190732956 }, { "episode": 1280, "epoch": 0.01755661321958111, "eps": 6, "loss/policy_avg": -0.08593496680259705, "loss/value_avg": 0.0, "lr": 2.9693094629156014e-06, "objective/entropy": 53.72633743286133, "objective/kl": 3.3111624717712402, "objective/non_score_reward": -0.3311161994934082, "objective/rlhf_reward": -1.339456558227539, "objective/scores": -1.0078125, "policy/approxkl_avg": 25.559288024902344, "policy/clipfrac_avg": 0.7353515625, "policy/entropy_avg": 0.997894287109375, "step": 25, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.0134021043777466, "val/ratio_var": 0.0019979747012257576 }, { "episode": 1536, "epoch": 0.021067935863497332, "eps": 6, "loss/policy_avg": -0.09734417498111725, "loss/value_avg": 0.0, "lr": 2.9616368286445014e-06, "objective/entropy": 51.259735107421875, "objective/kl": 5.089182376861572, "objective/non_score_reward": -0.5089181661605835, "objective/rlhf_reward": -1.2202520370483398, "objective/scores": -0.7109375, "policy/approxkl_avg": 29.841636657714844, "policy/clipfrac_avg": 0.736328125, "policy/entropy_avg": 0.960479736328125, "step": 30, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 26, "val/ratio": 1.0178756713867188, "val/ratio_var": 0.009866585955023766 }, { "episode": 1792, "epoch": 0.024579258507413555, "eps": 6, "loss/policy_avg": -0.06831618398427963, "loss/value_avg": 0.0, "lr": 2.9539641943734013e-06, "objective/entropy": 40.643272399902344, "objective/kl": 6.974010944366455, "objective/non_score_reward": -0.6974011063575745, "objective/rlhf_reward": -1.2684605121612549, "objective/scores": -0.5703125, "policy/approxkl_avg": 35.33942413330078, "policy/clipfrac_avg": 0.6982421875, "policy/entropy_avg": 0.7505035400390625, "step": 35, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 16, "val/ratio": 1.00449800491333, "val/ratio_var": 0.0022142010275274515 }, { "episode": 2048, "epoch": 0.028090581151329775, "eps": 6, "loss/policy_avg": -0.04068079590797424, "loss/value_avg": 0.0, "lr": 2.946291560102302e-06, "objective/entropy": 23.142562866210938, "objective/kl": 8.180486679077148, "objective/non_score_reward": -0.8180487155914307, "objective/rlhf_reward": -1.0729957818984985, "objective/scores": -0.255859375, "policy/approxkl_avg": 23.68307876586914, "policy/clipfrac_avg": 0.5859375, "policy/entropy_avg": 0.4361400604248047, "step": 40, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.0077030658721924, "val/ratio_var": 0.0024766812566667795 }, { "episode": 2304, "epoch": 0.031601903795246, "eps": 6, "loss/policy_avg": -0.07307010889053345, "loss/value_avg": 0.0, "lr": 2.938618925831202e-06, "objective/entropy": 19.376842498779297, "objective/kl": 8.770210266113281, "objective/non_score_reward": -0.8770210146903992, "objective/rlhf_reward": -1.0002652406692505, "objective/scores": -0.12353515625, "policy/approxkl_avg": 31.00873565673828, "policy/clipfrac_avg": 0.5302734375, "policy/entropy_avg": 0.33237457275390625, "step": 45, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 20, "val/ratio": 0.996111273765564, "val/ratio_var": 0.001100091845728457 }, { "episode": 2560, "epoch": 0.03511322643916222, "eps": 6, "loss/policy_avg": -0.04584116116166115, "loss/value_avg": 0.0, "lr": 2.9309462915601027e-06, "objective/entropy": 11.984097480773926, "objective/kl": 8.4966402053833, "objective/non_score_reward": -0.849664032459259, "objective/rlhf_reward": -0.8017911911010742, "objective/scores": 0.0478515625, "policy/approxkl_avg": 22.561037063598633, "policy/clipfrac_avg": 0.451171875, "policy/entropy_avg": 0.19393539428710938, "step": 50, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 20, "val/ratio": 0.9952375888824463, "val/ratio_var": 0.000761833623982966 }, { "episode": 2816, "epoch": 0.03862454908307844, "eps": 5, "loss/policy_avg": -0.029720915481448174, "loss/value_avg": 0.0, "lr": 2.9232736572890026e-06, "objective/entropy": 4.9489898681640625, "objective/kl": 8.733837127685547, "objective/non_score_reward": -0.8733837604522705, "objective/rlhf_reward": -0.7492713928222656, "objective/scores": 0.1240234375, "policy/approxkl_avg": 16.253189086914062, "policy/clipfrac_avg": 0.341796875, "policy/entropy_avg": 0.07728099822998047, "step": 55, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 18, "val/ratio": 0.9972053170204163, "val/ratio_var": 0.00032430028659291565 }, { "episode": 3072, "epoch": 0.042135871726994664, "eps": 5, "loss/policy_avg": -0.01298562902957201, "loss/value_avg": 0.0, "lr": 2.9156010230179026e-06, "objective/entropy": 1.3101667165756226, "objective/kl": 8.699792861938477, "objective/non_score_reward": -0.8699792623519897, "objective/rlhf_reward": -0.5752952098846436, "objective/scores": 0.294921875, "policy/approxkl_avg": 2.27925968170166, "policy/clipfrac_avg": 0.236328125, "policy/entropy_avg": 0.02513742446899414, "step": 60, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 20, "val/ratio": 1.0017118453979492, "val/ratio_var": 0.00016639505338389426 }, { "episode": 3328, "epoch": 0.04564719437091089, "eps": 5, "loss/policy_avg": -0.02618303708732128, "loss/value_avg": 0.0, "lr": 2.9079283887468033e-06, "objective/entropy": 2.3685269355773926, "objective/kl": 9.208517074584961, "objective/non_score_reward": -0.9208516478538513, "objective/rlhf_reward": -0.5182289481163025, "objective/scores": 0.40234375, "policy/approxkl_avg": 2.6189699172973633, "policy/clipfrac_avg": 0.310546875, "policy/entropy_avg": 0.04020071029663086, "step": 65, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 20, "val/ratio": 1.003983497619629, "val/ratio_var": 0.0009448421187698841 }, { "episode": 3584, "epoch": 0.04915851701482711, "eps": 5, "loss/policy_avg": -0.02327096462249756, "loss/value_avg": 0.0, "lr": 2.9002557544757032e-06, "objective/entropy": 2.0416018962860107, "objective/kl": 9.701976776123047, "objective/non_score_reward": -0.9701976776123047, "objective/rlhf_reward": -0.49486449360847473, "objective/scores": 0.474609375, "policy/approxkl_avg": 1.271956443786621, "policy/clipfrac_avg": 0.2734375, "policy/entropy_avg": 0.041253089904785156, "step": 70, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 16, "val/ratio": 1.0039558410644531, "val/ratio_var": 0.00041477559716440737 }, { "episode": 3840, "epoch": 0.052669839658743334, "eps": 5, "loss/policy_avg": -0.033096276223659515, "loss/value_avg": 0.0, "lr": 2.892583120204604e-06, "objective/entropy": 2.7795495986938477, "objective/kl": 10.028523445129395, "objective/non_score_reward": -1.0028523206710815, "objective/rlhf_reward": -0.46555712819099426, "objective/scores": 0.5390625, "policy/approxkl_avg": 3.055203676223755, "policy/clipfrac_avg": 0.3427734375, "policy/entropy_avg": 0.053270816802978516, "step": 75, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 23, "val/ratio": 1.0012407302856445, "val/ratio_var": 0.00011274257121840492 }, { "episode": 4096, "epoch": 0.05618116230265955, "eps": 5, "loss/policy_avg": -0.01961323618888855, "loss/value_avg": 0.0, "lr": 2.884910485933504e-06, "objective/entropy": 2.5525641441345215, "objective/kl": 10.111019134521484, "objective/non_score_reward": -1.0111019611358643, "objective/rlhf_reward": -0.510233461856842, "objective/scores": 0.5, "policy/approxkl_avg": 1.331697940826416, "policy/clipfrac_avg": 0.2861328125, "policy/entropy_avg": 0.048857688903808594, "step": 80, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 25, "val/ratio": 1.011049509048462, "val/ratio_var": 0.004252108279615641 }, { "episode": 4352, "epoch": 0.05969248494657577, "eps": 5, "loss/policy_avg": -0.009127877652645111, "loss/value_avg": 0.0, "lr": 2.877237851662404e-06, "objective/entropy": 3.016789674758911, "objective/kl": 11.257818222045898, "objective/non_score_reward": -1.125781774520874, "objective/rlhf_reward": -0.4276960492134094, "objective/scores": 0.69921875, "policy/approxkl_avg": 1.4772686958312988, "policy/clipfrac_avg": 0.35546875, "policy/entropy_avg": 0.053719520568847656, "step": 85, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.0042904615402222, "val/ratio_var": 0.0008556774700991809 }, { "episode": 4608, "epoch": 0.063203807590492, "eps": 5, "loss/policy_avg": -0.025049656629562378, "loss/value_avg": 0.0, "lr": 2.8695652173913046e-06, "objective/entropy": 2.5907459259033203, "objective/kl": 10.457273483276367, "objective/non_score_reward": -1.0457274913787842, "objective/rlhf_reward": -0.3816419839859009, "objective/scores": 0.6640625, "policy/approxkl_avg": 2.3460922241210938, "policy/clipfrac_avg": 0.322265625, "policy/entropy_avg": 0.04626178741455078, "step": 90, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.0003862380981445, "val/ratio_var": 7.93520302977413e-05 }, { "episode": 4864, "epoch": 0.06671513023440821, "eps": 5, "loss/policy_avg": -0.01828361675143242, "loss/value_avg": 0.0, "lr": 2.8618925831202045e-06, "objective/entropy": 2.397810220718384, "objective/kl": 10.732559204101562, "objective/non_score_reward": -1.073256015777588, "objective/rlhf_reward": -0.35966813564300537, "objective/scores": 0.71484375, "policy/approxkl_avg": 1.1093428134918213, "policy/clipfrac_avg": 0.32421875, "policy/entropy_avg": 0.041881561279296875, "step": 95, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 15, "val/ratio": 1.0054664611816406, "val/ratio_var": 0.0017973663052543998 }, { "episode": 5120, "epoch": 0.07022645287832444, "eps": 5, "loss/policy_avg": -0.04088423401117325, "loss/value_avg": 0.0, "lr": 2.8542199488491053e-06, "objective/entropy": 2.343449592590332, "objective/kl": 11.780994415283203, "objective/non_score_reward": -1.1780993938446045, "objective/rlhf_reward": -0.4628324806690216, "objective/scores": 0.71484375, "policy/approxkl_avg": 0.894420325756073, "policy/clipfrac_avg": 0.46875, "policy/entropy_avg": 0.04486083984375, "step": 100, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.0009559392929077, "val/ratio_var": 4.804596756002866e-05 }, { "episode": 5376, "epoch": 0.07373777552224066, "eps": 5, "loss/policy_avg": -0.020697183907032013, "loss/value_avg": 0.0, "lr": 2.846547314578005e-06, "objective/entropy": 1.9023351669311523, "objective/kl": 10.29288101196289, "objective/non_score_reward": -1.0292882919311523, "objective/rlhf_reward": -0.29047834873199463, "objective/scores": 0.73828125, "policy/approxkl_avg": 0.9143690466880798, "policy/clipfrac_avg": 0.373046875, "policy/entropy_avg": 0.028568267822265625, "step": 105, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.000715732574463, "val/ratio_var": 4.201457340968773e-05 }, { "episode": 5632, "epoch": 0.07724909816615688, "eps": 5, "loss/policy_avg": -0.012633640319108963, "loss/value_avg": 0.0, "lr": 2.8388746803069055e-06, "objective/entropy": 1.3839142322540283, "objective/kl": 10.57151985168457, "objective/non_score_reward": -1.0571520328521729, "objective/rlhf_reward": -0.2935946583747864, "objective/scores": 0.765625, "policy/approxkl_avg": 0.6525547504425049, "policy/clipfrac_avg": 0.2646484375, "policy/entropy_avg": 0.0345916748046875, "step": 110, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 0.9999199509620667, "val/ratio_var": 2.6978697860613465e-05 }, { "episode": 5888, "epoch": 0.0807604208100731, "eps": 5, "loss/policy_avg": -0.026668714359402657, "loss/value_avg": 0.0, "lr": 2.831202046035806e-06, "objective/entropy": 2.17741322517395, "objective/kl": 11.39688491821289, "objective/non_score_reward": -1.139688491821289, "objective/rlhf_reward": -0.3027456998825073, "objective/scores": 0.8359375, "policy/approxkl_avg": 8.829752922058105, "policy/clipfrac_avg": 0.35546875, "policy/entropy_avg": 0.034277915954589844, "step": 115, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.0012441873550415, "val/ratio_var": 9.009366476675496e-05 }, { "episode": 6144, "epoch": 0.08427174345398933, "eps": 5, "loss/policy_avg": -0.011602860875427723, "loss/value_avg": 0.0, "lr": 2.823529411764706e-06, "objective/entropy": 1.418602466583252, "objective/kl": 10.246469497680664, "objective/non_score_reward": -1.0246469974517822, "objective/rlhf_reward": -0.22599510848522186, "objective/scores": 0.796875, "policy/approxkl_avg": 0.31790149211883545, "policy/clipfrac_avg": 0.2314453125, "policy/entropy_avg": 0.028847694396972656, "step": 120, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.0009679794311523, "val/ratio_var": 3.900106457876973e-05 }, { "episode": 6400, "epoch": 0.08778306609790555, "eps": 5, "loss/policy_avg": -0.0157505851238966, "loss/value_avg": 0.0, "lr": 2.8158567774936066e-06, "objective/entropy": 1.936393141746521, "objective/kl": 10.550077438354492, "objective/non_score_reward": -1.0550076961517334, "objective/rlhf_reward": -0.252943217754364, "objective/scores": 0.80078125, "policy/approxkl_avg": 6.545133113861084, "policy/clipfrac_avg": 0.341796875, "policy/entropy_avg": 0.039971351623535156, "step": 125, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.0001187324523926, "val/ratio_var": 0.00011527155584190041 }, { "episode": 6656, "epoch": 0.09129438874182177, "eps": 5, "loss/policy_avg": -0.00908716581761837, "loss/value_avg": 0.0, "lr": 2.8081841432225065e-06, "objective/entropy": 1.9167767763137817, "objective/kl": 10.831771850585938, "objective/non_score_reward": -1.0831772089004517, "objective/rlhf_reward": -0.24270595610141754, "objective/scores": 0.83984375, "policy/approxkl_avg": 13.507976531982422, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.034499168395996094, "step": 130, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.0004911422729492, "val/ratio_var": 0.00018595268193166703 }, { "episode": 6912, "epoch": 0.094805711385738, "eps": 5, "loss/policy_avg": -0.017197387292981148, "loss/value_avg": 0.0, "lr": 2.800511508951407e-06, "objective/entropy": 1.7237651348114014, "objective/kl": 11.095592498779297, "objective/non_score_reward": -1.1095592975616455, "objective/rlhf_reward": -0.21057555079460144, "objective/scores": 0.8984375, "policy/approxkl_avg": 2.7560040950775146, "policy/clipfrac_avg": 0.2841796875, "policy/entropy_avg": 0.032952308654785156, "step": 135, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9994020462036133, "val/ratio_var": 3.074964843108319e-05 }, { "episode": 7168, "epoch": 0.09831703402965422, "eps": 5, "loss/policy_avg": -0.012010859325528145, "loss/value_avg": 0.0, "lr": 2.792838874680307e-06, "objective/entropy": 1.5862581729888916, "objective/kl": 10.674396514892578, "objective/non_score_reward": -1.0674396753311157, "objective/rlhf_reward": -0.14433012902736664, "objective/scores": 0.921875, "policy/approxkl_avg": 1.1186727285385132, "policy/clipfrac_avg": 0.2783203125, "policy/entropy_avg": 0.0295562744140625, "step": 140, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.0007727146148682, "val/ratio_var": 4.557183274300769e-05 }, { "episode": 7424, "epoch": 0.10182835667357044, "eps": 5, "loss/policy_avg": -0.013728385791182518, "loss/value_avg": 0.0, "lr": 2.785166240409207e-06, "objective/entropy": 1.5388869047164917, "objective/kl": 10.359582901000977, "objective/non_score_reward": -1.035958170890808, "objective/rlhf_reward": -0.14511710405349731, "objective/scores": 0.890625, "policy/approxkl_avg": 0.5204602479934692, "policy/clipfrac_avg": 0.283203125, "policy/entropy_avg": 0.028924942016601562, "step": 145, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.056097149848938, "val/ratio_var": 0.13372056186199188 }, { "episode": 7680, "epoch": 0.10533967931748667, "eps": 5, "loss/policy_avg": -0.014945434406399727, "loss/value_avg": 0.0, "lr": 2.7774936061381074e-06, "objective/entropy": 2.0769755840301514, "objective/kl": 11.147063255310059, "objective/non_score_reward": -1.11470627784729, "objective/rlhf_reward": -0.08940108120441437, "objective/scores": 1.0234375, "policy/approxkl_avg": 0.5961493253707886, "policy/clipfrac_avg": 0.3681640625, "policy/entropy_avg": 0.037804603576660156, "step": 150, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.0033739805221558, "val/ratio_var": 0.00030022990540601313 }, { "episode": 7936, "epoch": 0.10885100196140288, "eps": 5, "loss/policy_avg": -0.02276831492781639, "loss/value_avg": 0.0, "lr": 2.7698209718670078e-06, "objective/entropy": 2.1412830352783203, "objective/kl": 11.697949409484863, "objective/non_score_reward": -1.169795036315918, "objective/rlhf_reward": -0.13582009077072144, "objective/scores": 1.03125, "policy/approxkl_avg": 0.7155288457870483, "policy/clipfrac_avg": 0.3193359375, "policy/entropy_avg": 0.037835121154785156, "step": 155, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.0014090538024902, "val/ratio_var": 5.2470270020421594e-05 }, { "episode": 8192, "epoch": 0.1123623246053191, "eps": 5, "loss/policy_avg": -0.013076605275273323, "loss/value_avg": 0.0, "lr": 2.762148337595908e-06, "objective/entropy": 1.634714126586914, "objective/kl": 11.629154205322266, "objective/non_score_reward": -1.1629154682159424, "objective/rlhf_reward": -0.28488799929618835, "objective/scores": 0.87890625, "policy/approxkl_avg": 0.4181188941001892, "policy/clipfrac_avg": 0.3037109375, "policy/entropy_avg": 0.029273509979248047, "step": 160, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 15, "val/ratio": 1.0008339881896973, "val/ratio_var": 1.4662801731901709e-05 }, { "episode": 8448, "epoch": 0.11587364724923532, "eps": 5, "loss/policy_avg": -0.01651182770729065, "loss/value_avg": 0.0, "lr": 2.7544757033248085e-06, "objective/entropy": 1.9540742635726929, "objective/kl": 11.4830322265625, "objective/non_score_reward": -1.1483032703399658, "objective/rlhf_reward": -0.05983233451843262, "objective/scores": 1.0859375, "policy/approxkl_avg": 18.791297912597656, "policy/clipfrac_avg": 0.2880859375, "policy/entropy_avg": 0.03601264953613281, "step": 165, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 16, "val/ratio": 1.0220942497253418, "val/ratio_var": 0.02208283357322216 }, { "episode": 8704, "epoch": 0.11938496989315155, "eps": 5, "loss/policy_avg": -0.013821810483932495, "loss/value_avg": 0.0, "lr": 2.7468030690537084e-06, "objective/entropy": 1.6243339776992798, "objective/kl": 11.435280799865723, "objective/non_score_reward": -1.1435281038284302, "objective/rlhf_reward": -0.12443088740110397, "objective/scores": 1.015625, "policy/approxkl_avg": 0.29013216495513916, "policy/clipfrac_avg": 0.28125, "policy/entropy_avg": 0.03498649597167969, "step": 170, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 15, "val/ratio": 1.0027971267700195, "val/ratio_var": 0.0002298366161994636 }, { "episode": 8960, "epoch": 0.12289629253706777, "eps": 5, "loss/policy_avg": -0.011003649793565273, "loss/value_avg": 0.0, "lr": 2.7391304347826087e-06, "objective/entropy": 2.000375986099243, "objective/kl": 11.78514575958252, "objective/non_score_reward": -1.1785145998001099, "objective/rlhf_reward": -0.2609584331512451, "objective/scores": 0.91796875, "policy/approxkl_avg": 0.8603074550628662, "policy/clipfrac_avg": 0.2998046875, "policy/entropy_avg": 0.034775733947753906, "step": 175, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 19, "val/ratio": 1.0012288093566895, "val/ratio_var": 3.532394111971371e-05 }, { "episode": 9216, "epoch": 0.126407615180984, "eps": 5, "loss/policy_avg": -0.010885423980653286, "loss/value_avg": 0.0, "lr": 2.731457800511509e-06, "objective/entropy": 1.5240473747253418, "objective/kl": 12.420597076416016, "objective/non_score_reward": -1.2420598268508911, "objective/rlhf_reward": -0.16641265153884888, "objective/scores": 1.078125, "policy/approxkl_avg": 0.46217110753059387, "policy/clipfrac_avg": 0.2783203125, "policy/entropy_avg": 0.029424667358398438, "step": 180, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 20, "val/ratio": 1.0007582902908325, "val/ratio_var": 2.4759892767178826e-05 }, { "episode": 9472, "epoch": 0.12991893782490022, "eps": 5, "loss/policy_avg": -0.01097183395177126, "loss/value_avg": 0.0, "lr": 2.7237851662404094e-06, "objective/entropy": 1.6292238235473633, "objective/kl": 12.73173713684082, "objective/non_score_reward": -1.2731736898422241, "objective/rlhf_reward": -0.10916168242692947, "objective/scores": 1.1640625, "policy/approxkl_avg": 0.5525862574577332, "policy/clipfrac_avg": 0.310546875, "policy/entropy_avg": 0.031815528869628906, "step": 185, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 22, "val/ratio": 1.0027148723602295, "val/ratio_var": 0.00016600274830125272 }, { "episode": 9728, "epoch": 0.13343026046881643, "eps": 5, "loss/policy_avg": -0.010572239756584167, "loss/value_avg": 0.0, "lr": 2.7161125319693097e-06, "objective/entropy": 2.028618335723877, "objective/kl": 12.439943313598633, "objective/non_score_reward": -1.2439942359924316, "objective/rlhf_reward": -0.06748821586370468, "objective/scores": 1.171875, "policy/approxkl_avg": 0.4930054843425751, "policy/clipfrac_avg": 0.2841796875, "policy/entropy_avg": 0.03688812255859375, "step": 190, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 22, "val/ratio": 1.001340627670288, "val/ratio_var": 4.4035481550963596e-05 }, { "episode": 9984, "epoch": 0.13694158311273266, "eps": 5, "loss/policy_avg": -0.019254155457019806, "loss/value_avg": 0.0, "lr": 2.7084398976982097e-06, "objective/entropy": 2.295351266860962, "objective/kl": 13.32223892211914, "objective/non_score_reward": -1.332223892211914, "objective/rlhf_reward": -0.1836824268102646, "objective/scores": 1.1484375, "policy/approxkl_avg": 3.1426281929016113, "policy/clipfrac_avg": 0.3251953125, "policy/entropy_avg": 0.03939247131347656, "step": 195, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 17, "val/ratio": 1.0032271146774292, "val/ratio_var": 0.00019827872165478766 }, { "episode": 10240, "epoch": 0.14045290575664887, "eps": 5, "loss/policy_avg": -0.018122296780347824, "loss/value_avg": 0.0, "lr": 2.70076726342711e-06, "objective/entropy": 2.345075845718384, "objective/kl": 12.536066055297852, "objective/non_score_reward": -1.2536065578460693, "objective/rlhf_reward": -0.056986674666404724, "objective/scores": 1.1953125, "policy/approxkl_avg": 27.5201473236084, "policy/clipfrac_avg": 0.3046875, "policy/entropy_avg": 0.04156017303466797, "step": 200, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 20, "val/ratio": 0.9993807077407837, "val/ratio_var": 0.00011275127326371148 } ], "logging_steps": 100, "max_steps": 391, "num_input_tokens_seen": 0, "num_train_epochs": 1.3716104077797742, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": true, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0, "train_batch_size": null, "trial_name": null, "trial_params": null }