{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.055662188099808, "eval_steps": 500, "global_step": 1100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 135.57144165039062, "epoch": 0.0009596928982725527, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.199040767386091e-09, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 1 }, { "completion_length": 213.1428680419922, "epoch": 0.0019193857965451055, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.398081534772182e-09, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 2 }, { "completion_length": 147.85714721679688, "epoch": 0.0028790786948176585, "grad_norm": 1.6446343660354614, "kl": 0.0, "learning_rate": 3.5971223021582734e-09, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 3 }, { "completion_length": 160.42857360839844, "epoch": 0.003838771593090211, "grad_norm": 0.00036910639028064907, "kl": 0.0005060847033746541, "learning_rate": 4.796163069544364e-09, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 4 }, { "completion_length": 196.71429443359375, "epoch": 0.0047984644913627635, "grad_norm": 0.0003374080406501889, "kl": 0.0005655621644109488, "learning_rate": 5.995203836930455e-09, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 5 }, { "completion_length": 182.35714721679688, "epoch": 0.005758157389635317, "grad_norm": 0.7470938563346863, "kl": 0.0005649856757372618, "learning_rate": 7.194244604316547e-09, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 6 }, { "completion_length": 143.85714721679688, "epoch": 0.0067178502879078695, "grad_norm": 0.0004381168691907078, "kl": 0.0005645591882057488, "learning_rate": 8.393285371702637e-09, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 7 }, { "completion_length": 184.50001525878906, "epoch": 0.007677543186180422, "grad_norm": 1.393969178199768, "kl": 0.00044664909364655614, "learning_rate": 9.592326139088728e-09, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 8 }, { "completion_length": 151.85714721679688, "epoch": 0.008637236084452975, "grad_norm": 0.0004806791839655489, "kl": 0.0005882256082259119, "learning_rate": 1.079136690647482e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 9 }, { "completion_length": 170.35714721679688, "epoch": 0.009596928982725527, "grad_norm": 0.0004148663720116019, "kl": 0.0005737830069847405, "learning_rate": 1.199040767386091e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 10 }, { "completion_length": 153.42857360839844, "epoch": 0.01055662188099808, "grad_norm": 0.0003266215790063143, "kl": 0.000490470789372921, "learning_rate": 1.3189448441247001e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 11 }, { "completion_length": 156.92857360839844, "epoch": 0.011516314779270634, "grad_norm": 0.0004904203815385699, "kl": 0.00046417259727604687, "learning_rate": 1.4388489208633094e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 12 }, { "completion_length": 143.35714721679688, "epoch": 0.012476007677543186, "grad_norm": 1.605542540550232, "kl": 0.0005615208065137267, "learning_rate": 1.5587529976019183e-08, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 13 }, { "completion_length": 191.35714721679688, "epoch": 0.013435700575815739, "grad_norm": 0.00028336732066236436, "kl": 0.0005303352954797447, "learning_rate": 1.6786570743405274e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 14 }, { "completion_length": 208.35714721679688, "epoch": 0.014395393474088292, "grad_norm": 0.0005024393321946263, "kl": 0.0006201151991263032, "learning_rate": 1.7985611510791365e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 15 }, { "completion_length": 145.07144165039062, "epoch": 0.015355086372360844, "grad_norm": 1.8346211910247803, "kl": 0.0004937839112244546, "learning_rate": 1.9184652278177456e-08, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 16 }, { "completion_length": 153.71429443359375, "epoch": 0.016314779270633396, "grad_norm": 0.0004108909342903644, "kl": 0.0005206220084801316, "learning_rate": 2.038369304556355e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 17 }, { "completion_length": 165.7857208251953, "epoch": 0.01727447216890595, "grad_norm": 0.00038674852112308145, "kl": 0.0003908892394974828, "learning_rate": 2.158273381294964e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 18 }, { "completion_length": 204.1428680419922, "epoch": 0.018234165067178502, "grad_norm": 0.00026810416602529585, "kl": 0.00048633874393999577, "learning_rate": 2.2781774580335732e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 19 }, { "completion_length": 173.42857360839844, "epoch": 0.019193857965451054, "grad_norm": 0.00037750048795714974, "kl": 0.0006084477063268423, "learning_rate": 2.398081534772182e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 20 }, { "completion_length": 152.21429443359375, "epoch": 0.02015355086372361, "grad_norm": 0.0003818501136265695, "kl": 0.0004866582457907498, "learning_rate": 2.517985611510791e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 21 }, { "completion_length": 185.50001525878906, "epoch": 0.02111324376199616, "grad_norm": 0.0003315625654067844, "kl": 0.0005662248586304486, "learning_rate": 2.6378896882494002e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 22 }, { "completion_length": 148.6428680419922, "epoch": 0.022072936660268713, "grad_norm": 0.803084671497345, "kl": 0.000591802760027349, "learning_rate": 2.7577937649880093e-08, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 23 }, { "completion_length": 188.21429443359375, "epoch": 0.023032629558541268, "grad_norm": 0.0002674800343811512, "kl": 0.0005390832666307688, "learning_rate": 2.8776978417266188e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 24 }, { "completion_length": 204.92857360839844, "epoch": 0.02399232245681382, "grad_norm": 1.1090196371078491, "kl": 0.0007019327604211867, "learning_rate": 2.9976019184652275e-08, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 25 }, { "completion_length": 159.85714721679688, "epoch": 0.02495201535508637, "grad_norm": 0.0003070258826483041, "kl": 0.0004155442293267697, "learning_rate": 3.1175059952038366e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 26 }, { "completion_length": 162.6428680419922, "epoch": 0.025911708253358926, "grad_norm": 2.1749982833862305, "kl": 0.0005691185942851007, "learning_rate": 3.237410071942446e-08, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 27 }, { "completion_length": 117.28572082519531, "epoch": 0.026871401151631478, "grad_norm": 0.0005045092548243701, "kl": 0.0006130345282144845, "learning_rate": 3.357314148681055e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 28 }, { "completion_length": 173.21429443359375, "epoch": 0.02783109404990403, "grad_norm": 0.00031108956318348646, "kl": 0.000510133511852473, "learning_rate": 3.477218225419664e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 29 }, { "completion_length": 146.1428680419922, "epoch": 0.028790786948176585, "grad_norm": 0.000975142465904355, "kl": 0.0006087806541472673, "learning_rate": 3.597122302158273e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 30 }, { "completion_length": 111.21429443359375, "epoch": 0.029750479846449136, "grad_norm": 0.0004982451209798455, "kl": 0.0004702489241026342, "learning_rate": 3.717026378896882e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 31 }, { "completion_length": 151.7857208251953, "epoch": 0.030710172744721688, "grad_norm": 0.0006360537372529507, "kl": 0.0007317874114960432, "learning_rate": 3.836930455635491e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 32 }, { "completion_length": 172.21429443359375, "epoch": 0.03166986564299424, "grad_norm": 0.0005086234887130558, "kl": 0.0006477407878264785, "learning_rate": 3.9568345323741003e-08, "loss": 0.0, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 33 }, { "completion_length": 171.1428680419922, "epoch": 0.03262955854126679, "grad_norm": 0.9170761108398438, "kl": 0.0005105896852910519, "learning_rate": 4.07673860911271e-08, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 34 }, { "completion_length": 230.21429443359375, "epoch": 0.03358925143953935, "grad_norm": 0.0003504181222524494, "kl": 0.00046388726332224905, "learning_rate": 4.196642685851319e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 35 }, { "completion_length": 156.2857208251953, "epoch": 0.0345489443378119, "grad_norm": 0.001003033947199583, "kl": 0.0006527850637212396, "learning_rate": 4.316546762589928e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 36 }, { "completion_length": 177.00001525878906, "epoch": 0.03550863723608445, "grad_norm": 0.0008372681331820786, "kl": 0.0005447739968076348, "learning_rate": 4.4364508393285374e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 37 }, { "completion_length": 179.1428680419922, "epoch": 0.036468330134357005, "grad_norm": 0.0003778770042117685, "kl": 0.0005264849169179797, "learning_rate": 4.5563549160671465e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 38 }, { "completion_length": 140.85714721679688, "epoch": 0.03742802303262956, "grad_norm": 0.00045869124005548656, "kl": 0.0006461011944338679, "learning_rate": 4.676258992805755e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 39 }, { "completion_length": 216.7857208251953, "epoch": 0.03838771593090211, "grad_norm": 0.00032263746834360063, "kl": 0.0006066058995202184, "learning_rate": 4.796163069544364e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 40 }, { "completion_length": 261.5714416503906, "epoch": 0.03934740882917467, "grad_norm": 0.00029289969825185835, "kl": 0.0005265087820589542, "learning_rate": 4.916067146282973e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 41 }, { "completion_length": 152.2857208251953, "epoch": 0.04030710172744722, "grad_norm": 0.00042882084380835295, "kl": 0.000519172812346369, "learning_rate": 5.035971223021582e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 42 }, { "completion_length": 165.2857208251953, "epoch": 0.04126679462571977, "grad_norm": 0.0005159132997505367, "kl": 0.0004981404053978622, "learning_rate": 5.1558752997601913e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 43 }, { "completion_length": 157.92857360839844, "epoch": 0.04222648752399232, "grad_norm": 2.05961012840271, "kl": 0.0005051444750279188, "learning_rate": 5.2757793764988004e-08, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 44 }, { "completion_length": 219.57144165039062, "epoch": 0.04318618042226487, "grad_norm": 1.0319280624389648, "kl": 0.00048336974577978253, "learning_rate": 5.3956834532374095e-08, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 45 }, { "completion_length": 135.6428680419922, "epoch": 0.044145873320537425, "grad_norm": 0.0003835747775156051, "kl": 0.0006352445343509316, "learning_rate": 5.5155875299760186e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 46 }, { "completion_length": 193.71429443359375, "epoch": 0.045105566218809984, "grad_norm": 0.00034110460546799004, "kl": 0.0005837282515130937, "learning_rate": 5.635491606714628e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 47 }, { "completion_length": 207.1428680419922, "epoch": 0.046065259117082535, "grad_norm": 0.00026127984165214, "kl": 0.0004677344695664942, "learning_rate": 5.7553956834532375e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 48 }, { "completion_length": 129.42857360839844, "epoch": 0.04702495201535509, "grad_norm": 0.0004720119177363813, "kl": 0.0005594303365796804, "learning_rate": 5.8752997601918466e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 49 }, { "completion_length": 210.71429443359375, "epoch": 0.04798464491362764, "grad_norm": 0.0004740089934784919, "kl": 0.0005794462631456554, "learning_rate": 5.995203836930455e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 50 }, { "completion_length": 158.42857360839844, "epoch": 0.04894433781190019, "grad_norm": 0.00029957719380035996, "kl": 0.0004918202757835388, "learning_rate": 6.115107913669064e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 51 }, { "completion_length": 170.1428680419922, "epoch": 0.04990403071017274, "grad_norm": 1.2246661186218262, "kl": 0.0005658438894897699, "learning_rate": 6.235011990407673e-08, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 52 }, { "completion_length": 170.7857208251953, "epoch": 0.0508637236084453, "grad_norm": 0.0003407200565561652, "kl": 0.00046252604806795716, "learning_rate": 6.354916067146282e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 53 }, { "completion_length": 171.50001525878906, "epoch": 0.05182341650671785, "grad_norm": 0.000413825357099995, "kl": 0.0005888977902941406, "learning_rate": 6.474820143884891e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 54 }, { "completion_length": 200.50001525878906, "epoch": 0.052783109404990404, "grad_norm": 0.0003208342823199928, "kl": 0.00047237443504855037, "learning_rate": 6.594724220623502e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 55 }, { "completion_length": 153.1428680419922, "epoch": 0.053742802303262956, "grad_norm": 0.0002865141723304987, "kl": 0.0004340711748227477, "learning_rate": 6.71462829736211e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 56 }, { "completion_length": 190.07144165039062, "epoch": 0.05470249520153551, "grad_norm": 0.0002753006119746715, "kl": 0.0004322734021116048, "learning_rate": 6.83453237410072e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 57 }, { "completion_length": 160.5, "epoch": 0.05566218809980806, "grad_norm": 0.0004047853290103376, "kl": 0.0006186468526721001, "learning_rate": 6.954436450839328e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 58 }, { "completion_length": 157.7857208251953, "epoch": 0.05662188099808062, "grad_norm": 0.0007299200515262783, "kl": 0.0006332774646580219, "learning_rate": 7.074340527577938e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 59 }, { "completion_length": 208.2857208251953, "epoch": 0.05758157389635317, "grad_norm": 0.0003093322739005089, "kl": 0.0005150062497705221, "learning_rate": 7.194244604316546e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 60 }, { "completion_length": 185.71429443359375, "epoch": 0.05854126679462572, "grad_norm": 0.0004456975730136037, "kl": 0.0006128132226876915, "learning_rate": 7.314148681055156e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 61 }, { "completion_length": 123.0714340209961, "epoch": 0.05950095969289827, "grad_norm": 0.0005455185892060399, "kl": 0.0005309037514962256, "learning_rate": 7.434052757793764e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 62 }, { "completion_length": 233.1428680419922, "epoch": 0.060460652591170824, "grad_norm": 0.8514678478240967, "kl": 0.0004710037028416991, "learning_rate": 7.553956834532373e-08, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 63 }, { "completion_length": 175.50001525878906, "epoch": 0.061420345489443376, "grad_norm": 0.000515421386808157, "kl": 0.0005844376282766461, "learning_rate": 7.673860911270982e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 64 }, { "completion_length": 132.35714721679688, "epoch": 0.06238003838771593, "grad_norm": 0.0005553949158638716, "kl": 0.0005660270689986646, "learning_rate": 7.793764988009592e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 65 }, { "completion_length": 152.35714721679688, "epoch": 0.06333973128598848, "grad_norm": 0.0006625180831179023, "kl": 0.0006339925457723439, "learning_rate": 7.913669064748201e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 66 }, { "completion_length": 181.57144165039062, "epoch": 0.06429942418426103, "grad_norm": 1.5640690326690674, "kl": 0.0005268078530207276, "learning_rate": 8.03357314148681e-08, "loss": 0.0, "reward": 0.1428571492433548, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.1428571492433548, "step": 67 }, { "completion_length": 247.71429443359375, "epoch": 0.06525911708253358, "grad_norm": 0.0003310446918476373, "kl": 0.00045861746184527874, "learning_rate": 8.15347721822542e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 68 }, { "completion_length": 147.7857208251953, "epoch": 0.06621880998080615, "grad_norm": 0.0007288879714906216, "kl": 0.0005758238257840276, "learning_rate": 8.273381294964028e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 69 }, { "completion_length": 147.0, "epoch": 0.0671785028790787, "grad_norm": 0.00031062806374393404, "kl": 0.0004903021617792547, "learning_rate": 8.393285371702638e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 70 }, { "completion_length": 152.85714721679688, "epoch": 0.06813819577735125, "grad_norm": 0.00044071159209124744, "kl": 0.0005353126907721162, "learning_rate": 8.513189448441246e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 71 }, { "completion_length": 145.5, "epoch": 0.0690978886756238, "grad_norm": 0.0005308373947627842, "kl": 0.0005299055483192205, "learning_rate": 8.633093525179857e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 72 }, { "completion_length": 176.1428680419922, "epoch": 0.07005758157389635, "grad_norm": 0.0006112537230364978, "kl": 0.0005121036665514112, "learning_rate": 8.752997601918464e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 73 }, { "completion_length": 129.6428680419922, "epoch": 0.0710172744721689, "grad_norm": 0.0004405511717777699, "kl": 0.0005867794388905168, "learning_rate": 8.872901678657075e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 74 }, { "completion_length": 128.7857208251953, "epoch": 0.07197696737044146, "grad_norm": 0.0007860700134187937, "kl": 0.00069328275276348, "learning_rate": 8.992805755395683e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 75 }, { "completion_length": 174.50001525878906, "epoch": 0.07293666026871401, "grad_norm": 0.0002802668896038085, "kl": 0.0005197104765102267, "learning_rate": 9.112709832134293e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 76 }, { "completion_length": 146.7857208251953, "epoch": 0.07389635316698656, "grad_norm": 0.00039758585626259446, "kl": 0.0005077308742329478, "learning_rate": 9.232613908872901e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 77 }, { "completion_length": 183.6428680419922, "epoch": 0.07485604606525911, "grad_norm": 0.0003271863970439881, "kl": 0.0005645041237585247, "learning_rate": 9.35251798561151e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 78 }, { "completion_length": 223.21429443359375, "epoch": 0.07581573896353166, "grad_norm": 0.000325124739902094, "kl": 0.000542561465408653, "learning_rate": 9.472422062350119e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 79 }, { "completion_length": 167.5, "epoch": 0.07677543186180422, "grad_norm": 1.2339857816696167, "kl": 0.000657322583720088, "learning_rate": 9.592326139088728e-08, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 80 }, { "completion_length": 141.35714721679688, "epoch": 0.07773512476007678, "grad_norm": 0.0004968417924828827, "kl": 0.0007087711128406227, "learning_rate": 9.712230215827338e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 81 }, { "completion_length": 125.50000762939453, "epoch": 0.07869481765834933, "grad_norm": 0.0009816541569307446, "kl": 0.000571896496694535, "learning_rate": 9.832134292565946e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 82 }, { "completion_length": 228.21429443359375, "epoch": 0.07965451055662189, "grad_norm": 0.00030556568526662886, "kl": 0.00046765722800046206, "learning_rate": 9.952038369304557e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 83 }, { "completion_length": 137.2857208251953, "epoch": 0.08061420345489444, "grad_norm": 0.00039037931128405035, "kl": 0.0005185764166526496, "learning_rate": 1.0071942446043164e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 84 }, { "completion_length": 176.2857208251953, "epoch": 0.08157389635316699, "grad_norm": 0.00034896901343017817, "kl": 0.0005534705705940723, "learning_rate": 1.0191846522781775e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 85 }, { "completion_length": 141.0, "epoch": 0.08253358925143954, "grad_norm": 0.0005829152651131153, "kl": 0.0005998964770697057, "learning_rate": 1.0311750599520383e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 86 }, { "completion_length": 155.21429443359375, "epoch": 0.08349328214971209, "grad_norm": 0.00044023056398145854, "kl": 0.0005295532173477113, "learning_rate": 1.0431654676258993e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 87 }, { "completion_length": 125.92857360839844, "epoch": 0.08445297504798464, "grad_norm": 0.0004207340825814754, "kl": 0.0005696879234164953, "learning_rate": 1.0551558752997601e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 88 }, { "completion_length": 151.6428680419922, "epoch": 0.0854126679462572, "grad_norm": 0.0006630482384935021, "kl": 0.0006240293732844293, "learning_rate": 1.0671462829736211e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 89 }, { "completion_length": 174.1428680419922, "epoch": 0.08637236084452975, "grad_norm": 0.0004631876654457301, "kl": 0.0006516958237625659, "learning_rate": 1.0791366906474819e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 90 }, { "completion_length": 187.92857360839844, "epoch": 0.0873320537428023, "grad_norm": 0.0002980834397021681, "kl": 0.000502982409670949, "learning_rate": 1.091127098321343e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 91 }, { "completion_length": 153.0, "epoch": 0.08829174664107485, "grad_norm": 0.00037463370244950056, "kl": 0.0005217620055191219, "learning_rate": 1.1031175059952037e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 92 }, { "completion_length": 170.6428680419922, "epoch": 0.0892514395393474, "grad_norm": 0.0004220302798785269, "kl": 0.0005722779897041619, "learning_rate": 1.1151079136690646e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 93 }, { "completion_length": 133.5, "epoch": 0.09021113243761997, "grad_norm": 0.0005481647094711661, "kl": 0.0005710541154257953, "learning_rate": 1.1270983213429255e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 94 }, { "completion_length": 191.7857208251953, "epoch": 0.09117082533589252, "grad_norm": 0.0002727951796259731, "kl": 0.00047989259473979473, "learning_rate": 1.1390887290167865e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 95 }, { "completion_length": 164.1428680419922, "epoch": 0.09213051823416507, "grad_norm": 0.0002804905525408685, "kl": 0.0004200213006697595, "learning_rate": 1.1510791366906475e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 96 }, { "completion_length": 183.50001525878906, "epoch": 0.09309021113243762, "grad_norm": 0.00047365151112899184, "kl": 0.0007084443350322545, "learning_rate": 1.1630695443645083e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 97 }, { "completion_length": 191.00001525878906, "epoch": 0.09404990403071017, "grad_norm": 0.0002856609062291682, "kl": 0.00048228586092591286, "learning_rate": 1.1750599520383693e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 98 }, { "completion_length": 140.1428680419922, "epoch": 0.09500959692898273, "grad_norm": 2.202390432357788, "kl": 0.0006221291841939092, "learning_rate": 1.1870503597122301e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 99 }, { "completion_length": 175.00001525878906, "epoch": 0.09596928982725528, "grad_norm": 0.00043717565131373703, "kl": 0.0004098935751244426, "learning_rate": 1.199040767386091e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 100 }, { "completion_length": 160.85714721679688, "epoch": 0.09692898272552783, "grad_norm": 0.0006272057071328163, "kl": 0.0006100781611166894, "learning_rate": 1.211031175059952e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 101 }, { "completion_length": 157.2857208251953, "epoch": 0.09788867562380038, "grad_norm": 0.00035745432251133025, "kl": 0.000528128060977906, "learning_rate": 1.2230215827338128e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 102 }, { "completion_length": 140.85714721679688, "epoch": 0.09884836852207293, "grad_norm": 1.3156077861785889, "kl": 0.0005516663077287376, "learning_rate": 1.2350119904076737e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 103 }, { "completion_length": 152.35714721679688, "epoch": 0.09980806142034548, "grad_norm": 0.0004066934634465724, "kl": 0.0005398765206336975, "learning_rate": 1.2470023980815346e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 104 }, { "completion_length": 175.07144165039062, "epoch": 0.10076775431861804, "grad_norm": 0.0003851476649288088, "kl": 0.0005980779533274472, "learning_rate": 1.2589928057553956e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 105 }, { "completion_length": 193.1428680419922, "epoch": 0.1017274472168906, "grad_norm": 0.0003483214240986854, "kl": 0.0005786058609373868, "learning_rate": 1.2709832134292565e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 106 }, { "completion_length": 157.6428680419922, "epoch": 0.10268714011516315, "grad_norm": 0.00044869084376841784, "kl": 0.00039418123196810484, "learning_rate": 1.2829736211031176e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 107 }, { "completion_length": 164.21429443359375, "epoch": 0.1036468330134357, "grad_norm": 0.00037852817331440747, "kl": 0.0005093588843010366, "learning_rate": 1.2949640287769783e-07, "loss": 0.0, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 108 }, { "completion_length": 152.85714721679688, "epoch": 0.10460652591170826, "grad_norm": 0.0010042759822681546, "kl": 0.0006809078040532768, "learning_rate": 1.3069544364508392e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 109 }, { "completion_length": 192.92857360839844, "epoch": 0.10556621880998081, "grad_norm": 1.1291066408157349, "kl": 0.0007079385104589164, "learning_rate": 1.3189448441247004e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 110 }, { "completion_length": 209.57144165039062, "epoch": 0.10652591170825336, "grad_norm": 0.0003598193870857358, "kl": 0.0005137412808835506, "learning_rate": 1.330935251798561e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 111 }, { "completion_length": 144.6428680419922, "epoch": 0.10748560460652591, "grad_norm": 0.00041182481800206006, "kl": 0.0005751232383772731, "learning_rate": 1.342925659472422e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 112 }, { "completion_length": 173.00001525878906, "epoch": 0.10844529750479846, "grad_norm": 0.00038379052421078086, "kl": 0.00048211216926574707, "learning_rate": 1.3549160671462828e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 113 }, { "completion_length": 182.35714721679688, "epoch": 0.10940499040307101, "grad_norm": 0.00028473680140450597, "kl": 0.0004612836637534201, "learning_rate": 1.366906474820144e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 114 }, { "completion_length": 165.21429443359375, "epoch": 0.11036468330134357, "grad_norm": 0.00036668445682153106, "kl": 0.00045623042387887836, "learning_rate": 1.3788968824940047e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 115 }, { "completion_length": 211.21429443359375, "epoch": 0.11132437619961612, "grad_norm": 0.0003457469865679741, "kl": 0.0005738778272643685, "learning_rate": 1.3908872901678656e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 116 }, { "completion_length": 144.6428680419922, "epoch": 0.11228406909788867, "grad_norm": 0.0004836239095311612, "kl": 0.0006402559811249375, "learning_rate": 1.4028776978417265e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 117 }, { "completion_length": 201.6428680419922, "epoch": 0.11324376199616124, "grad_norm": 0.0003594615845941007, "kl": 0.0005501715349964797, "learning_rate": 1.4148681055155877e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 118 }, { "completion_length": 211.71429443359375, "epoch": 0.11420345489443379, "grad_norm": 0.0003194218734279275, "kl": 0.0005302675999701023, "learning_rate": 1.4268585131894483e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 119 }, { "completion_length": 193.92857360839844, "epoch": 0.11516314779270634, "grad_norm": 0.00037177655030973256, "kl": 0.0005381559021770954, "learning_rate": 1.4388489208633092e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 120 }, { "completion_length": 156.2857208251953, "epoch": 0.11612284069097889, "grad_norm": 0.00038172269705682993, "kl": 0.000548342359252274, "learning_rate": 1.45083932853717e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 121 }, { "completion_length": 203.07144165039062, "epoch": 0.11708253358925144, "grad_norm": 0.0003587153914850205, "kl": 0.0005690960679203272, "learning_rate": 1.4628297362110313e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 122 }, { "completion_length": 133.21429443359375, "epoch": 0.118042226487524, "grad_norm": 0.0006197782349772751, "kl": 0.0006614086451008916, "learning_rate": 1.474820143884892e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 123 }, { "completion_length": 186.35714721679688, "epoch": 0.11900191938579655, "grad_norm": 0.0003213662130292505, "kl": 0.0005787609843537211, "learning_rate": 1.4868105515587529e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 124 }, { "completion_length": 192.2857208251953, "epoch": 0.1199616122840691, "grad_norm": 0.0003585697850212455, "kl": 0.0006313406047411263, "learning_rate": 1.498800959232614e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 125 }, { "completion_length": 198.85714721679688, "epoch": 0.12092130518234165, "grad_norm": 0.00038071570452302694, "kl": 0.0005526289460249245, "learning_rate": 1.5107913669064747e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 126 }, { "completion_length": 148.1428680419922, "epoch": 0.1218809980806142, "grad_norm": 2.430067539215088, "kl": 0.0006654429598711431, "learning_rate": 1.5227817745803356e-07, "loss": 0.0, "reward": 0.1428571492433548, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.1428571492433548, "step": 127 }, { "completion_length": 154.2857208251953, "epoch": 0.12284069097888675, "grad_norm": 0.0004112898022867739, "kl": 0.0005881115794181824, "learning_rate": 1.5347721822541965e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 128 }, { "completion_length": 165.42857360839844, "epoch": 0.1238003838771593, "grad_norm": 0.0003670788719318807, "kl": 0.000552545243408531, "learning_rate": 1.5467625899280577e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 129 }, { "completion_length": 139.5, "epoch": 0.12476007677543186, "grad_norm": 0.0005072008352726698, "kl": 0.0006167014362290502, "learning_rate": 1.5587529976019183e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 130 }, { "completion_length": 114.21429443359375, "epoch": 0.1257197696737044, "grad_norm": 1.6213480234146118, "kl": 0.00045832840260118246, "learning_rate": 1.5707434052757792e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 131 }, { "completion_length": 188.71429443359375, "epoch": 0.12667946257197696, "grad_norm": 0.00034278465318493545, "kl": 0.0005040564574301243, "learning_rate": 1.5827338129496401e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 132 }, { "completion_length": 165.07144165039062, "epoch": 0.1276391554702495, "grad_norm": 1.0834221839904785, "kl": 0.0006640236242674291, "learning_rate": 1.5947242206235013e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 133 }, { "completion_length": 164.1428680419922, "epoch": 0.12859884836852206, "grad_norm": 1.3865183591842651, "kl": 0.0005434429622255266, "learning_rate": 1.606714628297362e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 134 }, { "completion_length": 211.85714721679688, "epoch": 0.1295585412667946, "grad_norm": 0.00047402491327375174, "kl": 0.0006030331132933497, "learning_rate": 1.6187050359712229e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 135 }, { "completion_length": 170.6428680419922, "epoch": 0.13051823416506717, "grad_norm": 0.0006219033966772258, "kl": 0.0006824259762652218, "learning_rate": 1.630695443645084e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 136 }, { "completion_length": 145.35714721679688, "epoch": 0.13147792706333974, "grad_norm": 0.00039621928590349853, "kl": 0.0005970431957393885, "learning_rate": 1.642685851318945e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 137 }, { "completion_length": 161.2857208251953, "epoch": 0.1324376199616123, "grad_norm": 1.5835026502609253, "kl": 0.0007118249195627868, "learning_rate": 1.6546762589928056e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 138 }, { "completion_length": 150.21429443359375, "epoch": 0.13339731285988485, "grad_norm": 0.0003325817233417183, "kl": 0.0004756075795739889, "learning_rate": 1.6666666666666665e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 139 }, { "completion_length": 194.85714721679688, "epoch": 0.1343570057581574, "grad_norm": 0.00033568323124200106, "kl": 0.0004382836923468858, "learning_rate": 1.6786570743405277e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 140 }, { "completion_length": 217.21429443359375, "epoch": 0.13531669865642995, "grad_norm": 0.000305079243844375, "kl": 0.0005180971929803491, "learning_rate": 1.6906474820143883e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 141 }, { "completion_length": 208.1428680419922, "epoch": 0.1362763915547025, "grad_norm": 0.00043074399582110345, "kl": 0.0006000304128974676, "learning_rate": 1.7026378896882492e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 142 }, { "completion_length": 212.92857360839844, "epoch": 0.13723608445297505, "grad_norm": 0.0003859077114611864, "kl": 0.00048819396761246026, "learning_rate": 1.7146282973621101e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 143 }, { "completion_length": 207.07144165039062, "epoch": 0.1381957773512476, "grad_norm": 0.00028781959554180503, "kl": 0.00047392401029355824, "learning_rate": 1.7266187050359713e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 144 }, { "completion_length": 208.7857208251953, "epoch": 0.13915547024952016, "grad_norm": 0.0003213859163224697, "kl": 0.0005155238904990256, "learning_rate": 1.738609112709832e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 145 }, { "completion_length": 203.00001525878906, "epoch": 0.1401151631477927, "grad_norm": 0.00028773315716534853, "kl": 0.00044469384010881186, "learning_rate": 1.750599520383693e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 146 }, { "completion_length": 190.85714721679688, "epoch": 0.14107485604606526, "grad_norm": 0.0003112384583801031, "kl": 0.0004886479000560939, "learning_rate": 1.7625899280575538e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 147 }, { "completion_length": 197.85714721679688, "epoch": 0.1420345489443378, "grad_norm": 0.00042582955211400986, "kl": 0.0006728200823999941, "learning_rate": 1.774580335731415e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 148 }, { "completion_length": 145.5, "epoch": 0.14299424184261036, "grad_norm": 0.000419868272729218, "kl": 0.0005456935614347458, "learning_rate": 1.7865707434052756e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 149 }, { "completion_length": 162.35714721679688, "epoch": 0.14395393474088292, "grad_norm": 0.0003780382394324988, "kl": 0.0005462466506287456, "learning_rate": 1.7985611510791365e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 150 }, { "completion_length": 143.71429443359375, "epoch": 0.14491362763915547, "grad_norm": 0.0004807198129128665, "kl": 0.0005312473513185978, "learning_rate": 1.8105515587529977e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 151 }, { "completion_length": 191.6428680419922, "epoch": 0.14587332053742802, "grad_norm": 1.955148458480835, "kl": 0.0005963364965282381, "learning_rate": 1.8225419664268586e-07, "loss": 0.0, "reward": 0.1428571492433548, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.1428571492433548, "step": 152 }, { "completion_length": 219.07144165039062, "epoch": 0.14683301343570057, "grad_norm": 0.00024871437926776707, "kl": 0.0005098316469229758, "learning_rate": 1.8345323741007192e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 153 }, { "completion_length": 196.1428680419922, "epoch": 0.14779270633397312, "grad_norm": 1.200289249420166, "kl": 0.0004947687266394496, "learning_rate": 1.8465227817745802e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 154 }, { "completion_length": 203.2857208251953, "epoch": 0.14875239923224567, "grad_norm": 1.4688223600387573, "kl": 0.0007565559353679419, "learning_rate": 1.8585131894484413e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 155 }, { "completion_length": 199.21429443359375, "epoch": 0.14971209213051823, "grad_norm": 0.0003543286875355989, "kl": 0.0006551481201313436, "learning_rate": 1.870503597122302e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 156 }, { "completion_length": 186.07144165039062, "epoch": 0.15067178502879078, "grad_norm": 0.00027435983065515757, "kl": 0.0005245337961241603, "learning_rate": 1.882494004796163e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 157 }, { "completion_length": 219.1428680419922, "epoch": 0.15163147792706333, "grad_norm": 0.0006726210121996701, "kl": 0.0006771465996280313, "learning_rate": 1.8944844124700238e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 158 }, { "completion_length": 223.00001525878906, "epoch": 0.15259117082533588, "grad_norm": 0.0003375273081474006, "kl": 0.0005506616435013711, "learning_rate": 1.906474820143885e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 159 }, { "completion_length": 185.71429443359375, "epoch": 0.15355086372360843, "grad_norm": 0.0003473393735475838, "kl": 0.0005297569441609085, "learning_rate": 1.9184652278177456e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 160 }, { "completion_length": 201.2857208251953, "epoch": 0.15451055662188098, "grad_norm": 0.0007428972749039531, "kl": 0.0005914120702072978, "learning_rate": 1.9304556354916065e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 161 }, { "completion_length": 135.92857360839844, "epoch": 0.15547024952015356, "grad_norm": 0.0009035385446622968, "kl": 0.0006196120521053672, "learning_rate": 1.9424460431654677e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 162 }, { "completion_length": 154.35714721679688, "epoch": 0.15642994241842612, "grad_norm": 0.0004327139467932284, "kl": 0.0005227324436418712, "learning_rate": 1.9544364508393286e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 163 }, { "completion_length": 154.42857360839844, "epoch": 0.15738963531669867, "grad_norm": 0.0004569028096739203, "kl": 0.000654122733976692, "learning_rate": 1.9664268585131893e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 164 }, { "completion_length": 153.21429443359375, "epoch": 0.15834932821497122, "grad_norm": 1.5683661699295044, "kl": 0.0005950650665909052, "learning_rate": 1.9784172661870502e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 165 }, { "completion_length": 120.5714340209961, "epoch": 0.15930902111324377, "grad_norm": 0.0005859537050127983, "kl": 0.0007576498319394886, "learning_rate": 1.9904076738609113e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 166 }, { "completion_length": 176.21429443359375, "epoch": 0.16026871401151632, "grad_norm": 0.0005688881501555443, "kl": 0.000695934344548732, "learning_rate": 2.0023980815347723e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 167 }, { "completion_length": 182.2857208251953, "epoch": 0.16122840690978887, "grad_norm": 0.000450094259576872, "kl": 0.0007291167275980115, "learning_rate": 2.014388489208633e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 168 }, { "completion_length": 150.1428680419922, "epoch": 0.16218809980806143, "grad_norm": 1.6065456867218018, "kl": 0.0007944860262796283, "learning_rate": 2.0263788968824938e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 169 }, { "completion_length": 183.85714721679688, "epoch": 0.16314779270633398, "grad_norm": 0.00038812385173514485, "kl": 0.0006125522195361555, "learning_rate": 2.038369304556355e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 170 }, { "completion_length": 170.1428680419922, "epoch": 0.16410748560460653, "grad_norm": 0.0008480788674205542, "kl": 0.0007221138803288341, "learning_rate": 2.0503597122302156e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 171 }, { "completion_length": 131.5, "epoch": 0.16506717850287908, "grad_norm": 0.0009021424339152873, "kl": 0.0007649558247067034, "learning_rate": 2.0623501199040765e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 172 }, { "completion_length": 162.5, "epoch": 0.16602687140115163, "grad_norm": 0.0005484732682816684, "kl": 0.0005437697400338948, "learning_rate": 2.0743405275779374e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 173 }, { "completion_length": 154.6428680419922, "epoch": 0.16698656429942418, "grad_norm": 0.0008841804228723049, "kl": 0.0007526960107497871, "learning_rate": 2.0863309352517986e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 174 }, { "completion_length": 147.7857208251953, "epoch": 0.16794625719769674, "grad_norm": 0.0007578100776299834, "kl": 0.0008119751582853496, "learning_rate": 2.0983213429256593e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 175 }, { "completion_length": 185.92857360839844, "epoch": 0.1689059500959693, "grad_norm": 0.00043362495489418507, "kl": 0.0006107023218646646, "learning_rate": 2.1103117505995202e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 176 }, { "completion_length": 155.07144165039062, "epoch": 0.16986564299424184, "grad_norm": 0.7326560020446777, "kl": 0.0005852950853295624, "learning_rate": 2.1223021582733814e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 177 }, { "completion_length": 175.57144165039062, "epoch": 0.1708253358925144, "grad_norm": 0.0005696373991668224, "kl": 0.0005363225354813039, "learning_rate": 2.1342925659472423e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 178 }, { "completion_length": 150.6428680419922, "epoch": 0.17178502879078694, "grad_norm": 0.7648991942405701, "kl": 0.0005510062328539789, "learning_rate": 2.146282973621103e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 179 }, { "completion_length": 119.42857360839844, "epoch": 0.1727447216890595, "grad_norm": 0.000839638989418745, "kl": 0.0006232323357835412, "learning_rate": 2.1582733812949638e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 180 }, { "completion_length": 193.00001525878906, "epoch": 0.17370441458733205, "grad_norm": 0.0007283779559656978, "kl": 0.0006753405323252082, "learning_rate": 2.170263788968825e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 181 }, { "completion_length": 265.0714416503906, "epoch": 0.1746641074856046, "grad_norm": 0.0006918279104866087, "kl": 0.000600978615693748, "learning_rate": 2.182254196642686e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 182 }, { "completion_length": 129.42857360839844, "epoch": 0.17562380038387715, "grad_norm": 0.0011069298489019275, "kl": 0.0007930785068310797, "learning_rate": 2.1942446043165465e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 183 }, { "completion_length": 169.2857208251953, "epoch": 0.1765834932821497, "grad_norm": 0.0011413100874051452, "kl": 0.0009220493957400322, "learning_rate": 2.2062350119904075e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 184 }, { "completion_length": 167.57144165039062, "epoch": 0.17754318618042225, "grad_norm": 0.0008806412224657834, "kl": 0.0008190371445380151, "learning_rate": 2.2182254196642686e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 185 }, { "completion_length": 154.07144165039062, "epoch": 0.1785028790786948, "grad_norm": 0.0007587157306261361, "kl": 0.000735841749701649, "learning_rate": 2.2302158273381293e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 186 }, { "completion_length": 165.6428680419922, "epoch": 0.17946257197696738, "grad_norm": 0.0006860272842459381, "kl": 0.0008189899381250143, "learning_rate": 2.2422062350119902e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 187 }, { "completion_length": 142.7857208251953, "epoch": 0.18042226487523993, "grad_norm": 0.0006982545601204038, "kl": 0.0008019261294975877, "learning_rate": 2.254196642685851e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 188 }, { "completion_length": 201.35714721679688, "epoch": 0.1813819577735125, "grad_norm": 0.0005836878553964198, "kl": 0.0006640757201239467, "learning_rate": 2.2661870503597123e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 189 }, { "completion_length": 166.85714721679688, "epoch": 0.18234165067178504, "grad_norm": 0.0009256108314730227, "kl": 0.000873617478646338, "learning_rate": 2.278177458033573e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 190 }, { "completion_length": 123.50000762939453, "epoch": 0.1833013435700576, "grad_norm": 0.0027157473377883434, "kl": 0.0013227493036538363, "learning_rate": 2.2901678657074338e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 191 }, { "completion_length": 138.0, "epoch": 0.18426103646833014, "grad_norm": 0.0009342596749775112, "kl": 0.0007333616958931088, "learning_rate": 2.302158273381295e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 192 }, { "completion_length": 173.57144165039062, "epoch": 0.1852207293666027, "grad_norm": 0.0013187481090426445, "kl": 0.0009545387583784759, "learning_rate": 2.314148681055156e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 193 }, { "completion_length": 197.1428680419922, "epoch": 0.18618042226487524, "grad_norm": 0.0006880752625875175, "kl": 0.0007264750311151147, "learning_rate": 2.3261390887290166e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 194 }, { "completion_length": 199.42857360839844, "epoch": 0.1871401151631478, "grad_norm": 0.7748671770095825, "kl": 0.0008393727475777268, "learning_rate": 2.3381294964028775e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 195 }, { "completion_length": 136.92857360839844, "epoch": 0.18809980806142035, "grad_norm": 0.0008711254340596497, "kl": 0.000657685799524188, "learning_rate": 2.3501199040767386e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 196 }, { "completion_length": 211.35714721679688, "epoch": 0.1890595009596929, "grad_norm": 0.000773859501350671, "kl": 0.0006889952928759158, "learning_rate": 2.3621103117505996e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 197 }, { "completion_length": 169.0, "epoch": 0.19001919385796545, "grad_norm": 0.0006125992513261735, "kl": 0.000707263418007642, "learning_rate": 2.3741007194244602e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 198 }, { "completion_length": 189.7857208251953, "epoch": 0.190978886756238, "grad_norm": 0.0008087116293609142, "kl": 0.0006217738846316934, "learning_rate": 2.3860911270983214e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 199 }, { "completion_length": 167.6428680419922, "epoch": 0.19193857965451055, "grad_norm": 0.0011518154060468078, "kl": 0.0008382133673876524, "learning_rate": 2.398081534772182e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 200 }, { "completion_length": 199.57144165039062, "epoch": 0.1928982725527831, "grad_norm": 0.0006516108405776322, "kl": 0.0007004616199992597, "learning_rate": 2.410071942446043e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 201 }, { "completion_length": 180.07144165039062, "epoch": 0.19385796545105566, "grad_norm": 0.0009824404260143638, "kl": 0.0007662419811822474, "learning_rate": 2.422062350119904e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 202 }, { "completion_length": 155.35714721679688, "epoch": 0.1948176583493282, "grad_norm": 2.158590316772461, "kl": 0.001021720003336668, "learning_rate": 2.434052757793765e-07, "loss": 0.0, "reward": 0.2142857313156128, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.2142857313156128, "step": 203 }, { "completion_length": 197.42857360839844, "epoch": 0.19577735124760076, "grad_norm": 1.158469319343567, "kl": 0.0005528393085114658, "learning_rate": 2.4460431654676257e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 204 }, { "completion_length": 107.0714340209961, "epoch": 0.1967370441458733, "grad_norm": 0.0012115774443373084, "kl": 0.0008443004917353392, "learning_rate": 2.458033573141487e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 205 }, { "completion_length": 184.92857360839844, "epoch": 0.19769673704414586, "grad_norm": 0.0008226028876379132, "kl": 0.0007202682900242507, "learning_rate": 2.4700239808153475e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 206 }, { "completion_length": 133.71429443359375, "epoch": 0.19865642994241842, "grad_norm": 0.0014496436342597008, "kl": 0.0009565892396494746, "learning_rate": 2.4820143884892087e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 207 }, { "completion_length": 162.7857208251953, "epoch": 0.19961612284069097, "grad_norm": 0.0007881002384237945, "kl": 0.0007638644310645759, "learning_rate": 2.4940047961630693e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 208 }, { "completion_length": 145.0, "epoch": 0.20057581573896352, "grad_norm": 0.0009940582094714046, "kl": 0.0009214243618771434, "learning_rate": 2.5059952038369305e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 209 }, { "completion_length": 180.50001525878906, "epoch": 0.20153550863723607, "grad_norm": 0.001215582713484764, "kl": 0.0009285681881010532, "learning_rate": 2.517985611510791e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 210 }, { "completion_length": 138.57144165039062, "epoch": 0.20249520153550865, "grad_norm": 0.002426323713734746, "kl": 0.0012417841935530305, "learning_rate": 2.529976019184652e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 211 }, { "completion_length": 195.1428680419922, "epoch": 0.2034548944337812, "grad_norm": 0.0006519036833196878, "kl": 0.0006649614661000669, "learning_rate": 2.541966426858513e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 212 }, { "completion_length": 167.42857360839844, "epoch": 0.20441458733205375, "grad_norm": 1.3879348039627075, "kl": 0.0008901157998479903, "learning_rate": 2.553956834532374e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 213 }, { "completion_length": 233.21429443359375, "epoch": 0.2053742802303263, "grad_norm": 0.0005398900248110294, "kl": 0.0005769425770267844, "learning_rate": 2.5659472422062353e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 214 }, { "completion_length": 164.07144165039062, "epoch": 0.20633397312859886, "grad_norm": 0.0011360155185684562, "kl": 0.0009616695460863411, "learning_rate": 2.577937649880096e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 215 }, { "completion_length": 177.07144165039062, "epoch": 0.2072936660268714, "grad_norm": 0.001026230864226818, "kl": 0.0008613543468527496, "learning_rate": 2.5899280575539566e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 216 }, { "completion_length": 194.50001525878906, "epoch": 0.20825335892514396, "grad_norm": 0.0010892812861129642, "kl": 0.0008288582903333008, "learning_rate": 2.601918465227818e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 217 }, { "completion_length": 156.57144165039062, "epoch": 0.2092130518234165, "grad_norm": 0.0014859149232506752, "kl": 0.0011846995912492275, "learning_rate": 2.6139088729016784e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 218 }, { "completion_length": 145.0, "epoch": 0.21017274472168906, "grad_norm": 0.0015101025346666574, "kl": 0.001177833415567875, "learning_rate": 2.625899280575539e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 219 }, { "completion_length": 226.6428680419922, "epoch": 0.21113243761996162, "grad_norm": 0.0007979250513017178, "kl": 0.0008849891019053757, "learning_rate": 2.637889688249401e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 220 }, { "completion_length": 155.1428680419922, "epoch": 0.21209213051823417, "grad_norm": 0.0014650049852207303, "kl": 0.001100677065551281, "learning_rate": 2.6498800959232614e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 221 }, { "completion_length": 153.42857360839844, "epoch": 0.21305182341650672, "grad_norm": 0.001838172785937786, "kl": 0.001235618838109076, "learning_rate": 2.661870503597122e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 222 }, { "completion_length": 206.21429443359375, "epoch": 0.21401151631477927, "grad_norm": 0.0012702227104455233, "kl": 0.0010753560345619917, "learning_rate": 2.673860911270983e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 223 }, { "completion_length": 156.5, "epoch": 0.21497120921305182, "grad_norm": 0.001898351707495749, "kl": 0.0014250983949750662, "learning_rate": 2.685851318944844e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 224 }, { "completion_length": 180.1428680419922, "epoch": 0.21593090211132437, "grad_norm": 1.299948811531067, "kl": 0.001172384712845087, "learning_rate": 2.697841726618705e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 225 }, { "completion_length": 160.71429443359375, "epoch": 0.21689059500959693, "grad_norm": 0.0020634811371564865, "kl": 0.0015121198957785964, "learning_rate": 2.7098321342925657e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 226 }, { "completion_length": 190.2857208251953, "epoch": 0.21785028790786948, "grad_norm": 0.0012137209996581078, "kl": 0.0011903165141120553, "learning_rate": 2.7218225419664263e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 227 }, { "completion_length": 179.85714721679688, "epoch": 0.21880998080614203, "grad_norm": 0.0015006123576313257, "kl": 0.0011109196348115802, "learning_rate": 2.733812949640288e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 228 }, { "completion_length": 185.2857208251953, "epoch": 0.21976967370441458, "grad_norm": 0.0010969473514705896, "kl": 0.0010765750193968415, "learning_rate": 2.7458033573141487e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 229 }, { "completion_length": 128.0, "epoch": 0.22072936660268713, "grad_norm": 0.0024165872018784285, "kl": 0.0017360516358166933, "learning_rate": 2.7577937649880093e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 230 }, { "completion_length": 166.2857208251953, "epoch": 0.22168905950095968, "grad_norm": 0.0019573229365050793, "kl": 0.0016222953563556075, "learning_rate": 2.7697841726618705e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 231 }, { "completion_length": 139.85714721679688, "epoch": 0.22264875239923224, "grad_norm": 0.0020130425691604614, "kl": 0.0015894394600763917, "learning_rate": 2.781774580335731e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 232 }, { "completion_length": 133.7857208251953, "epoch": 0.2236084452975048, "grad_norm": 0.9661685824394226, "kl": 0.0013019731268286705, "learning_rate": 2.793764988009592e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 233 }, { "completion_length": 183.71429443359375, "epoch": 0.22456813819577734, "grad_norm": 0.9696537256240845, "kl": 0.0018521607853472233, "learning_rate": 2.805755395683453e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 234 }, { "completion_length": 164.6428680419922, "epoch": 0.2255278310940499, "grad_norm": 0.0012538745068013668, "kl": 0.0010992323514074087, "learning_rate": 2.817745803357314e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 235 }, { "completion_length": 127.64286041259766, "epoch": 0.22648752399232247, "grad_norm": 0.001807168941013515, "kl": 0.0015386344166472554, "learning_rate": 2.8297362110311753e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 236 }, { "completion_length": 158.6428680419922, "epoch": 0.22744721689059502, "grad_norm": 0.0020955807995051146, "kl": 0.0017593932570889592, "learning_rate": 2.841726618705036e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 237 }, { "completion_length": 187.7857208251953, "epoch": 0.22840690978886757, "grad_norm": 0.0018621302442625165, "kl": 0.0015393722569569945, "learning_rate": 2.8537170263788966e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 238 }, { "completion_length": 153.21429443359375, "epoch": 0.22936660268714013, "grad_norm": 0.0023217785637825727, "kl": 0.002015487989410758, "learning_rate": 2.865707434052758e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 239 }, { "completion_length": 142.71429443359375, "epoch": 0.23032629558541268, "grad_norm": 0.002789160469546914, "kl": 0.0021790177561342716, "learning_rate": 2.8776978417266184e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 240 }, { "completion_length": 209.35714721679688, "epoch": 0.23128598848368523, "grad_norm": 0.0019388291984796524, "kl": 0.0016431508120149374, "learning_rate": 2.889688249400479e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 241 }, { "completion_length": 150.21429443359375, "epoch": 0.23224568138195778, "grad_norm": 0.00273157749325037, "kl": 0.0021783753763884306, "learning_rate": 2.90167865707434e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 242 }, { "completion_length": 167.42857360839844, "epoch": 0.23320537428023033, "grad_norm": 0.002374428790062666, "kl": 0.001792409922927618, "learning_rate": 2.9136690647482014e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 243 }, { "completion_length": 196.85714721679688, "epoch": 0.23416506717850288, "grad_norm": 0.0026038656942546368, "kl": 0.0020209960639476776, "learning_rate": 2.9256594724220626e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 244 }, { "completion_length": 220.7857208251953, "epoch": 0.23512476007677544, "grad_norm": 0.001225349842570722, "kl": 0.0011861540842801332, "learning_rate": 2.937649880095923e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 245 }, { "completion_length": 157.92857360839844, "epoch": 0.236084452975048, "grad_norm": 0.0026088713202625513, "kl": 0.002129191532731056, "learning_rate": 2.949640287769784e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 246 }, { "completion_length": 187.07144165039062, "epoch": 0.23704414587332054, "grad_norm": 0.002025354653596878, "kl": 0.001947617856785655, "learning_rate": 2.961630695443645e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 247 }, { "completion_length": 134.57144165039062, "epoch": 0.2380038387715931, "grad_norm": 0.002978349570184946, "kl": 0.002433012006804347, "learning_rate": 2.9736211031175057e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 248 }, { "completion_length": 188.07144165039062, "epoch": 0.23896353166986564, "grad_norm": 0.0019192213658243418, "kl": 0.0016239362303167582, "learning_rate": 2.9856115107913663e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 249 }, { "completion_length": 177.92857360839844, "epoch": 0.2399232245681382, "grad_norm": 0.0022956784814596176, "kl": 0.0019293871009722352, "learning_rate": 2.997601918465228e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 250 }, { "completion_length": 148.7857208251953, "epoch": 0.24088291746641075, "grad_norm": 0.0025290795601904392, "kl": 0.0021560739260166883, "learning_rate": 3.0095923261390887e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 251 }, { "completion_length": 149.85714721679688, "epoch": 0.2418426103646833, "grad_norm": 0.003358300542458892, "kl": 0.0024248952977359295, "learning_rate": 3.0215827338129493e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 252 }, { "completion_length": 182.21429443359375, "epoch": 0.24280230326295585, "grad_norm": 0.002087182132527232, "kl": 0.0018849438056349754, "learning_rate": 3.0335731414868105e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 253 }, { "completion_length": 152.85714721679688, "epoch": 0.2437619961612284, "grad_norm": 0.0022951846476644278, "kl": 0.0020260834135115147, "learning_rate": 3.045563549160671e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 254 }, { "completion_length": 181.35714721679688, "epoch": 0.24472168905950095, "grad_norm": 0.0024261039216071367, "kl": 0.0021250813733786345, "learning_rate": 3.0575539568345323e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 255 }, { "completion_length": 163.6428680419922, "epoch": 0.2456813819577735, "grad_norm": 0.002039488172158599, "kl": 0.0018252377631142735, "learning_rate": 3.069544364508393e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 256 }, { "completion_length": 176.6428680419922, "epoch": 0.24664107485604606, "grad_norm": 0.0021182410418987274, "kl": 0.0017519788816571236, "learning_rate": 3.081534772182254e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 257 }, { "completion_length": 124.71429443359375, "epoch": 0.2476007677543186, "grad_norm": 0.003411118872463703, "kl": 0.002667119028046727, "learning_rate": 3.0935251798561153e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 258 }, { "completion_length": 147.5, "epoch": 0.24856046065259116, "grad_norm": 0.0020024196710437536, "kl": 0.0017016871133819222, "learning_rate": 3.105515587529976e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 259 }, { "completion_length": 119.0714340209961, "epoch": 0.2495201535508637, "grad_norm": 0.0027498656418174505, "kl": 0.0021361790131777525, "learning_rate": 3.1175059952038366e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 260 }, { "completion_length": 119.21429443359375, "epoch": 0.2504798464491363, "grad_norm": 0.002395584248006344, "kl": 0.0019264081493020058, "learning_rate": 3.129496402877698e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 261 }, { "completion_length": 157.21429443359375, "epoch": 0.2514395393474088, "grad_norm": 0.0032541570253670216, "kl": 0.002302316017448902, "learning_rate": 3.1414868105515584e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 262 }, { "completion_length": 202.42857360839844, "epoch": 0.2523992322456814, "grad_norm": 0.0020991608034819365, "kl": 0.001845602411776781, "learning_rate": 3.153477218225419e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 263 }, { "completion_length": 184.2857208251953, "epoch": 0.2533589251439539, "grad_norm": 0.0017441392410546541, "kl": 0.0016937253531068563, "learning_rate": 3.1654676258992803e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 264 }, { "completion_length": 174.2857208251953, "epoch": 0.2543186180422265, "grad_norm": 0.0020491599570959806, "kl": 0.001726771704852581, "learning_rate": 3.1774580335731414e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 265 }, { "completion_length": 186.42857360839844, "epoch": 0.255278310940499, "grad_norm": 0.0018193688010796905, "kl": 0.0016390173695981503, "learning_rate": 3.1894484412470026e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 266 }, { "completion_length": 129.21429443359375, "epoch": 0.2562380038387716, "grad_norm": 0.002859668806195259, "kl": 0.0023970818147063255, "learning_rate": 3.201438848920863e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 267 }, { "completion_length": 212.6428680419922, "epoch": 0.2571976967370441, "grad_norm": 0.0019939548801630735, "kl": 0.0019298209808766842, "learning_rate": 3.213429256594724e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 268 }, { "completion_length": 182.57144165039062, "epoch": 0.2581573896353167, "grad_norm": 0.0023688061628490686, "kl": 0.001987688010558486, "learning_rate": 3.225419664268585e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 269 }, { "completion_length": 174.35714721679688, "epoch": 0.2591170825335892, "grad_norm": 0.002322971122339368, "kl": 0.001956464257091284, "learning_rate": 3.2374100719424457e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 270 }, { "completion_length": 190.50001525878906, "epoch": 0.2600767754318618, "grad_norm": 0.0026816839817911386, "kl": 0.001965252449735999, "learning_rate": 3.2494004796163064e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 271 }, { "completion_length": 170.07144165039062, "epoch": 0.26103646833013433, "grad_norm": 0.0018740884261205792, "kl": 0.0017238996224477887, "learning_rate": 3.261390887290168e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 272 }, { "completion_length": 185.6428680419922, "epoch": 0.2619961612284069, "grad_norm": 0.0022177747450768948, "kl": 0.0017764398362487555, "learning_rate": 3.2733812949640287e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 273 }, { "completion_length": 180.57144165039062, "epoch": 0.2629558541266795, "grad_norm": 0.0018041409784927964, "kl": 0.00168732984457165, "learning_rate": 3.28537170263789e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 274 }, { "completion_length": 156.2857208251953, "epoch": 0.263915547024952, "grad_norm": 1.2447295188903809, "kl": 0.0019863115157932043, "learning_rate": 3.2973621103117505e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 275 }, { "completion_length": 178.21429443359375, "epoch": 0.2648752399232246, "grad_norm": 0.0023839271161705256, "kl": 0.0020074776839464903, "learning_rate": 3.309352517985611e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 276 }, { "completion_length": 204.50001525878906, "epoch": 0.2658349328214971, "grad_norm": 0.002735461574047804, "kl": 0.0022335564717650414, "learning_rate": 3.3213429256594724e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 277 }, { "completion_length": 137.35714721679688, "epoch": 0.2667946257197697, "grad_norm": 1.183794617652893, "kl": 0.0030716650653630495, "learning_rate": 3.333333333333333e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 278 }, { "completion_length": 157.07144165039062, "epoch": 0.2677543186180422, "grad_norm": 0.0020497306250035763, "kl": 0.0018415265949442983, "learning_rate": 3.3453237410071937e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 279 }, { "completion_length": 166.6428680419922, "epoch": 0.2687140115163148, "grad_norm": 0.0027754022739827633, "kl": 0.0021763029508292675, "learning_rate": 3.3573141486810554e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 280 }, { "completion_length": 162.85714721679688, "epoch": 0.2696737044145873, "grad_norm": 1.5974063873291016, "kl": 0.0021157171577215195, "learning_rate": 3.369304556354916e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 281 }, { "completion_length": 185.35714721679688, "epoch": 0.2706333973128599, "grad_norm": 0.002374320523813367, "kl": 0.0019762800075113773, "learning_rate": 3.3812949640287766e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 282 }, { "completion_length": 187.71429443359375, "epoch": 0.2715930902111324, "grad_norm": 1.0880811214447021, "kl": 0.0018399183172732592, "learning_rate": 3.393285371702638e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 283 }, { "completion_length": 183.21429443359375, "epoch": 0.272552783109405, "grad_norm": 0.002009395509958267, "kl": 0.002069524023681879, "learning_rate": 3.4052757793764985e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 284 }, { "completion_length": 177.00001525878906, "epoch": 0.27351247600767753, "grad_norm": 0.00203274330124259, "kl": 0.0017071508336812258, "learning_rate": 3.4172661870503596e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 285 }, { "completion_length": 180.7857208251953, "epoch": 0.2744721689059501, "grad_norm": 0.0019489711849018931, "kl": 0.0018624041695147753, "learning_rate": 3.4292565947242203e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 286 }, { "completion_length": 148.6428680419922, "epoch": 0.27543186180422263, "grad_norm": 0.9649380445480347, "kl": 0.0027292685117572546, "learning_rate": 3.4412470023980815e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 287 }, { "completion_length": 197.21429443359375, "epoch": 0.2763915547024952, "grad_norm": 0.0024312075693160295, "kl": 0.0022409697994589806, "learning_rate": 3.4532374100719426e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 288 }, { "completion_length": 192.21429443359375, "epoch": 0.27735124760076774, "grad_norm": 0.005323869176208973, "kl": 0.003771465504541993, "learning_rate": 3.4652278177458033e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 289 }, { "completion_length": 170.85714721679688, "epoch": 0.2783109404990403, "grad_norm": 0.002437053946778178, "kl": 0.0023432057350873947, "learning_rate": 3.477218225419664e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 290 }, { "completion_length": 189.7857208251953, "epoch": 0.27927063339731284, "grad_norm": 0.0026907450519502163, "kl": 0.002821230096742511, "learning_rate": 3.489208633093525e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 291 }, { "completion_length": 192.00001525878906, "epoch": 0.2802303262955854, "grad_norm": 0.0021529404912143946, "kl": 0.002294830046594143, "learning_rate": 3.501199040767386e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 292 }, { "completion_length": 221.2857208251953, "epoch": 0.28119001919385794, "grad_norm": 0.0021073021925985813, "kl": 0.002086169784888625, "learning_rate": 3.5131894484412464e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 293 }, { "completion_length": 161.92857360839844, "epoch": 0.2821497120921305, "grad_norm": 0.002413996960967779, "kl": 0.002601858926936984, "learning_rate": 3.5251798561151076e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 294 }, { "completion_length": 179.92857360839844, "epoch": 0.28310940499040305, "grad_norm": 0.0030513862147927284, "kl": 0.003169405274093151, "learning_rate": 3.537170263788969e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 295 }, { "completion_length": 152.21429443359375, "epoch": 0.2840690978886756, "grad_norm": 0.0042703901417553425, "kl": 0.004278010223060846, "learning_rate": 3.54916067146283e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 296 }, { "completion_length": 144.0, "epoch": 0.28502879078694815, "grad_norm": 0.0033138927537947893, "kl": 0.0035512058530002832, "learning_rate": 3.5611510791366906e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 297 }, { "completion_length": 137.92857360839844, "epoch": 0.28598848368522073, "grad_norm": 1.2848498821258545, "kl": 0.00452088937163353, "learning_rate": 3.573141486810551e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 298 }, { "completion_length": 221.35714721679688, "epoch": 0.2869481765834933, "grad_norm": 0.002212430816143751, "kl": 0.0026129719335585833, "learning_rate": 3.5851318944844124e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 299 }, { "completion_length": 215.7857208251953, "epoch": 0.28790786948176583, "grad_norm": 0.005539421923458576, "kl": 0.0051537128165364265, "learning_rate": 3.597122302158273e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 300 }, { "completion_length": 160.57144165039062, "epoch": 0.2888675623800384, "grad_norm": 0.9260180592536926, "kl": 0.0042287427932024, "learning_rate": 3.6091127098321337e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 301 }, { "completion_length": 125.28572082519531, "epoch": 0.28982725527831094, "grad_norm": 1.760083556175232, "kl": 0.004440846852958202, "learning_rate": 3.6211031175059954e-07, "loss": 0.0, "reward": 0.1428571492433548, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.1428571492433548, "step": 302 }, { "completion_length": 167.1428680419922, "epoch": 0.2907869481765835, "grad_norm": 0.004808115307241678, "kl": 0.005196878220885992, "learning_rate": 3.633093525179856e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 303 }, { "completion_length": 174.85714721679688, "epoch": 0.29174664107485604, "grad_norm": 0.0025805344339460135, "kl": 0.003404662711545825, "learning_rate": 3.645083932853717e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 304 }, { "completion_length": 206.35714721679688, "epoch": 0.2927063339731286, "grad_norm": 0.003948758821934462, "kl": 0.004938705824315548, "learning_rate": 3.657074340527578e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 305 }, { "completion_length": 173.71429443359375, "epoch": 0.29366602687140114, "grad_norm": 0.002694430062547326, "kl": 0.003633981104940176, "learning_rate": 3.6690647482014385e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 306 }, { "completion_length": 188.71429443359375, "epoch": 0.2946257197696737, "grad_norm": 0.0045383768156170845, "kl": 0.005452936515212059, "learning_rate": 3.6810551558752997e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 307 }, { "completion_length": 202.6428680419922, "epoch": 0.29558541266794625, "grad_norm": 0.0032155371736735106, "kl": 0.004345043562352657, "learning_rate": 3.6930455635491603e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 308 }, { "completion_length": 162.42857360839844, "epoch": 0.2965451055662188, "grad_norm": 0.5645195841789246, "kl": 0.004626518581062555, "learning_rate": 3.705035971223021e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 309 }, { "completion_length": 140.85714721679688, "epoch": 0.29750479846449135, "grad_norm": 0.005775155499577522, "kl": 0.005995224695652723, "learning_rate": 3.7170263788968827e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 310 }, { "completion_length": 196.50001525878906, "epoch": 0.29846449136276393, "grad_norm": 0.0036524387542158365, "kl": 0.005481414962559938, "learning_rate": 3.7290167865707433e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 311 }, { "completion_length": 156.0, "epoch": 0.29942418426103645, "grad_norm": 0.006153911352157593, "kl": 0.007799423299729824, "learning_rate": 3.741007194244604e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 312 }, { "completion_length": 205.85714721679688, "epoch": 0.30038387715930903, "grad_norm": 0.003954061772674322, "kl": 0.005708874668926001, "learning_rate": 3.752997601918465e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 313 }, { "completion_length": 196.7857208251953, "epoch": 0.30134357005758156, "grad_norm": 0.004097146913409233, "kl": 0.006166146136820316, "learning_rate": 3.764988009592326e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 314 }, { "completion_length": 161.07144165039062, "epoch": 0.30230326295585414, "grad_norm": 0.010332883335649967, "kl": 0.011528744362294674, "learning_rate": 3.776978417266187e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 315 }, { "completion_length": 189.2857208251953, "epoch": 0.30326295585412666, "grad_norm": 0.005150932352989912, "kl": 0.007517289835959673, "learning_rate": 3.7889688249400476e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 316 }, { "completion_length": 178.2857208251953, "epoch": 0.30422264875239924, "grad_norm": 0.008586912415921688, "kl": 0.0109346192330122, "learning_rate": 3.800959232613909e-07, "loss": 0.0, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 317 }, { "completion_length": 148.2857208251953, "epoch": 0.30518234165067176, "grad_norm": 0.004604436922818422, "kl": 0.007145038805902004, "learning_rate": 3.81294964028777e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 318 }, { "completion_length": 148.07144165039062, "epoch": 0.30614203454894434, "grad_norm": 0.002955544274300337, "kl": 0.004868857096880674, "learning_rate": 3.8249400479616306e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 319 }, { "completion_length": 173.21429443359375, "epoch": 0.30710172744721687, "grad_norm": 0.00860007107257843, "kl": 0.01131271943449974, "learning_rate": 3.836930455635491e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 320 }, { "completion_length": 187.6428680419922, "epoch": 0.30806142034548945, "grad_norm": 0.003616205183789134, "kl": 0.006206669379025698, "learning_rate": 3.8489208633093524e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 321 }, { "completion_length": 174.71429443359375, "epoch": 0.30902111324376197, "grad_norm": 0.0030920489225536585, "kl": 0.005169415380805731, "learning_rate": 3.860911270983213e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 322 }, { "completion_length": 177.92857360839844, "epoch": 0.30998080614203455, "grad_norm": 0.004745262209326029, "kl": 0.007666883058845997, "learning_rate": 3.8729016786570737e-07, "loss": 0.0, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 323 }, { "completion_length": 142.2857208251953, "epoch": 0.31094049904030713, "grad_norm": 0.004846208728849888, "kl": 0.007900618948042393, "learning_rate": 3.8848920863309354e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 324 }, { "completion_length": 145.42857360839844, "epoch": 0.31190019193857965, "grad_norm": 2.4237635135650635, "kl": 0.00980131234973669, "learning_rate": 3.896882494004796e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 325 }, { "completion_length": 175.21429443359375, "epoch": 0.31285988483685223, "grad_norm": 0.003972214180976152, "kl": 0.006522429641336203, "learning_rate": 3.908872901678657e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 326 }, { "completion_length": 204.6428680419922, "epoch": 0.31381957773512476, "grad_norm": 0.8406820893287659, "kl": 0.007290977984666824, "learning_rate": 3.920863309352518e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 327 }, { "completion_length": 233.35714721679688, "epoch": 0.31477927063339733, "grad_norm": 0.0049285851418972015, "kl": 0.008145470172166824, "learning_rate": 3.9328537170263785e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 328 }, { "completion_length": 133.92857360839844, "epoch": 0.31573896353166986, "grad_norm": 0.005321166943758726, "kl": 0.008546940982341766, "learning_rate": 3.9448441247002397e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 329 }, { "completion_length": 175.1428680419922, "epoch": 0.31669865642994244, "grad_norm": 0.005221081897616386, "kl": 0.00866087805479765, "learning_rate": 3.9568345323741003e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 330 }, { "completion_length": 129.92857360839844, "epoch": 0.31765834932821496, "grad_norm": 0.0059640719555318356, "kl": 0.00962324719876051, "learning_rate": 3.968824940047961e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 331 }, { "completion_length": 144.07144165039062, "epoch": 0.31861804222648754, "grad_norm": 0.0085787083953619, "kl": 0.012860535643994808, "learning_rate": 3.9808153477218227e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 332 }, { "completion_length": 218.1428680419922, "epoch": 0.31957773512476007, "grad_norm": 0.0044692037627100945, "kl": 0.008109544403851032, "learning_rate": 3.9928057553956833e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 333 }, { "completion_length": 110.21429443359375, "epoch": 0.32053742802303264, "grad_norm": 0.015507777221500874, "kl": 0.020224329084157944, "learning_rate": 4.0047961630695445e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 334 }, { "completion_length": 117.71429443359375, "epoch": 0.32149712092130517, "grad_norm": 0.009055322036147118, "kl": 0.015160443261265755, "learning_rate": 4.016786570743405e-07, "loss": 0.0, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 335 }, { "completion_length": 203.85714721679688, "epoch": 0.32245681381957775, "grad_norm": 0.0059068528935313225, "kl": 0.010014813393354416, "learning_rate": 4.028776978417266e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 336 }, { "completion_length": 152.0, "epoch": 0.32341650671785027, "grad_norm": 0.006813621148467064, "kl": 0.011223774403333664, "learning_rate": 4.040767386091127e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 337 }, { "completion_length": 176.07144165039062, "epoch": 0.32437619961612285, "grad_norm": 0.0068233623169362545, "kl": 0.012045835144817829, "learning_rate": 4.0527577937649876e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 338 }, { "completion_length": 187.92857360839844, "epoch": 0.3253358925143954, "grad_norm": 0.006727190688252449, "kl": 0.010942983441054821, "learning_rate": 4.064748201438849e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 339 }, { "completion_length": 140.0, "epoch": 0.32629558541266795, "grad_norm": 0.007985416799783707, "kl": 0.013225625269114971, "learning_rate": 4.07673860911271e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 340 }, { "completion_length": 175.6428680419922, "epoch": 0.3272552783109405, "grad_norm": 1.867785930633545, "kl": 0.012738192453980446, "learning_rate": 4.0887290167865706e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 341 }, { "completion_length": 183.21429443359375, "epoch": 0.32821497120921306, "grad_norm": 0.007352407556027174, "kl": 0.01188686117529869, "learning_rate": 4.100719424460431e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 342 }, { "completion_length": 126.5714340209961, "epoch": 0.3291746641074856, "grad_norm": 0.006626503076404333, "kl": 0.013077978044748306, "learning_rate": 4.1127098321342924e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 343 }, { "completion_length": 212.35714721679688, "epoch": 0.33013435700575816, "grad_norm": 0.0039927843026816845, "kl": 0.007034569047391415, "learning_rate": 4.124700239808153e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 344 }, { "completion_length": 148.1428680419922, "epoch": 0.3310940499040307, "grad_norm": 0.01000019907951355, "kl": 0.014484604820609093, "learning_rate": 4.136690647482014e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 345 }, { "completion_length": 168.92857360839844, "epoch": 0.33205374280230326, "grad_norm": 0.008740787394344807, "kl": 0.012992286123335361, "learning_rate": 4.148681055155875e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 346 }, { "completion_length": 180.00001525878906, "epoch": 0.3330134357005758, "grad_norm": 0.005464079789817333, "kl": 0.008762363344430923, "learning_rate": 4.160671462829736e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 347 }, { "completion_length": 196.21429443359375, "epoch": 0.33397312859884837, "grad_norm": 0.004085875581949949, "kl": 0.006614582613110542, "learning_rate": 4.172661870503597e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 348 }, { "completion_length": 165.5, "epoch": 0.33493282149712095, "grad_norm": 0.004260022193193436, "kl": 0.007020771969109774, "learning_rate": 4.184652278177458e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 349 }, { "completion_length": 143.71429443359375, "epoch": 0.33589251439539347, "grad_norm": 0.003184996545314789, "kl": 0.005150003358721733, "learning_rate": 4.1966426858513185e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 350 }, { "completion_length": 150.85714721679688, "epoch": 0.33685220729366605, "grad_norm": 0.005969468038529158, "kl": 0.009847059845924377, "learning_rate": 4.2086330935251797e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 351 }, { "completion_length": 122.64286041259766, "epoch": 0.3378119001919386, "grad_norm": 0.007074362598359585, "kl": 0.010350339114665985, "learning_rate": 4.2206235011990404e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 352 }, { "completion_length": 133.5, "epoch": 0.33877159309021115, "grad_norm": 0.007989284582436085, "kl": 0.011157847009599209, "learning_rate": 4.232613908872901e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 353 }, { "completion_length": 217.7857208251953, "epoch": 0.3397312859884837, "grad_norm": 0.005063450895249844, "kl": 0.007295941933989525, "learning_rate": 4.2446043165467627e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 354 }, { "completion_length": 177.35714721679688, "epoch": 0.34069097888675626, "grad_norm": 0.005169340874999762, "kl": 0.00729595310986042, "learning_rate": 4.2565947242206233e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 355 }, { "completion_length": 173.1428680419922, "epoch": 0.3416506717850288, "grad_norm": 0.006186780985444784, "kl": 0.010443543083965778, "learning_rate": 4.2685851318944845e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 356 }, { "completion_length": 136.71429443359375, "epoch": 0.34261036468330136, "grad_norm": 0.006011091638356447, "kl": 0.007584834937006235, "learning_rate": 4.280575539568345e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 357 }, { "completion_length": 174.7857208251953, "epoch": 0.3435700575815739, "grad_norm": 0.003878444666042924, "kl": 0.005394139792770147, "learning_rate": 4.292565947242206e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 358 }, { "completion_length": 165.71429443359375, "epoch": 0.34452975047984646, "grad_norm": 0.009980314411222935, "kl": 0.009348583407700062, "learning_rate": 4.304556354916067e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 359 }, { "completion_length": 181.1428680419922, "epoch": 0.345489443378119, "grad_norm": 0.004425181075930595, "kl": 0.005862870719283819, "learning_rate": 4.3165467625899276e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 360 }, { "completion_length": 168.42857360839844, "epoch": 0.34644913627639157, "grad_norm": 0.003898108843713999, "kl": 0.005040816031396389, "learning_rate": 4.3285371702637883e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 361 }, { "completion_length": 141.57144165039062, "epoch": 0.3474088291746641, "grad_norm": 0.005947483237832785, "kl": 0.00805943738669157, "learning_rate": 4.34052757793765e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 362 }, { "completion_length": 153.1428680419922, "epoch": 0.34836852207293667, "grad_norm": 0.005342531483620405, "kl": 0.006359661463648081, "learning_rate": 4.3525179856115106e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 363 }, { "completion_length": 233.2857208251953, "epoch": 0.3493282149712092, "grad_norm": 0.0026397642213851213, "kl": 0.0041642519645392895, "learning_rate": 4.364508393285372e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 364 }, { "completion_length": 179.00001525878906, "epoch": 0.3502879078694818, "grad_norm": 0.004668551031500101, "kl": 0.006456525530666113, "learning_rate": 4.3764988009592324e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 365 }, { "completion_length": 196.50001525878906, "epoch": 0.3512476007677543, "grad_norm": 0.0038402073550969362, "kl": 0.005113447085022926, "learning_rate": 4.388489208633093e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 366 }, { "completion_length": 163.21429443359375, "epoch": 0.3522072936660269, "grad_norm": 0.007105072028934956, "kl": 0.009040369652211666, "learning_rate": 4.4004796163069543e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 367 }, { "completion_length": 134.7857208251953, "epoch": 0.3531669865642994, "grad_norm": 0.010568746365606785, "kl": 0.011442187242209911, "learning_rate": 4.412470023980815e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 368 }, { "completion_length": 176.35714721679688, "epoch": 0.354126679462572, "grad_norm": 0.005024742800742388, "kl": 0.0071365442126989365, "learning_rate": 4.424460431654676e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 369 }, { "completion_length": 187.07144165039062, "epoch": 0.3550863723608445, "grad_norm": 0.004224918782711029, "kl": 0.005730424541980028, "learning_rate": 4.436450839328537e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 370 }, { "completion_length": 150.2857208251953, "epoch": 0.3560460652591171, "grad_norm": 1.2522034645080566, "kl": 0.006983661558479071, "learning_rate": 4.448441247002398e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 371 }, { "completion_length": 139.5, "epoch": 0.3570057581573896, "grad_norm": 1.669086217880249, "kl": 0.008130360394716263, "learning_rate": 4.4604316546762586e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 372 }, { "completion_length": 150.85714721679688, "epoch": 0.3579654510556622, "grad_norm": 0.006010019686073065, "kl": 0.007658985909074545, "learning_rate": 4.4724220623501197e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 373 }, { "completion_length": 135.2857208251953, "epoch": 0.35892514395393477, "grad_norm": 1.0831830501556396, "kl": 0.005389953497797251, "learning_rate": 4.4844124700239804e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 374 }, { "completion_length": 196.35714721679688, "epoch": 0.3598848368522073, "grad_norm": 0.005350504536181688, "kl": 0.00724268751218915, "learning_rate": 4.4964028776978415e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 375 }, { "completion_length": 155.92857360839844, "epoch": 0.36084452975047987, "grad_norm": 0.007140668574720621, "kl": 0.009587417356669903, "learning_rate": 4.508393285371702e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 376 }, { "completion_length": 200.7857208251953, "epoch": 0.3618042226487524, "grad_norm": 0.004531811457127333, "kl": 0.006825300399214029, "learning_rate": 4.5203836930455634e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 377 }, { "completion_length": 172.2857208251953, "epoch": 0.362763915547025, "grad_norm": 0.004968650173395872, "kl": 0.007476368453353643, "learning_rate": 4.5323741007194245e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 378 }, { "completion_length": 129.5, "epoch": 0.3637236084452975, "grad_norm": 0.007496100850403309, "kl": 0.010479573160409927, "learning_rate": 4.544364508393285e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 379 }, { "completion_length": 205.7857208251953, "epoch": 0.3646833013435701, "grad_norm": 0.004206594545394182, "kl": 0.007105248514562845, "learning_rate": 4.556354916067146e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 380 }, { "completion_length": 153.2857208251953, "epoch": 0.3656429942418426, "grad_norm": 0.0072992704808712006, "kl": 0.010209056548774242, "learning_rate": 4.568345323741007e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 381 }, { "completion_length": 189.92857360839844, "epoch": 0.3666026871401152, "grad_norm": 0.006249314174056053, "kl": 0.010344824753701687, "learning_rate": 4.5803357314148677e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 382 }, { "completion_length": 174.7857208251953, "epoch": 0.3675623800383877, "grad_norm": 0.004801756702363491, "kl": 0.007716745138168335, "learning_rate": 4.592326139088729e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 383 }, { "completion_length": 169.42857360839844, "epoch": 0.3685220729366603, "grad_norm": 0.005430859979242086, "kl": 0.008168360218405724, "learning_rate": 4.60431654676259e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 384 }, { "completion_length": 154.42857360839844, "epoch": 0.3694817658349328, "grad_norm": 0.0054330346174538136, "kl": 0.010099723935127258, "learning_rate": 4.6163069544364507e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 385 }, { "completion_length": 218.85714721679688, "epoch": 0.3704414587332054, "grad_norm": 0.0036696011666208506, "kl": 0.006376377306878567, "learning_rate": 4.628297362110312e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 386 }, { "completion_length": 153.6428680419922, "epoch": 0.3714011516314779, "grad_norm": 0.00787197332829237, "kl": 0.012896696105599403, "learning_rate": 4.6402877697841725e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 387 }, { "completion_length": 138.21429443359375, "epoch": 0.3723608445297505, "grad_norm": 0.00975114293396473, "kl": 0.013940976932644844, "learning_rate": 4.652278177458033e-07, "loss": 0.0, "reward": 0.2857142984867096, "reward_std": 0.0, "rewards/check_originality_func": 0.2857142984867096, "step": 388 }, { "completion_length": 152.85714721679688, "epoch": 0.373320537428023, "grad_norm": 1.5980305671691895, "kl": 0.011837108992040157, "learning_rate": 4.6642685851318943e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 389 }, { "completion_length": 165.42857360839844, "epoch": 0.3742802303262956, "grad_norm": 0.0053258612751960754, "kl": 0.008874927647411823, "learning_rate": 4.676258992805755e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 390 }, { "completion_length": 196.21429443359375, "epoch": 0.3752399232245681, "grad_norm": 0.006782363634556532, "kl": 0.010904250666499138, "learning_rate": 4.688249400479616e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 391 }, { "completion_length": 156.21429443359375, "epoch": 0.3761996161228407, "grad_norm": 0.004746697377413511, "kl": 0.00783673394471407, "learning_rate": 4.7002398081534773e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 392 }, { "completion_length": 189.85714721679688, "epoch": 0.3771593090211132, "grad_norm": 0.004711447283625603, "kl": 0.00814561266452074, "learning_rate": 4.712230215827338e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 393 }, { "completion_length": 163.1428680419922, "epoch": 0.3781190019193858, "grad_norm": 0.0060174353420734406, "kl": 0.009941193275153637, "learning_rate": 4.724220623501199e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 394 }, { "completion_length": 134.6428680419922, "epoch": 0.3790786948176583, "grad_norm": 1.3802924156188965, "kl": 0.010883519425988197, "learning_rate": 4.73621103117506e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 395 }, { "completion_length": 154.07144165039062, "epoch": 0.3800383877159309, "grad_norm": 0.013116939924657345, "kl": 0.017497170716524124, "learning_rate": 4.7482014388489204e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 396 }, { "completion_length": 184.2857208251953, "epoch": 0.3809980806142035, "grad_norm": 0.005062079057097435, "kl": 0.00869723316282034, "learning_rate": 4.7601918465227816e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 397 }, { "completion_length": 165.57144165039062, "epoch": 0.381957773512476, "grad_norm": 0.0072804405353963375, "kl": 0.012167633511126041, "learning_rate": 4.772182254196643e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 398 }, { "completion_length": 131.6428680419922, "epoch": 0.3829174664107486, "grad_norm": 0.008052562363445759, "kl": 0.013356690295040607, "learning_rate": 4.784172661870504e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 399 }, { "completion_length": 147.1428680419922, "epoch": 0.3838771593090211, "grad_norm": 1.1468688249588013, "kl": 0.00990899559110403, "learning_rate": 4.796163069544364e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 400 }, { "completion_length": 227.50001525878906, "epoch": 0.3848368522072937, "grad_norm": 0.0030626929365098476, "kl": 0.0060234712436795235, "learning_rate": 4.808153477218225e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 401 }, { "completion_length": 190.21429443359375, "epoch": 0.3857965451055662, "grad_norm": 0.0041225748136639595, "kl": 0.008111919276416302, "learning_rate": 4.820143884892086e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 402 }, { "completion_length": 131.7857208251953, "epoch": 0.3867562380038388, "grad_norm": 1.2833592891693115, "kl": 0.010471985675394535, "learning_rate": 4.832134292565947e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 403 }, { "completion_length": 178.07144165039062, "epoch": 0.3877159309021113, "grad_norm": 0.8705030679702759, "kl": 0.011525952257215977, "learning_rate": 4.844124700239808e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 404 }, { "completion_length": 200.7857208251953, "epoch": 0.3886756238003839, "grad_norm": 0.003932827152311802, "kl": 0.006778443697839975, "learning_rate": 4.856115107913669e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 405 }, { "completion_length": 210.1428680419922, "epoch": 0.3896353166986564, "grad_norm": 0.004625941626727581, "kl": 0.008006430231034756, "learning_rate": 4.86810551558753e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 406 }, { "completion_length": 206.7857208251953, "epoch": 0.390595009596929, "grad_norm": 0.00439419224858284, "kl": 0.0075349523685872555, "learning_rate": 4.880095923261391e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 407 }, { "completion_length": 236.50001525878906, "epoch": 0.3915547024952015, "grad_norm": 0.003405099967494607, "kl": 0.00613864092156291, "learning_rate": 4.892086330935251e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 408 }, { "completion_length": 123.0714340209961, "epoch": 0.3925143953934741, "grad_norm": 0.006837570108473301, "kl": 0.011095196940004826, "learning_rate": 4.904076738609112e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 409 }, { "completion_length": 165.21429443359375, "epoch": 0.3934740882917466, "grad_norm": 0.00417932216078043, "kl": 0.0062383315525949, "learning_rate": 4.916067146282974e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 410 }, { "completion_length": 196.57144165039062, "epoch": 0.3944337811900192, "grad_norm": 0.010975129902362823, "kl": 0.014880065806210041, "learning_rate": 4.928057553956834e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 411 }, { "completion_length": 141.71429443359375, "epoch": 0.39539347408829173, "grad_norm": 1.2122067213058472, "kl": 0.011444865725934505, "learning_rate": 4.940047961630695e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 412 }, { "completion_length": 126.78572082519531, "epoch": 0.3963531669865643, "grad_norm": 1.1896779537200928, "kl": 0.010350048542022705, "learning_rate": 4.952038369304556e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 413 }, { "completion_length": 143.71429443359375, "epoch": 0.39731285988483683, "grad_norm": 0.005375595297664404, "kl": 0.007943677715957165, "learning_rate": 4.964028776978417e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 414 }, { "completion_length": 132.1428680419922, "epoch": 0.3982725527831094, "grad_norm": 0.004363033454865217, "kl": 0.006671765353530645, "learning_rate": 4.976019184652278e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 415 }, { "completion_length": 237.85714721679688, "epoch": 0.39923224568138194, "grad_norm": 0.0019939409103244543, "kl": 0.003266266081482172, "learning_rate": 4.988009592326139e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 416 }, { "completion_length": 179.92857360839844, "epoch": 0.4001919385796545, "grad_norm": 0.004206922370940447, "kl": 0.006261697970330715, "learning_rate": 5e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 417 }, { "completion_length": 138.85714721679688, "epoch": 0.40115163147792704, "grad_norm": 0.0040842569433152676, "kl": 0.005977509543299675, "learning_rate": 4.999999123169588e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 418 }, { "completion_length": 196.21429443359375, "epoch": 0.4021113243761996, "grad_norm": 0.004143781494349241, "kl": 0.005422282498329878, "learning_rate": 4.999996492678965e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 419 }, { "completion_length": 209.6428680419922, "epoch": 0.40307101727447214, "grad_norm": 0.0029916507191956043, "kl": 0.004605182912200689, "learning_rate": 4.999992108529978e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 420 }, { "completion_length": 189.85714721679688, "epoch": 0.4040307101727447, "grad_norm": 0.002500498201698065, "kl": 0.003872017143294215, "learning_rate": 4.999985970725702e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 421 }, { "completion_length": 176.7857208251953, "epoch": 0.4049904030710173, "grad_norm": 1.3243424892425537, "kl": 0.004181661643087864, "learning_rate": 4.999978079270442e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 422 }, { "completion_length": 179.57144165039062, "epoch": 0.4059500959692898, "grad_norm": 0.003065873868763447, "kl": 0.004730458837002516, "learning_rate": 4.999968434169733e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 423 }, { "completion_length": 127.00000762939453, "epoch": 0.4069097888675624, "grad_norm": 0.006228747311979532, "kl": 0.008223658427596092, "learning_rate": 4.999957035430342e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 424 }, { "completion_length": 191.00001525878906, "epoch": 0.40786948176583493, "grad_norm": 0.0027769785374403, "kl": 0.004309374373406172, "learning_rate": 4.999943883060264e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 425 }, { "completion_length": 160.1428680419922, "epoch": 0.4088291746641075, "grad_norm": 0.002410454908385873, "kl": 0.003909902181476355, "learning_rate": 4.999928977068724e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 426 }, { "completion_length": 159.07144165039062, "epoch": 0.40978886756238003, "grad_norm": 0.0025523179210722446, "kl": 0.004073550924658775, "learning_rate": 4.999912317466181e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 427 }, { "completion_length": 185.1428680419922, "epoch": 0.4107485604606526, "grad_norm": 0.0027732313610613346, "kl": 0.004125926643610001, "learning_rate": 4.999893904264319e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 428 }, { "completion_length": 162.1428680419922, "epoch": 0.41170825335892514, "grad_norm": 0.00394137017428875, "kl": 0.004912009462714195, "learning_rate": 4.999873737476054e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 429 }, { "completion_length": 198.07144165039062, "epoch": 0.4126679462571977, "grad_norm": 0.0036536199040710926, "kl": 0.004902256187051535, "learning_rate": 4.999851817115532e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 430 }, { "completion_length": 204.71429443359375, "epoch": 0.41362763915547024, "grad_norm": 0.0035679168067872524, "kl": 0.004561292938888073, "learning_rate": 4.999828143198131e-07, "loss": 0.0, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 431 }, { "completion_length": 112.42857360839844, "epoch": 0.4145873320537428, "grad_norm": 0.00395613070577383, "kl": 0.006832596845924854, "learning_rate": 4.999802715740456e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 432 }, { "completion_length": 151.57144165039062, "epoch": 0.41554702495201534, "grad_norm": 0.0044822171330451965, "kl": 0.005916564725339413, "learning_rate": 4.999775534760344e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 433 }, { "completion_length": 133.1428680419922, "epoch": 0.4165067178502879, "grad_norm": 0.002623331733047962, "kl": 0.004062375519424677, "learning_rate": 4.999746600276862e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 434 }, { "completion_length": 155.6428680419922, "epoch": 0.41746641074856045, "grad_norm": 0.0028917782474309206, "kl": 0.0042557804845273495, "learning_rate": 4.999715912310305e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 435 }, { "completion_length": 142.5, "epoch": 0.418426103646833, "grad_norm": 0.0031870147213339806, "kl": 0.005205508321523666, "learning_rate": 4.999683470882201e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 436 }, { "completion_length": 205.6428680419922, "epoch": 0.41938579654510555, "grad_norm": 0.003445605281740427, "kl": 0.004326117690652609, "learning_rate": 4.999649276015306e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 437 }, { "completion_length": 191.71429443359375, "epoch": 0.42034548944337813, "grad_norm": 0.0026372415013611317, "kl": 0.004240455571562052, "learning_rate": 4.999613327733607e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 438 }, { "completion_length": 211.50001525878906, "epoch": 0.42130518234165065, "grad_norm": 0.0031795876566320658, "kl": 0.004581843968480825, "learning_rate": 4.999575626062319e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 439 }, { "completion_length": 169.0, "epoch": 0.42226487523992323, "grad_norm": 0.0054425569251179695, "kl": 0.007124512456357479, "learning_rate": 4.999536171027889e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 440 }, { "completion_length": 157.6428680419922, "epoch": 0.42322456813819576, "grad_norm": 0.004720823373645544, "kl": 0.005255846306681633, "learning_rate": 4.999494962657994e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 441 }, { "completion_length": 117.14286041259766, "epoch": 0.42418426103646834, "grad_norm": 0.003895180532708764, "kl": 0.005768100265413523, "learning_rate": 4.999452000981541e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 442 }, { "completion_length": 169.07144165039062, "epoch": 0.42514395393474086, "grad_norm": 2.294982671737671, "kl": 0.004330285359174013, "learning_rate": 4.999407286028663e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 443 }, { "completion_length": 132.85714721679688, "epoch": 0.42610364683301344, "grad_norm": 0.006053650751709938, "kl": 0.003938985988497734, "learning_rate": 4.999360817830728e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 444 }, { "completion_length": 203.42857360839844, "epoch": 0.42706333973128596, "grad_norm": 0.002088029868900776, "kl": 0.003099104156717658, "learning_rate": 4.999312596420333e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 445 }, { "completion_length": 153.35714721679688, "epoch": 0.42802303262955854, "grad_norm": 0.0037501670885831118, "kl": 0.0051498012617230415, "learning_rate": 4.999262621831301e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 446 }, { "completion_length": 182.85714721679688, "epoch": 0.4289827255278311, "grad_norm": 0.002414596499875188, "kl": 0.0038143573328852654, "learning_rate": 4.999210894098689e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 447 }, { "completion_length": 154.21429443359375, "epoch": 0.42994241842610365, "grad_norm": 1.980672001838684, "kl": 0.007976163178682327, "learning_rate": 4.999157413258781e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 448 }, { "completion_length": 188.2857208251953, "epoch": 0.4309021113243762, "grad_norm": 1.0294198989868164, "kl": 0.005801485385745764, "learning_rate": 4.999102179349093e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 449 }, { "completion_length": 169.92857360839844, "epoch": 0.43186180422264875, "grad_norm": 1.147885799407959, "kl": 0.004824282601475716, "learning_rate": 4.999045192408369e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 450 }, { "completion_length": 157.6428680419922, "epoch": 0.43282149712092133, "grad_norm": 0.00281134364195168, "kl": 0.00488896993920207, "learning_rate": 4.998986452476584e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 451 }, { "completion_length": 163.6428680419922, "epoch": 0.43378119001919385, "grad_norm": 0.9781846404075623, "kl": 0.0057422081008553505, "learning_rate": 4.998925959594941e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 452 }, { "completion_length": 144.6428680419922, "epoch": 0.43474088291746643, "grad_norm": 1.7419283390045166, "kl": 0.008463704958558083, "learning_rate": 4.998863713805874e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 453 }, { "completion_length": 167.5, "epoch": 0.43570057581573896, "grad_norm": 0.005509881768375635, "kl": 0.008514398708939552, "learning_rate": 4.998799715153047e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 454 }, { "completion_length": 178.92857360839844, "epoch": 0.43666026871401153, "grad_norm": 0.007744582835584879, "kl": 0.009362463839352131, "learning_rate": 4.998733963681353e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 455 }, { "completion_length": 133.42857360839844, "epoch": 0.43761996161228406, "grad_norm": 0.0057564061135053635, "kl": 0.009840648621320724, "learning_rate": 4.998666459436912e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 456 }, { "completion_length": 161.35714721679688, "epoch": 0.43857965451055664, "grad_norm": 1.991047739982605, "kl": 0.00807172991335392, "learning_rate": 4.998597202467077e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 457 }, { "completion_length": 156.1428680419922, "epoch": 0.43953934740882916, "grad_norm": 0.004656978417187929, "kl": 0.008416320197284222, "learning_rate": 4.998526192820431e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 458 }, { "completion_length": 190.21429443359375, "epoch": 0.44049904030710174, "grad_norm": 0.003084923606365919, "kl": 0.0066238585859537125, "learning_rate": 4.998453430546781e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 459 }, { "completion_length": 161.2857208251953, "epoch": 0.44145873320537427, "grad_norm": 0.007532692980021238, "kl": 0.010352435521781445, "learning_rate": 4.998378915697171e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 460 }, { "completion_length": 170.85714721679688, "epoch": 0.44241842610364684, "grad_norm": 0.003317995462566614, "kl": 0.006282078102231026, "learning_rate": 4.998302648323867e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 461 }, { "completion_length": 148.35714721679688, "epoch": 0.44337811900191937, "grad_norm": 0.007438851520419121, "kl": 0.011668642982840538, "learning_rate": 4.998224628480372e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 462 }, { "completion_length": 173.21429443359375, "epoch": 0.44433781190019195, "grad_norm": 0.0033865768928080797, "kl": 0.007138850167393684, "learning_rate": 4.998144856221411e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 463 }, { "completion_length": 142.0, "epoch": 0.44529750479846447, "grad_norm": 0.006007790565490723, "kl": 0.011366385035216808, "learning_rate": 4.998063331602943e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 464 }, { "completion_length": 183.50001525878906, "epoch": 0.44625719769673705, "grad_norm": 0.002520012203603983, "kl": 0.006085669621825218, "learning_rate": 4.997980054682153e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 465 }, { "completion_length": 204.92857360839844, "epoch": 0.4472168905950096, "grad_norm": 0.003791185561567545, "kl": 0.008847094140946865, "learning_rate": 4.997895025517458e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 466 }, { "completion_length": 127.5714340209961, "epoch": 0.44817658349328215, "grad_norm": 0.007656343746930361, "kl": 0.01387804001569748, "learning_rate": 4.997808244168502e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 467 }, { "completion_length": 176.71429443359375, "epoch": 0.4491362763915547, "grad_norm": 0.0028603060636669397, "kl": 0.006476906593888998, "learning_rate": 4.99771971069616e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 468 }, { "completion_length": 140.21429443359375, "epoch": 0.45009596928982726, "grad_norm": 0.003974410705268383, "kl": 0.00797268282622099, "learning_rate": 4.997629425162535e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 469 }, { "completion_length": 159.2857208251953, "epoch": 0.4510556621880998, "grad_norm": 0.0032311200629919767, "kl": 0.00814227107912302, "learning_rate": 4.997537387630958e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 470 }, { "completion_length": 168.07144165039062, "epoch": 0.45201535508637236, "grad_norm": 0.0038708937354385853, "kl": 0.008634654805064201, "learning_rate": 4.997443598165991e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 471 }, { "completion_length": 204.35714721679688, "epoch": 0.45297504798464494, "grad_norm": 0.0036690442357212305, "kl": 0.009388349018990993, "learning_rate": 4.997348056833425e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 472 }, { "completion_length": 178.2857208251953, "epoch": 0.45393474088291746, "grad_norm": 0.005971782375127077, "kl": 0.011267641559243202, "learning_rate": 4.997250763700276e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 473 }, { "completion_length": 142.71429443359375, "epoch": 0.45489443378119004, "grad_norm": 0.007430283818393946, "kl": 0.013017536140978336, "learning_rate": 4.997151718834794e-07, "loss": 0.0, "reward": 0.2857142984867096, "reward_std": 0.0, "rewards/check_originality_func": 0.2857142984867096, "step": 474 }, { "completion_length": 147.5, "epoch": 0.45585412667946257, "grad_norm": 0.003904519136995077, "kl": 0.009765729308128357, "learning_rate": 4.997050922306455e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 475 }, { "completion_length": 178.92857360839844, "epoch": 0.45681381957773515, "grad_norm": 0.0052666435949504375, "kl": 0.010981598868966103, "learning_rate": 4.996948374185963e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 476 }, { "completion_length": 190.42857360839844, "epoch": 0.45777351247600767, "grad_norm": 0.0036542771849781275, "kl": 0.007928331382572651, "learning_rate": 4.996844074545253e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 477 }, { "completion_length": 181.50001525878906, "epoch": 0.45873320537428025, "grad_norm": 0.00499703036621213, "kl": 0.010227108374238014, "learning_rate": 4.996738023457488e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 478 }, { "completion_length": 172.7857208251953, "epoch": 0.4596928982725528, "grad_norm": 0.003226473229005933, "kl": 0.007377480622380972, "learning_rate": 4.996630220997057e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 479 }, { "completion_length": 156.6428680419922, "epoch": 0.46065259117082535, "grad_norm": 0.004314056597650051, "kl": 0.009901567362248898, "learning_rate": 4.996520667239582e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 480 }, { "completion_length": 173.21429443359375, "epoch": 0.4616122840690979, "grad_norm": 0.0035559532698243856, "kl": 0.0089016268029809, "learning_rate": 4.996409362261909e-07, "loss": 0.0, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 481 }, { "completion_length": 166.85714721679688, "epoch": 0.46257197696737046, "grad_norm": 0.004174004774540663, "kl": 0.008707933127880096, "learning_rate": 4.996296306142116e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 482 }, { "completion_length": 173.1428680419922, "epoch": 0.463531669865643, "grad_norm": 0.003399583511054516, "kl": 0.008225942961871624, "learning_rate": 4.996181498959507e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 483 }, { "completion_length": 141.35714721679688, "epoch": 0.46449136276391556, "grad_norm": 0.004938632249832153, "kl": 0.012155759148299694, "learning_rate": 4.996064940794615e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 484 }, { "completion_length": 171.7857208251953, "epoch": 0.4654510556621881, "grad_norm": 0.004433685913681984, "kl": 0.009662138298153877, "learning_rate": 4.995946631729203e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 485 }, { "completion_length": 127.92857360839844, "epoch": 0.46641074856046066, "grad_norm": 0.003751114010810852, "kl": 0.008576039224863052, "learning_rate": 4.995826571846258e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 486 }, { "completion_length": 153.6428680419922, "epoch": 0.4673704414587332, "grad_norm": 0.007126395590603352, "kl": 0.014598347246646881, "learning_rate": 4.99570476123e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 487 }, { "completion_length": 188.6428680419922, "epoch": 0.46833013435700577, "grad_norm": 0.0025787821505218744, "kl": 0.006688262335956097, "learning_rate": 4.995581199965873e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 488 }, { "completion_length": 141.71429443359375, "epoch": 0.4692898272552783, "grad_norm": 1.1156851053237915, "kl": 0.00935223326086998, "learning_rate": 4.995455888140551e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 489 }, { "completion_length": 160.35714721679688, "epoch": 0.47024952015355087, "grad_norm": 0.004353741649538279, "kl": 0.010059937834739685, "learning_rate": 4.995328825841939e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 490 }, { "completion_length": 192.92857360839844, "epoch": 0.4712092130518234, "grad_norm": 0.004319176543504, "kl": 0.010347171686589718, "learning_rate": 4.995200013159163e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 491 }, { "completion_length": 151.7857208251953, "epoch": 0.472168905950096, "grad_norm": 1.373595118522644, "kl": 0.013690197840332985, "learning_rate": 4.99506945018258e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 492 }, { "completion_length": 171.00001525878906, "epoch": 0.4731285988483685, "grad_norm": 0.005233554635196924, "kl": 0.010802817530930042, "learning_rate": 4.994937137003779e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 493 }, { "completion_length": 179.07144165039062, "epoch": 0.4740882917466411, "grad_norm": 1.0689971446990967, "kl": 0.009831391274929047, "learning_rate": 4.994803073715569e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 494 }, { "completion_length": 148.2857208251953, "epoch": 0.4750479846449136, "grad_norm": 1.2019357681274414, "kl": 0.011016573756933212, "learning_rate": 4.994667260411994e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 495 }, { "completion_length": 170.7857208251953, "epoch": 0.4760076775431862, "grad_norm": 0.00497909402474761, "kl": 0.010593735612928867, "learning_rate": 4.99452969718832e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 496 }, { "completion_length": 172.07144165039062, "epoch": 0.47696737044145876, "grad_norm": 1.3019663095474243, "kl": 0.009357011877000332, "learning_rate": 4.994390384141043e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 497 }, { "completion_length": 166.6428680419922, "epoch": 0.4779270633397313, "grad_norm": 0.00401948532089591, "kl": 0.008055298589169979, "learning_rate": 4.994249321367887e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 498 }, { "completion_length": 150.57144165039062, "epoch": 0.47888675623800386, "grad_norm": 0.003883990226313472, "kl": 0.008081617765128613, "learning_rate": 4.994106508967803e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 499 }, { "completion_length": 131.92857360839844, "epoch": 0.4798464491362764, "grad_norm": 0.004588898736983538, "kl": 0.009443467482924461, "learning_rate": 4.993961947040967e-07, "loss": 0.0, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 500 }, { "completion_length": 167.57144165039062, "epoch": 0.48080614203454897, "grad_norm": 0.005044565536081791, "kl": 0.009146878495812416, "learning_rate": 4.993815635688784e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 501 }, { "completion_length": 150.92857360839844, "epoch": 0.4817658349328215, "grad_norm": 0.0027599316090345383, "kl": 0.00583203649148345, "learning_rate": 4.993667575013888e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 502 }, { "completion_length": 143.7857208251953, "epoch": 0.48272552783109407, "grad_norm": 0.005656755995005369, "kl": 0.008897591382265091, "learning_rate": 4.993517765120136e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 503 }, { "completion_length": 142.85714721679688, "epoch": 0.4836852207293666, "grad_norm": 1.3464373350143433, "kl": 0.008662533946335316, "learning_rate": 4.993366206112617e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 504 }, { "completion_length": 192.6428680419922, "epoch": 0.4846449136276392, "grad_norm": 0.002618917031213641, "kl": 0.006546409334987402, "learning_rate": 4.993212898097643e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 505 }, { "completion_length": 161.0, "epoch": 0.4856046065259117, "grad_norm": 0.004488400183618069, "kl": 0.008936384692788124, "learning_rate": 4.993057841182754e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 506 }, { "completion_length": 179.1428680419922, "epoch": 0.4865642994241843, "grad_norm": 0.0035327388904988766, "kl": 0.0076110102236270905, "learning_rate": 4.992901035476715e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 507 }, { "completion_length": 155.07144165039062, "epoch": 0.4875239923224568, "grad_norm": 0.003558242227882147, "kl": 0.009451834484934807, "learning_rate": 4.992742481089524e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 508 }, { "completion_length": 160.35714721679688, "epoch": 0.4884836852207294, "grad_norm": 0.005339170340448618, "kl": 0.012043198570609093, "learning_rate": 4.992582178132397e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 509 }, { "completion_length": 177.7857208251953, "epoch": 0.4894433781190019, "grad_norm": 0.005805180408060551, "kl": 0.010114185512065887, "learning_rate": 4.992420126717784e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 510 }, { "completion_length": 125.85714721679688, "epoch": 0.4904030710172745, "grad_norm": 0.003938076086342335, "kl": 0.009393743239343166, "learning_rate": 4.992256326959354e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 511 }, { "completion_length": 169.1428680419922, "epoch": 0.491362763915547, "grad_norm": 0.0033285394310951233, "kl": 0.008462810888886452, "learning_rate": 4.992090778972012e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 512 }, { "completion_length": 152.5, "epoch": 0.4923224568138196, "grad_norm": 0.003722703317180276, "kl": 0.01004018820822239, "learning_rate": 4.991923482871881e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 513 }, { "completion_length": 119.85714721679688, "epoch": 0.4932821497120921, "grad_norm": 1.4018694162368774, "kl": 0.010642263107001781, "learning_rate": 4.991754438776314e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 514 }, { "completion_length": 200.85714721679688, "epoch": 0.4942418426103647, "grad_norm": 1.0270565748214722, "kl": 0.007728993892669678, "learning_rate": 4.991583646803888e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 515 }, { "completion_length": 146.35714721679688, "epoch": 0.4952015355086372, "grad_norm": 0.0040282392874360085, "kl": 0.009974617511034012, "learning_rate": 4.991411107074409e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 516 }, { "completion_length": 171.1428680419922, "epoch": 0.4961612284069098, "grad_norm": 0.0039641414768993855, "kl": 0.010769071988761425, "learning_rate": 4.991236819708908e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 517 }, { "completion_length": 162.42857360839844, "epoch": 0.4971209213051823, "grad_norm": 0.006564635783433914, "kl": 0.014494695700705051, "learning_rate": 4.99106078482964e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 518 }, { "completion_length": 202.42857360839844, "epoch": 0.4980806142034549, "grad_norm": 0.003831567708402872, "kl": 0.011370914988219738, "learning_rate": 4.990883002560088e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 519 }, { "completion_length": 167.07144165039062, "epoch": 0.4990403071017274, "grad_norm": 1.3876044750213623, "kl": 0.01233643852174282, "learning_rate": 4.990703473024958e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 520 }, { "completion_length": 144.92857360839844, "epoch": 0.5, "grad_norm": 0.004739078693091869, "kl": 0.01495194248855114, "learning_rate": 4.990522196350187e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 521 }, { "completion_length": 133.42857360839844, "epoch": 0.5009596928982726, "grad_norm": 0.004989520646631718, "kl": 0.013663483783602715, "learning_rate": 4.990339172662932e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 522 }, { "completion_length": 162.85714721679688, "epoch": 0.5019193857965452, "grad_norm": 1.2278469800949097, "kl": 0.02137809619307518, "learning_rate": 4.990154402091577e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 523 }, { "completion_length": 127.42857360839844, "epoch": 0.5028790786948176, "grad_norm": 0.004123081918805838, "kl": 0.012065181508660316, "learning_rate": 4.989967884765734e-07, "loss": 0.0, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 524 }, { "completion_length": 178.7857208251953, "epoch": 0.5038387715930902, "grad_norm": 0.004663968458771706, "kl": 0.011629972606897354, "learning_rate": 4.989779620816236e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 525 }, { "completion_length": 156.5, "epoch": 0.5047984644913628, "grad_norm": 1.195729374885559, "kl": 0.011659017764031887, "learning_rate": 4.989589610375145e-07, "loss": 0.0, "reward": 0.1428571492433548, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.1428571492433548, "step": 526 }, { "completion_length": 137.57144165039062, "epoch": 0.5057581573896354, "grad_norm": 0.004279721062630415, "kl": 0.011480689980089664, "learning_rate": 4.989397853575745e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 527 }, { "completion_length": 124.64286041259766, "epoch": 0.5067178502879078, "grad_norm": 0.0038034748286008835, "kl": 0.01048057060688734, "learning_rate": 4.989204350552548e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 528 }, { "completion_length": 125.85714721679688, "epoch": 0.5076775431861804, "grad_norm": 0.835983395576477, "kl": 0.014084087684750557, "learning_rate": 4.98900910144129e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 529 }, { "completion_length": 166.35714721679688, "epoch": 0.508637236084453, "grad_norm": 0.004846977069973946, "kl": 0.012231942266225815, "learning_rate": 4.98881210637893e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 530 }, { "completion_length": 173.1428680419922, "epoch": 0.5095969289827256, "grad_norm": 0.0035197811666876078, "kl": 0.010071267373859882, "learning_rate": 4.988613365503653e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 531 }, { "completion_length": 126.78572082519531, "epoch": 0.510556621880998, "grad_norm": 0.004718031268566847, "kl": 0.014152931980788708, "learning_rate": 4.98841287895487e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 532 }, { "completion_length": 166.92857360839844, "epoch": 0.5115163147792706, "grad_norm": 0.0033457798417657614, "kl": 0.008888877928256989, "learning_rate": 4.988210646873212e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 533 }, { "completion_length": 167.0, "epoch": 0.5124760076775432, "grad_norm": 0.004066879861056805, "kl": 0.01079594250768423, "learning_rate": 4.988006669400542e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 534 }, { "completion_length": 130.2857208251953, "epoch": 0.5134357005758158, "grad_norm": 0.0049894945695996284, "kl": 0.009917265735566616, "learning_rate": 4.98780094667994e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 535 }, { "completion_length": 193.6428680419922, "epoch": 0.5143953934740882, "grad_norm": 2.1574418544769287, "kl": 0.008312646299600601, "learning_rate": 4.987593478855714e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 536 }, { "completion_length": 146.21429443359375, "epoch": 0.5153550863723608, "grad_norm": 0.9228793382644653, "kl": 0.010036442428827286, "learning_rate": 4.987384266073395e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 537 }, { "completion_length": 138.42857360839844, "epoch": 0.5163147792706334, "grad_norm": 0.004111227113753557, "kl": 0.009612767957150936, "learning_rate": 4.987173308479737e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 538 }, { "completion_length": 178.85714721679688, "epoch": 0.517274472168906, "grad_norm": 0.0031744414009153843, "kl": 0.008631208911538124, "learning_rate": 4.986960606222722e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 539 }, { "completion_length": 120.71429443359375, "epoch": 0.5182341650671785, "grad_norm": 0.005836893804371357, "kl": 0.013994049280881882, "learning_rate": 4.986746159451553e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 540 }, { "completion_length": 168.7857208251953, "epoch": 0.519193857965451, "grad_norm": 0.0032577821984887123, "kl": 0.009109563194215298, "learning_rate": 4.986529968316653e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 541 }, { "completion_length": 109.5714340209961, "epoch": 0.5201535508637236, "grad_norm": 0.0041373553685843945, "kl": 0.010551022365689278, "learning_rate": 4.986312032969676e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 542 }, { "completion_length": 137.5, "epoch": 0.5211132437619962, "grad_norm": 0.004280523397028446, "kl": 0.011482583358883858, "learning_rate": 4.986092353563494e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 543 }, { "completion_length": 164.42857360839844, "epoch": 0.5220729366602687, "grad_norm": 0.003284461796283722, "kl": 0.008400504477322102, "learning_rate": 4.985870930252206e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 544 }, { "completion_length": 179.50001525878906, "epoch": 0.5230326295585412, "grad_norm": 0.0038915546610951424, "kl": 0.01003018394112587, "learning_rate": 4.985647763191131e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 545 }, { "completion_length": 136.0, "epoch": 0.5239923224568138, "grad_norm": 0.0038543075788766146, "kl": 0.010363508947193623, "learning_rate": 4.985422852536813e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 546 }, { "completion_length": 108.50000762939453, "epoch": 0.5249520153550864, "grad_norm": 1.3589282035827637, "kl": 0.014343147166073322, "learning_rate": 4.98519619844702e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 547 }, { "completion_length": 169.7857208251953, "epoch": 0.525911708253359, "grad_norm": 1.1147962808609009, "kl": 0.01053935568779707, "learning_rate": 4.984967801080739e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 548 }, { "completion_length": 166.57144165039062, "epoch": 0.5268714011516314, "grad_norm": 0.00406623212620616, "kl": 0.011052730493247509, "learning_rate": 4.984737660598186e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 549 }, { "completion_length": 136.71429443359375, "epoch": 0.527831094049904, "grad_norm": 0.005085871089249849, "kl": 0.012397916056215763, "learning_rate": 4.984505777160795e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 550 }, { "completion_length": 139.92857360839844, "epoch": 0.5287907869481766, "grad_norm": 0.0036912537179887295, "kl": 0.00944766029715538, "learning_rate": 4.984272150931223e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 551 }, { "completion_length": 168.6428680419922, "epoch": 0.5297504798464492, "grad_norm": 0.0034871737007051706, "kl": 0.01165189128369093, "learning_rate": 4.984036782073351e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 552 }, { "completion_length": 203.2857208251953, "epoch": 0.5307101727447217, "grad_norm": 0.0019996336195617914, "kl": 0.00710928812623024, "learning_rate": 4.983799670752282e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 553 }, { "completion_length": 136.92857360839844, "epoch": 0.5316698656429942, "grad_norm": 0.003274878952652216, "kl": 0.010461662895977497, "learning_rate": 4.983560817134341e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 554 }, { "completion_length": 175.21429443359375, "epoch": 0.5326295585412668, "grad_norm": 0.0027719277422875166, "kl": 0.008803843520581722, "learning_rate": 4.983320221387076e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 555 }, { "completion_length": 169.5, "epoch": 0.5335892514395394, "grad_norm": 0.0031191131565719843, "kl": 0.010882515460252762, "learning_rate": 4.983077883679256e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 556 }, { "completion_length": 200.71429443359375, "epoch": 0.5345489443378119, "grad_norm": 0.002566312672570348, "kl": 0.00874595157802105, "learning_rate": 4.982833804180871e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 557 }, { "completion_length": 188.7857208251953, "epoch": 0.5355086372360844, "grad_norm": 0.001955367624759674, "kl": 0.0075966059230268, "learning_rate": 4.982587983063137e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 558 }, { "completion_length": 149.07144165039062, "epoch": 0.536468330134357, "grad_norm": 0.003514951793476939, "kl": 0.011397313326597214, "learning_rate": 4.982340420498485e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 559 }, { "completion_length": 179.21429443359375, "epoch": 0.5374280230326296, "grad_norm": 0.0025192361790686846, "kl": 0.008648066781461239, "learning_rate": 4.982091116660574e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 560 }, { "completion_length": 185.07144165039062, "epoch": 0.5383877159309021, "grad_norm": 1.5459535121917725, "kl": 0.007795014884322882, "learning_rate": 4.981840071724281e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 561 }, { "completion_length": 193.1428680419922, "epoch": 0.5393474088291746, "grad_norm": 0.0023491517640650272, "kl": 0.008605689741671085, "learning_rate": 4.981587285865705e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 562 }, { "completion_length": 191.1428680419922, "epoch": 0.5403071017274472, "grad_norm": 0.002520696260035038, "kl": 0.008323675021529198, "learning_rate": 4.981332759262166e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 563 }, { "completion_length": 177.92857360839844, "epoch": 0.5412667946257198, "grad_norm": 0.003386122640222311, "kl": 0.009968424215912819, "learning_rate": 4.981076492092206e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 564 }, { "completion_length": 200.85714721679688, "epoch": 0.5422264875239923, "grad_norm": 0.002989916130900383, "kl": 0.00947828870266676, "learning_rate": 4.980818484535586e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 565 }, { "completion_length": 137.0, "epoch": 0.5431861804222649, "grad_norm": 0.004535728599876165, "kl": 0.013281760737299919, "learning_rate": 4.98055873677329e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 566 }, { "completion_length": 137.71429443359375, "epoch": 0.5441458733205374, "grad_norm": 1.6320366859436035, "kl": 0.014489902183413506, "learning_rate": 4.980297248987523e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 567 }, { "completion_length": 124.50000762939453, "epoch": 0.54510556621881, "grad_norm": 0.004490076098591089, "kl": 0.01215648464858532, "learning_rate": 4.980034021361708e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 568 }, { "completion_length": 177.57144165039062, "epoch": 0.5460652591170825, "grad_norm": 2.01159930229187, "kl": 0.015557147562503815, "learning_rate": 4.979769054080489e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 569 }, { "completion_length": 202.85714721679688, "epoch": 0.5470249520153551, "grad_norm": 0.004275185521692038, "kl": 0.011511662974953651, "learning_rate": 4.979502347329732e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 570 }, { "completion_length": 151.42857360839844, "epoch": 0.5479846449136276, "grad_norm": 0.00422873767092824, "kl": 0.012981298379600048, "learning_rate": 4.979233901296522e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 571 }, { "completion_length": 162.71429443359375, "epoch": 0.5489443378119002, "grad_norm": 0.0028415622655302286, "kl": 0.009324795566499233, "learning_rate": 4.978963716169165e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 572 }, { "completion_length": 172.7857208251953, "epoch": 0.5499040307101728, "grad_norm": 0.004657541401684284, "kl": 0.013272697106003761, "learning_rate": 4.978691792137186e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 573 }, { "completion_length": 175.57144165039062, "epoch": 0.5508637236084453, "grad_norm": 0.004683882463723421, "kl": 0.01397662702947855, "learning_rate": 4.97841812939133e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 574 }, { "completion_length": 215.85714721679688, "epoch": 0.5518234165067178, "grad_norm": 0.004057110752910376, "kl": 0.010532695800065994, "learning_rate": 4.978142728123562e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 575 }, { "completion_length": 152.57144165039062, "epoch": 0.5527831094049904, "grad_norm": 0.004842223133891821, "kl": 0.013419078662991524, "learning_rate": 4.977865588527064e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 576 }, { "completion_length": 152.42857360839844, "epoch": 0.553742802303263, "grad_norm": 0.003931504208594561, "kl": 0.013029252178966999, "learning_rate": 4.977586710796242e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 577 }, { "completion_length": 191.42857360839844, "epoch": 0.5547024952015355, "grad_norm": 0.004126745741814375, "kl": 0.01185076404362917, "learning_rate": 4.977306095126719e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 578 }, { "completion_length": 193.50001525878906, "epoch": 0.555662188099808, "grad_norm": 0.9970179200172424, "kl": 0.014389809221029282, "learning_rate": 4.977023741715334e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 579 }, { "completion_length": 168.07144165039062, "epoch": 0.5566218809980806, "grad_norm": 0.0051545859314501286, "kl": 0.016186736524105072, "learning_rate": 4.976739650760151e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 580 }, { "completion_length": 182.42857360839844, "epoch": 0.5575815738963532, "grad_norm": 0.008610285818576813, "kl": 0.019792476668953896, "learning_rate": 4.976453822460448e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 581 }, { "completion_length": 169.6428680419922, "epoch": 0.5585412667946257, "grad_norm": 0.004855492617934942, "kl": 0.015371189452707767, "learning_rate": 4.976166257016723e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 582 }, { "completion_length": 173.00001525878906, "epoch": 0.5595009596928983, "grad_norm": 0.007499069906771183, "kl": 0.020265091210603714, "learning_rate": 4.975876954630695e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 583 }, { "completion_length": 181.35714721679688, "epoch": 0.5604606525911708, "grad_norm": 0.015073015354573727, "kl": 0.025984114035964012, "learning_rate": 4.975585915505297e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 584 }, { "completion_length": 189.6428680419922, "epoch": 0.5614203454894434, "grad_norm": 0.8528309464454651, "kl": 0.011358657851815224, "learning_rate": 4.975293139844684e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 585 }, { "completion_length": 160.0, "epoch": 0.5623800383877159, "grad_norm": 0.008800397627055645, "kl": 0.022450508549809456, "learning_rate": 4.974998627854227e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 586 }, { "completion_length": 187.6428680419922, "epoch": 0.5633397312859885, "grad_norm": 0.0053908126428723335, "kl": 0.014568604528903961, "learning_rate": 4.974702379740515e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 587 }, { "completion_length": 192.71429443359375, "epoch": 0.564299424184261, "grad_norm": 0.007133249193429947, "kl": 0.019172661006450653, "learning_rate": 4.974404395711357e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 588 }, { "completion_length": 166.85714721679688, "epoch": 0.5652591170825336, "grad_norm": 0.010108179412782192, "kl": 0.02555897645652294, "learning_rate": 4.974104675975778e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 589 }, { "completion_length": 167.42857360839844, "epoch": 0.5662188099808061, "grad_norm": 0.007818725891411304, "kl": 0.022282354533672333, "learning_rate": 4.97380322074402e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 590 }, { "completion_length": 154.21429443359375, "epoch": 0.5671785028790787, "grad_norm": 0.013243420980870724, "kl": 0.028241179883480072, "learning_rate": 4.973500030227543e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 591 }, { "completion_length": 153.5, "epoch": 0.5681381957773513, "grad_norm": 0.007070672232657671, "kl": 0.022416528314352036, "learning_rate": 4.973195104639024e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 592 }, { "completion_length": 204.1428680419922, "epoch": 0.5690978886756238, "grad_norm": 0.011023720726370811, "kl": 0.021948201581835747, "learning_rate": 4.97288844419236e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 593 }, { "completion_length": 159.85714721679688, "epoch": 0.5700575815738963, "grad_norm": 1.2542513608932495, "kl": 0.01954692229628563, "learning_rate": 4.972580049102659e-07, "loss": 0.0, "reward": 0.1428571492433548, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.1428571492433548, "step": 594 }, { "completion_length": 175.85714721679688, "epoch": 0.5710172744721689, "grad_norm": 0.007302539888769388, "kl": 0.022564174607396126, "learning_rate": 4.972269919586249e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 595 }, { "completion_length": 103.64286041259766, "epoch": 0.5719769673704415, "grad_norm": 2.118541717529297, "kl": 0.03324838727712631, "learning_rate": 4.971958055860678e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 596 }, { "completion_length": 167.71429443359375, "epoch": 0.572936660268714, "grad_norm": 1.0096794366836548, "kl": 0.020319242030382156, "learning_rate": 4.971644458144706e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 597 }, { "completion_length": 179.92857360839844, "epoch": 0.5738963531669866, "grad_norm": 0.007809388916939497, "kl": 0.02398364432156086, "learning_rate": 4.971329126658309e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 598 }, { "completion_length": 110.21429443359375, "epoch": 0.5748560460652591, "grad_norm": 0.009256496094167233, "kl": 0.03536533564329147, "learning_rate": 4.971012061622683e-07, "loss": 0.0, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 599 }, { "completion_length": 206.00001525878906, "epoch": 0.5758157389635317, "grad_norm": 0.006854698993265629, "kl": 0.01973828300833702, "learning_rate": 4.970693263260237e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 600 }, { "completion_length": 150.7857208251953, "epoch": 0.5767754318618042, "grad_norm": 0.013709831982851028, "kl": 0.03374452516436577, "learning_rate": 4.970372731794597e-07, "loss": 0.0, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 601 }, { "completion_length": 179.7857208251953, "epoch": 0.5777351247600768, "grad_norm": 0.008148163557052612, "kl": 0.027171362191438675, "learning_rate": 4.970050467450603e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 602 }, { "completion_length": 170.85714721679688, "epoch": 0.5786948176583493, "grad_norm": 0.008823094889521599, "kl": 0.02840021811425686, "learning_rate": 4.969726470454313e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 603 }, { "completion_length": 162.92857360839844, "epoch": 0.5796545105566219, "grad_norm": 1.33676016330719, "kl": 0.03288481384515762, "learning_rate": 4.969400741032999e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 604 }, { "completion_length": 177.07144165039062, "epoch": 0.5806142034548945, "grad_norm": 0.007574094925075769, "kl": 0.024033285677433014, "learning_rate": 4.969073279415149e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 605 }, { "completion_length": 170.42857360839844, "epoch": 0.581573896353167, "grad_norm": 0.009103197604417801, "kl": 0.02871755324304104, "learning_rate": 4.968744085830465e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 606 }, { "completion_length": 141.71429443359375, "epoch": 0.5825335892514395, "grad_norm": 0.01120041124522686, "kl": 0.0335501991212368, "learning_rate": 4.968413160509865e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 607 }, { "completion_length": 134.85714721679688, "epoch": 0.5834932821497121, "grad_norm": 1.6394226551055908, "kl": 0.02879190444946289, "learning_rate": 4.968080503685482e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 608 }, { "completion_length": 196.71429443359375, "epoch": 0.5844529750479847, "grad_norm": 1.727159023284912, "kl": 0.02726820856332779, "learning_rate": 4.967746115590662e-07, "loss": 0.0, "reward": 0.2142857313156128, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.2142857313156128, "step": 609 }, { "completion_length": 179.71429443359375, "epoch": 0.5854126679462572, "grad_norm": 0.00735055934637785, "kl": 0.025701362639665604, "learning_rate": 4.967409996459966e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 610 }, { "completion_length": 167.5, "epoch": 0.5863723608445297, "grad_norm": 0.008853258565068245, "kl": 0.02850172482430935, "learning_rate": 4.96707214652917e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 611 }, { "completion_length": 198.1428680419922, "epoch": 0.5873320537428023, "grad_norm": 0.7082753777503967, "kl": 0.03089352697134018, "learning_rate": 4.966732566035265e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 612 }, { "completion_length": 174.1428680419922, "epoch": 0.5882917466410749, "grad_norm": 1.5916870832443237, "kl": 0.03299470245838165, "learning_rate": 4.966391255216451e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 613 }, { "completion_length": 158.85714721679688, "epoch": 0.5892514395393474, "grad_norm": 0.038652896881103516, "kl": 0.06410452723503113, "learning_rate": 4.96604821431215e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 614 }, { "completion_length": 155.7857208251953, "epoch": 0.5902111324376199, "grad_norm": 0.008962053805589676, "kl": 0.034887123852968216, "learning_rate": 4.965703443562989e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 615 }, { "completion_length": 125.64286041259766, "epoch": 0.5911708253358925, "grad_norm": 0.016899414360523224, "kl": 0.050006233155727386, "learning_rate": 4.965356943210815e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 616 }, { "completion_length": 183.50001525878906, "epoch": 0.5921305182341651, "grad_norm": 0.008435691706836224, "kl": 0.02952113002538681, "learning_rate": 4.965008713498686e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 617 }, { "completion_length": 177.57144165039062, "epoch": 0.5930902111324377, "grad_norm": 0.9799455404281616, "kl": 0.03829926997423172, "learning_rate": 4.964658754670868e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 618 }, { "completion_length": 165.07144165039062, "epoch": 0.5940499040307101, "grad_norm": 1.2217155694961548, "kl": 0.04146995767951012, "learning_rate": 4.964307066972851e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 619 }, { "completion_length": 175.7857208251953, "epoch": 0.5950095969289827, "grad_norm": 0.01453301403671503, "kl": 0.04094288870692253, "learning_rate": 4.963953650651326e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 620 }, { "completion_length": 168.92857360839844, "epoch": 0.5959692898272553, "grad_norm": 0.024150121957063675, "kl": 0.05796276777982712, "learning_rate": 4.963598505954207e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 621 }, { "completion_length": 176.42857360839844, "epoch": 0.5969289827255279, "grad_norm": 0.015133229084312916, "kl": 0.04665781930088997, "learning_rate": 4.963241633130612e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 622 }, { "completion_length": 159.85714721679688, "epoch": 0.5978886756238004, "grad_norm": 1.9957966804504395, "kl": 0.045066554099321365, "learning_rate": 4.962883032430874e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 623 }, { "completion_length": 198.1428680419922, "epoch": 0.5988483685220729, "grad_norm": 0.9289172887802124, "kl": 0.048174817115068436, "learning_rate": 4.962522704106541e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 624 }, { "completion_length": 170.42857360839844, "epoch": 0.5998080614203455, "grad_norm": 0.012114754877984524, "kl": 0.040465764701366425, "learning_rate": 4.962160648410368e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 625 }, { "completion_length": 191.35714721679688, "epoch": 0.6007677543186181, "grad_norm": 0.02383405901491642, "kl": 0.06151941418647766, "learning_rate": 4.961796865596327e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 626 }, { "completion_length": 164.1428680419922, "epoch": 0.6017274472168906, "grad_norm": 0.007235351018607616, "kl": 0.031972549855709076, "learning_rate": 4.961431355919597e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 627 }, { "completion_length": 184.50001525878906, "epoch": 0.6026871401151631, "grad_norm": 0.010864601470530033, "kl": 0.04833579435944557, "learning_rate": 4.961064119636569e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 628 }, { "completion_length": 158.92857360839844, "epoch": 0.6036468330134357, "grad_norm": 0.012480790726840496, "kl": 0.05491413548588753, "learning_rate": 4.960695157004849e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 629 }, { "completion_length": 126.85714721679688, "epoch": 0.6046065259117083, "grad_norm": 0.023834170773625374, "kl": 0.06052450090646744, "learning_rate": 4.960324468283248e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 630 }, { "completion_length": 177.71429443359375, "epoch": 0.6055662188099808, "grad_norm": 1.2194591760635376, "kl": 0.04470507800579071, "learning_rate": 4.959952053731793e-07, "loss": 0.0, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 631 }, { "completion_length": 157.07144165039062, "epoch": 0.6065259117082533, "grad_norm": 0.013994389213621616, "kl": 0.04751306027173996, "learning_rate": 4.959577913611718e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 632 }, { "completion_length": 151.07144165039062, "epoch": 0.6074856046065259, "grad_norm": 0.8711488246917725, "kl": 0.06959947943687439, "learning_rate": 4.95920204818547e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 633 }, { "completion_length": 161.1428680419922, "epoch": 0.6084452975047985, "grad_norm": 1.4518705606460571, "kl": 0.08069609105587006, "learning_rate": 4.958824457716706e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 634 }, { "completion_length": 159.5, "epoch": 0.6094049904030711, "grad_norm": 1.7045319080352783, "kl": 0.09534915536642075, "learning_rate": 4.958445142470291e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.1428571492433548, "step": 635 }, { "completion_length": 197.00001525878906, "epoch": 0.6103646833013435, "grad_norm": 0.009810774587094784, "kl": 0.04284657910466194, "learning_rate": 4.9580641027123e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 636 }, { "completion_length": 157.42857360839844, "epoch": 0.6113243761996161, "grad_norm": 0.030873248353600502, "kl": 0.08960860222578049, "learning_rate": 4.957681338710022e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 637 }, { "completion_length": 185.21429443359375, "epoch": 0.6122840690978887, "grad_norm": 1.3540250062942505, "kl": 0.06389807909727097, "learning_rate": 4.95729685073195e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 638 }, { "completion_length": 165.35714721679688, "epoch": 0.6132437619961613, "grad_norm": 1.5390338897705078, "kl": 0.07061870396137238, "learning_rate": 4.956910639047788e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 639 }, { "completion_length": 159.71429443359375, "epoch": 0.6142034548944337, "grad_norm": 0.015698017552495003, "kl": 0.06505683064460754, "learning_rate": 4.956522703928451e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 640 }, { "completion_length": 174.92857360839844, "epoch": 0.6151631477927063, "grad_norm": 0.014989185146987438, "kl": 0.052321914583444595, "learning_rate": 4.956133045646063e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 641 }, { "completion_length": 136.85714721679688, "epoch": 0.6161228406909789, "grad_norm": 1.0908697843551636, "kl": 0.07090864330530167, "learning_rate": 4.955741664473952e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 642 }, { "completion_length": 170.1428680419922, "epoch": 0.6170825335892515, "grad_norm": 1.3772156238555908, "kl": 0.06756855547428131, "learning_rate": 4.955348560686661e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 643 }, { "completion_length": 223.2857208251953, "epoch": 0.6180422264875239, "grad_norm": 1.0914409160614014, "kl": 0.06972721964120865, "learning_rate": 4.954953734559936e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 644 }, { "completion_length": 118.21429443359375, "epoch": 0.6190019193857965, "grad_norm": 0.033186156302690506, "kl": 0.09548316895961761, "learning_rate": 4.954557186370735e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 645 }, { "completion_length": 198.71429443359375, "epoch": 0.6199616122840691, "grad_norm": 0.02661043405532837, "kl": 0.07880501449108124, "learning_rate": 4.954158916397223e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 646 }, { "completion_length": 105.71428680419922, "epoch": 0.6209213051823417, "grad_norm": 0.02718203142285347, "kl": 0.08113043755292892, "learning_rate": 4.95375892491877e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 647 }, { "completion_length": 196.71429443359375, "epoch": 0.6218809980806143, "grad_norm": 1.886481761932373, "kl": 0.08884266018867493, "learning_rate": 4.953357212215956e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 648 }, { "completion_length": 126.71429443359375, "epoch": 0.6228406909788867, "grad_norm": 1.5212676525115967, "kl": 0.07624548673629761, "learning_rate": 4.95295377857057e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 649 }, { "completion_length": 219.1428680419922, "epoch": 0.6238003838771593, "grad_norm": 1.8286648988723755, "kl": 0.05457339808344841, "learning_rate": 4.952548624265606e-07, "loss": 0.0001, "reward": 0.2857142984867096, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.2857142984867096, "step": 650 }, { "completion_length": 132.6428680419922, "epoch": 0.6247600767754319, "grad_norm": 0.024437060579657555, "kl": 0.08109697699546814, "learning_rate": 4.952141749585263e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 651 }, { "completion_length": 187.6428680419922, "epoch": 0.6257197696737045, "grad_norm": 0.0246318057179451, "kl": 0.09496822208166122, "learning_rate": 4.951733154814952e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 652 }, { "completion_length": 203.00001525878906, "epoch": 0.6266794625719769, "grad_norm": 0.015666376799345016, "kl": 0.07635675370693207, "learning_rate": 4.951322840241284e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 653 }, { "completion_length": 177.35714721679688, "epoch": 0.6276391554702495, "grad_norm": 0.017766976729035378, "kl": 0.08114824444055557, "learning_rate": 4.950910806152085e-07, "loss": 0.0001, "reward": 0.2857142984867096, "reward_std": 0.0, "rewards/check_originality_func": 0.2857142984867096, "step": 654 }, { "completion_length": 178.35714721679688, "epoch": 0.6285988483685221, "grad_norm": 0.022903334349393845, "kl": 0.08720146119594574, "learning_rate": 4.950497052836378e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 655 }, { "completion_length": 144.71429443359375, "epoch": 0.6295585412667947, "grad_norm": 0.026973498985171318, "kl": 0.11624173820018768, "learning_rate": 4.950081580584397e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 656 }, { "completion_length": 171.07144165039062, "epoch": 0.6305182341650671, "grad_norm": 0.03797221928834915, "kl": 0.11702224612236023, "learning_rate": 4.949664389687583e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 657 }, { "completion_length": 222.21429443359375, "epoch": 0.6314779270633397, "grad_norm": 0.033011846244335175, "kl": 0.11886974424123764, "learning_rate": 4.94924548043858e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 658 }, { "completion_length": 199.6428680419922, "epoch": 0.6324376199616123, "grad_norm": 0.7753989100456238, "kl": 0.11524917930364609, "learning_rate": 4.948824853131236e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 659 }, { "completion_length": 199.42857360839844, "epoch": 0.6333973128598849, "grad_norm": 1.0476491451263428, "kl": 0.08758648484945297, "learning_rate": 4.948402508060607e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 660 }, { "completion_length": 172.71429443359375, "epoch": 0.6343570057581573, "grad_norm": 2.652740240097046, "kl": 0.1278374195098877, "learning_rate": 4.947978445522954e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.1428571492433548, "step": 661 }, { "completion_length": 174.50001525878906, "epoch": 0.6353166986564299, "grad_norm": 0.01937287673354149, "kl": 0.10539152473211288, "learning_rate": 4.94755266581574e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 662 }, { "completion_length": 149.6428680419922, "epoch": 0.6362763915547025, "grad_norm": 1.2107152938842773, "kl": 0.12826351821422577, "learning_rate": 4.947125169237636e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 663 }, { "completion_length": 161.42857360839844, "epoch": 0.6372360844529751, "grad_norm": 0.9705117344856262, "kl": 0.19711606204509735, "learning_rate": 4.946695956088514e-07, "loss": 0.0002, "reward": 0.2142857313156128, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.2142857313156128, "step": 664 }, { "completion_length": 163.6428680419922, "epoch": 0.6381957773512476, "grad_norm": 0.020351465791463852, "kl": 0.10060273855924606, "learning_rate": 4.946265026669454e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 665 }, { "completion_length": 160.92857360839844, "epoch": 0.6391554702495201, "grad_norm": 0.02866489812731743, "kl": 0.10173525661230087, "learning_rate": 4.945832381282735e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 666 }, { "completion_length": 194.35714721679688, "epoch": 0.6401151631477927, "grad_norm": 0.7198330163955688, "kl": 0.08691204339265823, "learning_rate": 4.945398020231843e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 667 }, { "completion_length": 169.6428680419922, "epoch": 0.6410748560460653, "grad_norm": 1.2809978723526, "kl": 0.16151106357574463, "learning_rate": 4.944961943821468e-07, "loss": 0.0002, "reward": 0.1428571492433548, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.1428571492433548, "step": 668 }, { "completion_length": 160.71429443359375, "epoch": 0.6420345489443378, "grad_norm": 0.0172633845359087, "kl": 0.09375422447919846, "learning_rate": 4.944524152357501e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 669 }, { "completion_length": 148.57144165039062, "epoch": 0.6429942418426103, "grad_norm": 0.952279269695282, "kl": 0.0987955778837204, "learning_rate": 4.944084646147038e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 670 }, { "completion_length": 164.0, "epoch": 0.6439539347408829, "grad_norm": 0.042983174324035645, "kl": 0.1423768252134323, "learning_rate": 4.943643425498376e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 671 }, { "completion_length": 214.6428680419922, "epoch": 0.6449136276391555, "grad_norm": 0.11515314131975174, "kl": 0.18873639404773712, "learning_rate": 4.943200490721015e-07, "loss": 0.0002, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 672 }, { "completion_length": 160.2857208251953, "epoch": 0.6458733205374281, "grad_norm": 0.020581480115652084, "kl": 0.10703858733177185, "learning_rate": 4.942755842125659e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 673 }, { "completion_length": 173.35714721679688, "epoch": 0.6468330134357005, "grad_norm": 0.04260466620326042, "kl": 0.11376498639583588, "learning_rate": 4.942309480024213e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 674 }, { "completion_length": 116.0714340209961, "epoch": 0.6477927063339731, "grad_norm": 1.784924864768982, "kl": 0.15121643245220184, "learning_rate": 4.941861404729784e-07, "loss": 0.0002, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 675 }, { "completion_length": 172.7857208251953, "epoch": 0.6487523992322457, "grad_norm": 1.160173773765564, "kl": 0.12914276123046875, "learning_rate": 4.941411616556682e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 676 }, { "completion_length": 115.00000762939453, "epoch": 0.6497120921305183, "grad_norm": 1.4545190334320068, "kl": 0.11069834977388382, "learning_rate": 4.940960115820415e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 677 }, { "completion_length": 192.1428680419922, "epoch": 0.6506717850287908, "grad_norm": 1.249598503112793, "kl": 0.16906961798667908, "learning_rate": 4.940506902837697e-07, "loss": 0.0002, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 678 }, { "completion_length": 163.5, "epoch": 0.6516314779270633, "grad_norm": 1.1113523244857788, "kl": 0.12103767693042755, "learning_rate": 4.940051977926439e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 679 }, { "completion_length": 210.71429443359375, "epoch": 0.6525911708253359, "grad_norm": 1.2970688343048096, "kl": 0.07775852084159851, "learning_rate": 4.939595341405754e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 680 }, { "completion_length": 192.57144165039062, "epoch": 0.6535508637236085, "grad_norm": 1.2056660652160645, "kl": 0.06420200318098068, "learning_rate": 4.93913699359596e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 681 }, { "completion_length": 174.7857208251953, "epoch": 0.654510556621881, "grad_norm": 1.479142665863037, "kl": 0.08080971240997314, "learning_rate": 4.938676934818567e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 682 }, { "completion_length": 189.35714721679688, "epoch": 0.6554702495201535, "grad_norm": 0.011146563105285168, "kl": 0.06463540345430374, "learning_rate": 4.938215165396292e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 683 }, { "completion_length": 197.21429443359375, "epoch": 0.6564299424184261, "grad_norm": 0.025410456582903862, "kl": 0.08083071559667587, "learning_rate": 4.937751685653051e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 684 }, { "completion_length": 150.92857360839844, "epoch": 0.6573896353166987, "grad_norm": 0.01665063574910164, "kl": 0.06985672563314438, "learning_rate": 4.937286495913956e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 685 }, { "completion_length": 208.42857360839844, "epoch": 0.6583493282149712, "grad_norm": 0.7306040525436401, "kl": 0.0781533345580101, "learning_rate": 4.936819596505322e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 686 }, { "completion_length": 171.57144165039062, "epoch": 0.6593090211132437, "grad_norm": 1.1967322826385498, "kl": 0.056119196116924286, "learning_rate": 4.936350987754664e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 687 }, { "completion_length": 160.0, "epoch": 0.6602687140115163, "grad_norm": 1.0922510623931885, "kl": 0.08159851282835007, "learning_rate": 4.935880669990692e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 688 }, { "completion_length": 190.42857360839844, "epoch": 0.6612284069097889, "grad_norm": 0.09800445288419724, "kl": 0.1185353696346283, "learning_rate": 4.935408643543317e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 689 }, { "completion_length": 182.57144165039062, "epoch": 0.6621880998080614, "grad_norm": 0.8919245600700378, "kl": 0.05321352183818817, "learning_rate": 4.93493490874365e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 690 }, { "completion_length": 130.57144165039062, "epoch": 0.663147792706334, "grad_norm": 0.020120002329349518, "kl": 0.1060774028301239, "learning_rate": 4.934459465923999e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 691 }, { "completion_length": 119.42857360839844, "epoch": 0.6641074856046065, "grad_norm": 1.6809455156326294, "kl": 0.19235211610794067, "learning_rate": 4.933982315417871e-07, "loss": 0.0002, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 692 }, { "completion_length": 209.35714721679688, "epoch": 0.6650671785028791, "grad_norm": 1.0304111242294312, "kl": 0.08384684473276138, "learning_rate": 4.933503457559968e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 693 }, { "completion_length": 153.6428680419922, "epoch": 0.6660268714011516, "grad_norm": 0.03501513600349426, "kl": 0.10883401334285736, "learning_rate": 4.933022892686192e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 694 }, { "completion_length": 202.35714721679688, "epoch": 0.6669865642994242, "grad_norm": 2.0346312522888184, "kl": 0.07893159240484238, "learning_rate": 4.932540621133644e-07, "loss": 0.0001, "reward": 0.2142857313156128, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.2142857313156128, "step": 695 }, { "completion_length": 170.1428680419922, "epoch": 0.6679462571976967, "grad_norm": 0.020199427381157875, "kl": 0.0886487290263176, "learning_rate": 4.932056643240618e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 696 }, { "completion_length": 121.78572082519531, "epoch": 0.6689059500959693, "grad_norm": 1.0353000164031982, "kl": 0.13079015910625458, "learning_rate": 4.931570959346608e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 697 }, { "completion_length": 214.42857360839844, "epoch": 0.6698656429942419, "grad_norm": 1.7259035110473633, "kl": 0.08333461731672287, "learning_rate": 4.931083569792306e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 698 }, { "completion_length": 152.5, "epoch": 0.6708253358925144, "grad_norm": 0.9897505640983582, "kl": 0.124901682138443, "learning_rate": 4.930594474919595e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 699 }, { "completion_length": 144.92857360839844, "epoch": 0.6717850287907869, "grad_norm": 1.3326154947280884, "kl": 0.08417586237192154, "learning_rate": 4.93010367507156e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 700 }, { "completion_length": 173.57144165039062, "epoch": 0.6727447216890595, "grad_norm": 0.014724793843925, "kl": 0.08971621096134186, "learning_rate": 4.929611170592479e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 701 }, { "completion_length": 155.1428680419922, "epoch": 0.6737044145873321, "grad_norm": 0.03200551122426987, "kl": 0.13965854048728943, "learning_rate": 4.929116961827826e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 702 }, { "completion_length": 187.6428680419922, "epoch": 0.6746641074856046, "grad_norm": 0.017592309042811394, "kl": 0.09832961857318878, "learning_rate": 4.928621049124271e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 703 }, { "completion_length": 198.92857360839844, "epoch": 0.6756238003838771, "grad_norm": 0.8736680150032043, "kl": 0.08665793389081955, "learning_rate": 4.928123432829678e-07, "loss": 0.0001, "reward": 0.2142857313156128, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.2142857313156128, "step": 704 }, { "completion_length": 173.85714721679688, "epoch": 0.6765834932821497, "grad_norm": 0.025242028757929802, "kl": 0.12328542023897171, "learning_rate": 4.927624113293109e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 705 }, { "completion_length": 159.0, "epoch": 0.6775431861804223, "grad_norm": 2.685668468475342, "kl": 0.10092766582965851, "learning_rate": 4.927123090864818e-07, "loss": 0.0001, "reward": 0.2142857313156128, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.2142857313156128, "step": 706 }, { "completion_length": 149.35714721679688, "epoch": 0.6785028790786948, "grad_norm": 1.777001976966858, "kl": 0.1309608519077301, "learning_rate": 4.926620365896253e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 707 }, { "completion_length": 130.42857360839844, "epoch": 0.6794625719769674, "grad_norm": 0.02355870231986046, "kl": 0.1216459572315216, "learning_rate": 4.92611593874006e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 708 }, { "completion_length": 197.7857208251953, "epoch": 0.6804222648752399, "grad_norm": 0.7763403058052063, "kl": 0.12093961983919144, "learning_rate": 4.925609809750076e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 709 }, { "completion_length": 197.7857208251953, "epoch": 0.6813819577735125, "grad_norm": 2.6362476348876953, "kl": 0.14627937972545624, "learning_rate": 4.925101979281332e-07, "loss": 0.0001, "reward": 0.2857142984867096, "reward_std": 0.4040610194206238, "rewards/check_originality_func": 0.2857142984867096, "step": 710 }, { "completion_length": 150.42857360839844, "epoch": 0.682341650671785, "grad_norm": 1.2048791646957397, "kl": 0.10813158750534058, "learning_rate": 4.924592447690053e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 711 }, { "completion_length": 192.1428680419922, "epoch": 0.6833013435700576, "grad_norm": 1.2897485494613647, "kl": 0.12419142574071884, "learning_rate": 4.924081215333657e-07, "loss": 0.0001, "reward": 0.2142857313156128, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.2142857313156128, "step": 712 }, { "completion_length": 197.57144165039062, "epoch": 0.6842610364683301, "grad_norm": 0.062515027821064, "kl": 0.13599205017089844, "learning_rate": 4.923568282570755e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 713 }, { "completion_length": 192.21429443359375, "epoch": 0.6852207293666027, "grad_norm": 1.0064507722854614, "kl": 0.1431538462638855, "learning_rate": 4.923053649761152e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 714 }, { "completion_length": 180.6428680419922, "epoch": 0.6861804222648752, "grad_norm": 0.018633795902132988, "kl": 0.1110287606716156, "learning_rate": 4.922537317265845e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 715 }, { "completion_length": 146.92857360839844, "epoch": 0.6871401151631478, "grad_norm": 0.020014416426420212, "kl": 0.1349213570356369, "learning_rate": 4.92201928544702e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 716 }, { "completion_length": 222.00001525878906, "epoch": 0.6880998080614203, "grad_norm": 0.016650892794132233, "kl": 0.09555627405643463, "learning_rate": 4.921499554668061e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 717 }, { "completion_length": 138.6428680419922, "epoch": 0.6890595009596929, "grad_norm": 1.1983387470245361, "kl": 0.15063071250915527, "learning_rate": 4.920978125293539e-07, "loss": 0.0002, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 718 }, { "completion_length": 165.42857360839844, "epoch": 0.6900191938579654, "grad_norm": 2.0008232593536377, "kl": 0.15917733311653137, "learning_rate": 4.920454997689219e-07, "loss": 0.0002, "reward": 0.2142857313156128, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.2142857313156128, "step": 719 }, { "completion_length": 185.35714721679688, "epoch": 0.690978886756238, "grad_norm": 2.950665235519409, "kl": 0.16828681528568268, "learning_rate": 4.919930172222054e-07, "loss": 0.0002, "reward": 0.1428571492433548, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.1428571492433548, "step": 720 }, { "completion_length": 165.21429443359375, "epoch": 0.6919385796545106, "grad_norm": 1.3637244701385498, "kl": 0.0998474732041359, "learning_rate": 4.919403649260194e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.1428571492433548, "step": 721 }, { "completion_length": 175.42857360839844, "epoch": 0.6928982725527831, "grad_norm": 1.458745002746582, "kl": 0.12310447543859482, "learning_rate": 4.918875429172974e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 722 }, { "completion_length": 176.6428680419922, "epoch": 0.6938579654510557, "grad_norm": 0.018896834924817085, "kl": 0.1102587878704071, "learning_rate": 4.918345512330921e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 723 }, { "completion_length": 175.71429443359375, "epoch": 0.6948176583493282, "grad_norm": 2.256110191345215, "kl": 0.12192071974277496, "learning_rate": 4.917813899105755e-07, "loss": 0.0001, "reward": 0.2142857313156128, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.2142857313156128, "step": 724 }, { "completion_length": 149.5, "epoch": 0.6957773512476008, "grad_norm": 1.448610544204712, "kl": 0.1207086443901062, "learning_rate": 4.917280589870381e-07, "loss": 0.0001, "reward": 0.2142857313156128, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.2142857313156128, "step": 725 }, { "completion_length": 164.07144165039062, "epoch": 0.6967370441458733, "grad_norm": 0.01053556427359581, "kl": 0.07755140215158463, "learning_rate": 4.916745584998898e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 726 }, { "completion_length": 146.5, "epoch": 0.6976967370441459, "grad_norm": 2.1245484352111816, "kl": 0.11379069089889526, "learning_rate": 4.916208884866592e-07, "loss": 0.0001, "reward": 0.2857142984867096, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.2857142984867096, "step": 727 }, { "completion_length": 146.6428680419922, "epoch": 0.6986564299424184, "grad_norm": 1.4942641258239746, "kl": 0.10744833946228027, "learning_rate": 4.915670489849941e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 728 }, { "completion_length": 213.71429443359375, "epoch": 0.699616122840691, "grad_norm": 0.009136641398072243, "kl": 0.058895841240882874, "learning_rate": 4.915130400326607e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 729 }, { "completion_length": 169.42857360839844, "epoch": 0.7005758157389635, "grad_norm": 0.9595388174057007, "kl": 0.10022168606519699, "learning_rate": 4.914588616675445e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 730 }, { "completion_length": 162.5, "epoch": 0.7015355086372361, "grad_norm": 0.016772985458374023, "kl": 0.10188329964876175, "learning_rate": 4.914045139276497e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 731 }, { "completion_length": 169.7857208251953, "epoch": 0.7024952015355086, "grad_norm": 0.8285542726516724, "kl": 0.07851031422615051, "learning_rate": 4.913499968510994e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 732 }, { "completion_length": 167.1428680419922, "epoch": 0.7034548944337812, "grad_norm": 0.02646755427122116, "kl": 0.12272654473781586, "learning_rate": 4.912953104761351e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 733 }, { "completion_length": 149.71429443359375, "epoch": 0.7044145873320538, "grad_norm": 1.3382081985473633, "kl": 0.06484362483024597, "learning_rate": 4.912404548411176e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.1428571492433548, "step": 734 }, { "completion_length": 168.07144165039062, "epoch": 0.7053742802303263, "grad_norm": 0.009884790517389774, "kl": 0.06611204147338867, "learning_rate": 4.91185429984526e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 735 }, { "completion_length": 138.57144165039062, "epoch": 0.7063339731285988, "grad_norm": 0.879690408706665, "kl": 0.09594187885522842, "learning_rate": 4.911302359449585e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 736 }, { "completion_length": 122.00000762939453, "epoch": 0.7072936660268714, "grad_norm": 1.982264518737793, "kl": 0.0894826278090477, "learning_rate": 4.910748727611316e-07, "loss": 0.0001, "reward": 0.2857142984867096, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.2857142984867096, "step": 737 }, { "completion_length": 116.14286041259766, "epoch": 0.708253358925144, "grad_norm": 0.015572501346468925, "kl": 0.09725014120340347, "learning_rate": 4.910193404718805e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 738 }, { "completion_length": 203.21429443359375, "epoch": 0.7092130518234165, "grad_norm": 0.009243465960025787, "kl": 0.05283233895897865, "learning_rate": 4.909636391161592e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 739 }, { "completion_length": 150.42857360839844, "epoch": 0.710172744721689, "grad_norm": 0.8038504123687744, "kl": 0.066309854388237, "learning_rate": 4.909077687330404e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 740 }, { "completion_length": 165.5, "epoch": 0.7111324376199616, "grad_norm": 0.9005822539329529, "kl": 0.058103129267692566, "learning_rate": 4.908517293617149e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 741 }, { "completion_length": 173.1428680419922, "epoch": 0.7120921305182342, "grad_norm": 0.010089167393743992, "kl": 0.06510969251394272, "learning_rate": 4.907955210414924e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 742 }, { "completion_length": 126.42857360839844, "epoch": 0.7130518234165067, "grad_norm": 0.009290485642850399, "kl": 0.06624267995357513, "learning_rate": 4.907391438118012e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 743 }, { "completion_length": 143.6428680419922, "epoch": 0.7140115163147792, "grad_norm": 1.267127275466919, "kl": 0.0922749862074852, "learning_rate": 4.906825977121877e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 744 }, { "completion_length": 130.35714721679688, "epoch": 0.7149712092130518, "grad_norm": 0.018063440918922424, "kl": 0.0997069701552391, "learning_rate": 4.906258827823171e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 745 }, { "completion_length": 150.6428680419922, "epoch": 0.7159309021113244, "grad_norm": 0.009262275882065296, "kl": 0.06073791906237602, "learning_rate": 4.90568999061973e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 746 }, { "completion_length": 162.5, "epoch": 0.716890595009597, "grad_norm": 1.4470521211624146, "kl": 0.08503665775060654, "learning_rate": 4.905119465910569e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 747 }, { "completion_length": 144.57144165039062, "epoch": 0.7178502879078695, "grad_norm": 1.8219925165176392, "kl": 0.07500538975000381, "learning_rate": 4.904547254095894e-07, "loss": 0.0001, "reward": 0.3571428656578064, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.3571428656578064, "step": 748 }, { "completion_length": 201.7857208251953, "epoch": 0.718809980806142, "grad_norm": 0.010054612532258034, "kl": 0.07546065002679825, "learning_rate": 4.903973355577091e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 749 }, { "completion_length": 155.1428680419922, "epoch": 0.7197696737044146, "grad_norm": 0.012460339814424515, "kl": 0.07734999060630798, "learning_rate": 4.903397770756729e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 750 }, { "completion_length": 148.7857208251953, "epoch": 0.7207293666026872, "grad_norm": 0.011618106625974178, "kl": 0.0768575519323349, "learning_rate": 4.90282050003856e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 751 }, { "completion_length": 199.50001525878906, "epoch": 0.7216890595009597, "grad_norm": 0.01950983703136444, "kl": 0.07416008412837982, "learning_rate": 4.902241543827519e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 752 }, { "completion_length": 159.7857208251953, "epoch": 0.7226487523992322, "grad_norm": 0.018460318446159363, "kl": 0.08175213634967804, "learning_rate": 4.901660902529723e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 753 }, { "completion_length": 177.35714721679688, "epoch": 0.7236084452975048, "grad_norm": 0.007826785556972027, "kl": 0.06131577864289284, "learning_rate": 4.901078576552471e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 754 }, { "completion_length": 160.07144165039062, "epoch": 0.7245681381957774, "grad_norm": 0.006535206455737352, "kl": 0.056212812662124634, "learning_rate": 4.900494566304244e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 755 }, { "completion_length": 161.7857208251953, "epoch": 0.72552783109405, "grad_norm": 1.9947409629821777, "kl": 0.076224684715271, "learning_rate": 4.899908872194704e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.1428571492433548, "step": 756 }, { "completion_length": 144.21429443359375, "epoch": 0.7264875239923224, "grad_norm": 0.01050098892301321, "kl": 0.08187925815582275, "learning_rate": 4.899321494634696e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 757 }, { "completion_length": 154.6428680419922, "epoch": 0.727447216890595, "grad_norm": 0.010756145231425762, "kl": 0.07415325939655304, "learning_rate": 4.898732434036243e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 758 }, { "completion_length": 165.57144165039062, "epoch": 0.7284069097888676, "grad_norm": 0.7373768091201782, "kl": 0.05653765797615051, "learning_rate": 4.89814169081255e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 759 }, { "completion_length": 146.42857360839844, "epoch": 0.7293666026871402, "grad_norm": 1.017916202545166, "kl": 0.10195969045162201, "learning_rate": 4.897549265378004e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 760 }, { "completion_length": 177.07144165039062, "epoch": 0.7303262955854126, "grad_norm": 1.2543247938156128, "kl": 0.06101800873875618, "learning_rate": 4.896955158148167e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 761 }, { "completion_length": 219.7857208251953, "epoch": 0.7312859884836852, "grad_norm": 1.1786352396011353, "kl": 0.05282389372587204, "learning_rate": 4.896359369539788e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 762 }, { "completion_length": 123.85714721679688, "epoch": 0.7322456813819578, "grad_norm": 1.0929453372955322, "kl": 0.06613599509000778, "learning_rate": 4.895761899970789e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 763 }, { "completion_length": 170.5, "epoch": 0.7332053742802304, "grad_norm": 1.1585990190505981, "kl": 0.0703069418668747, "learning_rate": 4.895162749860272e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 764 }, { "completion_length": 154.2857208251953, "epoch": 0.7341650671785028, "grad_norm": 2.3628976345062256, "kl": 0.10783317685127258, "learning_rate": 4.894561919628524e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.1428571492433548, "step": 765 }, { "completion_length": 143.0, "epoch": 0.7351247600767754, "grad_norm": 2.160891532897949, "kl": 0.09033627808094025, "learning_rate": 4.893959409697002e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 766 }, { "completion_length": 126.42857360839844, "epoch": 0.736084452975048, "grad_norm": 0.019126690924167633, "kl": 0.10157816857099533, "learning_rate": 4.893355220488348e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 767 }, { "completion_length": 200.2857208251953, "epoch": 0.7370441458733206, "grad_norm": 0.009737776592373848, "kl": 0.06415418535470963, "learning_rate": 4.892749352426376e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 768 }, { "completion_length": 158.7857208251953, "epoch": 0.738003838771593, "grad_norm": 1.2829113006591797, "kl": 0.08510610461235046, "learning_rate": 4.892141805936084e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.1428571492433548, "step": 769 }, { "completion_length": 172.35714721679688, "epoch": 0.7389635316698656, "grad_norm": 0.01201514434069395, "kl": 0.0797138437628746, "learning_rate": 4.891532581443643e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 770 }, { "completion_length": 165.85714721679688, "epoch": 0.7399232245681382, "grad_norm": 1.0337027311325073, "kl": 0.08076802641153336, "learning_rate": 4.890921679376402e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 771 }, { "completion_length": 228.1428680419922, "epoch": 0.7408829174664108, "grad_norm": 1.3122227191925049, "kl": 0.06465241312980652, "learning_rate": 4.890309100162888e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.1428571492433548, "step": 772 }, { "completion_length": 174.71429443359375, "epoch": 0.7418426103646834, "grad_norm": 0.010632685385644436, "kl": 0.08614736050367355, "learning_rate": 4.889694844232801e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 773 }, { "completion_length": 151.2857208251953, "epoch": 0.7428023032629558, "grad_norm": 0.049935758113861084, "kl": 0.13293954730033875, "learning_rate": 4.889078912017022e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 774 }, { "completion_length": 168.35714721679688, "epoch": 0.7437619961612284, "grad_norm": 0.01736142486333847, "kl": 0.08704431354999542, "learning_rate": 4.888461303947604e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 775 }, { "completion_length": 146.1428680419922, "epoch": 0.744721689059501, "grad_norm": 0.025002116337418556, "kl": 0.11226708441972733, "learning_rate": 4.887842020457779e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 776 }, { "completion_length": 140.0, "epoch": 0.7456813819577736, "grad_norm": 1.1139541864395142, "kl": 0.10910072177648544, "learning_rate": 4.88722106198195e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 777 }, { "completion_length": 162.1428680419922, "epoch": 0.746641074856046, "grad_norm": 0.022365523502230644, "kl": 0.10884088277816772, "learning_rate": 4.886598428955698e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 778 }, { "completion_length": 179.71429443359375, "epoch": 0.7476007677543186, "grad_norm": 2.0024256706237793, "kl": 0.10022048652172089, "learning_rate": 4.885974121815779e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.1428571492433548, "step": 779 }, { "completion_length": 220.00001525878906, "epoch": 0.7485604606525912, "grad_norm": 0.9033368825912476, "kl": 0.06979133188724518, "learning_rate": 4.885348141000122e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 780 }, { "completion_length": 201.35714721679688, "epoch": 0.7495201535508638, "grad_norm": 0.009944288060069084, "kl": 0.06814754009246826, "learning_rate": 4.884720486947829e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 781 }, { "completion_length": 176.92857360839844, "epoch": 0.7504798464491362, "grad_norm": 0.015330747701227665, "kl": 0.08654240518808365, "learning_rate": 4.884091160099177e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 782 }, { "completion_length": 125.71429443359375, "epoch": 0.7514395393474088, "grad_norm": 2.058807849884033, "kl": 0.1578809767961502, "learning_rate": 4.883460160895617e-07, "loss": 0.0002, "reward": 0.2142857313156128, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.2142857313156128, "step": 783 }, { "completion_length": 159.42857360839844, "epoch": 0.7523992322456814, "grad_norm": 0.024418719112873077, "kl": 0.11039355397224426, "learning_rate": 4.882827489779772e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 784 }, { "completion_length": 137.92857360839844, "epoch": 0.753358925143954, "grad_norm": 1.3203753232955933, "kl": 0.1378057450056076, "learning_rate": 4.882193147195438e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 785 }, { "completion_length": 146.42857360839844, "epoch": 0.7543186180422264, "grad_norm": 0.040856052190065384, "kl": 0.1389281153678894, "learning_rate": 4.881557133587585e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 786 }, { "completion_length": 209.21429443359375, "epoch": 0.755278310940499, "grad_norm": 0.009155694395303726, "kl": 0.06994375586509705, "learning_rate": 4.880919449402353e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 787 }, { "completion_length": 204.50001525878906, "epoch": 0.7562380038387716, "grad_norm": 0.9115666747093201, "kl": 0.08514169603586197, "learning_rate": 4.880280095087054e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 788 }, { "completion_length": 143.0, "epoch": 0.7571976967370442, "grad_norm": 0.021706266328692436, "kl": 0.12595202028751373, "learning_rate": 4.879639071090173e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 789 }, { "completion_length": 169.6428680419922, "epoch": 0.7581573896353166, "grad_norm": 0.01603739708662033, "kl": 0.10657726228237152, "learning_rate": 4.878996377861367e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 790 }, { "completion_length": 149.85714721679688, "epoch": 0.7591170825335892, "grad_norm": 0.03440432623028755, "kl": 0.15433381497859955, "learning_rate": 4.878352015851459e-07, "loss": 0.0002, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 791 }, { "completion_length": 137.57144165039062, "epoch": 0.7600767754318618, "grad_norm": 0.05002276971936226, "kl": 0.1596015989780426, "learning_rate": 4.877705985512449e-07, "loss": 0.0002, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 792 }, { "completion_length": 154.1428680419922, "epoch": 0.7610364683301344, "grad_norm": 1.8400242328643799, "kl": 0.15554043650627136, "learning_rate": 4.877058287297502e-07, "loss": 0.0002, "reward": 0.1428571492433548, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.1428571492433548, "step": 793 }, { "completion_length": 163.21429443359375, "epoch": 0.761996161228407, "grad_norm": 0.01969468779861927, "kl": 0.1223905086517334, "learning_rate": 4.876408921660956e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 794 }, { "completion_length": 167.35714721679688, "epoch": 0.7629558541266794, "grad_norm": 1.3432449102401733, "kl": 0.11880765855312347, "learning_rate": 4.875757889058319e-07, "loss": 0.0001, "reward": 0.3571428656578064, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.3571428656578064, "step": 795 }, { "completion_length": 135.42857360839844, "epoch": 0.763915547024952, "grad_norm": 1.3241636753082275, "kl": 0.159468412399292, "learning_rate": 4.875105189946266e-07, "loss": 0.0002, "reward": 0.2142857313156128, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.2142857313156128, "step": 796 }, { "completion_length": 164.7857208251953, "epoch": 0.7648752399232246, "grad_norm": 1.5025544166564941, "kl": 0.1314813494682312, "learning_rate": 4.874450824782642e-07, "loss": 0.0001, "reward": 0.2142857313156128, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.2142857313156128, "step": 797 }, { "completion_length": 168.6428680419922, "epoch": 0.7658349328214972, "grad_norm": 1.0256328582763672, "kl": 0.10814754664897919, "learning_rate": 4.873794794026461e-07, "loss": 0.0001, "reward": 0.2142857313156128, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.2142857313156128, "step": 798 }, { "completion_length": 225.21429443359375, "epoch": 0.7667946257197696, "grad_norm": 1.6207802295684814, "kl": 0.12219558656215668, "learning_rate": 4.873137098137907e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.1428571492433548, "step": 799 }, { "completion_length": 135.5, "epoch": 0.7677543186180422, "grad_norm": 1.4700626134872437, "kl": 0.14033739268779755, "learning_rate": 4.872477737578327e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 800 }, { "completion_length": 162.0, "epoch": 0.7687140115163148, "grad_norm": 1.1293710470199585, "kl": 0.13205845654010773, "learning_rate": 4.871816712810239e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 801 }, { "completion_length": 166.5, "epoch": 0.7696737044145874, "grad_norm": 0.03930402174592018, "kl": 0.13564346730709076, "learning_rate": 4.871154024297332e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 802 }, { "completion_length": 148.21429443359375, "epoch": 0.7706333973128598, "grad_norm": 0.027556084096431732, "kl": 0.1581900715827942, "learning_rate": 4.870489672504455e-07, "loss": 0.0002, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 803 }, { "completion_length": 143.5, "epoch": 0.7715930902111324, "grad_norm": 0.06675148010253906, "kl": 0.19051392376422882, "learning_rate": 4.86982365789763e-07, "loss": 0.0002, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 804 }, { "completion_length": 142.57144165039062, "epoch": 0.772552783109405, "grad_norm": 1.5992814302444458, "kl": 0.181320458650589, "learning_rate": 4.869155980944038e-07, "loss": 0.0002, "reward": 0.1428571492433548, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.1428571492433548, "step": 805 }, { "completion_length": 189.07144165039062, "epoch": 0.7735124760076776, "grad_norm": 1.410343885421753, "kl": 0.1130717545747757, "learning_rate": 4.868486642112035e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.1428571492433548, "step": 806 }, { "completion_length": 164.21429443359375, "epoch": 0.77447216890595, "grad_norm": 1.0535041093826294, "kl": 0.11653467267751694, "learning_rate": 4.867815641871136e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 807 }, { "completion_length": 187.1428680419922, "epoch": 0.7754318618042226, "grad_norm": 0.014194350689649582, "kl": 0.09546775370836258, "learning_rate": 4.867142980692024e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 808 }, { "completion_length": 116.50000762939453, "epoch": 0.7763915547024952, "grad_norm": 0.03585396707057953, "kl": 0.16005778312683105, "learning_rate": 4.866468659046548e-07, "loss": 0.0002, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 809 }, { "completion_length": 169.92857360839844, "epoch": 0.7773512476007678, "grad_norm": 1.6012895107269287, "kl": 0.13453982770442963, "learning_rate": 4.865792677407718e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.1428571492433548, "step": 810 }, { "completion_length": 121.28572082519531, "epoch": 0.7783109404990403, "grad_norm": 2.1051595211029053, "kl": 0.1720375418663025, "learning_rate": 4.865115036249714e-07, "loss": 0.0002, "reward": 0.5, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.5, "step": 811 }, { "completion_length": 137.1428680419922, "epoch": 0.7792706333973128, "grad_norm": 1.4102556705474854, "kl": 0.29032936692237854, "learning_rate": 4.864435736047876e-07, "loss": 0.0003, "reward": 0.2857142984867096, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.2857142984867096, "step": 812 }, { "completion_length": 126.28572082519531, "epoch": 0.7802303262955854, "grad_norm": 0.8970164656639099, "kl": 0.19698992371559143, "learning_rate": 4.863754777278708e-07, "loss": 0.0002, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 813 }, { "completion_length": 150.7857208251953, "epoch": 0.781190019193858, "grad_norm": 0.023102758452296257, "kl": 0.11701919883489609, "learning_rate": 4.863072160419879e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 814 }, { "completion_length": 170.92857360839844, "epoch": 0.7821497120921305, "grad_norm": 0.7202088236808777, "kl": 0.18308493494987488, "learning_rate": 4.86238788595022e-07, "loss": 0.0002, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 815 }, { "completion_length": 188.71429443359375, "epoch": 0.783109404990403, "grad_norm": 0.7758586406707764, "kl": 0.18117286264896393, "learning_rate": 4.861701954349726e-07, "loss": 0.0002, "reward": 0.2142857313156128, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.2142857313156128, "step": 816 }, { "completion_length": 171.35714721679688, "epoch": 0.7840690978886756, "grad_norm": 2.2239561080932617, "kl": 0.20659464597702026, "learning_rate": 4.861014366099553e-07, "loss": 0.0002, "reward": 0.2857142984867096, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.2857142984867096, "step": 817 }, { "completion_length": 192.2857208251953, "epoch": 0.7850287907869482, "grad_norm": 0.9310640692710876, "kl": 0.13383102416992188, "learning_rate": 4.86032512168202e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 818 }, { "completion_length": 152.21429443359375, "epoch": 0.7859884836852208, "grad_norm": 1.7143744230270386, "kl": 0.1612527221441269, "learning_rate": 4.859634221580607e-07, "loss": 0.0002, "reward": 0.2142857313156128, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.2142857313156128, "step": 819 }, { "completion_length": 151.6428680419922, "epoch": 0.7869481765834933, "grad_norm": 0.037865567952394485, "kl": 0.18964225053787231, "learning_rate": 4.858941666279955e-07, "loss": 0.0002, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 820 }, { "completion_length": 221.21429443359375, "epoch": 0.7879078694817658, "grad_norm": 1.0704631805419922, "kl": 0.15129685401916504, "learning_rate": 4.858247456265867e-07, "loss": 0.0002, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 821 }, { "completion_length": 148.42857360839844, "epoch": 0.7888675623800384, "grad_norm": 1.374784231185913, "kl": 0.1705753207206726, "learning_rate": 4.857551592025308e-07, "loss": 0.0002, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 822 }, { "completion_length": 114.64286041259766, "epoch": 0.789827255278311, "grad_norm": 0.06456493586301804, "kl": 0.2080933153629303, "learning_rate": 4.856854074046402e-07, "loss": 0.0002, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 823 }, { "completion_length": 113.35714721679688, "epoch": 0.7907869481765835, "grad_norm": 3.1683733463287354, "kl": 0.14878588914871216, "learning_rate": 4.856154902818431e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.1428571492433548, "step": 824 }, { "completion_length": 173.71429443359375, "epoch": 0.791746641074856, "grad_norm": 1.9457639455795288, "kl": 0.1315532922744751, "learning_rate": 4.85545407883184e-07, "loss": 0.0001, "reward": 0.2142857313156128, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.2142857313156128, "step": 825 }, { "completion_length": 195.57144165039062, "epoch": 0.7927063339731286, "grad_norm": 0.017426496371626854, "kl": 0.14729736745357513, "learning_rate": 4.854751602578232e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 826 }, { "completion_length": 176.50001525878906, "epoch": 0.7936660268714012, "grad_norm": 2.390385150909424, "kl": 0.10931770503520966, "learning_rate": 4.854047474550369e-07, "loss": 0.0001, "reward": 0.3571428656578064, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.3571428656578064, "step": 827 }, { "completion_length": 206.00001525878906, "epoch": 0.7946257197696737, "grad_norm": 1.0389877557754517, "kl": 0.12801414728164673, "learning_rate": 4.853341695242172e-07, "loss": 0.0001, "reward": 0.2142857313156128, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.2142857313156128, "step": 828 }, { "completion_length": 123.14286041259766, "epoch": 0.7955854126679462, "grad_norm": 2.6064887046813965, "kl": 0.17494124174118042, "learning_rate": 4.852634265148719e-07, "loss": 0.0002, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 829 }, { "completion_length": 161.7857208251953, "epoch": 0.7965451055662188, "grad_norm": 0.9943892359733582, "kl": 0.13882379233837128, "learning_rate": 4.851925184766247e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 830 }, { "completion_length": 186.1428680419922, "epoch": 0.7975047984644914, "grad_norm": 0.7639083862304688, "kl": 0.11922261863946915, "learning_rate": 4.851214454592152e-07, "loss": 0.0001, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 831 }, { "completion_length": 133.21429443359375, "epoch": 0.7984644913627639, "grad_norm": 0.04375765472650528, "kl": 0.20071092247962952, "learning_rate": 4.850502075124984e-07, "loss": 0.0002, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 832 }, { "completion_length": 131.0, "epoch": 0.7994241842610365, "grad_norm": 1.25660240650177, "kl": 0.21137554943561554, "learning_rate": 4.849788046864453e-07, "loss": 0.0002, "reward": 0.2142857313156128, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.2142857313156128, "step": 833 }, { "completion_length": 154.92857360839844, "epoch": 0.800383877159309, "grad_norm": 0.02895703911781311, "kl": 0.17294560372829437, "learning_rate": 4.849072370311425e-07, "loss": 0.0002, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 834 }, { "completion_length": 135.42857360839844, "epoch": 0.8013435700575816, "grad_norm": 2.6720640659332275, "kl": 0.18687154352664948, "learning_rate": 4.84835504596792e-07, "loss": 0.0002, "reward": 0.2857142984867096, "reward_std": 0.4040610194206238, "rewards/check_originality_func": 0.2857142984867096, "step": 835 }, { "completion_length": 137.42857360839844, "epoch": 0.8023032629558541, "grad_norm": 1.200229287147522, "kl": 0.3357578217983246, "learning_rate": 4.847636074337116e-07, "loss": 0.0003, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 836 }, { "completion_length": 173.21429443359375, "epoch": 0.8032629558541267, "grad_norm": 1.255347728729248, "kl": 0.17965157330036163, "learning_rate": 4.846915455923347e-07, "loss": 0.0002, "reward": 0.2142857313156128, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.2142857313156128, "step": 837 }, { "completion_length": 169.35714721679688, "epoch": 0.8042226487523992, "grad_norm": 0.013211546465754509, "kl": 0.11323035508394241, "learning_rate": 4.8461931912321e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 838 }, { "completion_length": 154.71429443359375, "epoch": 0.8051823416506718, "grad_norm": 1.8666768074035645, "kl": 0.19888047873973846, "learning_rate": 4.845469280770017e-07, "loss": 0.0002, "reward": 0.2142857313156128, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.2142857313156128, "step": 839 }, { "completion_length": 199.57144165039062, "epoch": 0.8061420345489443, "grad_norm": 2.0301952362060547, "kl": 0.13254408538341522, "learning_rate": 4.844743725044897e-07, "loss": 0.0001, "reward": 0.1428571492433548, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.1428571492433548, "step": 840 }, { "completion_length": 153.71429443359375, "epoch": 0.8071017274472169, "grad_norm": 2.1375367641448975, "kl": 0.2524608075618744, "learning_rate": 4.844016524565692e-07, "loss": 0.0003, "reward": 0.2142857313156128, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.2142857313156128, "step": 841 }, { "completion_length": 185.92857360839844, "epoch": 0.8080614203454894, "grad_norm": 2.048834800720215, "kl": 0.17260262370109558, "learning_rate": 4.843287679842505e-07, "loss": 0.0002, "reward": 0.2142857313156128, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.2142857313156128, "step": 842 }, { "completion_length": 161.35714721679688, "epoch": 0.809021113243762, "grad_norm": 1.635056734085083, "kl": 0.21594257652759552, "learning_rate": 4.842557191386596e-07, "loss": 0.0002, "reward": 0.2142857313156128, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.2142857313156128, "step": 843 }, { "completion_length": 154.6428680419922, "epoch": 0.8099808061420346, "grad_norm": 1.0219424962997437, "kl": 0.16647952795028687, "learning_rate": 4.841825059710377e-07, "loss": 0.0002, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 844 }, { "completion_length": 121.78572082519531, "epoch": 0.8109404990403071, "grad_norm": 1.862230658531189, "kl": 0.3673359751701355, "learning_rate": 4.841091285327411e-07, "loss": 0.0004, "reward": 0.3571428656578064, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.3571428656578064, "step": 845 }, { "completion_length": 171.71429443359375, "epoch": 0.8119001919385797, "grad_norm": 2.488290548324585, "kl": 0.1842094361782074, "learning_rate": 4.840355868752414e-07, "loss": 0.0002, "reward": 0.6428571939468384, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.6428571939468384, "step": 846 }, { "completion_length": 196.6428680419922, "epoch": 0.8128598848368522, "grad_norm": 1.5591751337051392, "kl": 0.20815101265907288, "learning_rate": 4.839618810501258e-07, "loss": 0.0002, "reward": 0.4285714626312256, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.4285714626312256, "step": 847 }, { "completion_length": 143.92857360839844, "epoch": 0.8138195777351248, "grad_norm": 2.6518940925598145, "kl": 0.3428964912891388, "learning_rate": 4.838880111090959e-07, "loss": 0.0003, "reward": 0.2857142984867096, "reward_std": 0.4040610194206238, "rewards/check_originality_func": 0.2857142984867096, "step": 848 }, { "completion_length": 177.21429443359375, "epoch": 0.8147792706333973, "grad_norm": 0.8781049251556396, "kl": 0.1850660741329193, "learning_rate": 4.838139771039691e-07, "loss": 0.0002, "reward": 0.2142857313156128, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.2142857313156128, "step": 849 }, { "completion_length": 147.6428680419922, "epoch": 0.8157389635316699, "grad_norm": 0.06422825902700424, "kl": 0.26229801774024963, "learning_rate": 4.837397790866774e-07, "loss": 0.0003, "reward": 0.1428571492433548, "reward_std": 0.0, "rewards/check_originality_func": 0.1428571492433548, "step": 850 }, { "completion_length": 194.21429443359375, "epoch": 0.8166986564299424, "grad_norm": 1.1970500946044922, "kl": 0.25684210658073425, "learning_rate": 4.836654171092682e-07, "loss": 0.0003, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 851 }, { "completion_length": 168.5, "epoch": 0.817658349328215, "grad_norm": 0.13293419778347015, "kl": 0.21070025861263275, "learning_rate": 4.835908912239038e-07, "loss": 0.0002, "reward": 0.0, "reward_std": 0.0, "rewards/check_originality_func": 0.0, "step": 852 }, { "completion_length": 142.35714721679688, "epoch": 0.8186180422264875, "grad_norm": 1.6263096332550049, "kl": 0.24911034107208252, "learning_rate": 4.835162014828614e-07, "loss": 0.0002, "reward": 0.2142857313156128, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.2142857313156128, "step": 853 }, { "completion_length": 191.57144165039062, "epoch": 0.8195777351247601, "grad_norm": 2.094773769378662, "kl": 0.1838218867778778, "learning_rate": 4.834413479385332e-07, "loss": 0.0002, "reward": 0.2142857313156128, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.2142857313156128, "step": 854 }, { "completion_length": 168.0, "epoch": 0.8205374280230326, "grad_norm": 2.581845760345459, "kl": 0.18935467302799225, "learning_rate": 4.833663306434263e-07, "loss": 0.0002, "reward": 0.2142857313156128, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.2142857313156128, "step": 855 }, { "completion_length": 115.5714340209961, "epoch": 0.8214971209213052, "grad_norm": 2.5847702026367188, "kl": 0.3219936490058899, "learning_rate": 4.832911496501626e-07, "loss": 0.0003, "reward": 0.2142857313156128, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.2142857313156128, "step": 856 }, { "completion_length": 126.21429443359375, "epoch": 0.8224568138195777, "grad_norm": 1.6775190830230713, "kl": 0.21901282668113708, "learning_rate": 4.832158050114788e-07, "loss": 0.0002, "reward": 0.2142857313156128, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.2142857313156128, "step": 857 }, { "completion_length": 194.00001525878906, "epoch": 0.8234165067178503, "grad_norm": 1.2553455829620361, "kl": 0.1859927624464035, "learning_rate": 4.831402967802268e-07, "loss": 0.0002, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 858 }, { "completion_length": 165.2857208251953, "epoch": 0.8243761996161229, "grad_norm": 1.1464368104934692, "kl": 0.23158009350299835, "learning_rate": 4.830646250093726e-07, "loss": 0.0002, "reward": 0.2142857313156128, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.2142857313156128, "step": 859 }, { "completion_length": 127.00000762939453, "epoch": 0.8253358925143954, "grad_norm": 2.7241742610931396, "kl": 0.2748223543167114, "learning_rate": 4.829887897519974e-07, "loss": 0.0003, "reward": 0.3571428656578064, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.3571428656578064, "step": 860 }, { "completion_length": 159.35714721679688, "epoch": 0.8262955854126679, "grad_norm": 3.0851900577545166, "kl": 0.23344458639621735, "learning_rate": 4.82912791061297e-07, "loss": 0.0002, "reward": 0.0714285746216774, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.0714285746216774, "step": 861 }, { "completion_length": 181.35714721679688, "epoch": 0.8272552783109405, "grad_norm": 1.7084687948226929, "kl": 0.2433740794658661, "learning_rate": 4.828366289905816e-07, "loss": 0.0002, "reward": 0.1428571492433548, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.1428571492433548, "step": 862 }, { "completion_length": 151.42857360839844, "epoch": 0.8282149712092131, "grad_norm": 1.3571780920028687, "kl": 0.24143405258655548, "learning_rate": 4.827603035932762e-07, "loss": 0.0002, "reward": 0.1428571492433548, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.1428571492433548, "step": 863 }, { "completion_length": 168.5, "epoch": 0.8291746641074856, "grad_norm": 1.907912015914917, "kl": 0.5049967765808105, "learning_rate": 4.826838149229205e-07, "loss": 0.0005, "reward": 0.2857142984867096, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.2857142984867096, "step": 864 }, { "completion_length": 139.6428680419922, "epoch": 0.8301343570057581, "grad_norm": 1.7396739721298218, "kl": 0.31872785091400146, "learning_rate": 4.826071630331684e-07, "loss": 0.0003, "reward": 0.3571428656578064, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.3571428656578064, "step": 865 }, { "completion_length": 194.92857360839844, "epoch": 0.8310940499040307, "grad_norm": 2.527390718460083, "kl": 0.3241078853607178, "learning_rate": 4.825303479777886e-07, "loss": 0.0003, "reward": 0.5, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.5, "step": 866 }, { "completion_length": 108.42857360839844, "epoch": 0.8320537428023033, "grad_norm": 3.425982713699341, "kl": 0.49525028467178345, "learning_rate": 4.82453369810664e-07, "loss": 0.0005, "reward": 0.2857142984867096, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.2857142984867096, "step": 867 }, { "completion_length": 154.07144165039062, "epoch": 0.8330134357005758, "grad_norm": 1.2685915231704712, "kl": 0.3046738803386688, "learning_rate": 4.823762285857921e-07, "loss": 0.0003, "reward": 0.2142857313156128, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.2142857313156128, "step": 868 }, { "completion_length": 142.92857360839844, "epoch": 0.8339731285988484, "grad_norm": 3.153442859649658, "kl": 0.354380339384079, "learning_rate": 4.822989243572847e-07, "loss": 0.0004, "reward": 0.3571428656578064, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.3571428656578064, "step": 869 }, { "completion_length": 145.5, "epoch": 0.8349328214971209, "grad_norm": 2.6543426513671875, "kl": 0.38813385367393494, "learning_rate": 4.82221457179368e-07, "loss": 0.0004, "reward": 0.2857142984867096, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.2857142984867096, "step": 870 }, { "completion_length": 133.0, "epoch": 0.8358925143953935, "grad_norm": 2.7885632514953613, "kl": 0.4672565460205078, "learning_rate": 4.821438271063824e-07, "loss": 0.0005, "reward": 0.3571428656578064, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.3571428656578064, "step": 871 }, { "completion_length": 172.07144165039062, "epoch": 0.836852207293666, "grad_norm": 2.4132795333862305, "kl": 0.3413809537887573, "learning_rate": 4.820660341927828e-07, "loss": 0.0003, "reward": 0.2142857313156128, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.2142857313156128, "step": 872 }, { "completion_length": 168.35714721679688, "epoch": 0.8378119001919386, "grad_norm": 1.765296220779419, "kl": 0.38247790932655334, "learning_rate": 4.819880784931379e-07, "loss": 0.0004, "reward": 0.2857142984867096, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.2857142984867096, "step": 873 }, { "completion_length": 132.07144165039062, "epoch": 0.8387715930902111, "grad_norm": 2.7457892894744873, "kl": 0.39209482073783875, "learning_rate": 4.819099600621309e-07, "loss": 0.0004, "reward": 0.5714285969734192, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.5714285969734192, "step": 874 }, { "completion_length": 154.42857360839844, "epoch": 0.8397312859884837, "grad_norm": 2.54343318939209, "kl": 0.43926793336868286, "learning_rate": 4.818316789545592e-07, "loss": 0.0004, "reward": 0.2857142984867096, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.2857142984867096, "step": 875 }, { "completion_length": 155.21429443359375, "epoch": 0.8406909788867563, "grad_norm": 1.153052568435669, "kl": 0.3020021319389343, "learning_rate": 4.817532352253342e-07, "loss": 0.0003, "reward": 0.5, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.5, "step": 876 }, { "completion_length": 192.7857208251953, "epoch": 0.8416506717850288, "grad_norm": 1.7658169269561768, "kl": 0.3890196681022644, "learning_rate": 4.816746289294814e-07, "loss": 0.0004, "reward": 0.5714285969734192, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.5714285969734192, "step": 877 }, { "completion_length": 180.00001525878906, "epoch": 0.8426103646833013, "grad_norm": 2.9264862537384033, "kl": 0.41303548216819763, "learning_rate": 4.815958601221401e-07, "loss": 0.0004, "reward": 0.5714285969734192, "reward_std": 0.4040610194206238, "rewards/check_originality_func": 0.5714285969734192, "step": 878 }, { "completion_length": 119.21429443359375, "epoch": 0.8435700575815739, "grad_norm": 1.9217979907989502, "kl": 0.8544398546218872, "learning_rate": 4.81516928858564e-07, "loss": 0.0009, "reward": 0.2142857313156128, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.2142857313156128, "step": 879 }, { "completion_length": 144.21429443359375, "epoch": 0.8445297504798465, "grad_norm": 2.1743946075439453, "kl": 0.565416693687439, "learning_rate": 4.814378351941206e-07, "loss": 0.0006, "reward": 0.5714285969734192, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.5714285969734192, "step": 880 }, { "completion_length": 155.21429443359375, "epoch": 0.845489443378119, "grad_norm": 2.1831634044647217, "kl": 0.5408298969268799, "learning_rate": 4.813585791842911e-07, "loss": 0.0005, "reward": 0.1428571492433548, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.1428571492433548, "step": 881 }, { "completion_length": 162.85714721679688, "epoch": 0.8464491362763915, "grad_norm": 0.14294292032718658, "kl": 0.5603489875793457, "learning_rate": 4.812791608846709e-07, "loss": 0.0006, "reward": 0.8571429252624512, "reward_std": 0.0, "rewards/check_originality_func": 0.8571429252624512, "step": 882 }, { "completion_length": 117.21429443359375, "epoch": 0.8474088291746641, "grad_norm": 2.3136539459228516, "kl": 0.5756494998931885, "learning_rate": 4.811995803509691e-07, "loss": 0.0006, "reward": 0.3571428656578064, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.3571428656578064, "step": 883 }, { "completion_length": 175.42857360839844, "epoch": 0.8483685220729367, "grad_norm": 1.0727465152740479, "kl": 0.41480228304862976, "learning_rate": 4.811198376390085e-07, "loss": 0.0004, "reward": 0.5, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.5, "step": 884 }, { "completion_length": 156.35714721679688, "epoch": 0.8493282149712092, "grad_norm": 5.112655162811279, "kl": 1.1215019226074219, "learning_rate": 4.81039932804726e-07, "loss": 0.0011, "reward": 0.5714285969734192, "reward_std": 0.4040610194206238, "rewards/check_originality_func": 0.5714285969734192, "step": 885 }, { "completion_length": 188.7857208251953, "epoch": 0.8502879078694817, "grad_norm": 2.699171304702759, "kl": 1.1787227392196655, "learning_rate": 4.809598659041717e-07, "loss": 0.0012, "reward": 0.6428571939468384, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.6428571939468384, "step": 886 }, { "completion_length": 146.85714721679688, "epoch": 0.8512476007677543, "grad_norm": 3.363140344619751, "kl": 0.4905180037021637, "learning_rate": 4.808796369935099e-07, "loss": 0.0005, "reward": 0.785714328289032, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.785714328289032, "step": 887 }, { "completion_length": 160.6428680419922, "epoch": 0.8522072936660269, "grad_norm": 3.3138129711151123, "kl": 0.987316906452179, "learning_rate": 4.807992461290182e-07, "loss": 0.001, "reward": 0.5714285969734192, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.5714285969734192, "step": 888 }, { "completion_length": 151.85714721679688, "epoch": 0.8531669865642995, "grad_norm": 2.7816262245178223, "kl": 0.8270294070243835, "learning_rate": 4.807186933670879e-07, "loss": 0.0008, "reward": 0.5, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.5, "step": 889 }, { "completion_length": 174.57144165039062, "epoch": 0.8541266794625719, "grad_norm": 2.8278348445892334, "kl": 0.6089766621589661, "learning_rate": 4.806379787642241e-07, "loss": 0.0006, "reward": 0.6428571939468384, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.6428571939468384, "step": 890 }, { "completion_length": 140.5, "epoch": 0.8550863723608445, "grad_norm": 2.6965317726135254, "kl": 0.5781830549240112, "learning_rate": 4.805571023770449e-07, "loss": 0.0006, "reward": 0.5, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.5, "step": 891 }, { "completion_length": 169.71429443359375, "epoch": 0.8560460652591171, "grad_norm": 2.7575056552886963, "kl": 0.5150176882743835, "learning_rate": 4.804760642622825e-07, "loss": 0.0005, "reward": 0.785714328289032, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.785714328289032, "step": 892 }, { "completion_length": 166.21429443359375, "epoch": 0.8570057581573897, "grad_norm": 1.5014925003051758, "kl": 0.7404791116714478, "learning_rate": 4.803948644767819e-07, "loss": 0.0007, "reward": 0.6428571939468384, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.6428571939468384, "step": 893 }, { "completion_length": 162.92857360839844, "epoch": 0.8579654510556622, "grad_norm": 1.8689026832580566, "kl": 0.5340602397918701, "learning_rate": 4.803135030775022e-07, "loss": 0.0005, "reward": 0.5714285969734192, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.5714285969734192, "step": 894 }, { "completion_length": 174.35714721679688, "epoch": 0.8589251439539347, "grad_norm": 1.836107850074768, "kl": 0.44557416439056396, "learning_rate": 4.802319801215154e-07, "loss": 0.0004, "reward": 0.6428571939468384, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.6428571939468384, "step": 895 }, { "completion_length": 131.0, "epoch": 0.8598848368522073, "grad_norm": 2.5334603786468506, "kl": 0.7896557450294495, "learning_rate": 4.80150295666007e-07, "loss": 0.0008, "reward": 0.3571428656578064, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.3571428656578064, "step": 896 }, { "completion_length": 152.35714721679688, "epoch": 0.8608445297504799, "grad_norm": 1.3813453912734985, "kl": 0.4520536959171295, "learning_rate": 4.800684497682755e-07, "loss": 0.0005, "reward": 0.6428571939468384, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.6428571939468384, "step": 897 }, { "completion_length": 138.07144165039062, "epoch": 0.8618042226487524, "grad_norm": 2.2901666164398193, "kl": 0.5979324579238892, "learning_rate": 4.799864424857331e-07, "loss": 0.0006, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 898 }, { "completion_length": 136.07144165039062, "epoch": 0.8627639155470249, "grad_norm": 4.319260120391846, "kl": 0.5759350657463074, "learning_rate": 4.799042738759048e-07, "loss": 0.0006, "reward": 0.6428571939468384, "reward_std": 0.5050762891769409, "rewards/check_originality_func": 0.6428571939468384, "step": 899 }, { "completion_length": 147.57144165039062, "epoch": 0.8637236084452975, "grad_norm": 2.4739160537719727, "kl": 0.5668299198150635, "learning_rate": 4.798219439964293e-07, "loss": 0.0006, "reward": 0.6428571939468384, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.6428571939468384, "step": 900 }, { "completion_length": 141.5, "epoch": 0.8646833013435701, "grad_norm": 2.5149903297424316, "kl": 0.5879033207893372, "learning_rate": 4.797394529050577e-07, "loss": 0.0006, "reward": 0.6428571939468384, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.6428571939468384, "step": 901 }, { "completion_length": 174.50001525878906, "epoch": 0.8656429942418427, "grad_norm": 2.4728682041168213, "kl": 0.42482006549835205, "learning_rate": 4.796568006596547e-07, "loss": 0.0004, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 902 }, { "completion_length": 132.71429443359375, "epoch": 0.8666026871401151, "grad_norm": 2.612227439880371, "kl": 0.6533089876174927, "learning_rate": 4.795739873181979e-07, "loss": 0.0007, "reward": 0.6428571939468384, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.6428571939468384, "step": 903 }, { "completion_length": 151.7857208251953, "epoch": 0.8675623800383877, "grad_norm": 1.7876770496368408, "kl": 0.7349087595939636, "learning_rate": 4.794910129387779e-07, "loss": 0.0007, "reward": 0.7142857313156128, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.7142857313156128, "step": 904 }, { "completion_length": 172.50001525878906, "epoch": 0.8685220729366603, "grad_norm": 2.232642412185669, "kl": 0.49343860149383545, "learning_rate": 4.794078775795983e-07, "loss": 0.0005, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 905 }, { "completion_length": 185.2857208251953, "epoch": 0.8694817658349329, "grad_norm": 1.0167922973632812, "kl": 0.3967530131340027, "learning_rate": 4.793245812989756e-07, "loss": 0.0004, "reward": 0.6428571939468384, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.6428571939468384, "step": 906 }, { "completion_length": 132.85714721679688, "epoch": 0.8704414587332053, "grad_norm": 2.602766513824463, "kl": 0.4661983251571655, "learning_rate": 4.792411241553391e-07, "loss": 0.0005, "reward": 0.1428571492433548, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.1428571492433548, "step": 907 }, { "completion_length": 172.1428680419922, "epoch": 0.8714011516314779, "grad_norm": 4.251507759094238, "kl": 0.6887491941452026, "learning_rate": 4.79157506207231e-07, "loss": 0.0007, "reward": 0.5714285969734192, "reward_std": 0.4040610194206238, "rewards/check_originality_func": 0.5714285969734192, "step": 908 }, { "completion_length": 113.50000762939453, "epoch": 0.8723608445297505, "grad_norm": 2.72515606880188, "kl": 0.7950993180274963, "learning_rate": 4.790737275133064e-07, "loss": 0.0008, "reward": 0.785714328289032, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.785714328289032, "step": 909 }, { "completion_length": 201.35714721679688, "epoch": 0.8733205374280231, "grad_norm": 0.902843177318573, "kl": 0.5756837129592896, "learning_rate": 4.78989788132333e-07, "loss": 0.0006, "reward": 0.6428571939468384, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.6428571939468384, "step": 910 }, { "completion_length": 184.2857208251953, "epoch": 0.8742802303262955, "grad_norm": 2.6755573749542236, "kl": 0.7159241437911987, "learning_rate": 4.789056881231913e-07, "loss": 0.0007, "reward": 0.3571428656578064, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.3571428656578064, "step": 911 }, { "completion_length": 153.7857208251953, "epoch": 0.8752399232245681, "grad_norm": 3.5757830142974854, "kl": 0.5553666949272156, "learning_rate": 4.788214275448745e-07, "loss": 0.0006, "reward": 0.7142857313156128, "reward_std": 0.4040610194206238, "rewards/check_originality_func": 0.7142857313156128, "step": 912 }, { "completion_length": 125.85714721679688, "epoch": 0.8761996161228407, "grad_norm": 4.558701992034912, "kl": 0.8218669295310974, "learning_rate": 4.787370064564882e-07, "loss": 0.0008, "reward": 0.5, "reward_std": 0.5050762891769409, "rewards/check_originality_func": 0.5, "step": 913 }, { "completion_length": 140.6428680419922, "epoch": 0.8771593090211133, "grad_norm": 2.587376594543457, "kl": 0.4561639130115509, "learning_rate": 4.786524249172511e-07, "loss": 0.0005, "reward": 0.7142857313156128, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.7142857313156128, "step": 914 }, { "completion_length": 137.85714721679688, "epoch": 0.8781190019193857, "grad_norm": 3.649531602859497, "kl": 0.804486095905304, "learning_rate": 4.785676829864939e-07, "loss": 0.0008, "reward": 0.5, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.5, "step": 915 }, { "completion_length": 157.57144165039062, "epoch": 0.8790786948176583, "grad_norm": 3.7428252696990967, "kl": 0.48060935735702515, "learning_rate": 4.784827807236601e-07, "loss": 0.0005, "reward": 0.6428571939468384, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.6428571939468384, "step": 916 }, { "completion_length": 147.57144165039062, "epoch": 0.8800383877159309, "grad_norm": 1.3370546102523804, "kl": 0.698165237903595, "learning_rate": 4.783977181883055e-07, "loss": 0.0007, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 917 }, { "completion_length": 115.00000762939453, "epoch": 0.8809980806142035, "grad_norm": 4.100742816925049, "kl": 0.9257350564002991, "learning_rate": 4.783124954400987e-07, "loss": 0.0009, "reward": 0.5, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.5, "step": 918 }, { "completion_length": 164.7857208251953, "epoch": 0.8819577735124761, "grad_norm": 0.07091014087200165, "kl": 0.49472200870513916, "learning_rate": 4.782271125388202e-07, "loss": 0.0005, "reward": 0.7142857313156128, "reward_std": 0.0, "rewards/check_originality_func": 0.7142857313156128, "step": 919 }, { "completion_length": 195.6428680419922, "epoch": 0.8829174664107485, "grad_norm": 1.5378797054290771, "kl": 0.5948125720024109, "learning_rate": 4.781415695443631e-07, "loss": 0.0006, "reward": 0.785714328289032, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.785714328289032, "step": 920 }, { "completion_length": 213.92857360839844, "epoch": 0.8838771593090211, "grad_norm": 2.3839025497436523, "kl": 0.7300068140029907, "learning_rate": 4.780558665167329e-07, "loss": 0.0007, "reward": 0.5714285969734192, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.5714285969734192, "step": 921 }, { "completion_length": 141.5, "epoch": 0.8848368522072937, "grad_norm": 5.368049621582031, "kl": 0.6869112253189087, "learning_rate": 4.779700035160469e-07, "loss": 0.0007, "reward": 0.785714328289032, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.785714328289032, "step": 922 }, { "completion_length": 124.28572082519531, "epoch": 0.8857965451055663, "grad_norm": 3.1096692085266113, "kl": 0.7134397029876709, "learning_rate": 4.778839806025353e-07, "loss": 0.0007, "reward": 0.785714328289032, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.785714328289032, "step": 923 }, { "completion_length": 133.07144165039062, "epoch": 0.8867562380038387, "grad_norm": 4.343780040740967, "kl": 0.6489086151123047, "learning_rate": 4.777977978365399e-07, "loss": 0.0006, "reward": 0.5714285969734192, "reward_std": 0.4040610194206238, "rewards/check_originality_func": 0.5714285969734192, "step": 924 }, { "completion_length": 144.71429443359375, "epoch": 0.8877159309021113, "grad_norm": 5.038804054260254, "kl": 0.7560345530509949, "learning_rate": 4.777114552785149e-07, "loss": 0.0008, "reward": 0.5714285969734192, "reward_std": 0.4040610194206238, "rewards/check_originality_func": 0.5714285969734192, "step": 925 }, { "completion_length": 122.64286041259766, "epoch": 0.8886756238003839, "grad_norm": 1.6708567142486572, "kl": 0.5888431668281555, "learning_rate": 4.776249529890263e-07, "loss": 0.0006, "reward": 0.5, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.5, "step": 926 }, { "completion_length": 226.35714721679688, "epoch": 0.8896353166986565, "grad_norm": 2.4958391189575195, "kl": 0.40838778018951416, "learning_rate": 4.775382910287527e-07, "loss": 0.0004, "reward": 0.785714328289032, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.785714328289032, "step": 927 }, { "completion_length": 216.00001525878906, "epoch": 0.8905950095969289, "grad_norm": 2.9395296573638916, "kl": 0.6504149436950684, "learning_rate": 4.774514694584843e-07, "loss": 0.0007, "reward": 0.6428571939468384, "reward_std": 0.5050762891769409, "rewards/check_originality_func": 0.6428571939468384, "step": 928 }, { "completion_length": 169.2857208251953, "epoch": 0.8915547024952015, "grad_norm": 1.4439791440963745, "kl": 0.6328601241111755, "learning_rate": 4.773644883391232e-07, "loss": 0.0006, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 929 }, { "completion_length": 164.0, "epoch": 0.8925143953934741, "grad_norm": 2.512023687362671, "kl": 0.6588895320892334, "learning_rate": 4.772773477316836e-07, "loss": 0.0007, "reward": 0.6428571939468384, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.6428571939468384, "step": 930 }, { "completion_length": 171.71429443359375, "epoch": 0.8934740882917467, "grad_norm": 2.7392830848693848, "kl": 0.5339912176132202, "learning_rate": 4.771900476972917e-07, "loss": 0.0005, "reward": 0.6428571939468384, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.6428571939468384, "step": 931 }, { "completion_length": 141.7857208251953, "epoch": 0.8944337811900192, "grad_norm": 2.3056187629699707, "kl": 0.48650556802749634, "learning_rate": 4.771025882971851e-07, "loss": 0.0005, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 932 }, { "completion_length": 200.21429443359375, "epoch": 0.8953934740882917, "grad_norm": 1.746018886566162, "kl": 0.47448521852493286, "learning_rate": 4.770149695927136e-07, "loss": 0.0005, "reward": 0.4285714626312256, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.4285714626312256, "step": 933 }, { "completion_length": 169.6428680419922, "epoch": 0.8963531669865643, "grad_norm": 2.7914226055145264, "kl": 0.6590645909309387, "learning_rate": 4.769271916453385e-07, "loss": 0.0007, "reward": 0.7142857313156128, "reward_std": 0.4040610194206238, "rewards/check_originality_func": 0.7142857313156128, "step": 934 }, { "completion_length": 197.71429443359375, "epoch": 0.8973128598848369, "grad_norm": 2.3273396492004395, "kl": 1.036407709121704, "learning_rate": 4.76839254516633e-07, "loss": 0.001, "reward": 0.5714285969734192, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.5714285969734192, "step": 935 }, { "completion_length": 185.2857208251953, "epoch": 0.8982725527831094, "grad_norm": 0.07301955670118332, "kl": 0.557611346244812, "learning_rate": 4.7675115826828185e-07, "loss": 0.0006, "reward": 0.7142857313156128, "reward_std": 0.0, "rewards/check_originality_func": 0.7142857313156128, "step": 936 }, { "completion_length": 144.92857360839844, "epoch": 0.8992322456813819, "grad_norm": 1.413051724433899, "kl": 0.5780062079429626, "learning_rate": 4.766629029620814e-07, "loss": 0.0006, "reward": 0.785714328289032, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.785714328289032, "step": 937 }, { "completion_length": 182.07144165039062, "epoch": 0.9001919385796545, "grad_norm": 2.096040964126587, "kl": 0.5069324374198914, "learning_rate": 4.7657448865993953e-07, "loss": 0.0005, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 938 }, { "completion_length": 179.50001525878906, "epoch": 0.9011516314779271, "grad_norm": 2.4069414138793945, "kl": 0.5559120774269104, "learning_rate": 4.7648591542387583e-07, "loss": 0.0006, "reward": 0.7142857313156128, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.7142857313156128, "step": 939 }, { "completion_length": 145.57144165039062, "epoch": 0.9021113243761996, "grad_norm": 2.842611789703369, "kl": 0.5579888224601746, "learning_rate": 4.7639718331602117e-07, "loss": 0.0006, "reward": 0.7142857313156128, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.7142857313156128, "step": 940 }, { "completion_length": 154.71429443359375, "epoch": 0.9030710172744721, "grad_norm": 2.2552073001861572, "kl": 0.5744655728340149, "learning_rate": 4.7630829239861816e-07, "loss": 0.0006, "reward": 0.5714285969734192, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.5714285969734192, "step": 941 }, { "completion_length": 187.21429443359375, "epoch": 0.9040307101727447, "grad_norm": 2.1283798217773438, "kl": 0.5330450534820557, "learning_rate": 4.7621924273402024e-07, "loss": 0.0005, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 942 }, { "completion_length": 172.50001525878906, "epoch": 0.9049904030710173, "grad_norm": 3.436983346939087, "kl": 0.6441758275032043, "learning_rate": 4.761300343846929e-07, "loss": 0.0006, "reward": 0.785714328289032, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.785714328289032, "step": 943 }, { "completion_length": 155.6428680419922, "epoch": 0.9059500959692899, "grad_norm": 2.119119644165039, "kl": 0.5775772929191589, "learning_rate": 4.7604066741321253e-07, "loss": 0.0006, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 944 }, { "completion_length": 136.0, "epoch": 0.9069097888675623, "grad_norm": 0.06944773346185684, "kl": 0.5501778721809387, "learning_rate": 4.759511418822668e-07, "loss": 0.0006, "reward": 0.7142857313156128, "reward_std": 0.0, "rewards/check_originality_func": 0.7142857313156128, "step": 945 }, { "completion_length": 175.07144165039062, "epoch": 0.9078694817658349, "grad_norm": 1.3189332485198975, "kl": 0.4155198037624359, "learning_rate": 4.758614578546548e-07, "loss": 0.0004, "reward": 0.785714328289032, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.785714328289032, "step": 946 }, { "completion_length": 146.7857208251953, "epoch": 0.9088291746641075, "grad_norm": 2.1159911155700684, "kl": 0.5978903770446777, "learning_rate": 4.7577161539328646e-07, "loss": 0.0006, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 947 }, { "completion_length": 174.71429443359375, "epoch": 0.9097888675623801, "grad_norm": 2.4283671379089355, "kl": 0.5074416399002075, "learning_rate": 4.7568161456118327e-07, "loss": 0.0005, "reward": 0.7142857313156128, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.7142857313156128, "step": 948 }, { "completion_length": 140.2857208251953, "epoch": 0.9107485604606526, "grad_norm": 2.379621744155884, "kl": 0.5491951107978821, "learning_rate": 4.755914554214776e-07, "loss": 0.0005, "reward": 0.7142857313156128, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.7142857313156128, "step": 949 }, { "completion_length": 173.2857208251953, "epoch": 0.9117082533589251, "grad_norm": 2.6015756130218506, "kl": 0.44321900606155396, "learning_rate": 4.7550113803741275e-07, "loss": 0.0004, "reward": 0.785714328289032, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.785714328289032, "step": 950 }, { "completion_length": 125.5714340209961, "epoch": 0.9126679462571977, "grad_norm": 2.8672070503234863, "kl": 0.7231409549713135, "learning_rate": 4.7541066247234317e-07, "loss": 0.0007, "reward": 0.785714328289032, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.785714328289032, "step": 951 }, { "completion_length": 154.57144165039062, "epoch": 0.9136276391554703, "grad_norm": 0.9741955399513245, "kl": 0.5351475477218628, "learning_rate": 4.7532002878973434e-07, "loss": 0.0005, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 952 }, { "completion_length": 156.0, "epoch": 0.9145873320537428, "grad_norm": 2.116065740585327, "kl": 0.4656742811203003, "learning_rate": 4.752292370531624e-07, "loss": 0.0005, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 953 }, { "completion_length": 223.9285888671875, "epoch": 0.9155470249520153, "grad_norm": 2.3340420722961426, "kl": 0.46402183175086975, "learning_rate": 4.7513828732631466e-07, "loss": 0.0005, "reward": 0.7142857313156128, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.7142857313156128, "step": 954 }, { "completion_length": 171.6428680419922, "epoch": 0.9165067178502879, "grad_norm": 1.9541587829589844, "kl": 0.46876075863838196, "learning_rate": 4.75047179672989e-07, "loss": 0.0005, "reward": 0.6428571939468384, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.6428571939468384, "step": 955 }, { "completion_length": 231.50001525878906, "epoch": 0.9174664107485605, "grad_norm": 1.9142409563064575, "kl": 0.41474419832229614, "learning_rate": 4.749559141570943e-07, "loss": 0.0004, "reward": 0.7142857313156128, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.7142857313156128, "step": 956 }, { "completion_length": 176.21429443359375, "epoch": 0.918426103646833, "grad_norm": 2.19897198677063, "kl": 0.5869089365005493, "learning_rate": 4.7486449084265e-07, "loss": 0.0006, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 957 }, { "completion_length": 218.7857208251953, "epoch": 0.9193857965451055, "grad_norm": 1.5966094732284546, "kl": 0.4046158194541931, "learning_rate": 4.7477290979378626e-07, "loss": 0.0004, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 958 }, { "completion_length": 168.85714721679688, "epoch": 0.9203454894433781, "grad_norm": 1.9081134796142578, "kl": 0.5629347562789917, "learning_rate": 4.746811710747439e-07, "loss": 0.0006, "reward": 0.7142857313156128, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.7142857313156128, "step": 959 }, { "completion_length": 164.42857360839844, "epoch": 0.9213051823416507, "grad_norm": 3.707995891571045, "kl": 0.5230230689048767, "learning_rate": 4.7458927474987454e-07, "loss": 0.0005, "reward": 0.5714285969734192, "reward_std": 0.4040610194206238, "rewards/check_originality_func": 0.5714285969734192, "step": 960 }, { "completion_length": 185.35714721679688, "epoch": 0.9222648752399232, "grad_norm": 1.9233053922653198, "kl": 0.41237136721611023, "learning_rate": 4.7449722088363996e-07, "loss": 0.0004, "reward": 0.6428571939468384, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.6428571939468384, "step": 961 }, { "completion_length": 190.07144165039062, "epoch": 0.9232245681381958, "grad_norm": 1.4365935325622559, "kl": 0.8154608011245728, "learning_rate": 4.744050095406128e-07, "loss": 0.0008, "reward": 0.785714328289032, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.785714328289032, "step": 962 }, { "completion_length": 183.7857208251953, "epoch": 0.9241842610364683, "grad_norm": 2.565647602081299, "kl": 0.4094181954860687, "learning_rate": 4.74312640785476e-07, "loss": 0.0004, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 963 }, { "completion_length": 204.00001525878906, "epoch": 0.9251439539347409, "grad_norm": 0.14955644309520721, "kl": 0.6066585779190063, "learning_rate": 4.742201146830229e-07, "loss": 0.0006, "reward": 1.0, "reward_std": 0.0, "rewards/check_originality_func": 1.0, "step": 964 }, { "completion_length": 161.71429443359375, "epoch": 0.9261036468330134, "grad_norm": 2.336005926132202, "kl": 0.4798429608345032, "learning_rate": 4.7412743129815726e-07, "loss": 0.0005, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 965 }, { "completion_length": 180.71429443359375, "epoch": 0.927063339731286, "grad_norm": 1.7383607625961304, "kl": 0.5032739639282227, "learning_rate": 4.7403459069589325e-07, "loss": 0.0005, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 966 }, { "completion_length": 195.42857360839844, "epoch": 0.9280230326295585, "grad_norm": 2.0726964473724365, "kl": 0.41781705617904663, "learning_rate": 4.7394159294135517e-07, "loss": 0.0004, "reward": 0.785714328289032, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.785714328289032, "step": 967 }, { "completion_length": 118.5714340209961, "epoch": 0.9289827255278311, "grad_norm": 2.7632758617401123, "kl": 0.5380629897117615, "learning_rate": 4.738484380997776e-07, "loss": 0.0005, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 968 }, { "completion_length": 124.0714340209961, "epoch": 0.9299424184261037, "grad_norm": 2.006035089492798, "kl": 0.5457581877708435, "learning_rate": 4.737551262365054e-07, "loss": 0.0005, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 969 }, { "completion_length": 218.07144165039062, "epoch": 0.9309021113243762, "grad_norm": 1.5539145469665527, "kl": 0.3765806555747986, "learning_rate": 4.7366165741699347e-07, "loss": 0.0004, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 970 }, { "completion_length": 187.57144165039062, "epoch": 0.9318618042226487, "grad_norm": 2.3409149646759033, "kl": 0.39740118384361267, "learning_rate": 4.7356803170680694e-07, "loss": 0.0004, "reward": 0.785714328289032, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.785714328289032, "step": 971 }, { "completion_length": 218.42857360839844, "epoch": 0.9328214971209213, "grad_norm": 1.623311161994934, "kl": 0.3792901933193207, "learning_rate": 4.734742491716208e-07, "loss": 0.0004, "reward": 0.5714285969734192, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.5714285969734192, "step": 972 }, { "completion_length": 184.71429443359375, "epoch": 0.9337811900191939, "grad_norm": 2.6382839679718018, "kl": 0.4517477750778198, "learning_rate": 4.733803098772202e-07, "loss": 0.0005, "reward": 0.6428571939468384, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.6428571939468384, "step": 973 }, { "completion_length": 174.2857208251953, "epoch": 0.9347408829174664, "grad_norm": 2.2549326419830322, "kl": 0.4309324324131012, "learning_rate": 4.732862138895003e-07, "loss": 0.0004, "reward": 0.7142857313156128, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.7142857313156128, "step": 974 }, { "completion_length": 119.71429443359375, "epoch": 0.935700575815739, "grad_norm": 2.324214458465576, "kl": 0.525473415851593, "learning_rate": 4.731919612744659e-07, "loss": 0.0005, "reward": 0.7142857313156128, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.7142857313156128, "step": 975 }, { "completion_length": 184.42857360839844, "epoch": 0.9366602687140115, "grad_norm": 3.006110429763794, "kl": 0.4913351535797119, "learning_rate": 4.7309755209823193e-07, "loss": 0.0005, "reward": 0.5714285969734192, "reward_std": 0.4040610194206238, "rewards/check_originality_func": 0.5714285969734192, "step": 976 }, { "completion_length": 144.35714721679688, "epoch": 0.9376199616122841, "grad_norm": 0.08966536819934845, "kl": 0.5570021867752075, "learning_rate": 4.730029864270231e-07, "loss": 0.0006, "reward": 0.8571429252624512, "reward_std": 0.0, "rewards/check_originality_func": 0.8571429252624512, "step": 977 }, { "completion_length": 112.28572082519531, "epoch": 0.9385796545105566, "grad_norm": 3.025169610977173, "kl": 0.6688101887702942, "learning_rate": 4.729082643271738e-07, "loss": 0.0007, "reward": 0.785714328289032, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.785714328289032, "step": 978 }, { "completion_length": 216.07144165039062, "epoch": 0.9395393474088292, "grad_norm": 2.907849073410034, "kl": 1.0423882007598877, "learning_rate": 4.728133858651281e-07, "loss": 0.001, "reward": 0.7142857313156128, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.7142857313156128, "step": 979 }, { "completion_length": 159.21429443359375, "epoch": 0.9404990403071017, "grad_norm": 4.093930721282959, "kl": 0.5541455745697021, "learning_rate": 4.727183511074401e-07, "loss": 0.0006, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 980 }, { "completion_length": 117.00000762939453, "epoch": 0.9414587332053743, "grad_norm": 0.04566064476966858, "kl": 0.6181661486625671, "learning_rate": 4.7262316012077317e-07, "loss": 0.0006, "reward": 0.8571429252624512, "reward_std": 0.0, "rewards/check_originality_func": 0.8571429252624512, "step": 981 }, { "completion_length": 171.42857360839844, "epoch": 0.9424184261036468, "grad_norm": 1.6505250930786133, "kl": 0.5022579431533813, "learning_rate": 4.725278129719004e-07, "loss": 0.0005, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 982 }, { "completion_length": 185.00001525878906, "epoch": 0.9433781190019194, "grad_norm": 1.9682224988937378, "kl": 0.7400583028793335, "learning_rate": 4.7243230972770434e-07, "loss": 0.0007, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 983 }, { "completion_length": 261.71429443359375, "epoch": 0.944337811900192, "grad_norm": 0.025403108447790146, "kl": 0.35206183791160583, "learning_rate": 4.723366504551772e-07, "loss": 0.0004, "reward": 0.8571429252624512, "reward_std": 0.0, "rewards/check_originality_func": 0.8571429252624512, "step": 984 }, { "completion_length": 174.00001525878906, "epoch": 0.9452975047984645, "grad_norm": 3.041748046875, "kl": 0.581104040145874, "learning_rate": 4.7224083522142056e-07, "loss": 0.0006, "reward": 0.6428571939468384, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.6428571939468384, "step": 985 }, { "completion_length": 150.6428680419922, "epoch": 0.946257197696737, "grad_norm": 2.959774971008301, "kl": 0.6435522437095642, "learning_rate": 4.7214486409364527e-07, "loss": 0.0006, "reward": 0.7142857313156128, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.7142857313156128, "step": 986 }, { "completion_length": 175.85714721679688, "epoch": 0.9472168905950096, "grad_norm": 4.238525390625, "kl": 0.5815308690071106, "learning_rate": 4.7204873713917177e-07, "loss": 0.0006, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 987 }, { "completion_length": 167.2857208251953, "epoch": 0.9481765834932822, "grad_norm": 1.4913158416748047, "kl": 0.8324114084243774, "learning_rate": 4.7195245442542965e-07, "loss": 0.0008, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 988 }, { "completion_length": 161.21429443359375, "epoch": 0.9491362763915547, "grad_norm": 2.2882015705108643, "kl": 0.8077451586723328, "learning_rate": 4.7185601601995784e-07, "loss": 0.0008, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 989 }, { "completion_length": 201.85714721679688, "epoch": 0.9500959692898272, "grad_norm": 1.7933698892593384, "kl": 0.6332314014434814, "learning_rate": 4.717594219904043e-07, "loss": 0.0006, "reward": 0.785714328289032, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.785714328289032, "step": 990 }, { "completion_length": 230.7857208251953, "epoch": 0.9510556621880998, "grad_norm": 2.2297654151916504, "kl": 0.6131426692008972, "learning_rate": 4.716626724045265e-07, "loss": 0.0006, "reward": 0.785714328289032, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.785714328289032, "step": 991 }, { "completion_length": 166.6428680419922, "epoch": 0.9520153550863724, "grad_norm": 0.070705346763134, "kl": 0.6580756902694702, "learning_rate": 4.7156576733019073e-07, "loss": 0.0007, "reward": 1.0, "reward_std": 0.0, "rewards/check_originality_func": 1.0, "step": 992 }, { "completion_length": 282.4285888671875, "epoch": 0.9529750479846449, "grad_norm": 1.066772222518921, "kl": 0.4011736214160919, "learning_rate": 4.7146870683537236e-07, "loss": 0.0004, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 993 }, { "completion_length": 182.7857208251953, "epoch": 0.9539347408829175, "grad_norm": 1.661795973777771, "kl": 0.6564017534255981, "learning_rate": 4.7137149098815597e-07, "loss": 0.0007, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 994 }, { "completion_length": 192.57144165039062, "epoch": 0.95489443378119, "grad_norm": 0.6934019327163696, "kl": 1.0252406597137451, "learning_rate": 4.71274119856735e-07, "loss": 0.001, "reward": 0.8571429252624512, "reward_std": 0.0, "rewards/check_originality_func": 0.8571429252624512, "step": 995 }, { "completion_length": 206.42857360839844, "epoch": 0.9558541266794626, "grad_norm": 2.871472120285034, "kl": 0.6459137797355652, "learning_rate": 4.7117659350941175e-07, "loss": 0.0006, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 996 }, { "completion_length": 206.00001525878906, "epoch": 0.9568138195777351, "grad_norm": 1.2527952194213867, "kl": 0.5170645713806152, "learning_rate": 4.710789120145976e-07, "loss": 0.0005, "reward": 0.785714328289032, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.785714328289032, "step": 997 }, { "completion_length": 189.42857360839844, "epoch": 0.9577735124760077, "grad_norm": 1.019492268562317, "kl": 0.7305828332901001, "learning_rate": 4.7098107544081255e-07, "loss": 0.0007, "reward": 0.785714328289032, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.785714328289032, "step": 998 }, { "completion_length": 160.42857360839844, "epoch": 0.9587332053742802, "grad_norm": 0.03335558623075485, "kl": 0.6270844340324402, "learning_rate": 4.708830838566854e-07, "loss": 0.0006, "reward": 0.7142857313156128, "reward_std": 0.0, "rewards/check_originality_func": 0.7142857313156128, "step": 999 }, { "completion_length": 169.71429443359375, "epoch": 0.9596928982725528, "grad_norm": 1.7459685802459717, "kl": 0.580906331539154, "learning_rate": 4.7078493733095393e-07, "loss": 0.0006, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 1000 }, { "completion_length": 266.0, "epoch": 0.9606525911708254, "grad_norm": 1.8476630449295044, "kl": 0.5107110738754272, "learning_rate": 4.706866359324643e-07, "loss": 0.0005, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 1001 }, { "completion_length": 139.85714721679688, "epoch": 0.9616122840690979, "grad_norm": 1.9665141105651855, "kl": 0.6823427081108093, "learning_rate": 4.705881797301714e-07, "loss": 0.0007, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 1002 }, { "completion_length": 134.6428680419922, "epoch": 0.9625719769673704, "grad_norm": 4.0630879402160645, "kl": 1.0166819095611572, "learning_rate": 4.704895687931389e-07, "loss": 0.001, "reward": 0.785714328289032, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.785714328289032, "step": 1003 }, { "completion_length": 196.1428680419922, "epoch": 0.963531669865643, "grad_norm": 1.2827757596969604, "kl": 0.6695514917373657, "learning_rate": 4.703908031905387e-07, "loss": 0.0007, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 1004 }, { "completion_length": 195.6428680419922, "epoch": 0.9644913627639156, "grad_norm": 1.4921987056732178, "kl": 0.5807093977928162, "learning_rate": 4.7029188299165144e-07, "loss": 0.0006, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 1005 }, { "completion_length": 180.07144165039062, "epoch": 0.9654510556621881, "grad_norm": 0.048924922943115234, "kl": 0.6378337144851685, "learning_rate": 4.7019280826586604e-07, "loss": 0.0006, "reward": 0.7142857313156128, "reward_std": 0.0, "rewards/check_originality_func": 0.7142857313156128, "step": 1006 }, { "completion_length": 238.35714721679688, "epoch": 0.9664107485604606, "grad_norm": 1.883095383644104, "kl": 0.5012056827545166, "learning_rate": 4.7009357908268e-07, "loss": 0.0005, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 1007 }, { "completion_length": 234.35714721679688, "epoch": 0.9673704414587332, "grad_norm": 1.5289462804794312, "kl": 0.552444577217102, "learning_rate": 4.6999419551169886e-07, "loss": 0.0006, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 1008 }, { "completion_length": 207.1428680419922, "epoch": 0.9683301343570058, "grad_norm": 3.4268507957458496, "kl": 0.5448492169380188, "learning_rate": 4.6989465762263686e-07, "loss": 0.0005, "reward": 0.785714328289032, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.785714328289032, "step": 1009 }, { "completion_length": 233.71429443359375, "epoch": 0.9692898272552783, "grad_norm": 1.9168721437454224, "kl": 0.4878080487251282, "learning_rate": 4.6979496548531614e-07, "loss": 0.0005, "reward": 0.785714328289032, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.785714328289032, "step": 1010 }, { "completion_length": 228.2857208251953, "epoch": 0.9702495201535508, "grad_norm": 0.10641372948884964, "kl": 0.6249610781669617, "learning_rate": 4.696951191696673e-07, "loss": 0.0006, "reward": 1.0, "reward_std": 0.0, "rewards/check_originality_func": 1.0, "step": 1011 }, { "completion_length": 166.57144165039062, "epoch": 0.9712092130518234, "grad_norm": 0.08914443850517273, "kl": 0.7231295704841614, "learning_rate": 4.6959511874572875e-07, "loss": 0.0007, "reward": 1.0, "reward_std": 0.0, "rewards/check_originality_func": 1.0, "step": 1012 }, { "completion_length": 216.07144165039062, "epoch": 0.972168905950096, "grad_norm": 2.6702067852020264, "kl": 0.7317110300064087, "learning_rate": 4.694949642836475e-07, "loss": 0.0007, "reward": 0.785714328289032, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.785714328289032, "step": 1013 }, { "completion_length": 182.6428680419922, "epoch": 0.9731285988483686, "grad_norm": 2.761906385421753, "kl": 0.6080414056777954, "learning_rate": 4.693946558536782e-07, "loss": 0.0006, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 1014 }, { "completion_length": 207.07144165039062, "epoch": 0.974088291746641, "grad_norm": 1.0536783933639526, "kl": 0.6973207592964172, "learning_rate": 4.6929419352618354e-07, "loss": 0.0007, "reward": 0.5, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.5, "step": 1015 }, { "completion_length": 186.42857360839844, "epoch": 0.9750479846449136, "grad_norm": 2.857184886932373, "kl": 0.6753986477851868, "learning_rate": 4.691935773716344e-07, "loss": 0.0007, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 1016 }, { "completion_length": 151.1428680419922, "epoch": 0.9760076775431862, "grad_norm": 3.808972120285034, "kl": 0.8129858374595642, "learning_rate": 4.6909280746060936e-07, "loss": 0.0008, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 1017 }, { "completion_length": 220.50001525878906, "epoch": 0.9769673704414588, "grad_norm": 2.4237136840820312, "kl": 0.6947652697563171, "learning_rate": 4.6899188386379496e-07, "loss": 0.0007, "reward": 0.7142857313156128, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.7142857313156128, "step": 1018 }, { "completion_length": 191.71429443359375, "epoch": 0.9779270633397313, "grad_norm": 0.1763453185558319, "kl": 0.7699907422065735, "learning_rate": 4.688908066519854e-07, "loss": 0.0008, "reward": 1.0, "reward_std": 0.0, "rewards/check_originality_func": 1.0, "step": 1019 }, { "completion_length": 159.0, "epoch": 0.9788867562380038, "grad_norm": 1.2772748470306396, "kl": 0.7715043425559998, "learning_rate": 4.6878957589608293e-07, "loss": 0.0008, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 1020 }, { "completion_length": 135.7857208251953, "epoch": 0.9798464491362764, "grad_norm": 0.10654929280281067, "kl": 0.7626136541366577, "learning_rate": 4.6868819166709716e-07, "loss": 0.0008, "reward": 0.7142857313156128, "reward_std": 0.0, "rewards/check_originality_func": 0.7142857313156128, "step": 1021 }, { "completion_length": 190.50001525878906, "epoch": 0.980806142034549, "grad_norm": 0.09895492345094681, "kl": 0.8164024949073792, "learning_rate": 4.685866540361455e-07, "loss": 0.0008, "reward": 1.0, "reward_std": 0.0, "rewards/check_originality_func": 1.0, "step": 1022 }, { "completion_length": 143.5, "epoch": 0.9817658349328215, "grad_norm": 3.523005485534668, "kl": 0.9566010236740112, "learning_rate": 4.6848496307445306e-07, "loss": 0.001, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 1023 }, { "completion_length": 265.0, "epoch": 0.982725527831094, "grad_norm": 1.1807388067245483, "kl": 0.5083954334259033, "learning_rate": 4.683831188533524e-07, "loss": 0.0005, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 1024 }, { "completion_length": 201.42857360839844, "epoch": 0.9836852207293666, "grad_norm": 1.3579745292663574, "kl": 0.7622183561325073, "learning_rate": 4.6828112144428355e-07, "loss": 0.0008, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 1025 }, { "completion_length": 172.07144165039062, "epoch": 0.9846449136276392, "grad_norm": 0.2688550651073456, "kl": 1.1163619756698608, "learning_rate": 4.681789709187942e-07, "loss": 0.0011, "reward": 1.0, "reward_std": 0.0, "rewards/check_originality_func": 1.0, "step": 1026 }, { "completion_length": 225.35714721679688, "epoch": 0.9856046065259118, "grad_norm": 1.0787605047225952, "kl": 0.7350795865058899, "learning_rate": 4.680766673485391e-07, "loss": 0.0007, "reward": 0.785714328289032, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.785714328289032, "step": 1027 }, { "completion_length": 183.00001525878906, "epoch": 0.9865642994241842, "grad_norm": 0.09089431166648865, "kl": 0.7582247257232666, "learning_rate": 4.679742108052807e-07, "loss": 0.0008, "reward": 1.0, "reward_std": 0.0, "rewards/check_originality_func": 1.0, "step": 1028 }, { "completion_length": 207.6428680419922, "epoch": 0.9875239923224568, "grad_norm": 1.3696401119232178, "kl": 0.7119497656822205, "learning_rate": 4.6787160136088856e-07, "loss": 0.0007, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 1029 }, { "completion_length": 216.92857360839844, "epoch": 0.9884836852207294, "grad_norm": 0.05766696855425835, "kl": 0.6659984588623047, "learning_rate": 4.6776883908733956e-07, "loss": 0.0007, "reward": 1.0, "reward_std": 0.0, "rewards/check_originality_func": 1.0, "step": 1030 }, { "completion_length": 216.00001525878906, "epoch": 0.989443378119002, "grad_norm": 0.12378384172916412, "kl": 0.7684391140937805, "learning_rate": 4.6766592405671774e-07, "loss": 0.0008, "reward": 1.0, "reward_std": 0.0, "rewards/check_originality_func": 1.0, "step": 1031 }, { "completion_length": 214.71429443359375, "epoch": 0.9904030710172744, "grad_norm": 1.4756966829299927, "kl": 0.6215579509735107, "learning_rate": 4.6756285634121433e-07, "loss": 0.0006, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 1032 }, { "completion_length": 234.7857208251953, "epoch": 0.991362763915547, "grad_norm": 2.3846795558929443, "kl": 0.5907265543937683, "learning_rate": 4.674596360131277e-07, "loss": 0.0006, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 1033 }, { "completion_length": 233.4285888671875, "epoch": 0.9923224568138196, "grad_norm": 0.04051128402352333, "kl": 0.5566267967224121, "learning_rate": 4.673562631448632e-07, "loss": 0.0006, "reward": 1.0, "reward_std": 0.0, "rewards/check_originality_func": 1.0, "step": 1034 }, { "completion_length": 215.07144165039062, "epoch": 0.9932821497120922, "grad_norm": 0.0446697399020195, "kl": 0.5783951282501221, "learning_rate": 4.6725273780893323e-07, "loss": 0.0006, "reward": 1.0, "reward_std": 0.0, "rewards/check_originality_func": 1.0, "step": 1035 }, { "completion_length": 212.21429443359375, "epoch": 0.9942418426103646, "grad_norm": 2.7370519638061523, "kl": 0.6483216285705566, "learning_rate": 4.671490600779571e-07, "loss": 0.0006, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 1036 }, { "completion_length": 190.6428680419922, "epoch": 0.9952015355086372, "grad_norm": 1.3498822450637817, "kl": 0.7377363443374634, "learning_rate": 4.6704523002466094e-07, "loss": 0.0007, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 1037 }, { "completion_length": 133.92857360839844, "epoch": 0.9961612284069098, "grad_norm": 2.90879487991333, "kl": 0.7516669034957886, "learning_rate": 4.6694124772187805e-07, "loss": 0.0008, "reward": 0.5714285969734192, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.5714285969734192, "step": 1038 }, { "completion_length": 169.92857360839844, "epoch": 0.9971209213051824, "grad_norm": 0.09781783819198608, "kl": 0.670177161693573, "learning_rate": 4.668371132425481e-07, "loss": 0.0007, "reward": 0.8571429252624512, "reward_std": 0.0, "rewards/check_originality_func": 0.8571429252624512, "step": 1039 }, { "completion_length": 185.42857360839844, "epoch": 0.9980806142034548, "grad_norm": 0.04361867159605026, "kl": 0.6423711180686951, "learning_rate": 4.667328266597178e-07, "loss": 0.0006, "reward": 1.0, "reward_std": 0.0, "rewards/check_originality_func": 1.0, "step": 1040 }, { "completion_length": 228.85714721679688, "epoch": 0.9990403071017274, "grad_norm": 1.543967843055725, "kl": 0.8174960613250732, "learning_rate": 4.6662838804654046e-07, "loss": 0.0008, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 1041 }, { "completion_length": 218.45455932617188, "epoch": 1.0, "grad_norm": 2.283773422241211, "kl": 0.6711764931678772, "learning_rate": 4.665237974762761e-07, "loss": 0.0007, "reward": 0.6428571939468384, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.6428571939468384, "step": 1042 }, { "completion_length": 196.57144165039062, "epoch": 1.0009596928982725, "grad_norm": 1.8347153663635254, "kl": 0.6046781539916992, "learning_rate": 4.664190550222912e-07, "loss": 0.0006, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 1043 }, { "completion_length": 214.7857208251953, "epoch": 1.0019193857965452, "grad_norm": 1.7803683280944824, "kl": 0.544779360294342, "learning_rate": 4.6631416075805886e-07, "loss": 0.0005, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 1044 }, { "completion_length": 130.21429443359375, "epoch": 1.0028790786948176, "grad_norm": 3.561687469482422, "kl": 2.912534236907959, "learning_rate": 4.6620911475715873e-07, "loss": 0.0029, "reward": 0.6428571939468384, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.6428571939468384, "step": 1045 }, { "completion_length": 193.1428680419922, "epoch": 1.0038387715930903, "grad_norm": 0.03597096726298332, "kl": 0.6084990501403809, "learning_rate": 4.661039170932767e-07, "loss": 0.0006, "reward": 1.0, "reward_std": 0.0, "rewards/check_originality_func": 1.0, "step": 1046 }, { "completion_length": 161.0, "epoch": 1.0047984644913628, "grad_norm": 2.5216116905212402, "kl": 3.5182764530181885, "learning_rate": 4.659985678402053e-07, "loss": 0.0035, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 1047 }, { "completion_length": 172.85714721679688, "epoch": 1.0057581573896353, "grad_norm": 2.0068769454956055, "kl": 0.5522428750991821, "learning_rate": 4.658930670718433e-07, "loss": 0.0006, "reward": 0.6428571939468384, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.6428571939468384, "step": 1048 }, { "completion_length": 189.07144165039062, "epoch": 1.006717850287908, "grad_norm": 2.1906611919403076, "kl": 0.8987250328063965, "learning_rate": 4.6578741486219556e-07, "loss": 0.0009, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 1049 }, { "completion_length": 230.7857208251953, "epoch": 1.0076775431861804, "grad_norm": 3.0883445739746094, "kl": 0.6262044310569763, "learning_rate": 4.6568161128537354e-07, "loss": 0.0006, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 1050 }, { "completion_length": 188.50001525878906, "epoch": 1.0086372360844529, "grad_norm": 1.263244867324829, "kl": 0.5555644035339355, "learning_rate": 4.655756564155945e-07, "loss": 0.0006, "reward": 0.785714328289032, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.785714328289032, "step": 1051 }, { "completion_length": 186.1428680419922, "epoch": 1.0095969289827256, "grad_norm": 1.6303093433380127, "kl": 0.5648990869522095, "learning_rate": 4.6546955032718205e-07, "loss": 0.0006, "reward": 0.785714328289032, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.785714328289032, "step": 1052 }, { "completion_length": 226.57144165039062, "epoch": 1.010556621880998, "grad_norm": 2.490513324737549, "kl": 1.1083693504333496, "learning_rate": 4.653632930945659e-07, "loss": 0.0011, "reward": 0.7142857313156128, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.7142857313156128, "step": 1053 }, { "completion_length": 133.2857208251953, "epoch": 1.0115163147792707, "grad_norm": 1.268970012664795, "kl": 1.8183289766311646, "learning_rate": 4.6525688479228164e-07, "loss": 0.0018, "reward": 0.785714328289032, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.785714328289032, "step": 1054 }, { "completion_length": 192.2857208251953, "epoch": 1.0124760076775432, "grad_norm": 1.026919960975647, "kl": 0.572449803352356, "learning_rate": 4.651503254949709e-07, "loss": 0.0006, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 1055 }, { "completion_length": 184.85714721679688, "epoch": 1.0134357005758157, "grad_norm": 2.4330945014953613, "kl": 0.5947504043579102, "learning_rate": 4.6504361527738134e-07, "loss": 0.0006, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 1056 }, { "completion_length": 210.7857208251953, "epoch": 1.0143953934740884, "grad_norm": 1.0829102993011475, "kl": 0.49785882234573364, "learning_rate": 4.6493675421436616e-07, "loss": 0.0005, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 1057 }, { "completion_length": 133.1428680419922, "epoch": 1.0153550863723608, "grad_norm": 1.610646367073059, "kl": 0.7612841129302979, "learning_rate": 4.648297423808848e-07, "loss": 0.0008, "reward": 0.785714328289032, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.785714328289032, "step": 1058 }, { "completion_length": 189.71429443359375, "epoch": 1.0163147792706333, "grad_norm": 1.4809596538543701, "kl": 0.5763490200042725, "learning_rate": 4.6472257985200205e-07, "loss": 0.0006, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 1059 }, { "completion_length": 200.21429443359375, "epoch": 1.017274472168906, "grad_norm": 2.258847236633301, "kl": 0.6763843894004822, "learning_rate": 4.6461526670288877e-07, "loss": 0.0007, "reward": 0.7142857313156128, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.7142857313156128, "step": 1060 }, { "completion_length": 214.2857208251953, "epoch": 1.0182341650671785, "grad_norm": 0.15683099627494812, "kl": 0.7772266864776611, "learning_rate": 4.6450780300882115e-07, "loss": 0.0008, "reward": 0.7142857313156128, "reward_std": 0.0, "rewards/check_originality_func": 0.7142857313156128, "step": 1061 }, { "completion_length": 222.07144165039062, "epoch": 1.0191938579654511, "grad_norm": 1.2052642107009888, "kl": 0.5012114644050598, "learning_rate": 4.644001888451813e-07, "loss": 0.0005, "reward": 0.785714328289032, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.785714328289032, "step": 1062 }, { "completion_length": 194.6428680419922, "epoch": 1.0201535508637236, "grad_norm": 0.028571326285600662, "kl": 0.524763822555542, "learning_rate": 4.642924242874565e-07, "loss": 0.0005, "reward": 0.8571429252624512, "reward_std": 0.0, "rewards/check_originality_func": 0.8571429252624512, "step": 1063 }, { "completion_length": 145.7857208251953, "epoch": 1.021113243761996, "grad_norm": 0.2643882632255554, "kl": 1.3214653730392456, "learning_rate": 4.6418450941123996e-07, "loss": 0.0013, "reward": 0.8571429252624512, "reward_std": 0.0, "rewards/check_originality_func": 0.8571429252624512, "step": 1064 }, { "completion_length": 182.85714721679688, "epoch": 1.0220729366602688, "grad_norm": 0.03715880587697029, "kl": 0.5998356938362122, "learning_rate": 4.6407644429223015e-07, "loss": 0.0006, "reward": 0.8571429252624512, "reward_std": 0.0, "rewards/check_originality_func": 0.8571429252624512, "step": 1065 }, { "completion_length": 200.35714721679688, "epoch": 1.0230326295585412, "grad_norm": 0.05521077662706375, "kl": 0.6264926791191101, "learning_rate": 4.639682290062307e-07, "loss": 0.0006, "reward": 0.8571429252624512, "reward_std": 0.0, "rewards/check_originality_func": 0.8571429252624512, "step": 1066 }, { "completion_length": 173.21429443359375, "epoch": 1.023992322456814, "grad_norm": 0.06973706930875778, "kl": 0.7183751463890076, "learning_rate": 4.6385986362915077e-07, "loss": 0.0007, "reward": 1.0, "reward_std": 0.0, "rewards/check_originality_func": 1.0, "step": 1067 }, { "completion_length": 245.9285888671875, "epoch": 1.0249520153550864, "grad_norm": 1.1947746276855469, "kl": 0.4653846323490143, "learning_rate": 4.6375134823700503e-07, "loss": 0.0005, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 1068 }, { "completion_length": 197.21429443359375, "epoch": 1.0259117082533589, "grad_norm": 1.9626741409301758, "kl": 0.6430882215499878, "learning_rate": 4.636426829059129e-07, "loss": 0.0006, "reward": 0.5714285969734192, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.5714285969734192, "step": 1069 }, { "completion_length": 249.71429443359375, "epoch": 1.0268714011516316, "grad_norm": 0.6698397994041443, "kl": 0.5783044695854187, "learning_rate": 4.635338677120994e-07, "loss": 0.0006, "reward": 0.6428571939468384, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.6428571939468384, "step": 1070 }, { "completion_length": 174.42857360839844, "epoch": 1.027831094049904, "grad_norm": 2.3094565868377686, "kl": 1.0601561069488525, "learning_rate": 4.6342490273189443e-07, "loss": 0.0011, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 1071 }, { "completion_length": 190.42857360839844, "epoch": 1.0287907869481765, "grad_norm": 1.052965521812439, "kl": 0.7785249948501587, "learning_rate": 4.633157880417331e-07, "loss": 0.0008, "reward": 0.785714328289032, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.785714328289032, "step": 1072 }, { "completion_length": 217.50001525878906, "epoch": 1.0297504798464492, "grad_norm": 2.612950086593628, "kl": 1.778611660003662, "learning_rate": 4.6320652371815547e-07, "loss": 0.0018, "reward": 0.6428571939468384, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.6428571939468384, "step": 1073 }, { "completion_length": 178.7857208251953, "epoch": 1.0307101727447217, "grad_norm": 1.234687328338623, "kl": 0.6807624697685242, "learning_rate": 4.630971098378065e-07, "loss": 0.0007, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 1074 }, { "completion_length": 186.35714721679688, "epoch": 1.0316698656429943, "grad_norm": 1.5585134029388428, "kl": 0.8476853966712952, "learning_rate": 4.6298754647743614e-07, "loss": 0.0008, "reward": 0.7142857313156128, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.7142857313156128, "step": 1075 }, { "completion_length": 221.85714721679688, "epoch": 1.0326295585412668, "grad_norm": 1.585559368133545, "kl": 0.6570377349853516, "learning_rate": 4.628778337138992e-07, "loss": 0.0007, "reward": 0.7142857313156128, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.7142857313156128, "step": 1076 }, { "completion_length": 217.35714721679688, "epoch": 1.0335892514395393, "grad_norm": 0.029403597116470337, "kl": 0.5138069987297058, "learning_rate": 4.6276797162415524e-07, "loss": 0.0005, "reward": 1.0, "reward_std": 0.0, "rewards/check_originality_func": 1.0, "step": 1077 }, { "completion_length": 257.8571472167969, "epoch": 1.034548944337812, "grad_norm": 0.0869363397359848, "kl": 0.6309928894042969, "learning_rate": 4.6265796028526873e-07, "loss": 0.0006, "reward": 0.7142857313156128, "reward_std": 0.0, "rewards/check_originality_func": 0.7142857313156128, "step": 1078 }, { "completion_length": 186.50001525878906, "epoch": 1.0355086372360844, "grad_norm": 2.515883207321167, "kl": 0.6917616724967957, "learning_rate": 4.625477997744085e-07, "loss": 0.0007, "reward": 0.785714328289032, "reward_std": 0.30304574966430664, "rewards/check_originality_func": 0.785714328289032, "step": 1079 }, { "completion_length": 154.71429443359375, "epoch": 1.036468330134357, "grad_norm": 4.344629764556885, "kl": 0.6822255253791809, "learning_rate": 4.6243749016884835e-07, "loss": 0.0007, "reward": 0.7142857313156128, "reward_std": 0.4040610194206238, "rewards/check_originality_func": 0.7142857313156128, "step": 1080 }, { "completion_length": 161.1428680419922, "epoch": 1.0374280230326296, "grad_norm": 2.5479464530944824, "kl": 0.6597902178764343, "learning_rate": 4.6232703154596654e-07, "loss": 0.0007, "reward": 0.7142857313156128, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.7142857313156128, "step": 1081 }, { "completion_length": 211.85714721679688, "epoch": 1.038387715930902, "grad_norm": 0.027059897780418396, "kl": 0.5298020839691162, "learning_rate": 4.6221642398324576e-07, "loss": 0.0005, "reward": 1.0, "reward_std": 0.0, "rewards/check_originality_func": 1.0, "step": 1082 }, { "completion_length": 230.2857208251953, "epoch": 1.0393474088291748, "grad_norm": 2.0713038444519043, "kl": 0.59115070104599, "learning_rate": 4.6210566755827333e-07, "loss": 0.0006, "reward": 0.5714285969734192, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.5714285969734192, "step": 1083 }, { "completion_length": 138.5, "epoch": 1.0403071017274472, "grad_norm": 0.048105303198099136, "kl": 0.6870853900909424, "learning_rate": 4.619947623487409e-07, "loss": 0.0007, "reward": 1.0, "reward_std": 0.0, "rewards/check_originality_func": 1.0, "step": 1084 }, { "completion_length": 205.1428680419922, "epoch": 1.0412667946257197, "grad_norm": 0.1456574946641922, "kl": 0.7572711110115051, "learning_rate": 4.6188370843244463e-07, "loss": 0.0008, "reward": 1.0, "reward_std": 0.0, "rewards/check_originality_func": 1.0, "step": 1085 }, { "completion_length": 188.35714721679688, "epoch": 1.0422264875239924, "grad_norm": 3.360332489013672, "kl": 1.3486582040786743, "learning_rate": 4.6177250588728466e-07, "loss": 0.0013, "reward": 0.5714285969734192, "reward_std": 0.4040610194206238, "rewards/check_originality_func": 0.5714285969734192, "step": 1086 }, { "completion_length": 142.7857208251953, "epoch": 1.0431861804222649, "grad_norm": 2.0247719287872314, "kl": 0.6359577178955078, "learning_rate": 4.616611547912658e-07, "loss": 0.0006, "reward": 0.785714328289032, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.785714328289032, "step": 1087 }, { "completion_length": 172.71429443359375, "epoch": 1.0441458733205373, "grad_norm": 1.6811720132827759, "kl": 0.5627303719520569, "learning_rate": 4.615496552224969e-07, "loss": 0.0006, "reward": 0.7142857313156128, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.7142857313156128, "step": 1088 }, { "completion_length": 178.42857360839844, "epoch": 1.04510556621881, "grad_norm": 2.37947678565979, "kl": 0.725833535194397, "learning_rate": 4.614380072591907e-07, "loss": 0.0007, "reward": 0.7142857313156128, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.7142857313156128, "step": 1089 }, { "completion_length": 215.07144165039062, "epoch": 1.0460652591170825, "grad_norm": 1.650094747543335, "kl": 0.5264923572540283, "learning_rate": 4.613262109796645e-07, "loss": 0.0005, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 1090 }, { "completion_length": 187.42857360839844, "epoch": 1.0470249520153552, "grad_norm": 5.134483814239502, "kl": 4.384420394897461, "learning_rate": 4.612142664623393e-07, "loss": 0.0044, "reward": 0.6428571939468384, "reward_std": 0.5050762891769409, "rewards/check_originality_func": 0.6428571939468384, "step": 1091 }, { "completion_length": 275.21429443359375, "epoch": 1.0479846449136276, "grad_norm": 1.1324676275253296, "kl": 0.4438019394874573, "learning_rate": 4.611021737857402e-07, "loss": 0.0004, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 1092 }, { "completion_length": 128.35714721679688, "epoch": 1.0489443378119, "grad_norm": 1.7099486589431763, "kl": 0.6732638478279114, "learning_rate": 4.6098993302849617e-07, "loss": 0.0007, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 1093 }, { "completion_length": 243.50001525878906, "epoch": 1.0499040307101728, "grad_norm": 2.803877115249634, "kl": 0.4610106945037842, "learning_rate": 4.6087754426934014e-07, "loss": 0.0005, "reward": 0.6428571939468384, "reward_std": 0.5050762891769409, "rewards/check_originality_func": 0.6428571939468384, "step": 1094 }, { "completion_length": 152.42857360839844, "epoch": 1.0508637236084453, "grad_norm": 1.4918700456619263, "kl": 0.6793709397315979, "learning_rate": 4.607650075871088e-07, "loss": 0.0007, "reward": 0.9285714626312256, "reward_std": 0.10101525485515594, "rewards/check_originality_func": 0.9285714626312256, "step": 1095 }, { "completion_length": 198.42857360839844, "epoch": 1.051823416506718, "grad_norm": 1.5728017091751099, "kl": 0.5450882911682129, "learning_rate": 4.606523230607427e-07, "loss": 0.0005, "reward": 0.7142857313156128, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.7142857313156128, "step": 1096 }, { "completion_length": 139.0, "epoch": 1.0527831094049904, "grad_norm": 0.058316197246313095, "kl": 0.6780792474746704, "learning_rate": 4.605394907692859e-07, "loss": 0.0007, "reward": 0.8571429252624512, "reward_std": 0.0, "rewards/check_originality_func": 0.8571429252624512, "step": 1097 }, { "completion_length": 164.07144165039062, "epoch": 1.053742802303263, "grad_norm": 1.7601710557937622, "kl": 0.5871489644050598, "learning_rate": 4.6042651079188633e-07, "loss": 0.0006, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 1098 }, { "completion_length": 186.35714721679688, "epoch": 1.0547024952015356, "grad_norm": 2.4066812992095947, "kl": 0.6427438855171204, "learning_rate": 4.603133832077953e-07, "loss": 0.0006, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 1099 }, { "completion_length": 279.3571472167969, "epoch": 1.055662188099808, "grad_norm": 1.703138828277588, "kl": 0.4165450632572174, "learning_rate": 4.602001080963678e-07, "loss": 0.0004, "reward": 0.8571429252624512, "reward_std": 0.2020305097103119, "rewards/check_originality_func": 0.8571429252624512, "step": 1100 } ], "logging_steps": 1, "max_steps": 4168, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }