zerozeroz commited on
Commit
f30c802
·
verified ·
1 Parent(s): 266dadc

Model save

Browse files
README.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: codellama/CodeLlama-7b-hf
3
+ library_name: transformers
4
+ model_name: CodeLlama-7b-hf
5
+ tags:
6
+ - generated_from_trainer
7
+ - trl
8
+ - grpo
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for CodeLlama-7b-hf
13
+
14
+ This model is a fine-tuned version of [codellama/CodeLlama-7b-hf](https://huggingface.co/codellama/CodeLlama-7b-hf).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="zerozeroz/CodeLlama-7b-hf", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+
31
+
32
+
33
+ This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.14.0
38
+ - Transformers: 4.48.1
39
+ - Pytorch: 2.5.1+cu121
40
+ - Datasets: 3.1.0
41
+ - Tokenizers: 0.21.0
42
+
43
+ ## Citations
44
+
45
+ Cite GRPO as:
46
+
47
+ ```bibtex
48
+ @article{zhihong2024deepseekmath,
49
+ title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
50
+ author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
51
+ year = 2024,
52
+ eprint = {arXiv:2402.03300},
53
+ }
54
+
55
+ ```
56
+
57
+ Cite TRL as:
58
+
59
+ ```bibtex
60
+ @misc{vonwerra2022trl,
61
+ title = {{TRL: Transformer Reinforcement Learning}},
62
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
63
+ year = 2020,
64
+ journal = {GitHub repository},
65
+ publisher = {GitHub},
66
+ howpublished = {\url{https://github.com/huggingface/trl}}
67
+ }
68
+ ```
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 8.671089399598486e-06,
4
+ "train_runtime": 8102.573,
5
+ "train_samples": 374,
6
+ "train_samples_per_second": 0.093,
7
+ "train_steps_per_second": 0.015
8
+ }
config.json CHANGED
@@ -25,6 +25,6 @@
25
  "tie_word_embeddings": false,
26
  "torch_dtype": "bfloat16",
27
  "transformers_version": "4.48.1",
28
- "use_cache": false,
29
  "vocab_size": 32016
30
  }
 
25
  "tie_word_embeddings": false,
26
  "torch_dtype": "bfloat16",
27
  "transformers_version": "4.48.1",
28
+ "use_cache": true,
29
  "vocab_size": 32016
30
  }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.48.1"
6
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 8.671089399598486e-06,
4
+ "train_runtime": 8102.573,
5
+ "train_samples": 374,
6
+ "train_samples_per_second": 0.093,
7
+ "train_steps_per_second": 0.015
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1667 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.992,
5
+ "eval_steps": 500,
6
+ "global_step": 125,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "completion_length": 178.3541717529297,
13
+ "epoch": 0.016,
14
+ "grad_norm": 0.7053135404726241,
15
+ "kl": 0.0,
16
+ "learning_rate": 1.25e-07,
17
+ "loss": 0.0,
18
+ "reward": 0.5438157171010971,
19
+ "reward_std": 0.4464171230792999,
20
+ "rewards/correct_code_reward_func": 0.2500000074505806,
21
+ "rewards/len_reward_func": 0.2938157171010971,
22
+ "step": 1
23
+ },
24
+ {
25
+ "completion_length": 176.89584350585938,
26
+ "epoch": 0.032,
27
+ "grad_norm": 2.3417507999605864,
28
+ "kl": 0.0,
29
+ "learning_rate": 2.5e-07,
30
+ "loss": -0.0,
31
+ "reward": 0.4928237199783325,
32
+ "reward_std": 0.3440842181444168,
33
+ "rewards/correct_code_reward_func": 0.1458333432674408,
34
+ "rewards/len_reward_func": 0.34699034690856934,
35
+ "step": 2
36
+ },
37
+ {
38
+ "completion_length": 192.25,
39
+ "epoch": 0.048,
40
+ "grad_norm": 0.48071888167986226,
41
+ "kl": -3.4809112548828125e-05,
42
+ "learning_rate": 3.75e-07,
43
+ "loss": -0.0,
44
+ "reward": 0.449026882648468,
45
+ "reward_std": 0.47666139900684357,
46
+ "rewards/correct_code_reward_func": 0.1666666716337204,
47
+ "rewards/len_reward_func": 0.2823602259159088,
48
+ "step": 3
49
+ },
50
+ {
51
+ "completion_length": 270.7291717529297,
52
+ "epoch": 0.064,
53
+ "grad_norm": 0.3569537292235904,
54
+ "kl": 4.172325134277344e-07,
55
+ "learning_rate": 5e-07,
56
+ "loss": 0.0,
57
+ "reward": 0.45451289415359497,
58
+ "reward_std": 0.3296015188097954,
59
+ "rewards/correct_code_reward_func": 0.1041666716337204,
60
+ "rewards/len_reward_func": 0.3503462225198746,
61
+ "step": 4
62
+ },
63
+ {
64
+ "completion_length": 232.58334350585938,
65
+ "epoch": 0.08,
66
+ "grad_norm": 0.4439935569873824,
67
+ "kl": -4.214048385620117e-05,
68
+ "learning_rate": 4.999157413258781e-07,
69
+ "loss": -0.0,
70
+ "reward": 0.2809216380119324,
71
+ "reward_std": 0.37323758006095886,
72
+ "rewards/correct_code_reward_func": 0.0,
73
+ "rewards/len_reward_func": 0.2809216380119324,
74
+ "step": 5
75
+ },
76
+ {
77
+ "completion_length": 261.9375,
78
+ "epoch": 0.096,
79
+ "grad_norm": 0.3541024748340305,
80
+ "kl": -3.1054019927978516e-05,
81
+ "learning_rate": 4.996630220997057e-07,
82
+ "loss": -0.0,
83
+ "reward": 0.4689294695854187,
84
+ "reward_std": 0.4829525202512741,
85
+ "rewards/correct_code_reward_func": 0.1875,
86
+ "rewards/len_reward_func": 0.2814294695854187,
87
+ "step": 6
88
+ },
89
+ {
90
+ "completion_length": 306.12501525878906,
91
+ "epoch": 0.112,
92
+ "grad_norm": 0.3947267586718111,
93
+ "kl": -2.968311309814453e-05,
94
+ "learning_rate": 4.992420126717784e-07,
95
+ "loss": -0.0,
96
+ "reward": 0.3299577385187149,
97
+ "reward_std": 0.30695800483226776,
98
+ "rewards/correct_code_reward_func": 0.0,
99
+ "rewards/len_reward_func": 0.3299577236175537,
100
+ "step": 7
101
+ },
102
+ {
103
+ "completion_length": 238.85417938232422,
104
+ "epoch": 0.128,
105
+ "grad_norm": 0.5118206169412205,
106
+ "kl": -4.1604042053222656e-05,
107
+ "learning_rate": 4.986529968316653e-07,
108
+ "loss": 0.0,
109
+ "reward": 0.4368641823530197,
110
+ "reward_std": 0.2371533028781414,
111
+ "rewards/correct_code_reward_func": 0.08333333395421505,
112
+ "rewards/len_reward_func": 0.3535308539867401,
113
+ "step": 8
114
+ },
115
+ {
116
+ "completion_length": 304.7708435058594,
117
+ "epoch": 0.144,
118
+ "grad_norm": 0.3493288643723073,
119
+ "kl": -5.060434341430664e-05,
120
+ "learning_rate": 4.978963716169165e-07,
121
+ "loss": -0.0,
122
+ "reward": 0.4636567234992981,
123
+ "reward_std": 0.46522316336631775,
124
+ "rewards/correct_code_reward_func": 0.1666666679084301,
125
+ "rewards/len_reward_func": 0.2969900518655777,
126
+ "step": 9
127
+ },
128
+ {
129
+ "completion_length": 264.4791717529297,
130
+ "epoch": 0.16,
131
+ "grad_norm": 0.8959331411324142,
132
+ "kl": -7.329881191253662e-05,
133
+ "learning_rate": 4.969726470454313e-07,
134
+ "loss": -0.0,
135
+ "reward": 0.5069085508584976,
136
+ "reward_std": 0.37757958471775055,
137
+ "rewards/correct_code_reward_func": 0.1666666716337204,
138
+ "rewards/len_reward_func": 0.3402418941259384,
139
+ "step": 10
140
+ },
141
+ {
142
+ "completion_length": 201.3541717529297,
143
+ "epoch": 0.176,
144
+ "grad_norm": 0.4500851901199424,
145
+ "kl": -4.0531158447265625e-05,
146
+ "learning_rate": 4.958824457716706e-07,
147
+ "loss": -0.0,
148
+ "reward": 0.5010976195335388,
149
+ "reward_std": 0.5051684230566025,
150
+ "rewards/correct_code_reward_func": 0.1666666716337204,
151
+ "rewards/len_reward_func": 0.334430992603302,
152
+ "step": 11
153
+ },
154
+ {
155
+ "completion_length": 297.37500762939453,
156
+ "epoch": 0.192,
157
+ "grad_norm": 0.43567877016900936,
158
+ "kl": -4.696846008300781e-05,
159
+ "learning_rate": 4.946265026669454e-07,
160
+ "loss": -0.0,
161
+ "reward": 0.3353367894887924,
162
+ "reward_std": 0.3315615653991699,
163
+ "rewards/correct_code_reward_func": 0.02083333395421505,
164
+ "rewards/len_reward_func": 0.3145034611225128,
165
+ "step": 12
166
+ },
167
+ {
168
+ "completion_length": 201.75,
169
+ "epoch": 0.208,
170
+ "grad_norm": 0.5760425272270367,
171
+ "kl": 2.5451183319091797e-05,
172
+ "learning_rate": 4.932056643240618e-07,
173
+ "loss": 0.0,
174
+ "reward": 0.6330613493919373,
175
+ "reward_std": 0.5388243198394775,
176
+ "rewards/correct_code_reward_func": 0.3541666716337204,
177
+ "rewards/len_reward_func": 0.27889466285705566,
178
+ "step": 13
179
+ },
180
+ {
181
+ "completion_length": 288.0416717529297,
182
+ "epoch": 0.224,
183
+ "grad_norm": 0.6576192766028296,
184
+ "kl": 7.191300392150879e-05,
185
+ "learning_rate": 4.916208884866592e-07,
186
+ "loss": 0.0,
187
+ "reward": 0.39246678352355957,
188
+ "reward_std": 0.44181104004383087,
189
+ "rewards/correct_code_reward_func": 0.08333333395421505,
190
+ "rewards/len_reward_func": 0.3091334402561188,
191
+ "step": 14
192
+ },
193
+ {
194
+ "completion_length": 183.75000762939453,
195
+ "epoch": 0.24,
196
+ "grad_norm": 0.6134701851816485,
197
+ "kl": 0.00014084577560424805,
198
+ "learning_rate": 4.898732434036243e-07,
199
+ "loss": 0.0,
200
+ "reward": 0.42976467311382294,
201
+ "reward_std": 0.3796156942844391,
202
+ "rewards/correct_code_reward_func": 0.1250000037252903,
203
+ "rewards/len_reward_func": 0.30476468801498413,
204
+ "step": 15
205
+ },
206
+ {
207
+ "completion_length": 207.77083587646484,
208
+ "epoch": 0.256,
209
+ "grad_norm": 0.7064422469677513,
210
+ "kl": 0.000293731689453125,
211
+ "learning_rate": 4.879639071090173e-07,
212
+ "loss": 0.0,
213
+ "reward": 0.35718174278736115,
214
+ "reward_std": 0.408719003200531,
215
+ "rewards/correct_code_reward_func": 0.0625,
216
+ "rewards/len_reward_func": 0.29468175768852234,
217
+ "step": 16
218
+ },
219
+ {
220
+ "completion_length": 194.00000762939453,
221
+ "epoch": 0.272,
222
+ "grad_norm": 0.4827506793012771,
223
+ "kl": 2.288818359375e-05,
224
+ "learning_rate": 4.858941666279955e-07,
225
+ "loss": -0.0,
226
+ "reward": 0.5703845322132111,
227
+ "reward_std": 0.39449170231819153,
228
+ "rewards/correct_code_reward_func": 0.2500000111758709,
229
+ "rewards/len_reward_func": 0.32038453221321106,
230
+ "step": 17
231
+ },
232
+ {
233
+ "completion_length": 238.18750762939453,
234
+ "epoch": 0.288,
235
+ "grad_norm": 0.41996129426983214,
236
+ "kl": 0.00011217594146728516,
237
+ "learning_rate": 4.836654171092682e-07,
238
+ "loss": 0.0,
239
+ "reward": 0.6033974885940552,
240
+ "reward_std": 0.43378083407878876,
241
+ "rewards/correct_code_reward_func": 0.2708333358168602,
242
+ "rewards/len_reward_func": 0.33256417512893677,
243
+ "step": 18
244
+ },
245
+ {
246
+ "completion_length": 233.95834350585938,
247
+ "epoch": 0.304,
248
+ "grad_norm": 0.5123314162893023,
249
+ "kl": 0.00013169646263122559,
250
+ "learning_rate": 4.812791608846709e-07,
251
+ "loss": 0.0,
252
+ "reward": 0.40910618007183075,
253
+ "reward_std": 0.3389303684234619,
254
+ "rewards/correct_code_reward_func": 0.0833333358168602,
255
+ "rewards/len_reward_func": 0.32577285170555115,
256
+ "step": 19
257
+ },
258
+ {
259
+ "completion_length": 316.3125,
260
+ "epoch": 0.32,
261
+ "grad_norm": 0.47220846544005757,
262
+ "kl": 0.0002579689025878906,
263
+ "learning_rate": 4.787370064564882e-07,
264
+ "loss": 0.0,
265
+ "reward": 0.41953830420970917,
266
+ "reward_std": 0.4723764508962631,
267
+ "rewards/correct_code_reward_func": 0.1458333358168602,
268
+ "rewards/len_reward_func": 0.2737049460411072,
269
+ "step": 20
270
+ },
271
+ {
272
+ "completion_length": 227.7916717529297,
273
+ "epoch": 0.336,
274
+ "grad_norm": 0.6008188696097762,
275
+ "kl": 0.0004267692565917969,
276
+ "learning_rate": 4.7604066741321253e-07,
277
+ "loss": 0.0,
278
+ "reward": 0.6169633865356445,
279
+ "reward_std": 0.48675188422203064,
280
+ "rewards/correct_code_reward_func": 0.3333333432674408,
281
+ "rewards/len_reward_func": 0.28363004326820374,
282
+ "step": 21
283
+ },
284
+ {
285
+ "completion_length": 252.27084350585938,
286
+ "epoch": 0.352,
287
+ "grad_norm": 0.5160855324353004,
288
+ "kl": 0.0001907944679260254,
289
+ "learning_rate": 4.731919612744659e-07,
290
+ "loss": 0.0,
291
+ "reward": 0.5648790299892426,
292
+ "reward_std": 0.31752249598503113,
293
+ "rewards/correct_code_reward_func": 0.18750000558793545,
294
+ "rewards/len_reward_func": 0.37737900018692017,
295
+ "step": 22
296
+ },
297
+ {
298
+ "completion_length": 199.27083587646484,
299
+ "epoch": 0.368,
300
+ "grad_norm": 0.4296758598414565,
301
+ "kl": 0.000273287296295166,
302
+ "learning_rate": 4.7019280826586604e-07,
303
+ "loss": 0.0,
304
+ "reward": 0.45347318053245544,
305
+ "reward_std": 0.3857118636369705,
306
+ "rewards/correct_code_reward_func": 0.1041666716337204,
307
+ "rewards/len_reward_func": 0.34930650889873505,
308
+ "step": 23
309
+ },
310
+ {
311
+ "completion_length": 221.77083587646484,
312
+ "epoch": 0.384,
313
+ "grad_norm": 1.2892124765727169,
314
+ "kl": 0.0007638931274414062,
315
+ "learning_rate": 4.6704523002466094e-07,
316
+ "loss": 0.0,
317
+ "reward": 0.4506646543741226,
318
+ "reward_std": 0.20875498466193676,
319
+ "rewards/correct_code_reward_func": 0.0416666679084301,
320
+ "rewards/len_reward_func": 0.40899796783924103,
321
+ "step": 24
322
+ },
323
+ {
324
+ "completion_length": 266.8333435058594,
325
+ "epoch": 0.4,
326
+ "grad_norm": 1.4838265312357841,
327
+ "kl": 0.0009393692016601562,
328
+ "learning_rate": 4.6375134823700503e-07,
329
+ "loss": 0.0,
330
+ "reward": 0.4240592122077942,
331
+ "reward_std": 0.36766907572746277,
332
+ "rewards/correct_code_reward_func": 0.1041666679084301,
333
+ "rewards/len_reward_func": 0.3198925405740738,
334
+ "step": 25
335
+ },
336
+ {
337
+ "completion_length": 225.20834350585938,
338
+ "epoch": 0.416,
339
+ "grad_norm": 0.45923128114507034,
340
+ "kl": 0.0007772445678710938,
341
+ "learning_rate": 4.603133832077953e-07,
342
+ "loss": 0.0,
343
+ "reward": 0.48156437277793884,
344
+ "reward_std": 0.4315789043903351,
345
+ "rewards/correct_code_reward_func": 0.1458333358168602,
346
+ "rewards/len_reward_func": 0.33573102951049805,
347
+ "step": 26
348
+ },
349
+ {
350
+ "completion_length": 127.33333587646484,
351
+ "epoch": 0.432,
352
+ "grad_norm": 0.5853476018540942,
353
+ "kl": 0.00115966796875,
354
+ "learning_rate": 4.5673365236403216e-07,
355
+ "loss": 0.0,
356
+ "reward": 0.5744044184684753,
357
+ "reward_std": 0.43071019649505615,
358
+ "rewards/correct_code_reward_func": 0.2708333432674408,
359
+ "rewards/len_reward_func": 0.30357107520103455,
360
+ "step": 27
361
+ },
362
+ {
363
+ "completion_length": 273.1666717529297,
364
+ "epoch": 0.448,
365
+ "grad_norm": 0.4925160720854453,
366
+ "kl": 0.0010128021240234375,
367
+ "learning_rate": 4.530145686927125e-07,
368
+ "loss": 0.0,
369
+ "reward": 0.5949445962905884,
370
+ "reward_std": 0.42838945984840393,
371
+ "rewards/correct_code_reward_func": 0.2708333432674408,
372
+ "rewards/len_reward_func": 0.3241112679243088,
373
+ "step": 28
374
+ },
375
+ {
376
+ "completion_length": 178.75000762939453,
377
+ "epoch": 0.464,
378
+ "grad_norm": 1.1463419828713735,
379
+ "kl": 0.0027008056640625,
380
+ "learning_rate": 4.4915863911430897e-07,
381
+ "loss": 0.0,
382
+ "reward": 0.3950851857662201,
383
+ "reward_std": 0.4124990254640579,
384
+ "rewards/correct_code_reward_func": 0.08333333395421505,
385
+ "rewards/len_reward_func": 0.3117518424987793,
386
+ "step": 29
387
+ },
388
+ {
389
+ "completion_length": 210.0,
390
+ "epoch": 0.48,
391
+ "grad_norm": 0.38472151178254205,
392
+ "kl": 0.0011115074157714844,
393
+ "learning_rate": 4.45168462792932e-07,
394
+ "loss": 0.0,
395
+ "reward": 0.48962171375751495,
396
+ "reward_std": 0.443721666932106,
397
+ "rewards/correct_code_reward_func": 0.1666666716337204,
398
+ "rewards/len_reward_func": 0.32295504212379456,
399
+ "step": 30
400
+ },
401
+ {
402
+ "completion_length": 204.95833587646484,
403
+ "epoch": 0.496,
404
+ "grad_norm": 1.5507179471403478,
405
+ "kl": 0.00183868408203125,
406
+ "learning_rate": 4.4104672938431223e-07,
407
+ "loss": 0.0,
408
+ "reward": 0.6203064322471619,
409
+ "reward_std": 0.39608660340309143,
410
+ "rewards/correct_code_reward_func": 0.2916666865348816,
411
+ "rewards/len_reward_func": 0.32863976061344147,
412
+ "step": 31
413
+ },
414
+ {
415
+ "completion_length": 159.83333587646484,
416
+ "epoch": 0.512,
417
+ "grad_norm": 0.9492121763047169,
418
+ "kl": 0.001636505126953125,
419
+ "learning_rate": 4.367962172227866e-07,
420
+ "loss": 0.0,
421
+ "reward": 0.5581120550632477,
422
+ "reward_std": 0.4106949418783188,
423
+ "rewards/correct_code_reward_func": 0.25,
424
+ "rewards/len_reward_func": 0.3081120699644089,
425
+ "step": 32
426
+ },
427
+ {
428
+ "completion_length": 214.52083587646484,
429
+ "epoch": 0.528,
430
+ "grad_norm": 1.1197088117962448,
431
+ "kl": 0.005927085876464844,
432
+ "learning_rate": 4.324197914485075e-07,
433
+ "loss": 0.0,
434
+ "reward": 0.4833623319864273,
435
+ "reward_std": 0.39532361924648285,
436
+ "rewards/correct_code_reward_func": 0.1250000037252903,
437
+ "rewards/len_reward_func": 0.3583623170852661,
438
+ "step": 33
439
+ },
440
+ {
441
+ "completion_length": 255.39583587646484,
442
+ "epoch": 0.544,
443
+ "grad_norm": 0.4856761942711223,
444
+ "kl": 0.00038909912109375,
445
+ "learning_rate": 4.2792040207614e-07,
446
+ "loss": 0.0,
447
+ "reward": 0.3419315367937088,
448
+ "reward_std": 0.23230206966400146,
449
+ "rewards/correct_code_reward_func": 0.02083333395421505,
450
+ "rewards/len_reward_func": 0.3210982233285904,
451
+ "step": 34
452
+ },
453
+ {
454
+ "completion_length": 215.93750762939453,
455
+ "epoch": 0.56,
456
+ "grad_norm": 0.6317607844858494,
457
+ "kl": 0.00643157958984375,
458
+ "learning_rate": 4.2330108200634723e-07,
459
+ "loss": 0.0,
460
+ "reward": 0.48397234082221985,
461
+ "reward_std": 0.36393553018569946,
462
+ "rewards/correct_code_reward_func": 0.1458333432674408,
463
+ "rewards/len_reward_func": 0.33813901245594025,
464
+ "step": 35
465
+ },
466
+ {
467
+ "completion_length": 170.52084350585938,
468
+ "epoch": 0.576,
469
+ "grad_norm": 0.47983643730367526,
470
+ "kl": 0.0009899139404296875,
471
+ "learning_rate": 4.185649449814045e-07,
472
+ "loss": 0.0,
473
+ "reward": 0.5446349233388901,
474
+ "reward_std": 0.475721538066864,
475
+ "rewards/correct_code_reward_func": 0.229166679084301,
476
+ "rewards/len_reward_func": 0.3154682517051697,
477
+ "step": 36
478
+ },
479
+ {
480
+ "completion_length": 155.41666793823242,
481
+ "epoch": 0.592,
482
+ "grad_norm": 0.7182646382516522,
483
+ "kl": 0.00705718994140625,
484
+ "learning_rate": 4.137151834863213e-07,
485
+ "loss": 0.0,
486
+ "reward": 0.659955769777298,
487
+ "reward_std": 0.4905228465795517,
488
+ "rewards/correct_code_reward_func": 0.2916666865348816,
489
+ "rewards/len_reward_func": 0.3682890981435776,
490
+ "step": 37
491
+ },
492
+ {
493
+ "completion_length": 234.35416793823242,
494
+ "epoch": 0.608,
495
+ "grad_norm": 0.5385402773601645,
496
+ "kl": 0.001377105712890625,
497
+ "learning_rate": 4.087550665968846e-07,
498
+ "loss": 0.0,
499
+ "reward": 0.34026579558849335,
500
+ "reward_std": 0.42601510882377625,
501
+ "rewards/correct_code_reward_func": 0.10416666977107525,
502
+ "rewards/len_reward_func": 0.23609913140535355,
503
+ "step": 38
504
+ },
505
+ {
506
+ "completion_length": 150.02083587646484,
507
+ "epoch": 0.624,
508
+ "grad_norm": 0.8263933056203022,
509
+ "kl": 0.0065155029296875,
510
+ "learning_rate": 4.036879377760752e-07,
511
+ "loss": 0.0,
512
+ "reward": 0.6042845845222473,
513
+ "reward_std": 0.4889480620622635,
514
+ "rewards/correct_code_reward_func": 0.2708333432674408,
515
+ "rewards/len_reward_func": 0.3334512561559677,
516
+ "step": 39
517
+ },
518
+ {
519
+ "completion_length": 168.77084350585938,
520
+ "epoch": 0.64,
521
+ "grad_norm": 0.6434934249206259,
522
+ "kl": 0.00261688232421875,
523
+ "learning_rate": 3.9851721262034157e-07,
524
+ "loss": 0.0,
525
+ "reward": 0.4757592976093292,
526
+ "reward_std": 0.3624489903450012,
527
+ "rewards/correct_code_reward_func": 0.125,
528
+ "rewards/len_reward_func": 0.3507593274116516,
529
+ "step": 40
530
+ },
531
+ {
532
+ "completion_length": 194.0833396911621,
533
+ "epoch": 0.656,
534
+ "grad_norm": 0.5493289883401115,
535
+ "kl": 0.003704071044921875,
536
+ "learning_rate": 3.932463765572505e-07,
537
+ "loss": 0.0,
538
+ "reward": 0.42005764693021774,
539
+ "reward_std": 0.37090964615345,
540
+ "rewards/correct_code_reward_func": 0.16666667722165585,
541
+ "rewards/len_reward_func": 0.25339096784591675,
542
+ "step": 41
543
+ },
544
+ {
545
+ "completion_length": 223.87500762939453,
546
+ "epoch": 0.672,
547
+ "grad_norm": 0.43766199341669654,
548
+ "kl": 0.003650665283203125,
549
+ "learning_rate": 3.8787898249606767e-07,
550
+ "loss": 0.0,
551
+ "reward": 0.35570722818374634,
552
+ "reward_std": 0.3742265850305557,
553
+ "rewards/correct_code_reward_func": 0.0416666679084301,
554
+ "rewards/len_reward_func": 0.31404057145118713,
555
+ "step": 42
556
+ },
557
+ {
558
+ "completion_length": 159.93750762939453,
559
+ "epoch": 0.688,
560
+ "grad_norm": 0.662510453229864,
561
+ "kl": 0.004608154296875,
562
+ "learning_rate": 3.8241864843284964e-07,
563
+ "loss": 0.0,
564
+ "reward": 0.6946325600147247,
565
+ "reward_std": 0.4132131338119507,
566
+ "rewards/correct_code_reward_func": 0.3125000074505806,
567
+ "rewards/len_reward_func": 0.38213254511356354,
568
+ "step": 43
569
+ },
570
+ {
571
+ "completion_length": 202.43750762939453,
572
+ "epoch": 0.704,
573
+ "grad_norm": 0.4902137587205668,
574
+ "kl": 0.006561279296875,
575
+ "learning_rate": 3.768690550116639e-07,
576
+ "loss": 0.0,
577
+ "reward": 0.4793711006641388,
578
+ "reward_std": 0.4331624209880829,
579
+ "rewards/correct_code_reward_func": 0.1875000074505806,
580
+ "rewards/len_reward_func": 0.2918711006641388,
581
+ "step": 44
582
+ },
583
+ {
584
+ "completion_length": 201.52084350585938,
585
+ "epoch": 0.72,
586
+ "grad_norm": 1.5244716072983422,
587
+ "kl": 0.0045623779296875,
588
+ "learning_rate": 3.712339430435792e-07,
589
+ "loss": 0.0,
590
+ "reward": 0.376946821808815,
591
+ "reward_std": 0.36263740062713623,
592
+ "rewards/correct_code_reward_func": 0.0833333358168602,
593
+ "rewards/len_reward_func": 0.2936134934425354,
594
+ "step": 45
595
+ },
596
+ {
597
+ "completion_length": 211.87501525878906,
598
+ "epoch": 0.736,
599
+ "grad_norm": 0.47019285013227063,
600
+ "kl": 0.00263214111328125,
601
+ "learning_rate": 3.65517110985099e-07,
602
+ "loss": 0.0,
603
+ "reward": 0.4974285364151001,
604
+ "reward_std": 0.3727869838476181,
605
+ "rewards/correct_code_reward_func": 0.12500000558793545,
606
+ "rewards/len_reward_func": 0.3724285215139389,
607
+ "step": 46
608
+ },
609
+ {
610
+ "completion_length": 128.29166793823242,
611
+ "epoch": 0.752,
612
+ "grad_norm": 0.5038135341546401,
613
+ "kl": 0.00347900390625,
614
+ "learning_rate": 3.597224123777389e-07,
615
+ "loss": 0.0,
616
+ "reward": 0.608631819486618,
617
+ "reward_std": 0.4802524596452713,
618
+ "rewards/correct_code_reward_func": 0.3541666865348816,
619
+ "rewards/len_reward_func": 0.25446511805057526,
620
+ "step": 47
621
+ },
622
+ {
623
+ "completion_length": 161.8541717529297,
624
+ "epoch": 0.768,
625
+ "grad_norm": 0.5836224104866172,
626
+ "kl": 0.0087890625,
627
+ "learning_rate": 3.5385375325047163e-07,
628
+ "loss": 0.0,
629
+ "reward": 0.5433830618858337,
630
+ "reward_std": 0.4709310829639435,
631
+ "rewards/correct_code_reward_func": 0.2291666679084301,
632
+ "rewards/len_reward_func": 0.31421639025211334,
633
+ "step": 48
634
+ },
635
+ {
636
+ "completion_length": 140.47916793823242,
637
+ "epoch": 0.784,
638
+ "grad_norm": 0.49678584086470384,
639
+ "kl": 0.0029144287109375,
640
+ "learning_rate": 3.479150894867926e-07,
641
+ "loss": 0.0,
642
+ "reward": 0.5527551472187042,
643
+ "reward_std": 0.5054349154233932,
644
+ "rewards/correct_code_reward_func": 0.22916667722165585,
645
+ "rewards/len_reward_func": 0.3235885202884674,
646
+ "step": 49
647
+ },
648
+ {
649
+ "completion_length": 190.87500762939453,
650
+ "epoch": 0.8,
651
+ "grad_norm": 1.1393985621399754,
652
+ "kl": 0.004364013671875,
653
+ "learning_rate": 3.4191042415818e-07,
654
+ "loss": 0.0,
655
+ "reward": 0.4087870866060257,
656
+ "reward_std": 0.35092872381210327,
657
+ "rewards/correct_code_reward_func": 0.1041666716337204,
658
+ "rewards/len_reward_func": 0.3046204149723053,
659
+ "step": 50
660
+ },
661
+ {
662
+ "completion_length": 189.95833587646484,
663
+ "epoch": 0.816,
664
+ "grad_norm": 1.1299817912717918,
665
+ "kl": 0.01027679443359375,
666
+ "learning_rate": 3.3584380482574717e-07,
667
+ "loss": 0.0,
668
+ "reward": 0.48216497898101807,
669
+ "reward_std": 0.35060346126556396,
670
+ "rewards/correct_code_reward_func": 0.1875,
671
+ "rewards/len_reward_func": 0.29466497898101807,
672
+ "step": 51
673
+ },
674
+ {
675
+ "completion_length": 155.20833587646484,
676
+ "epoch": 0.832,
677
+ "grad_norm": 1.0099136669800142,
678
+ "kl": 0.0089263916015625,
679
+ "learning_rate": 3.297193208119047e-07,
680
+ "loss": 0.0,
681
+ "reward": 0.7047297656536102,
682
+ "reward_std": 0.537945419549942,
683
+ "rewards/correct_code_reward_func": 0.3750000149011612,
684
+ "rewards/len_reward_func": 0.32972970604896545,
685
+ "step": 52
686
+ },
687
+ {
688
+ "completion_length": 119.43750381469727,
689
+ "epoch": 0.848,
690
+ "grad_norm": 0.6910875224957795,
691
+ "kl": 0.00537109375,
692
+ "learning_rate": 3.235411004438741e-07,
693
+ "loss": 0.0,
694
+ "reward": 0.37493598461151123,
695
+ "reward_std": 0.32232099026441574,
696
+ "rewards/correct_code_reward_func": 0.0416666679084301,
697
+ "rewards/len_reward_func": 0.33326931297779083,
698
+ "step": 53
699
+ },
700
+ {
701
+ "completion_length": 239.00001525878906,
702
+ "epoch": 0.864,
703
+ "grad_norm": 0.9366579226902805,
704
+ "kl": 0.01202392578125,
705
+ "learning_rate": 3.173133082709086e-07,
706
+ "loss": 0.0,
707
+ "reward": 0.5343351364135742,
708
+ "reward_std": 0.39477548003196716,
709
+ "rewards/correct_code_reward_func": 0.2083333358168602,
710
+ "rewards/len_reward_func": 0.32600177824497223,
711
+ "step": 54
712
+ },
713
+ {
714
+ "completion_length": 165.6458396911621,
715
+ "epoch": 0.88,
716
+ "grad_norm": 0.5543105836651998,
717
+ "kl": 0.0074310302734375,
718
+ "learning_rate": 3.1104014225709784e-07,
719
+ "loss": 0.0,
720
+ "reward": 0.5656554698944092,
721
+ "reward_std": 0.4120694398880005,
722
+ "rewards/correct_code_reward_func": 0.2500000111758709,
723
+ "rewards/len_reward_func": 0.315655454993248,
724
+ "step": 55
725
+ },
726
+ {
727
+ "completion_length": 224.8541717529297,
728
+ "epoch": 0.896,
729
+ "grad_norm": 0.4221238954900066,
730
+ "kl": 0.0029735565185546875,
731
+ "learning_rate": 3.0472583095164873e-07,
732
+ "loss": 0.0,
733
+ "reward": 0.4594677835702896,
734
+ "reward_std": 0.3312383443117142,
735
+ "rewards/correct_code_reward_func": 0.10416666977107525,
736
+ "rewards/len_reward_func": 0.3553011566400528,
737
+ "step": 56
738
+ },
739
+ {
740
+ "completion_length": 135.8541717529297,
741
+ "epoch": 0.912,
742
+ "grad_norm": 0.7883647712663692,
743
+ "kl": 0.005859375,
744
+ "learning_rate": 2.983746306385499e-07,
745
+ "loss": 0.0,
746
+ "reward": 0.5004815310239792,
747
+ "reward_std": 0.4576799273490906,
748
+ "rewards/correct_code_reward_func": 0.1458333395421505,
749
+ "rewards/len_reward_func": 0.3546481877565384,
750
+ "step": 57
751
+ },
752
+ {
753
+ "completion_length": 177.625,
754
+ "epoch": 0.928,
755
+ "grad_norm": 0.823591617621225,
756
+ "kl": 0.00780487060546875,
757
+ "learning_rate": 2.919908224675412e-07,
758
+ "loss": 0.0,
759
+ "reward": 0.47625844180583954,
760
+ "reward_std": 0.4644129127264023,
761
+ "rewards/correct_code_reward_func": 0.1666666716337204,
762
+ "rewards/len_reward_func": 0.30959178507328033,
763
+ "step": 58
764
+ },
765
+ {
766
+ "completion_length": 111.95833587646484,
767
+ "epoch": 0.944,
768
+ "grad_norm": 0.7895834198702301,
769
+ "kl": 0.01739501953125,
770
+ "learning_rate": 2.8557870956832133e-07,
771
+ "loss": 0.0,
772
+ "reward": 0.45759040117263794,
773
+ "reward_std": 0.33291806280612946,
774
+ "rewards/correct_code_reward_func": 0.125,
775
+ "rewards/len_reward_func": 0.3325904309749603,
776
+ "step": 59
777
+ },
778
+ {
779
+ "completion_length": 200.64583587646484,
780
+ "epoch": 0.96,
781
+ "grad_norm": 0.398008582749878,
782
+ "kl": 0.00305938720703125,
783
+ "learning_rate": 2.7914261414993976e-07,
784
+ "loss": 0.0,
785
+ "reward": 0.5845803320407867,
786
+ "reward_std": 0.3288002014160156,
787
+ "rewards/correct_code_reward_func": 0.2291666716337204,
788
+ "rewards/len_reward_func": 0.35541366040706635,
789
+ "step": 60
790
+ },
791
+ {
792
+ "completion_length": 123.75000762939453,
793
+ "epoch": 0.976,
794
+ "grad_norm": 0.6258506341186686,
795
+ "kl": 0.00853729248046875,
796
+ "learning_rate": 2.726868745873286e-07,
797
+ "loss": 0.0,
798
+ "reward": 0.5053079277276993,
799
+ "reward_std": 0.45688633620738983,
800
+ "rewards/correct_code_reward_func": 0.16666667722165585,
801
+ "rewards/len_reward_func": 0.3386412411928177,
802
+ "step": 61
803
+ },
804
+ {
805
+ "completion_length": 178.4166717529297,
806
+ "epoch": 0.992,
807
+ "grad_norm": 0.4226697044495165,
808
+ "kl": 0.004604339599609375,
809
+ "learning_rate": 2.662158424969357e-07,
810
+ "loss": 0.0,
811
+ "reward": 0.4181392341852188,
812
+ "reward_std": 0.39034655690193176,
813
+ "rewards/correct_code_reward_func": 0.1041666679084301,
814
+ "rewards/len_reward_func": 0.3139725774526596,
815
+ "step": 62
816
+ },
817
+ {
818
+ "completion_length": 150.375,
819
+ "epoch": 1.0,
820
+ "grad_norm": 0.4226697044495165,
821
+ "kl": 0.00823974609375,
822
+ "learning_rate": 2.597338798034344e-07,
823
+ "loss": 0.0,
824
+ "reward": 0.7250348925590515,
825
+ "reward_std": 0.3359350562095642,
826
+ "rewards/correct_code_reward_func": 0.375,
827
+ "rewards/len_reward_func": 0.3500348925590515,
828
+ "step": 63
829
+ },
830
+ {
831
+ "completion_length": 137.45833587646484,
832
+ "epoch": 1.016,
833
+ "grad_norm": 0.621781928712815,
834
+ "kl": 0.0059356689453125,
835
+ "learning_rate": 2.532453557994827e-07,
836
+ "loss": 0.0,
837
+ "reward": 0.5079408586025238,
838
+ "reward_std": 0.43556541204452515,
839
+ "rewards/correct_code_reward_func": 0.16666667722165585,
840
+ "rewards/len_reward_func": 0.3412741720676422,
841
+ "step": 64
842
+ },
843
+ {
844
+ "completion_length": 237.45834350585938,
845
+ "epoch": 1.032,
846
+ "grad_norm": 0.4700296188876399,
847
+ "kl": 0.00553131103515625,
848
+ "learning_rate": 2.467546442005173e-07,
849
+ "loss": 0.0,
850
+ "reward": 0.3794639855623245,
851
+ "reward_std": 0.3328210711479187,
852
+ "rewards/correct_code_reward_func": 0.0625,
853
+ "rewards/len_reward_func": 0.3169640153646469,
854
+ "step": 65
855
+ },
856
+ {
857
+ "completion_length": 217.33333587646484,
858
+ "epoch": 1.048,
859
+ "grad_norm": 0.47407157003975803,
860
+ "kl": 0.01959228515625,
861
+ "learning_rate": 2.4026612019656556e-07,
862
+ "loss": 0.0,
863
+ "reward": 0.5135317444801331,
864
+ "reward_std": 0.3767416924238205,
865
+ "rewards/correct_code_reward_func": 0.2083333395421505,
866
+ "rewards/len_reward_func": 0.30519840121269226,
867
+ "step": 66
868
+ },
869
+ {
870
+ "completion_length": 159.9791717529297,
871
+ "epoch": 1.064,
872
+ "grad_norm": 1.33408581993753,
873
+ "kl": 0.004638671875,
874
+ "learning_rate": 2.337841575030642e-07,
875
+ "loss": 0.0,
876
+ "reward": 0.5043017268180847,
877
+ "reward_std": 0.5010640621185303,
878
+ "rewards/correct_code_reward_func": 0.1875000111758709,
879
+ "rewards/len_reward_func": 0.3168017417192459,
880
+ "step": 67
881
+ },
882
+ {
883
+ "completion_length": 196.06250762939453,
884
+ "epoch": 1.08,
885
+ "grad_norm": 0.39338975551762256,
886
+ "kl": 0.0045166015625,
887
+ "learning_rate": 2.2731312541267143e-07,
888
+ "loss": 0.0,
889
+ "reward": 0.40096263587474823,
890
+ "reward_std": 0.38369233906269073,
891
+ "rewards/correct_code_reward_func": 0.1041666679084301,
892
+ "rewards/len_reward_func": 0.29679596424102783,
893
+ "step": 68
894
+ },
895
+ {
896
+ "completion_length": 168.20833587646484,
897
+ "epoch": 1.096,
898
+ "grad_norm": 0.5733635012795789,
899
+ "kl": 0.011474609375,
900
+ "learning_rate": 2.2085738585006021e-07,
901
+ "loss": 0.0,
902
+ "reward": 0.5634751617908478,
903
+ "reward_std": 0.42181093990802765,
904
+ "rewards/correct_code_reward_func": 0.229166679084301,
905
+ "rewards/len_reward_func": 0.3343084752559662,
906
+ "step": 69
907
+ },
908
+ {
909
+ "completion_length": 131.62500762939453,
910
+ "epoch": 1.112,
911
+ "grad_norm": 0.5768348851420809,
912
+ "kl": 0.0180511474609375,
913
+ "learning_rate": 2.1442129043167873e-07,
914
+ "loss": 0.0,
915
+ "reward": 0.44008754193782806,
916
+ "reward_std": 0.4175649434328079,
917
+ "rewards/correct_code_reward_func": 0.1458333358168602,
918
+ "rewards/len_reward_func": 0.29425420612096786,
919
+ "step": 70
920
+ },
921
+ {
922
+ "completion_length": 178.1041717529297,
923
+ "epoch": 1.1280000000000001,
924
+ "grad_norm": 0.8562478784181595,
925
+ "kl": 0.01507568359375,
926
+ "learning_rate": 2.0800917753245875e-07,
927
+ "loss": 0.0,
928
+ "reward": 0.42609627544879913,
929
+ "reward_std": 0.395632266998291,
930
+ "rewards/correct_code_reward_func": 0.0833333358168602,
931
+ "rewards/len_reward_func": 0.34276294708251953,
932
+ "step": 71
933
+ },
934
+ {
935
+ "completion_length": 142.43750381469727,
936
+ "epoch": 1.144,
937
+ "grad_norm": 0.5850086202761346,
938
+ "kl": 0.007049560546875,
939
+ "learning_rate": 2.0162536936145008e-07,
940
+ "loss": 0.0,
941
+ "reward": 0.6273844540119171,
942
+ "reward_std": 0.37064287066459656,
943
+ "rewards/correct_code_reward_func": 0.2708333432674408,
944
+ "rewards/len_reward_func": 0.3565511107444763,
945
+ "step": 72
946
+ },
947
+ {
948
+ "completion_length": 173.4166717529297,
949
+ "epoch": 1.16,
950
+ "grad_norm": 1.0060077647295973,
951
+ "kl": 0.01995849609375,
952
+ "learning_rate": 1.9527416904835132e-07,
953
+ "loss": 0.0,
954
+ "reward": 0.6327731013298035,
955
+ "reward_std": 0.40126484632492065,
956
+ "rewards/correct_code_reward_func": 0.3125,
957
+ "rewards/len_reward_func": 0.3202730864286423,
958
+ "step": 73
959
+ },
960
+ {
961
+ "completion_length": 196.5416717529297,
962
+ "epoch": 1.176,
963
+ "grad_norm": 0.5247248856581095,
964
+ "kl": 0.004150390625,
965
+ "learning_rate": 1.889598577429022e-07,
966
+ "loss": 0.0,
967
+ "reward": 0.4798154681921005,
968
+ "reward_std": 0.4057523310184479,
969
+ "rewards/correct_code_reward_func": 0.1458333395421505,
970
+ "rewards/len_reward_func": 0.33398209512233734,
971
+ "step": 74
972
+ },
973
+ {
974
+ "completion_length": 163.37500762939453,
975
+ "epoch": 1.192,
976
+ "grad_norm": 0.44940299985092946,
977
+ "kl": 0.00665283203125,
978
+ "learning_rate": 1.8268669172909136e-07,
979
+ "loss": 0.0,
980
+ "reward": 0.6056158542633057,
981
+ "reward_std": 0.4253086894750595,
982
+ "rewards/correct_code_reward_func": 0.2708333432674408,
983
+ "rewards/len_reward_func": 0.3347824960947037,
984
+ "step": 75
985
+ },
986
+ {
987
+ "completion_length": 127.60417175292969,
988
+ "epoch": 1.208,
989
+ "grad_norm": 0.5154656852815648,
990
+ "kl": 0.0311279296875,
991
+ "learning_rate": 1.7645889955612592e-07,
992
+ "loss": 0.0,
993
+ "reward": 0.5518045127391815,
994
+ "reward_std": 0.46641653776168823,
995
+ "rewards/correct_code_reward_func": 0.2083333358168602,
996
+ "rewards/len_reward_func": 0.3434711843729019,
997
+ "step": 76
998
+ },
999
+ {
1000
+ "completion_length": 126.47917175292969,
1001
+ "epoch": 1.224,
1002
+ "grad_norm": 0.7328993764471904,
1003
+ "kl": 0.01027679443359375,
1004
+ "learning_rate": 1.7028067918809535e-07,
1005
+ "loss": 0.0,
1006
+ "reward": 0.5090649425983429,
1007
+ "reward_std": 0.34914855659008026,
1008
+ "rewards/correct_code_reward_func": 0.20833333395421505,
1009
+ "rewards/len_reward_func": 0.3007315993309021,
1010
+ "step": 77
1011
+ },
1012
+ {
1013
+ "completion_length": 137.2291717529297,
1014
+ "epoch": 1.24,
1015
+ "grad_norm": 0.7418796199300344,
1016
+ "kl": 0.009124755859375,
1017
+ "learning_rate": 1.6415619517425294e-07,
1018
+ "loss": 0.0,
1019
+ "reward": 0.578598827123642,
1020
+ "reward_std": 0.4721776694059372,
1021
+ "rewards/correct_code_reward_func": 0.2500000074505806,
1022
+ "rewards/len_reward_func": 0.32859882712364197,
1023
+ "step": 78
1024
+ },
1025
+ {
1026
+ "completion_length": 158.95833587646484,
1027
+ "epoch": 1.256,
1028
+ "grad_norm": 2.068496131846233,
1029
+ "kl": 0.02191162109375,
1030
+ "learning_rate": 1.5808957584181994e-07,
1031
+ "loss": 0.0,
1032
+ "reward": 0.4869799315929413,
1033
+ "reward_std": 0.4040430933237076,
1034
+ "rewards/correct_code_reward_func": 0.1666666679084301,
1035
+ "rewards/len_reward_func": 0.3203132748603821,
1036
+ "step": 79
1037
+ },
1038
+ {
1039
+ "completion_length": 106.93750381469727,
1040
+ "epoch": 1.272,
1041
+ "grad_norm": 0.5762356445650323,
1042
+ "kl": 0.00872802734375,
1043
+ "learning_rate": 1.5208491051320744e-07,
1044
+ "loss": 0.0,
1045
+ "reward": 0.5087297856807709,
1046
+ "reward_std": 0.39580225944519043,
1047
+ "rewards/correct_code_reward_func": 0.1875000074505806,
1048
+ "rewards/len_reward_func": 0.3212297558784485,
1049
+ "step": 80
1050
+ },
1051
+ {
1052
+ "completion_length": 114.00000381469727,
1053
+ "epoch": 1.288,
1054
+ "grad_norm": 0.786665682978904,
1055
+ "kl": 0.013519287109375,
1056
+ "learning_rate": 1.461462467495284e-07,
1057
+ "loss": 0.0,
1058
+ "reward": 0.4894479066133499,
1059
+ "reward_std": 0.3821127265691757,
1060
+ "rewards/correct_code_reward_func": 0.125,
1061
+ "rewards/len_reward_func": 0.3644479066133499,
1062
+ "step": 81
1063
+ },
1064
+ {
1065
+ "completion_length": 154.83333587646484,
1066
+ "epoch": 1.304,
1067
+ "grad_norm": 0.7107423115065932,
1068
+ "kl": 0.0141754150390625,
1069
+ "learning_rate": 1.4027758762226107e-07,
1070
+ "loss": 0.0,
1071
+ "reward": 0.4811897426843643,
1072
+ "reward_std": 0.3424055427312851,
1073
+ "rewards/correct_code_reward_func": 0.1041666679084301,
1074
+ "rewards/len_reward_func": 0.3770230710506439,
1075
+ "step": 82
1076
+ },
1077
+ {
1078
+ "completion_length": 235.70834350585938,
1079
+ "epoch": 1.32,
1080
+ "grad_norm": 0.43515005998860457,
1081
+ "kl": 0.0047760009765625,
1082
+ "learning_rate": 1.3448288901490092e-07,
1083
+ "loss": 0.0,
1084
+ "reward": 0.4617680013179779,
1085
+ "reward_std": 0.3315645009279251,
1086
+ "rewards/correct_code_reward_func": 0.1250000037252903,
1087
+ "rewards/len_reward_func": 0.3367680013179779,
1088
+ "step": 83
1089
+ },
1090
+ {
1091
+ "completion_length": 143.1666717529297,
1092
+ "epoch": 1.336,
1093
+ "grad_norm": 0.65180392684865,
1094
+ "kl": 0.0086669921875,
1095
+ "learning_rate": 1.2876605695642084e-07,
1096
+ "loss": 0.0,
1097
+ "reward": 0.4678248018026352,
1098
+ "reward_std": 0.3193260580301285,
1099
+ "rewards/correct_code_reward_func": 0.1041666679084301,
1100
+ "rewards/len_reward_func": 0.3636581301689148,
1101
+ "step": 84
1102
+ },
1103
+ {
1104
+ "completion_length": 131.8333396911621,
1105
+ "epoch": 1.3519999999999999,
1106
+ "grad_norm": 0.4416459826072822,
1107
+ "kl": 0.01190185546875,
1108
+ "learning_rate": 1.231309449883361e-07,
1109
+ "loss": 0.0,
1110
+ "reward": 0.41201435029506683,
1111
+ "reward_std": 0.3407471626996994,
1112
+ "rewards/correct_code_reward_func": 0.0416666679084301,
1113
+ "rewards/len_reward_func": 0.37034766376018524,
1114
+ "step": 85
1115
+ },
1116
+ {
1117
+ "completion_length": 103.77083969116211,
1118
+ "epoch": 1.3679999999999999,
1119
+ "grad_norm": 0.43361682764693815,
1120
+ "kl": 0.0066986083984375,
1121
+ "learning_rate": 1.1758135156715041e-07,
1122
+ "loss": 0.0,
1123
+ "reward": 0.423152431845665,
1124
+ "reward_std": 0.3362526297569275,
1125
+ "rewards/correct_code_reward_func": 0.1041666679084301,
1126
+ "rewards/len_reward_func": 0.3189857602119446,
1127
+ "step": 86
1128
+ },
1129
+ {
1130
+ "completion_length": 176.375,
1131
+ "epoch": 1.384,
1132
+ "grad_norm": 1.352523990215239,
1133
+ "kl": 0.03180694580078125,
1134
+ "learning_rate": 1.1212101750393235e-07,
1135
+ "loss": 0.0,
1136
+ "reward": 0.4561140537261963,
1137
+ "reward_std": 0.39378371834754944,
1138
+ "rewards/correct_code_reward_func": 0.10416666977107525,
1139
+ "rewards/len_reward_func": 0.3519473969936371,
1140
+ "step": 87
1141
+ },
1142
+ {
1143
+ "completion_length": 79.97916793823242,
1144
+ "epoch": 1.4,
1145
+ "grad_norm": 2.141260004886994,
1146
+ "kl": 0.0297698974609375,
1147
+ "learning_rate": 1.0675362344274952e-07,
1148
+ "loss": 0.0,
1149
+ "reward": 0.42587006092071533,
1150
+ "reward_std": 0.4436161369085312,
1151
+ "rewards/correct_code_reward_func": 0.125,
1152
+ "rewards/len_reward_func": 0.30087001621723175,
1153
+ "step": 88
1154
+ },
1155
+ {
1156
+ "completion_length": 107.37500381469727,
1157
+ "epoch": 1.416,
1158
+ "grad_norm": 0.4125343610721507,
1159
+ "kl": 0.017974853515625,
1160
+ "learning_rate": 1.0148278737965844e-07,
1161
+ "loss": 0.0,
1162
+ "reward": 0.5323555767536163,
1163
+ "reward_std": 0.4073493778705597,
1164
+ "rewards/correct_code_reward_func": 0.1666666716337204,
1165
+ "rewards/len_reward_func": 0.36568886041641235,
1166
+ "step": 89
1167
+ },
1168
+ {
1169
+ "completion_length": 87.83333587646484,
1170
+ "epoch": 1.432,
1171
+ "grad_norm": 1.3465092335562199,
1172
+ "kl": 0.0328369140625,
1173
+ "learning_rate": 9.631206222392479e-08,
1174
+ "loss": 0.0,
1175
+ "reward": 0.5186317265033722,
1176
+ "reward_std": 0.4107673317193985,
1177
+ "rewards/correct_code_reward_func": 0.1666666716337204,
1178
+ "rewards/len_reward_func": 0.3519650846719742,
1179
+ "step": 90
1180
+ },
1181
+ {
1182
+ "completion_length": 124.33333969116211,
1183
+ "epoch": 1.448,
1184
+ "grad_norm": 0.7479671719828264,
1185
+ "kl": 0.02008056640625,
1186
+ "learning_rate": 9.124493340311537e-08,
1187
+ "loss": 0.0,
1188
+ "reward": 0.5086182951927185,
1189
+ "reward_std": 0.12347583472728729,
1190
+ "rewards/correct_code_reward_func": 0.0833333358168602,
1191
+ "rewards/len_reward_func": 0.4252849221229553,
1192
+ "step": 91
1193
+ },
1194
+ {
1195
+ "completion_length": 167.79166793823242,
1196
+ "epoch": 1.464,
1197
+ "grad_norm": 1.1543274217148969,
1198
+ "kl": 0.0083770751953125,
1199
+ "learning_rate": 8.628481651367875e-08,
1200
+ "loss": 0.0,
1201
+ "reward": 0.6109435856342316,
1202
+ "reward_std": 0.422005370259285,
1203
+ "rewards/correct_code_reward_func": 0.2708333432674408,
1204
+ "rewards/len_reward_func": 0.34011024236679077,
1205
+ "step": 92
1206
+ },
1207
+ {
1208
+ "completion_length": 115.33333969116211,
1209
+ "epoch": 1.48,
1210
+ "grad_norm": 0.3931440618018204,
1211
+ "kl": 0.0080108642578125,
1212
+ "learning_rate": 8.143505501859551e-08,
1213
+ "loss": 0.0,
1214
+ "reward": 0.44855794310569763,
1215
+ "reward_std": 0.32760028541088104,
1216
+ "rewards/correct_code_reward_func": 0.1250000037252903,
1217
+ "rewards/len_reward_func": 0.3235579580068588,
1218
+ "step": 93
1219
+ },
1220
+ {
1221
+ "completion_length": 151.39583587646484,
1222
+ "epoch": 1.496,
1223
+ "grad_norm": 0.5894110998499853,
1224
+ "kl": 0.009307861328125,
1225
+ "learning_rate": 7.669891799365282e-08,
1226
+ "loss": 0.0,
1227
+ "reward": 0.36864979565143585,
1228
+ "reward_std": 0.36151623725891113,
1229
+ "rewards/correct_code_reward_func": 0.0416666679084301,
1230
+ "rewards/len_reward_func": 0.32698309421539307,
1231
+ "step": 94
1232
+ },
1233
+ {
1234
+ "completion_length": 147.7916717529297,
1235
+ "epoch": 1.512,
1236
+ "grad_norm": 0.5972288037218009,
1237
+ "kl": 0.019500732421875,
1238
+ "learning_rate": 7.207959792385998e-08,
1239
+ "loss": 0.0,
1240
+ "reward": 0.4600509703159332,
1241
+ "reward_std": 0.4011112302541733,
1242
+ "rewards/correct_code_reward_func": 0.1041666679084301,
1243
+ "rewards/len_reward_func": 0.3558843284845352,
1244
+ "step": 95
1245
+ },
1246
+ {
1247
+ "completion_length": 149.9791717529297,
1248
+ "epoch": 1.528,
1249
+ "grad_norm": 0.86174582631064,
1250
+ "kl": 0.00836181640625,
1251
+ "learning_rate": 6.758020855149249e-08,
1252
+ "loss": 0.0,
1253
+ "reward": 0.44805125892162323,
1254
+ "reward_std": 0.33948560059070587,
1255
+ "rewards/correct_code_reward_func": 0.125,
1256
+ "rewards/len_reward_func": 0.32305125892162323,
1257
+ "step": 96
1258
+ },
1259
+ {
1260
+ "completion_length": 163.6041717529297,
1261
+ "epoch": 1.544,
1262
+ "grad_norm": 0.36337651628903067,
1263
+ "kl": 0.0181121826171875,
1264
+ "learning_rate": 6.320378277721342e-08,
1265
+ "loss": 0.0,
1266
+ "reward": 0.4679824113845825,
1267
+ "reward_std": 0.4093552529811859,
1268
+ "rewards/correct_code_reward_func": 0.10416666977107525,
1269
+ "rewards/len_reward_func": 0.3638157695531845,
1270
+ "step": 97
1271
+ },
1272
+ {
1273
+ "completion_length": 102.75000381469727,
1274
+ "epoch": 1.56,
1275
+ "grad_norm": 1.175706006208444,
1276
+ "kl": 0.023834228515625,
1277
+ "learning_rate": 5.895327061568775e-08,
1278
+ "loss": 0.0,
1279
+ "reward": 0.5549444258213043,
1280
+ "reward_std": 0.43229806423187256,
1281
+ "rewards/correct_code_reward_func": 0.2500000111758709,
1282
+ "rewards/len_reward_func": 0.3049444109201431,
1283
+ "step": 98
1284
+ },
1285
+ {
1286
+ "completion_length": 144.58333587646484,
1287
+ "epoch": 1.576,
1288
+ "grad_norm": 0.7970883391032116,
1289
+ "kl": 0.011474609375,
1290
+ "learning_rate": 5.483153720706798e-08,
1291
+ "loss": 0.0,
1292
+ "reward": 0.5082628130912781,
1293
+ "reward_std": 0.46417203545570374,
1294
+ "rewards/correct_code_reward_func": 0.2083333432674408,
1295
+ "rewards/len_reward_func": 0.2999294698238373,
1296
+ "step": 99
1297
+ },
1298
+ {
1299
+ "completion_length": 93.41667175292969,
1300
+ "epoch": 1.592,
1301
+ "grad_norm": 0.577298607650083,
1302
+ "kl": 0.0172119140625,
1303
+ "learning_rate": 5.0841360885690996e-08,
1304
+ "loss": 0.0,
1305
+ "reward": 0.43817608058452606,
1306
+ "reward_std": 0.4479677081108093,
1307
+ "rewards/correct_code_reward_func": 0.1458333358168602,
1308
+ "rewards/len_reward_func": 0.29234276711940765,
1309
+ "step": 100
1310
+ },
1311
+ {
1312
+ "completion_length": 112.79167175292969,
1313
+ "epoch": 1.608,
1314
+ "grad_norm": 0.6328737919939129,
1315
+ "kl": 0.0224151611328125,
1316
+ "learning_rate": 4.698543130728755e-08,
1317
+ "loss": 0.0,
1318
+ "reward": 0.6780606508255005,
1319
+ "reward_std": 0.37215377390384674,
1320
+ "rewards/correct_code_reward_func": 0.3333333358168602,
1321
+ "rewards/len_reward_func": 0.3447272926568985,
1322
+ "step": 101
1323
+ },
1324
+ {
1325
+ "completion_length": 125.33333587646484,
1326
+ "epoch": 1.624,
1327
+ "grad_norm": 0.8805964414937946,
1328
+ "kl": 0.007904052734375,
1329
+ "learning_rate": 4.326634763596784e-08,
1330
+ "loss": 0.0,
1331
+ "reward": 0.5231586992740631,
1332
+ "reward_std": 0.39199909567832947,
1333
+ "rewards/correct_code_reward_func": 0.125,
1334
+ "rewards/len_reward_func": 0.3981587141752243,
1335
+ "step": 102
1336
+ },
1337
+ {
1338
+ "completion_length": 143.81250762939453,
1339
+ "epoch": 1.6400000000000001,
1340
+ "grad_norm": 0.4566354927971138,
1341
+ "kl": 0.0088348388671875,
1342
+ "learning_rate": 3.968661679220467e-08,
1343
+ "loss": 0.0,
1344
+ "reward": 0.5204833149909973,
1345
+ "reward_std": 0.25895993411540985,
1346
+ "rewards/correct_code_reward_func": 0.1041666679084301,
1347
+ "rewards/len_reward_func": 0.4163166582584381,
1348
+ "step": 103
1349
+ },
1350
+ {
1351
+ "completion_length": 118.27083587646484,
1352
+ "epoch": 1.6560000000000001,
1353
+ "grad_norm": 0.41571642382372326,
1354
+ "kl": 0.00982666015625,
1355
+ "learning_rate": 3.624865176299499e-08,
1356
+ "loss": 0.0,
1357
+ "reward": 0.6309479027986526,
1358
+ "reward_std": 0.4369208961725235,
1359
+ "rewards/correct_code_reward_func": 0.29166667722165585,
1360
+ "rewards/len_reward_func": 0.33928124606609344,
1361
+ "step": 104
1362
+ },
1363
+ {
1364
+ "completion_length": 127.50000381469727,
1365
+ "epoch": 1.6720000000000002,
1366
+ "grad_norm": 0.44810915489413017,
1367
+ "kl": 0.021331787109375,
1368
+ "learning_rate": 3.295476997533905e-08,
1369
+ "loss": 0.0,
1370
+ "reward": 0.43531325459480286,
1371
+ "reward_std": 0.2511162757873535,
1372
+ "rewards/correct_code_reward_func": 0.0416666679084301,
1373
+ "rewards/len_reward_func": 0.39364662766456604,
1374
+ "step": 105
1375
+ },
1376
+ {
1377
+ "completion_length": 179.75000762939453,
1378
+ "epoch": 1.688,
1379
+ "grad_norm": 0.5235689630320457,
1380
+ "kl": 0.029052734375,
1381
+ "learning_rate": 2.980719173413396e-08,
1382
+ "loss": 0.0,
1383
+ "reward": 0.354349747300148,
1384
+ "reward_std": 0.2833855152130127,
1385
+ "rewards/correct_code_reward_func": 0.0416666679084301,
1386
+ "rewards/len_reward_func": 0.3126830607652664,
1387
+ "step": 106
1388
+ },
1389
+ {
1390
+ "completion_length": 120.50000381469727,
1391
+ "epoch": 1.704,
1392
+ "grad_norm": 1.2523814931919,
1393
+ "kl": 0.0085601806640625,
1394
+ "learning_rate": 2.680803872553408e-08,
1395
+ "loss": 0.0,
1396
+ "reward": 0.48751458525657654,
1397
+ "reward_std": 0.4157916307449341,
1398
+ "rewards/correct_code_reward_func": 0.1666666716337204,
1399
+ "rewards/len_reward_func": 0.32084792852401733,
1400
+ "step": 107
1401
+ },
1402
+ {
1403
+ "completion_length": 151.16666793823242,
1404
+ "epoch": 1.72,
1405
+ "grad_norm": 0.6447932351262547,
1406
+ "kl": 0.00946807861328125,
1407
+ "learning_rate": 2.395933258678745e-08,
1408
+ "loss": 0.0,
1409
+ "reward": 0.6216467022895813,
1410
+ "reward_std": 0.44435153901576996,
1411
+ "rewards/correct_code_reward_func": 0.2708333432674408,
1412
+ "rewards/len_reward_func": 0.3508133441209793,
1413
+ "step": 108
1414
+ },
1415
+ {
1416
+ "completion_length": 123.4375,
1417
+ "epoch": 1.736,
1418
+ "grad_norm": 0.5052351271825365,
1419
+ "kl": 0.0091552734375,
1420
+ "learning_rate": 2.1262993543511715e-08,
1421
+ "loss": 0.0,
1422
+ "reward": 0.6645375192165375,
1423
+ "reward_std": 0.4121406674385071,
1424
+ "rewards/correct_code_reward_func": 0.2916666716337204,
1425
+ "rewards/len_reward_func": 0.37287086248397827,
1426
+ "step": 109
1427
+ },
1428
+ {
1429
+ "completion_length": 153.47916793823242,
1430
+ "epoch": 1.752,
1431
+ "grad_norm": 0.6369640119331228,
1432
+ "kl": 0.0086212158203125,
1433
+ "learning_rate": 1.872083911532907e-08,
1434
+ "loss": 0.0,
1435
+ "reward": 0.40650297701358795,
1436
+ "reward_std": 0.27610746026039124,
1437
+ "rewards/correct_code_reward_func": 0.0416666679084301,
1438
+ "rewards/len_reward_func": 0.36483629047870636,
1439
+ "step": 110
1440
+ },
1441
+ {
1442
+ "completion_length": 132.1041717529297,
1443
+ "epoch": 1.768,
1444
+ "grad_norm": 0.44311314663042145,
1445
+ "kl": 0.01031494140625,
1446
+ "learning_rate": 1.6334582890731697e-08,
1447
+ "loss": 0.0,
1448
+ "reward": 0.7341299653053284,
1449
+ "reward_std": 0.504623532295227,
1450
+ "rewards/correct_code_reward_func": 0.4375,
1451
+ "rewards/len_reward_func": 0.29662999510765076,
1452
+ "step": 111
1453
+ },
1454
+ {
1455
+ "completion_length": 162.0625,
1456
+ "epoch": 1.784,
1457
+ "grad_norm": 0.6586562319299408,
1458
+ "kl": 0.0101318359375,
1459
+ "learning_rate": 1.4105833372004523e-08,
1460
+ "loss": 0.0,
1461
+ "reward": 0.4220695346593857,
1462
+ "reward_std": 0.2905489057302475,
1463
+ "rewards/correct_code_reward_func": 0.0416666679084301,
1464
+ "rewards/len_reward_func": 0.3804028630256653,
1465
+ "step": 112
1466
+ },
1467
+ {
1468
+ "completion_length": 106.89583587646484,
1469
+ "epoch": 1.8,
1470
+ "grad_norm": 0.7764527703399275,
1471
+ "kl": 0.019287109375,
1472
+ "learning_rate": 1.2036092890982619e-08,
1473
+ "loss": 0.0,
1474
+ "reward": 0.5292025506496429,
1475
+ "reward_std": 0.3711909055709839,
1476
+ "rewards/correct_code_reward_func": 0.1458333358168602,
1477
+ "rewards/len_reward_func": 0.38336920738220215,
1478
+ "step": 113
1479
+ },
1480
+ {
1481
+ "completion_length": 132.2916717529297,
1482
+ "epoch": 1.8159999999999998,
1483
+ "grad_norm": 0.6064692969361543,
1484
+ "kl": 0.009307861328125,
1485
+ "learning_rate": 1.0126756596375685e-08,
1486
+ "loss": 0.0,
1487
+ "reward": 0.48789724707603455,
1488
+ "reward_std": 0.3553258925676346,
1489
+ "rewards/correct_code_reward_func": 0.1458333358168602,
1490
+ "rewards/len_reward_func": 0.34206391870975494,
1491
+ "step": 114
1492
+ },
1493
+ {
1494
+ "completion_length": 120.22917175292969,
1495
+ "epoch": 1.8319999999999999,
1496
+ "grad_norm": 0.3843336433099292,
1497
+ "kl": 0.0174560546875,
1498
+ "learning_rate": 8.379111513340753e-09,
1499
+ "loss": 0.0,
1500
+ "reward": 0.47604209184646606,
1501
+ "reward_std": 0.3304741531610489,
1502
+ "rewards/correct_code_reward_func": 0.0833333358168602,
1503
+ "rewards/len_reward_func": 0.39270876348018646,
1504
+ "step": 115
1505
+ },
1506
+ {
1507
+ "completion_length": 145.31250762939453,
1508
+ "epoch": 1.8479999999999999,
1509
+ "grad_norm": 0.5532480666977985,
1510
+ "kl": 0.024169921875,
1511
+ "learning_rate": 6.7943356759381785e-09,
1512
+ "loss": 0.0,
1513
+ "reward": 0.46019650995731354,
1514
+ "reward_std": 0.3715391010046005,
1515
+ "rewards/correct_code_reward_func": 0.1041666679084301,
1516
+ "rewards/len_reward_func": 0.35602983832359314,
1517
+ "step": 116
1518
+ },
1519
+ {
1520
+ "completion_length": 149.93750381469727,
1521
+ "epoch": 1.8639999999999999,
1522
+ "grad_norm": 1.0372718715130624,
1523
+ "kl": 0.00921630859375,
1524
+ "learning_rate": 5.373497333054616e-09,
1525
+ "loss": 0.0,
1526
+ "reward": 0.4656513184309006,
1527
+ "reward_std": 0.44700081646442413,
1528
+ "rewards/correct_code_reward_func": 0.1666666716337204,
1529
+ "rewards/len_reward_func": 0.29898466169834137,
1530
+ "step": 117
1531
+ },
1532
+ {
1533
+ "completion_length": 144.89583587646484,
1534
+ "epoch": 1.88,
1535
+ "grad_norm": 0.4882633725665408,
1536
+ "kl": 0.014007568359375,
1537
+ "learning_rate": 4.117554228329406e-09,
1538
+ "loss": 0.0,
1539
+ "reward": 0.5118084400892258,
1540
+ "reward_std": 0.34739528596401215,
1541
+ "rewards/correct_code_reward_func": 0.1875000074505806,
1542
+ "rewards/len_reward_func": 0.32430844008922577,
1543
+ "step": 118
1544
+ },
1545
+ {
1546
+ "completion_length": 97.45833587646484,
1547
+ "epoch": 1.896,
1548
+ "grad_norm": 0.4820971241858704,
1549
+ "kl": 0.01251220703125,
1550
+ "learning_rate": 3.0273529545687125e-09,
1551
+ "loss": 0.0,
1552
+ "reward": 0.4883972406387329,
1553
+ "reward_std": 0.43138815462589264,
1554
+ "rewards/correct_code_reward_func": 0.12500000558793545,
1555
+ "rewards/len_reward_func": 0.3633972257375717,
1556
+ "step": 119
1557
+ },
1558
+ {
1559
+ "completion_length": 120.95833587646484,
1560
+ "epoch": 1.912,
1561
+ "grad_norm": 0.6000792918829387,
1562
+ "kl": 0.02130126953125,
1563
+ "learning_rate": 2.1036283830834224e-09,
1564
+ "loss": 0.0,
1565
+ "reward": 0.6599289178848267,
1566
+ "reward_std": 0.32202909141778946,
1567
+ "rewards/correct_code_reward_func": 0.2291666716337204,
1568
+ "rewards/len_reward_func": 0.4307622164487839,
1569
+ "step": 120
1570
+ },
1571
+ {
1572
+ "completion_length": 161.4791717529297,
1573
+ "epoch": 1.928,
1574
+ "grad_norm": 0.9160239570442854,
1575
+ "kl": 0.040740966796875,
1576
+ "learning_rate": 1.347003168334665e-09,
1577
+ "loss": 0.0,
1578
+ "reward": 0.6658598780632019,
1579
+ "reward_std": 0.45104770362377167,
1580
+ "rewards/correct_code_reward_func": 0.3750000149011612,
1581
+ "rewards/len_reward_func": 0.2908598631620407,
1582
+ "step": 121
1583
+ },
1584
+ {
1585
+ "completion_length": 197.33333587646484,
1586
+ "epoch": 1.944,
1587
+ "grad_norm": 1.2887565831293122,
1588
+ "kl": 0.0357666015625,
1589
+ "learning_rate": 7.579873282216598e-10,
1590
+ "loss": 0.0,
1591
+ "reward": 0.5509243905544281,
1592
+ "reward_std": 0.42525260150432587,
1593
+ "rewards/correct_code_reward_func": 0.229166679084301,
1594
+ "rewards/len_reward_func": 0.3217576891183853,
1595
+ "step": 122
1596
+ },
1597
+ {
1598
+ "completion_length": 238.39584350585938,
1599
+ "epoch": 1.96,
1600
+ "grad_norm": 0.36929428922486,
1601
+ "kl": 0.0052337646484375,
1602
+ "learning_rate": 3.3697790029424413e-10,
1603
+ "loss": 0.0,
1604
+ "reward": 0.5057644844055176,
1605
+ "reward_std": 0.4373111426830292,
1606
+ "rewards/correct_code_reward_func": 0.1458333395421505,
1607
+ "rewards/len_reward_func": 0.3599311411380768,
1608
+ "step": 123
1609
+ },
1610
+ {
1611
+ "completion_length": 126.41667175292969,
1612
+ "epoch": 1.976,
1613
+ "grad_norm": 0.4901359623088852,
1614
+ "kl": 0.007110595703125,
1615
+ "learning_rate": 8.425867412190091e-11,
1616
+ "loss": 0.0,
1617
+ "reward": 0.5390879511833191,
1618
+ "reward_std": 0.30444033443927765,
1619
+ "rewards/correct_code_reward_func": 0.1666666716337204,
1620
+ "rewards/len_reward_func": 0.3724212795495987,
1621
+ "step": 124
1622
+ },
1623
+ {
1624
+ "completion_length": 196.77083587646484,
1625
+ "epoch": 1.992,
1626
+ "grad_norm": 0.41890098929296765,
1627
+ "kl": 0.005767822265625,
1628
+ "learning_rate": 0.0,
1629
+ "loss": 0.0,
1630
+ "reward": 0.5094343423843384,
1631
+ "reward_std": 0.314863845705986,
1632
+ "rewards/correct_code_reward_func": 0.1041666679084301,
1633
+ "rewards/len_reward_func": 0.4052676856517792,
1634
+ "step": 125
1635
+ },
1636
+ {
1637
+ "epoch": 1.992,
1638
+ "step": 125,
1639
+ "total_flos": 0.0,
1640
+ "train_loss": 8.671089399598486e-06,
1641
+ "train_runtime": 8102.573,
1642
+ "train_samples_per_second": 0.093,
1643
+ "train_steps_per_second": 0.015
1644
+ }
1645
+ ],
1646
+ "logging_steps": 1,
1647
+ "max_steps": 125,
1648
+ "num_input_tokens_seen": 0,
1649
+ "num_train_epochs": 3,
1650
+ "save_steps": 25,
1651
+ "stateful_callbacks": {
1652
+ "TrainerControl": {
1653
+ "args": {
1654
+ "should_epoch_stop": false,
1655
+ "should_evaluate": false,
1656
+ "should_log": false,
1657
+ "should_save": true,
1658
+ "should_training_stop": true
1659
+ },
1660
+ "attributes": {}
1661
+ }
1662
+ },
1663
+ "total_flos": 0.0,
1664
+ "train_batch_size": 1,
1665
+ "trial_name": null,
1666
+ "trial_params": null
1667
+ }