zfz1 commited on
Commit
a912310
·
verified ·
1 Parent(s): 4889d13

Model save

Browse files
README.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - trl
4
+ - dpo
5
+ - generated_from_trainer
6
+ model-index:
7
+ - name: deepseek-8b-dpo-full
8
+ results: []
9
+ ---
10
+
11
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
12
+ should probably proofread and complete it, then remove this comment. -->
13
+
14
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/thuzfz1/huggingface/runs/cccqh2t0)
15
+ # deepseek-8b-dpo-full
16
+
17
+ This model was trained from scratch on an unknown dataset.
18
+
19
+ ## Model description
20
+
21
+ More information needed
22
+
23
+ ## Intended uses & limitations
24
+
25
+ More information needed
26
+
27
+ ## Training and evaluation data
28
+
29
+ More information needed
30
+
31
+ ## Training procedure
32
+
33
+ ### Training hyperparameters
34
+
35
+ The following hyperparameters were used during training:
36
+ - learning_rate: 3e-06
37
+ - train_batch_size: 1
38
+ - eval_batch_size: 1
39
+ - seed: 42
40
+ - distributed_type: multi-GPU
41
+ - num_devices: 4
42
+ - gradient_accumulation_steps: 4
43
+ - total_train_batch_size: 16
44
+ - total_eval_batch_size: 4
45
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
46
+ - lr_scheduler_type: cosine
47
+ - lr_scheduler_warmup_ratio: 0.1
48
+ - num_epochs: 1
49
+
50
+ ### Training results
51
+
52
+
53
+
54
+ ### Framework versions
55
+
56
+ - Transformers 4.42.3
57
+ - Pytorch 2.1.2
58
+ - Datasets 2.20.0
59
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.12455665428638459,
5
+ "train_runtime": 20866.0985,
6
+ "train_samples": 20000,
7
+ "train_samples_per_second": 0.958,
8
+ "train_steps_per_second": 0.06
9
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 100000,
4
+ "eos_token_id": 100001,
5
+ "transformers_version": "4.42.3"
6
+ }
model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e2e836e130aff40bb9c45d3d9ea56c7424af699a35f25fec22070d63b9c9dab
3
+ size 4987202208
model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a351df07208210518384b2860e2cd6367c544c79b7c6b91f490a510aafa0efc4
3
+ size 4980945440
model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e575522f37ab339584105226b764ff60fd52146d38d127df6e97f44f6221fc6
3
+ size 3852615520
model.safetensors.index.json ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 13820731392
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00003-of-00003.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
13
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
14
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
15
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
16
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
17
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
18
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
19
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
20
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
21
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
22
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
23
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
24
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
25
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
26
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors",
27
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
28
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
29
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
30
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
31
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
32
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
33
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
34
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
35
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
36
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
37
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
38
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
39
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
40
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
41
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
42
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
43
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
44
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
45
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
46
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
47
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
48
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
49
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
50
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
51
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
52
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
53
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
54
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
55
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
56
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
57
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
58
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
59
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
60
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
61
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
62
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
63
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
64
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
65
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
66
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
67
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
68
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
69
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
70
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
71
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
72
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
73
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
74
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
75
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
76
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
77
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
78
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
79
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
80
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
81
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
82
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
83
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
84
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
85
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
86
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
87
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
88
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
89
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
90
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
91
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
92
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
93
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
94
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
95
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
96
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
97
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
98
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
99
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
100
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
101
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
102
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
103
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
104
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
105
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
106
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
107
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
108
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
109
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
110
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
111
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
112
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
113
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
114
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
115
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
116
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
117
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
118
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
119
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
120
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
121
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
122
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
123
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
124
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
125
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
126
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
127
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
128
+ "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
129
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
130
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
131
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
132
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
133
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
134
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
135
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
136
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
137
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
138
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
139
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
140
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
141
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
142
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
143
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00003.safetensors",
144
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
145
+ "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
146
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
147
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
148
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
149
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
150
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
151
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
152
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors",
153
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
154
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
155
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
156
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
157
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
158
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
159
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
160
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
161
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
162
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
163
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
164
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
165
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
166
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
167
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
168
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
169
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
170
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
171
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
172
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
173
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
174
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
175
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
176
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
177
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
178
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
179
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
180
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
181
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
182
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
183
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
184
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
185
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
186
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
187
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
188
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
189
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
190
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
191
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
192
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
193
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
194
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
195
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
196
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
197
+ "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors",
198
+ "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
199
+ "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
200
+ "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
201
+ "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
202
+ "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
203
+ "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
204
+ "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
205
+ "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
206
+ "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors",
207
+ "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
208
+ "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
209
+ "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
210
+ "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
211
+ "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
212
+ "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
213
+ "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
214
+ "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
215
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
216
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
217
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
218
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
219
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
220
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
221
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
222
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
223
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
224
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
225
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
226
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
227
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
228
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
229
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
230
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
231
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
232
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
233
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
234
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
235
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
236
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
237
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
238
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
239
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
240
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
241
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
242
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
243
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
244
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
245
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
246
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
247
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
248
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
249
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
250
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
251
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
252
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
253
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
254
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
255
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
256
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
257
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
258
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
259
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
260
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
261
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
262
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
263
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
264
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
265
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
266
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
267
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
268
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
269
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
270
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
271
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
272
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
273
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
274
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
275
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
276
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
277
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
278
+ "model.norm.weight": "model-00003-of-00003.safetensors"
279
+ }
280
+ }
runs/Jul16_18-34-08_phe108-jieyuzhao-01/events.out.tfevents.1721180793.phe108-jieyuzhao-01.284499.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:877e328f1817a0c908c80bba24f5f47d9cce0ce14088aa6439500b7fdca8205c
3
- size 92744
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2ac71b79cb67fce4b4d8ca874a27f54c2f0030a5c823d5044d806a0eb1c8615
3
+ size 93098
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.12455665428638459,
5
+ "train_runtime": 20866.0985,
6
+ "train_samples": 20000,
7
+ "train_samples_per_second": 0.958,
8
+ "train_steps_per_second": 0.06
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,1932 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 10000,
6
+ "global_step": 1250,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0008,
13
+ "grad_norm": 21.581922928906643,
14
+ "learning_rate": 2.4e-08,
15
+ "logits/chosen": 58.004119873046875,
16
+ "logits/rejected": 46.01157760620117,
17
+ "logps/chosen": -68.83617401123047,
18
+ "logps/rejected": -57.57984924316406,
19
+ "loss": 0.6931,
20
+ "rewards/accuracies": 0.0,
21
+ "rewards/chosen": 0.0,
22
+ "rewards/margins": 0.0,
23
+ "rewards/rejected": 0.0,
24
+ "step": 1
25
+ },
26
+ {
27
+ "epoch": 0.008,
28
+ "grad_norm": 20.4688268613938,
29
+ "learning_rate": 2.4000000000000003e-07,
30
+ "logits/chosen": 59.60597229003906,
31
+ "logits/rejected": 55.70063781738281,
32
+ "logps/chosen": -57.55007553100586,
33
+ "logps/rejected": -64.12437438964844,
34
+ "loss": 0.6943,
35
+ "rewards/accuracies": 0.5277777910232544,
36
+ "rewards/chosen": 0.0031741515267640352,
37
+ "rewards/margins": 0.004799725487828255,
38
+ "rewards/rejected": -0.0016255751252174377,
39
+ "step": 10
40
+ },
41
+ {
42
+ "epoch": 0.016,
43
+ "grad_norm": 24.0481332048613,
44
+ "learning_rate": 4.800000000000001e-07,
45
+ "logits/chosen": 58.38694381713867,
46
+ "logits/rejected": 57.88800811767578,
47
+ "logps/chosen": -61.777549743652344,
48
+ "logps/rejected": -76.17720031738281,
49
+ "loss": 0.6889,
50
+ "rewards/accuracies": 0.5249999761581421,
51
+ "rewards/chosen": -0.016177240759134293,
52
+ "rewards/margins": -0.016559984534978867,
53
+ "rewards/rejected": 0.0003827471227850765,
54
+ "step": 20
55
+ },
56
+ {
57
+ "epoch": 0.024,
58
+ "grad_norm": 18.48062338133383,
59
+ "learning_rate": 7.2e-07,
60
+ "logits/chosen": 57.91182327270508,
61
+ "logits/rejected": 55.304176330566406,
62
+ "logps/chosen": -64.21894073486328,
63
+ "logps/rejected": -70.75691986083984,
64
+ "loss": 0.6969,
65
+ "rewards/accuracies": 0.4000000059604645,
66
+ "rewards/chosen": -0.016543073579669,
67
+ "rewards/margins": -0.02169782668352127,
68
+ "rewards/rejected": 0.005154752172529697,
69
+ "step": 30
70
+ },
71
+ {
72
+ "epoch": 0.032,
73
+ "grad_norm": 23.114053471975932,
74
+ "learning_rate": 9.600000000000001e-07,
75
+ "logits/chosen": 56.66730880737305,
76
+ "logits/rejected": 58.83241653442383,
77
+ "logps/chosen": -55.49696731567383,
78
+ "logps/rejected": -65.36714172363281,
79
+ "loss": 0.6852,
80
+ "rewards/accuracies": 0.4749999940395355,
81
+ "rewards/chosen": -0.06515228003263474,
82
+ "rewards/margins": -0.007973963394761086,
83
+ "rewards/rejected": -0.0571783110499382,
84
+ "step": 40
85
+ },
86
+ {
87
+ "epoch": 0.04,
88
+ "grad_norm": 17.75345076566636,
89
+ "learning_rate": 1.2000000000000002e-06,
90
+ "logits/chosen": 56.38788986206055,
91
+ "logits/rejected": 56.39375686645508,
92
+ "logps/chosen": -56.10227584838867,
93
+ "logps/rejected": -68.6869125366211,
94
+ "loss": 0.658,
95
+ "rewards/accuracies": 0.6499999761581421,
96
+ "rewards/chosen": -0.12126936763525009,
97
+ "rewards/margins": 0.07603181153535843,
98
+ "rewards/rejected": -0.19730117917060852,
99
+ "step": 50
100
+ },
101
+ {
102
+ "epoch": 0.048,
103
+ "grad_norm": 19.63702779072802,
104
+ "learning_rate": 1.44e-06,
105
+ "logits/chosen": 55.504783630371094,
106
+ "logits/rejected": 58.70302200317383,
107
+ "logps/chosen": -57.737709045410156,
108
+ "logps/rejected": -81.11849975585938,
109
+ "loss": 0.604,
110
+ "rewards/accuracies": 0.675000011920929,
111
+ "rewards/chosen": -0.26448559761047363,
112
+ "rewards/margins": 0.15866820514202118,
113
+ "rewards/rejected": -0.4231537878513336,
114
+ "step": 60
115
+ },
116
+ {
117
+ "epoch": 0.056,
118
+ "grad_norm": 18.137331310839922,
119
+ "learning_rate": 1.6800000000000002e-06,
120
+ "logits/chosen": 57.54669952392578,
121
+ "logits/rejected": 54.60467529296875,
122
+ "logps/chosen": -67.18961334228516,
123
+ "logps/rejected": -77.70694732666016,
124
+ "loss": 0.5428,
125
+ "rewards/accuracies": 0.7749999761581421,
126
+ "rewards/chosen": -0.612514317035675,
127
+ "rewards/margins": 0.23503276705741882,
128
+ "rewards/rejected": -0.847547173500061,
129
+ "step": 70
130
+ },
131
+ {
132
+ "epoch": 0.064,
133
+ "grad_norm": 18.231347330767957,
134
+ "learning_rate": 1.9200000000000003e-06,
135
+ "logits/chosen": 54.9367561340332,
136
+ "logits/rejected": 54.68426513671875,
137
+ "logps/chosen": -62.68987274169922,
138
+ "logps/rejected": -77.52642059326172,
139
+ "loss": 0.5303,
140
+ "rewards/accuracies": 0.675000011920929,
141
+ "rewards/chosen": -1.0836379528045654,
142
+ "rewards/margins": 0.46581095457077026,
143
+ "rewards/rejected": -1.5494489669799805,
144
+ "step": 80
145
+ },
146
+ {
147
+ "epoch": 0.072,
148
+ "grad_norm": 17.73877224591989,
149
+ "learning_rate": 2.16e-06,
150
+ "logits/chosen": 53.1925163269043,
151
+ "logits/rejected": 54.84343338012695,
152
+ "logps/chosen": -75.93465423583984,
153
+ "logps/rejected": -97.27320861816406,
154
+ "loss": 0.4118,
155
+ "rewards/accuracies": 0.8500000238418579,
156
+ "rewards/chosen": -1.2428396940231323,
157
+ "rewards/margins": 1.071587324142456,
158
+ "rewards/rejected": -2.314426898956299,
159
+ "step": 90
160
+ },
161
+ {
162
+ "epoch": 0.08,
163
+ "grad_norm": 19.99662339162552,
164
+ "learning_rate": 2.4000000000000003e-06,
165
+ "logits/chosen": 53.5159797668457,
166
+ "logits/rejected": 50.565223693847656,
167
+ "logps/chosen": -77.43242645263672,
168
+ "logps/rejected": -100.10637664794922,
169
+ "loss": 0.4154,
170
+ "rewards/accuracies": 0.8500000238418579,
171
+ "rewards/chosen": -2.2108376026153564,
172
+ "rewards/margins": 1.270476222038269,
173
+ "rewards/rejected": -3.481314182281494,
174
+ "step": 100
175
+ },
176
+ {
177
+ "epoch": 0.088,
178
+ "grad_norm": 21.75426718419849,
179
+ "learning_rate": 2.64e-06,
180
+ "logits/chosen": 52.46582794189453,
181
+ "logits/rejected": 51.50734329223633,
182
+ "logps/chosen": -81.02312469482422,
183
+ "logps/rejected": -109.15274810791016,
184
+ "loss": 0.3175,
185
+ "rewards/accuracies": 0.8500000238418579,
186
+ "rewards/chosen": -2.748387336730957,
187
+ "rewards/margins": 1.5973384380340576,
188
+ "rewards/rejected": -4.3457255363464355,
189
+ "step": 110
190
+ },
191
+ {
192
+ "epoch": 0.096,
193
+ "grad_norm": 16.929821828111294,
194
+ "learning_rate": 2.88e-06,
195
+ "logits/chosen": 49.478553771972656,
196
+ "logits/rejected": 47.64826583862305,
197
+ "logps/chosen": -77.54215240478516,
198
+ "logps/rejected": -115.6530990600586,
199
+ "loss": 0.3657,
200
+ "rewards/accuracies": 0.824999988079071,
201
+ "rewards/chosen": -2.67846941947937,
202
+ "rewards/margins": 2.1004786491394043,
203
+ "rewards/rejected": -4.778947830200195,
204
+ "step": 120
205
+ },
206
+ {
207
+ "epoch": 0.104,
208
+ "grad_norm": 18.372474710811762,
209
+ "learning_rate": 2.9998537860139563e-06,
210
+ "logits/chosen": 48.161476135253906,
211
+ "logits/rejected": 49.45183563232422,
212
+ "logps/chosen": -99.06620025634766,
213
+ "logps/rejected": -118.9909439086914,
214
+ "loss": 0.3465,
215
+ "rewards/accuracies": 0.7250000238418579,
216
+ "rewards/chosen": -3.597534656524658,
217
+ "rewards/margins": 1.4235634803771973,
218
+ "rewards/rejected": -5.021098613739014,
219
+ "step": 130
220
+ },
221
+ {
222
+ "epoch": 0.112,
223
+ "grad_norm": 11.846677373240393,
224
+ "learning_rate": 2.9986842451482876e-06,
225
+ "logits/chosen": 47.42317199707031,
226
+ "logits/rejected": 47.59636306762695,
227
+ "logps/chosen": -87.74755859375,
228
+ "logps/rejected": -126.07084655761719,
229
+ "loss": 0.285,
230
+ "rewards/accuracies": 0.8500000238418579,
231
+ "rewards/chosen": -3.148099422454834,
232
+ "rewards/margins": 2.943894863128662,
233
+ "rewards/rejected": -6.091994285583496,
234
+ "step": 140
235
+ },
236
+ {
237
+ "epoch": 0.12,
238
+ "grad_norm": 36.496040238235864,
239
+ "learning_rate": 2.9963460753897363e-06,
240
+ "logits/chosen": 45.3277473449707,
241
+ "logits/rejected": 49.25115203857422,
242
+ "logps/chosen": -95.30039978027344,
243
+ "logps/rejected": -133.84915161132812,
244
+ "loss": 0.3391,
245
+ "rewards/accuracies": 0.75,
246
+ "rewards/chosen": -3.688962459564209,
247
+ "rewards/margins": 2.5285627841949463,
248
+ "rewards/rejected": -6.217525482177734,
249
+ "step": 150
250
+ },
251
+ {
252
+ "epoch": 0.128,
253
+ "grad_norm": 19.59471062026771,
254
+ "learning_rate": 2.9928410999727467e-06,
255
+ "logits/chosen": 48.085975646972656,
256
+ "logits/rejected": 47.303443908691406,
257
+ "logps/chosen": -94.16909790039062,
258
+ "logps/rejected": -124.5986557006836,
259
+ "loss": 0.2205,
260
+ "rewards/accuracies": 0.800000011920929,
261
+ "rewards/chosen": -3.4897732734680176,
262
+ "rewards/margins": 2.562234401702881,
263
+ "rewards/rejected": -6.05200719833374,
264
+ "step": 160
265
+ },
266
+ {
267
+ "epoch": 0.136,
268
+ "grad_norm": 12.523322391428595,
269
+ "learning_rate": 2.988172051971717e-06,
270
+ "logits/chosen": 45.95378112792969,
271
+ "logits/rejected": 47.941734313964844,
272
+ "logps/chosen": -95.47936248779297,
273
+ "logps/rejected": -132.12608337402344,
274
+ "loss": 0.2479,
275
+ "rewards/accuracies": 0.8999999761581421,
276
+ "rewards/chosen": -3.5358726978302,
277
+ "rewards/margins": 2.7687032222747803,
278
+ "rewards/rejected": -6.3045759201049805,
279
+ "step": 170
280
+ },
281
+ {
282
+ "epoch": 0.144,
283
+ "grad_norm": 22.507905467341416,
284
+ "learning_rate": 2.9823425721698293e-06,
285
+ "logits/chosen": 47.45555877685547,
286
+ "logits/rejected": 45.6512451171875,
287
+ "logps/chosen": -88.78406524658203,
288
+ "logps/rejected": -130.03347778320312,
289
+ "loss": 0.2046,
290
+ "rewards/accuracies": 0.875,
291
+ "rewards/chosen": -3.3888843059539795,
292
+ "rewards/margins": 3.680863857269287,
293
+ "rewards/rejected": -7.0697479248046875,
294
+ "step": 180
295
+ },
296
+ {
297
+ "epoch": 0.152,
298
+ "grad_norm": 20.747003935771083,
299
+ "learning_rate": 2.975357206220079e-06,
300
+ "logits/chosen": 43.333560943603516,
301
+ "logits/rejected": 47.98616409301758,
302
+ "logps/chosen": -103.88460540771484,
303
+ "logps/rejected": -156.9280242919922,
304
+ "loss": 0.1485,
305
+ "rewards/accuracies": 0.925000011920929,
306
+ "rewards/chosen": -4.8503522872924805,
307
+ "rewards/margins": 3.576326370239258,
308
+ "rewards/rejected": -8.426677703857422,
309
+ "step": 190
310
+ },
311
+ {
312
+ "epoch": 0.16,
313
+ "grad_norm": 17.041744365225764,
314
+ "learning_rate": 2.9672214011007086e-06,
315
+ "logits/chosen": 39.760108947753906,
316
+ "logits/rejected": 39.248085021972656,
317
+ "logps/chosen": -108.03792572021484,
318
+ "logps/rejected": -156.3753204345703,
319
+ "loss": 0.1781,
320
+ "rewards/accuracies": 0.925000011920929,
321
+ "rewards/chosen": -5.024051189422607,
322
+ "rewards/margins": 4.045407295227051,
323
+ "rewards/rejected": -9.0694580078125,
324
+ "step": 200
325
+ },
326
+ {
327
+ "epoch": 0.168,
328
+ "grad_norm": 36.70520837669934,
329
+ "learning_rate": 2.95794150086782e-06,
330
+ "logits/chosen": 37.43067169189453,
331
+ "logits/rejected": 37.81218719482422,
332
+ "logps/chosen": -122.63470458984375,
333
+ "logps/rejected": -180.85458374023438,
334
+ "loss": 0.1757,
335
+ "rewards/accuracies": 0.8999999761581421,
336
+ "rewards/chosen": -6.044098854064941,
337
+ "rewards/margins": 4.934633255004883,
338
+ "rewards/rejected": -10.978731155395508,
339
+ "step": 210
340
+ },
341
+ {
342
+ "epoch": 0.176,
343
+ "grad_norm": 9.451569716987374,
344
+ "learning_rate": 2.9475247417084673e-06,
345
+ "logits/chosen": 38.666202545166016,
346
+ "logits/rejected": 39.08936309814453,
347
+ "logps/chosen": -123.2136001586914,
348
+ "logps/rejected": -177.7836456298828,
349
+ "loss": 0.1577,
350
+ "rewards/accuracies": 0.9750000238418579,
351
+ "rewards/chosen": -5.986439228057861,
352
+ "rewards/margins": 4.965152263641357,
353
+ "rewards/rejected": -10.951591491699219,
354
+ "step": 220
355
+ },
356
+ {
357
+ "epoch": 0.184,
358
+ "grad_norm": 4.732586023204334,
359
+ "learning_rate": 2.9359792462981008e-06,
360
+ "logits/chosen": 37.19866180419922,
361
+ "logits/rejected": 39.60905075073242,
362
+ "logps/chosen": -121.33009338378906,
363
+ "logps/rejected": -176.44203186035156,
364
+ "loss": 0.271,
365
+ "rewards/accuracies": 0.8500000238418579,
366
+ "rewards/chosen": -6.191902160644531,
367
+ "rewards/margins": 4.402517318725586,
368
+ "rewards/rejected": -10.594419479370117,
369
+ "step": 230
370
+ },
371
+ {
372
+ "epoch": 0.192,
373
+ "grad_norm": 46.75079216151069,
374
+ "learning_rate": 2.9233140174667447e-06,
375
+ "logits/chosen": 35.31947326660156,
376
+ "logits/rejected": 39.77277374267578,
377
+ "logps/chosen": -115.7451400756836,
378
+ "logps/rejected": -193.0277557373047,
379
+ "loss": 0.1112,
380
+ "rewards/accuracies": 0.9750000238418579,
381
+ "rewards/chosen": -5.50314998626709,
382
+ "rewards/margins": 6.3514556884765625,
383
+ "rewards/rejected": -11.854605674743652,
384
+ "step": 240
385
+ },
386
+ {
387
+ "epoch": 0.2,
388
+ "grad_norm": 56.466288047816285,
389
+ "learning_rate": 2.9095389311788626e-06,
390
+ "logits/chosen": 33.861488342285156,
391
+ "logits/rejected": 37.11942672729492,
392
+ "logps/chosen": -114.38687896728516,
393
+ "logps/rejected": -182.79983520507812,
394
+ "loss": 0.1955,
395
+ "rewards/accuracies": 0.925000011920929,
396
+ "rewards/chosen": -5.901637077331543,
397
+ "rewards/margins": 5.603636741638184,
398
+ "rewards/rejected": -11.505274772644043,
399
+ "step": 250
400
+ },
401
+ {
402
+ "epoch": 0.208,
403
+ "grad_norm": 11.256333883583846,
404
+ "learning_rate": 2.894664728832377e-06,
405
+ "logits/chosen": 38.68649673461914,
406
+ "logits/rejected": 38.391868591308594,
407
+ "logps/chosen": -125.4790267944336,
408
+ "logps/rejected": -184.40927124023438,
409
+ "loss": 0.1671,
410
+ "rewards/accuracies": 0.875,
411
+ "rewards/chosen": -5.836948871612549,
412
+ "rewards/margins": 5.325386047363281,
413
+ "rewards/rejected": -11.162334442138672,
414
+ "step": 260
415
+ },
416
+ {
417
+ "epoch": 0.216,
418
+ "grad_norm": 26.666948743704236,
419
+ "learning_rate": 2.878703008882852e-06,
420
+ "logits/chosen": 36.955894470214844,
421
+ "logits/rejected": 39.17523193359375,
422
+ "logps/chosen": -107.0689468383789,
423
+ "logps/rejected": -169.14031982421875,
424
+ "loss": 0.21,
425
+ "rewards/accuracies": 0.8999999761581421,
426
+ "rewards/chosen": -5.4622063636779785,
427
+ "rewards/margins": 4.83941125869751,
428
+ "rewards/rejected": -10.301618576049805,
429
+ "step": 270
430
+ },
431
+ {
432
+ "epoch": 0.224,
433
+ "grad_norm": 14.65567488371947,
434
+ "learning_rate": 2.861666217799363e-06,
435
+ "logits/chosen": 38.019309997558594,
436
+ "logits/rejected": 41.11383056640625,
437
+ "logps/chosen": -110.4853515625,
438
+ "logps/rejected": -187.29208374023438,
439
+ "loss": 0.2127,
440
+ "rewards/accuracies": 0.9750000238418579,
441
+ "rewards/chosen": -4.756478309631348,
442
+ "rewards/margins": 6.638999938964844,
443
+ "rewards/rejected": -11.395478248596191,
444
+ "step": 280
445
+ },
446
+ {
447
+ "epoch": 0.232,
448
+ "grad_norm": 4.610753920962251,
449
+ "learning_rate": 2.8435676403591196e-06,
450
+ "logits/chosen": 38.20881271362305,
451
+ "logits/rejected": 35.70587921142578,
452
+ "logps/chosen": -113.76185607910156,
453
+ "logps/rejected": -169.40895080566406,
454
+ "loss": 0.1187,
455
+ "rewards/accuracies": 0.9750000238418579,
456
+ "rewards/chosen": -5.58528995513916,
457
+ "rewards/margins": 5.651334285736084,
458
+ "rewards/rejected": -11.236624717712402,
459
+ "step": 290
460
+ },
461
+ {
462
+ "epoch": 0.24,
463
+ "grad_norm": 16.792649400961125,
464
+ "learning_rate": 2.8244213892883906e-06,
465
+ "logits/chosen": 37.90871810913086,
466
+ "logits/rejected": 41.139713287353516,
467
+ "logps/chosen": -115.63899993896484,
468
+ "logps/rejected": -188.9228973388672,
469
+ "loss": 0.0712,
470
+ "rewards/accuracies": 0.949999988079071,
471
+ "rewards/chosen": -5.2915940284729,
472
+ "rewards/margins": 5.684304237365723,
473
+ "rewards/rejected": -10.975896835327148,
474
+ "step": 300
475
+ },
476
+ {
477
+ "epoch": 0.248,
478
+ "grad_norm": 3.4109197371862177,
479
+ "learning_rate": 2.8042423942578284e-06,
480
+ "logits/chosen": 32.42366027832031,
481
+ "logits/rejected": 37.181610107421875,
482
+ "logps/chosen": -111.64329528808594,
483
+ "logps/rejected": -204.35745239257812,
484
+ "loss": 0.1115,
485
+ "rewards/accuracies": 0.9750000238418579,
486
+ "rewards/chosen": -5.966713905334473,
487
+ "rewards/margins": 7.201415061950684,
488
+ "rewards/rejected": -13.168128967285156,
489
+ "step": 310
490
+ },
491
+ {
492
+ "epoch": 0.256,
493
+ "grad_norm": 34.61265439136052,
494
+ "learning_rate": 2.78304639024076e-06,
495
+ "logits/chosen": 33.11461639404297,
496
+ "logits/rejected": 34.98590087890625,
497
+ "logps/chosen": -129.79061889648438,
498
+ "logps/rejected": -214.4392547607422,
499
+ "loss": 0.1245,
500
+ "rewards/accuracies": 0.9750000238418579,
501
+ "rewards/chosen": -7.288665771484375,
502
+ "rewards/margins": 6.769256591796875,
503
+ "rewards/rejected": -14.05792236328125,
504
+ "step": 320
505
+ },
506
+ {
507
+ "epoch": 0.264,
508
+ "grad_norm": 40.604661066881384,
509
+ "learning_rate": 2.7608499052435266e-06,
510
+ "logits/chosen": 30.30582046508789,
511
+ "logits/rejected": 34.23260498046875,
512
+ "logps/chosen": -120.01988220214844,
513
+ "logps/rejected": -213.86123657226562,
514
+ "loss": 0.1025,
515
+ "rewards/accuracies": 0.949999988079071,
516
+ "rewards/chosen": -6.6563215255737305,
517
+ "rewards/margins": 7.752140998840332,
518
+ "rewards/rejected": -14.408462524414062,
519
+ "step": 330
520
+ },
521
+ {
522
+ "epoch": 0.272,
523
+ "grad_norm": 14.452169107352786,
524
+ "learning_rate": 2.7376702474174426e-06,
525
+ "logits/chosen": 33.35188293457031,
526
+ "logits/rejected": 31.421377182006836,
527
+ "logps/chosen": -136.9534912109375,
528
+ "logps/rejected": -193.56581115722656,
529
+ "loss": 0.121,
530
+ "rewards/accuracies": 0.9750000238418579,
531
+ "rewards/chosen": -7.1245574951171875,
532
+ "rewards/margins": 5.618683338165283,
533
+ "rewards/rejected": -12.743240356445312,
534
+ "step": 340
535
+ },
536
+ {
537
+ "epoch": 0.28,
538
+ "grad_norm": 0.6616288825225272,
539
+ "learning_rate": 2.713525491562421e-06,
540
+ "logits/chosen": 35.41117858886719,
541
+ "logits/rejected": 35.81652069091797,
542
+ "logps/chosen": -142.27957153320312,
543
+ "logps/rejected": -206.56591796875,
544
+ "loss": 0.1007,
545
+ "rewards/accuracies": 0.9750000238418579,
546
+ "rewards/chosen": -7.426796913146973,
547
+ "rewards/margins": 6.085718154907227,
548
+ "rewards/rejected": -13.5125150680542,
549
+ "step": 350
550
+ },
551
+ {
552
+ "epoch": 0.288,
553
+ "grad_norm": 17.254991119966604,
554
+ "learning_rate": 2.688434465032786e-06,
555
+ "logits/chosen": 27.68739891052246,
556
+ "logits/rejected": 32.76261520385742,
557
+ "logps/chosen": -134.08424377441406,
558
+ "logps/rejected": -243.40560913085938,
559
+ "loss": 0.147,
560
+ "rewards/accuracies": 0.9750000238418579,
561
+ "rewards/chosen": -7.258644104003906,
562
+ "rewards/margins": 9.23260498046875,
563
+ "rewards/rejected": -16.491247177124023,
564
+ "step": 360
565
+ },
566
+ {
567
+ "epoch": 0.296,
568
+ "grad_norm": 4.261139222241878,
569
+ "learning_rate": 2.6624167330562694e-06,
570
+ "logits/chosen": 29.108882904052734,
571
+ "logits/rejected": 31.266714096069336,
572
+ "logps/chosen": -134.42970275878906,
573
+ "logps/rejected": -220.4041748046875,
574
+ "loss": 0.0617,
575
+ "rewards/accuracies": 0.925000011920929,
576
+ "rewards/chosen": -7.372394561767578,
577
+ "rewards/margins": 8.266263008117676,
578
+ "rewards/rejected": -15.63865852355957,
579
+ "step": 370
580
+ },
581
+ {
582
+ "epoch": 0.304,
583
+ "grad_norm": 1.262878357266055,
584
+ "learning_rate": 2.6354925834776346e-06,
585
+ "logits/chosen": 27.302906036376953,
586
+ "logits/rejected": 31.609981536865234,
587
+ "logps/chosen": -149.15396118164062,
588
+ "logps/rejected": -241.90420532226562,
589
+ "loss": 0.1221,
590
+ "rewards/accuracies": 0.949999988079071,
591
+ "rewards/chosen": -8.688291549682617,
592
+ "rewards/margins": 8.208234786987305,
593
+ "rewards/rejected": -16.896526336669922,
594
+ "step": 380
595
+ },
596
+ {
597
+ "epoch": 0.312,
598
+ "grad_norm": 43.62912594375019,
599
+ "learning_rate": 2.607683010938826e-06,
600
+ "logits/chosen": 27.132095336914062,
601
+ "logits/rejected": 28.30078125,
602
+ "logps/chosen": -155.2683563232422,
603
+ "logps/rejected": -240.5152587890625,
604
+ "loss": 0.1539,
605
+ "rewards/accuracies": 0.8500000238418579,
606
+ "rewards/chosen": -9.518204689025879,
607
+ "rewards/margins": 7.853548526763916,
608
+ "rewards/rejected": -17.371753692626953,
609
+ "step": 390
610
+ },
611
+ {
612
+ "epoch": 0.32,
613
+ "grad_norm": 31.97742985431739,
614
+ "learning_rate": 2.5790097005079765e-06,
615
+ "logits/chosen": 27.687297821044922,
616
+ "logits/rejected": 30.38442611694336,
617
+ "logps/chosen": -152.99664306640625,
618
+ "logps/rejected": -259.11627197265625,
619
+ "loss": 0.0747,
620
+ "rewards/accuracies": 0.949999988079071,
621
+ "rewards/chosen": -9.019024848937988,
622
+ "rewards/margins": 9.441879272460938,
623
+ "rewards/rejected": -18.46090316772461,
624
+ "step": 400
625
+ },
626
+ {
627
+ "epoch": 0.328,
628
+ "grad_norm": 0.11384820974453211,
629
+ "learning_rate": 2.549495010770048e-06,
630
+ "logits/chosen": 27.36895751953125,
631
+ "logits/rejected": 28.704010009765625,
632
+ "logps/chosen": -144.52304077148438,
633
+ "logps/rejected": -247.2997589111328,
634
+ "loss": 0.1256,
635
+ "rewards/accuracies": 1.0,
636
+ "rewards/chosen": -8.224719047546387,
637
+ "rewards/margins": 9.54539680480957,
638
+ "rewards/rejected": -17.77011489868164,
639
+ "step": 410
640
+ },
641
+ {
642
+ "epoch": 0.336,
643
+ "grad_norm": 7.534465189988814,
644
+ "learning_rate": 2.519161956392275e-06,
645
+ "logits/chosen": 30.696910858154297,
646
+ "logits/rejected": 30.624948501586914,
647
+ "logps/chosen": -144.02389526367188,
648
+ "logps/rejected": -232.913818359375,
649
+ "loss": 0.1559,
650
+ "rewards/accuracies": 0.9750000238418579,
651
+ "rewards/chosen": -8.057051658630371,
652
+ "rewards/margins": 8.463793754577637,
653
+ "rewards/rejected": -16.520845413208008,
654
+ "step": 420
655
+ },
656
+ {
657
+ "epoch": 0.344,
658
+ "grad_norm": 23.270592204413173,
659
+ "learning_rate": 2.4880341901780208e-06,
660
+ "logits/chosen": 31.727802276611328,
661
+ "logits/rejected": 34.599830627441406,
662
+ "logps/chosen": -146.5982208251953,
663
+ "logps/rejected": -236.9903106689453,
664
+ "loss": 0.101,
665
+ "rewards/accuracies": 0.925000011920929,
666
+ "rewards/chosen": -8.658833503723145,
667
+ "rewards/margins": 7.560339450836182,
668
+ "rewards/rejected": -16.219173431396484,
669
+ "step": 430
670
+ },
671
+ {
672
+ "epoch": 0.352,
673
+ "grad_norm": 18.404989253529585,
674
+ "learning_rate": 2.456135984623035e-06,
675
+ "logits/chosen": 31.22390365600586,
676
+ "logits/rejected": 34.494503021240234,
677
+ "logps/chosen": -131.6576385498047,
678
+ "logps/rejected": -224.7453155517578,
679
+ "loss": 0.0794,
680
+ "rewards/accuracies": 0.949999988079071,
681
+ "rewards/chosen": -8.250141143798828,
682
+ "rewards/margins": 7.884213447570801,
683
+ "rewards/rejected": -16.134353637695312,
684
+ "step": 440
685
+ },
686
+ {
687
+ "epoch": 0.36,
688
+ "grad_norm": 5.835930327262679,
689
+ "learning_rate": 2.4234922129884873e-06,
690
+ "logits/chosen": 31.049453735351562,
691
+ "logits/rejected": 32.644561767578125,
692
+ "logps/chosen": -136.99464416503906,
693
+ "logps/rejected": -229.085693359375,
694
+ "loss": 0.0925,
695
+ "rewards/accuracies": 0.925000011920929,
696
+ "rewards/chosen": -7.603506565093994,
697
+ "rewards/margins": 8.22996997833252,
698
+ "rewards/rejected": -15.833475112915039,
699
+ "step": 450
700
+ },
701
+ {
702
+ "epoch": 0.368,
703
+ "grad_norm": 83.38465938371176,
704
+ "learning_rate": 2.3901283299055523e-06,
705
+ "logits/chosen": 29.210134506225586,
706
+ "logits/rejected": 33.91218185424805,
707
+ "logps/chosen": -133.32357788085938,
708
+ "logps/rejected": -240.31906127929688,
709
+ "loss": 0.0892,
710
+ "rewards/accuracies": 0.9750000238418579,
711
+ "rewards/chosen": -7.826475620269775,
712
+ "rewards/margins": 8.995865821838379,
713
+ "rewards/rejected": -16.82234001159668,
714
+ "step": 460
715
+ },
716
+ {
717
+ "epoch": 0.376,
718
+ "grad_norm": 2.502030553529166,
719
+ "learning_rate": 2.356070351526648e-06,
720
+ "logits/chosen": 30.35089111328125,
721
+ "logits/rejected": 29.741165161132812,
722
+ "logps/chosen": -145.46237182617188,
723
+ "logps/rejected": -234.22525024414062,
724
+ "loss": 0.0886,
725
+ "rewards/accuracies": 0.9750000238418579,
726
+ "rewards/chosen": -8.746350288391113,
727
+ "rewards/margins": 8.287931442260742,
728
+ "rewards/rejected": -17.034282684326172,
729
+ "step": 470
730
+ },
731
+ {
732
+ "epoch": 0.384,
733
+ "grad_norm": 14.091730335356385,
734
+ "learning_rate": 2.3213448352388254e-06,
735
+ "logits/chosen": 27.40460777282715,
736
+ "logits/rejected": 31.037439346313477,
737
+ "logps/chosen": -142.2796173095703,
738
+ "logps/rejected": -239.9733428955078,
739
+ "loss": 0.064,
740
+ "rewards/accuracies": 0.949999988079071,
741
+ "rewards/chosen": -9.121622085571289,
742
+ "rewards/margins": 8.313209533691406,
743
+ "rewards/rejected": -17.434831619262695,
744
+ "step": 480
745
+ },
746
+ {
747
+ "epoch": 0.392,
748
+ "grad_norm": 11.107798849175124,
749
+ "learning_rate": 2.285978858955119e-06,
750
+ "logits/chosen": 31.397411346435547,
751
+ "logits/rejected": 31.57785987854004,
752
+ "logps/chosen": -137.85682678222656,
753
+ "logps/rejected": -237.64340209960938,
754
+ "loss": 0.0405,
755
+ "rewards/accuracies": 0.9750000238418579,
756
+ "rewards/chosen": -7.711313724517822,
757
+ "rewards/margins": 8.816143035888672,
758
+ "rewards/rejected": -16.527456283569336,
759
+ "step": 490
760
+ },
761
+ {
762
+ "epoch": 0.4,
763
+ "grad_norm": 0.7122560435001203,
764
+ "learning_rate": 2.25e-06,
765
+ "logits/chosen": 29.285724639892578,
766
+ "logits/rejected": 33.19956970214844,
767
+ "logps/chosen": -150.15484619140625,
768
+ "logps/rejected": -258.1266784667969,
769
+ "loss": 0.0857,
770
+ "rewards/accuracies": 0.949999988079071,
771
+ "rewards/chosen": -8.766794204711914,
772
+ "rewards/margins": 9.55189323425293,
773
+ "rewards/rejected": -18.31868553161621,
774
+ "step": 500
775
+ },
776
+ {
777
+ "epoch": 0.408,
778
+ "grad_norm": 1.1236023709740472,
779
+ "learning_rate": 2.213436313605413e-06,
780
+ "logits/chosen": 27.258758544921875,
781
+ "logits/rejected": 32.38069152832031,
782
+ "logps/chosen": -140.72256469726562,
783
+ "logps/rejected": -257.39410400390625,
784
+ "loss": 0.0529,
785
+ "rewards/accuracies": 0.9750000238418579,
786
+ "rewards/chosen": -8.840039253234863,
787
+ "rewards/margins": 10.10526180267334,
788
+ "rewards/rejected": -18.945301055908203,
789
+ "step": 510
790
+ },
791
+ {
792
+ "epoch": 0.416,
793
+ "grad_norm": 42.2808961940555,
794
+ "learning_rate": 2.1763163110341462e-06,
795
+ "logits/chosen": 26.64520263671875,
796
+ "logits/rejected": 29.211254119873047,
797
+ "logps/chosen": -143.02182006835938,
798
+ "logps/rejected": -262.0480041503906,
799
+ "loss": 0.0919,
800
+ "rewards/accuracies": 1.0,
801
+ "rewards/chosen": -8.261417388916016,
802
+ "rewards/margins": 10.782800674438477,
803
+ "rewards/rejected": -19.044218063354492,
804
+ "step": 520
805
+ },
806
+ {
807
+ "epoch": 0.424,
808
+ "grad_norm": 5.223417864951368,
809
+ "learning_rate": 2.138668937347609e-06,
810
+ "logits/chosen": 25.72537612915039,
811
+ "logits/rejected": 28.29427146911621,
812
+ "logps/chosen": -153.34121704101562,
813
+ "logps/rejected": -274.3043212890625,
814
+ "loss": 0.1288,
815
+ "rewards/accuracies": 1.0,
816
+ "rewards/chosen": -9.276151657104492,
817
+ "rewards/margins": 11.748897552490234,
818
+ "rewards/rejected": -21.02505111694336,
819
+ "step": 530
820
+ },
821
+ {
822
+ "epoch": 0.432,
823
+ "grad_norm": 13.59100794053642,
824
+ "learning_rate": 2.100523548835343e-06,
825
+ "logits/chosen": 26.498031616210938,
826
+ "logits/rejected": 29.164413452148438,
827
+ "logps/chosen": -168.49325561523438,
828
+ "logps/rejected": -274.753173828125,
829
+ "loss": 0.0632,
830
+ "rewards/accuracies": 0.9750000238418579,
831
+ "rewards/chosen": -11.204986572265625,
832
+ "rewards/margins": 9.171854972839355,
833
+ "rewards/rejected": -20.376840591430664,
834
+ "step": 540
835
+ },
836
+ {
837
+ "epoch": 0.44,
838
+ "grad_norm": 6.137158026956021,
839
+ "learning_rate": 2.061909890123868e-06,
840
+ "logits/chosen": 23.507408142089844,
841
+ "logits/rejected": 27.2824764251709,
842
+ "logps/chosen": -153.0961456298828,
843
+ "logps/rejected": -278.5215148925781,
844
+ "loss": 0.0752,
845
+ "rewards/accuracies": 0.9750000238418579,
846
+ "rewards/chosen": -9.87879753112793,
847
+ "rewards/margins": 10.660184860229492,
848
+ "rewards/rejected": -20.538986206054688,
849
+ "step": 550
850
+ },
851
+ {
852
+ "epoch": 0.448,
853
+ "grad_norm": 3.456576564579184,
854
+ "learning_rate": 2.022858070982723e-06,
855
+ "logits/chosen": 24.48337745666504,
856
+ "logits/rejected": 27.21124839782715,
857
+ "logps/chosen": -173.62545776367188,
858
+ "logps/rejected": -283.33575439453125,
859
+ "loss": 0.0325,
860
+ "rewards/accuracies": 0.9750000238418579,
861
+ "rewards/chosen": -10.958311080932617,
862
+ "rewards/margins": 10.133281707763672,
863
+ "rewards/rejected": -21.09159278869629,
864
+ "step": 560
865
+ },
866
+ {
867
+ "epoch": 0.456,
868
+ "grad_norm": 40.06033927173472,
869
+ "learning_rate": 1.983398542845767e-06,
870
+ "logits/chosen": 23.440534591674805,
871
+ "logits/rejected": 24.654071807861328,
872
+ "logps/chosen": -153.51531982421875,
873
+ "logps/rejected": -272.8458251953125,
874
+ "loss": 0.0788,
875
+ "rewards/accuracies": 0.9750000238418579,
876
+ "rewards/chosen": -9.767447471618652,
877
+ "rewards/margins": 11.361773490905762,
878
+ "rewards/rejected": -21.129222869873047,
879
+ "step": 570
880
+ },
881
+ {
882
+ "epoch": 0.464,
883
+ "grad_norm": 0.1813350915194513,
884
+ "learning_rate": 1.9435620750660703e-06,
885
+ "logits/chosen": 20.848228454589844,
886
+ "logits/rejected": 24.540836334228516,
887
+ "logps/chosen": -139.9002227783203,
888
+ "logps/rejected": -272.963623046875,
889
+ "loss": 0.0687,
890
+ "rewards/accuracies": 0.9750000238418579,
891
+ "rewards/chosen": -8.545764923095703,
892
+ "rewards/margins": 12.114400863647461,
893
+ "rewards/rejected": -20.660165786743164,
894
+ "step": 580
895
+ },
896
+ {
897
+ "epoch": 0.472,
898
+ "grad_norm": 45.254112699453046,
899
+ "learning_rate": 1.9033797309228985e-06,
900
+ "logits/chosen": 19.232898712158203,
901
+ "logits/rejected": 23.08200454711914,
902
+ "logps/chosen": -163.60186767578125,
903
+ "logps/rejected": -292.57952880859375,
904
+ "loss": 0.3063,
905
+ "rewards/accuracies": 0.925000011920929,
906
+ "rewards/chosen": -10.544285774230957,
907
+ "rewards/margins": 11.296300888061523,
908
+ "rewards/rejected": -21.840587615966797,
909
+ "step": 590
910
+ },
911
+ {
912
+ "epoch": 0.48,
913
+ "grad_norm": 2.147234675582369,
914
+ "learning_rate": 1.8628828433995015e-06,
915
+ "logits/chosen": 24.288915634155273,
916
+ "logits/rejected": 26.083560943603516,
917
+ "logps/chosen": -165.03009033203125,
918
+ "logps/rejected": -274.15594482421875,
919
+ "loss": 0.0344,
920
+ "rewards/accuracies": 0.9750000238418579,
921
+ "rewards/chosen": -9.776161193847656,
922
+ "rewards/margins": 10.559597969055176,
923
+ "rewards/rejected": -20.335758209228516,
924
+ "step": 600
925
+ },
926
+ {
927
+ "epoch": 0.488,
928
+ "grad_norm": 32.88041793576487,
929
+ "learning_rate": 1.822102990750595e-06,
930
+ "logits/chosen": 21.8018798828125,
931
+ "logits/rejected": 26.132715225219727,
932
+ "logps/chosen": -159.8308563232422,
933
+ "logps/rejected": -299.31707763671875,
934
+ "loss": 0.059,
935
+ "rewards/accuracies": 0.9750000238418579,
936
+ "rewards/chosen": -9.872952461242676,
937
+ "rewards/margins": 12.362961769104004,
938
+ "rewards/rejected": -22.235912322998047,
939
+ "step": 610
940
+ },
941
+ {
942
+ "epoch": 0.496,
943
+ "grad_norm": 27.101516320021904,
944
+ "learning_rate": 1.7810719718785873e-06,
945
+ "logits/chosen": 22.754619598388672,
946
+ "logits/rejected": 26.49776268005371,
947
+ "logps/chosen": -159.6096954345703,
948
+ "logps/rejected": -283.2123718261719,
949
+ "loss": 0.0772,
950
+ "rewards/accuracies": 0.9750000238418579,
951
+ "rewards/chosen": -10.500214576721191,
952
+ "rewards/margins": 10.847715377807617,
953
+ "rewards/rejected": -21.34792709350586,
954
+ "step": 620
955
+ },
956
+ {
957
+ "epoch": 0.504,
958
+ "grad_norm": 12.207017549903293,
959
+ "learning_rate": 1.7398217815377524e-06,
960
+ "logits/chosen": 23.825117111206055,
961
+ "logits/rejected": 24.585908889770508,
962
+ "logps/chosen": -167.65447998046875,
963
+ "logps/rejected": -288.53851318359375,
964
+ "loss": 0.1162,
965
+ "rewards/accuracies": 1.0,
966
+ "rewards/chosen": -10.316202163696289,
967
+ "rewards/margins": 12.006393432617188,
968
+ "rewards/rejected": -22.322595596313477,
969
+ "step": 630
970
+ },
971
+ {
972
+ "epoch": 0.512,
973
+ "grad_norm": 5.163064270145757,
974
+ "learning_rate": 1.698384585385684e-06,
975
+ "logits/chosen": 23.767126083374023,
976
+ "logits/rejected": 23.51525115966797,
977
+ "logps/chosen": -180.27285766601562,
978
+ "logps/rejected": -302.7817077636719,
979
+ "loss": 0.0808,
980
+ "rewards/accuracies": 0.9750000238418579,
981
+ "rewards/chosen": -10.770536422729492,
982
+ "rewards/margins": 12.786243438720703,
983
+ "rewards/rejected": -23.556777954101562,
984
+ "step": 640
985
+ },
986
+ {
987
+ "epoch": 0.52,
988
+ "grad_norm": 19.70083585367663,
989
+ "learning_rate": 1.6567926949014804e-06,
990
+ "logits/chosen": 21.79136085510254,
991
+ "logits/rejected": 26.325298309326172,
992
+ "logps/chosen": -160.727294921875,
993
+ "logps/rejected": -299.2230224609375,
994
+ "loss": 0.0831,
995
+ "rewards/accuracies": 0.9750000238418579,
996
+ "rewards/chosen": -10.32923698425293,
997
+ "rewards/margins": 12.3799467086792,
998
+ "rewards/rejected": -22.709184646606445,
999
+ "step": 650
1000
+ },
1001
+ {
1002
+ "epoch": 0.528,
1003
+ "grad_norm": 58.109980325157714,
1004
+ "learning_rate": 1.615078542190228e-06,
1005
+ "logits/chosen": 18.619295120239258,
1006
+ "logits/rejected": 23.427764892578125,
1007
+ "logps/chosen": -153.34585571289062,
1008
+ "logps/rejected": -298.84844970703125,
1009
+ "loss": 0.0532,
1010
+ "rewards/accuracies": 0.9750000238418579,
1011
+ "rewards/chosen": -9.80422306060791,
1012
+ "rewards/margins": 12.987719535827637,
1013
+ "rewards/rejected": -22.791942596435547,
1014
+ "step": 660
1015
+ },
1016
+ {
1017
+ "epoch": 0.536,
1018
+ "grad_norm": 12.579940547510622,
1019
+ "learning_rate": 1.5732746546934201e-06,
1020
+ "logits/chosen": 18.508384704589844,
1021
+ "logits/rejected": 23.189361572265625,
1022
+ "logps/chosen": -153.68649291992188,
1023
+ "logps/rejected": -281.5841979980469,
1024
+ "loss": 0.0459,
1025
+ "rewards/accuracies": 0.949999988079071,
1026
+ "rewards/chosen": -10.447421073913574,
1027
+ "rewards/margins": 11.287437438964844,
1028
+ "rewards/rejected": -21.7348575592041,
1029
+ "step": 670
1030
+ },
1031
+ {
1032
+ "epoch": 0.544,
1033
+ "grad_norm": 3.9836905566738907,
1034
+ "learning_rate": 1.5314136298250356e-06,
1035
+ "logits/chosen": 19.70537567138672,
1036
+ "logits/rejected": 21.990859985351562,
1037
+ "logps/chosen": -177.63992309570312,
1038
+ "logps/rejected": -290.6033630371094,
1039
+ "loss": 0.0641,
1040
+ "rewards/accuracies": 1.0,
1041
+ "rewards/chosen": -11.616503715515137,
1042
+ "rewards/margins": 10.820137023925781,
1043
+ "rewards/rejected": -22.436641693115234,
1044
+ "step": 680
1045
+ },
1046
+ {
1047
+ "epoch": 0.552,
1048
+ "grad_norm": 0.03655701442918885,
1049
+ "learning_rate": 1.4895281095530578e-06,
1050
+ "logits/chosen": 19.422595977783203,
1051
+ "logits/rejected": 21.195425033569336,
1052
+ "logps/chosen": -176.49839782714844,
1053
+ "logps/rejected": -313.53497314453125,
1054
+ "loss": 0.0229,
1055
+ "rewards/accuracies": 1.0,
1056
+ "rewards/chosen": -11.55458927154541,
1057
+ "rewards/margins": 13.00189208984375,
1058
+ "rewards/rejected": -24.556480407714844,
1059
+ "step": 690
1060
+ },
1061
+ {
1062
+ "epoch": 0.56,
1063
+ "grad_norm": 14.189469953880016,
1064
+ "learning_rate": 1.4476507549462489e-06,
1065
+ "logits/chosen": 19.599300384521484,
1066
+ "logits/rejected": 22.10235023498535,
1067
+ "logps/chosen": -175.69180297851562,
1068
+ "logps/rejected": -306.8517150878906,
1069
+ "loss": 0.0487,
1070
+ "rewards/accuracies": 1.0,
1071
+ "rewards/chosen": -11.599604606628418,
1072
+ "rewards/margins": 12.865476608276367,
1073
+ "rewards/rejected": -24.46508026123047,
1074
+ "step": 700
1075
+ },
1076
+ {
1077
+ "epoch": 0.568,
1078
+ "grad_norm": 16.687171595570835,
1079
+ "learning_rate": 1.40581422070603e-06,
1080
+ "logits/chosen": 18.665790557861328,
1081
+ "logits/rejected": 22.680692672729492,
1082
+ "logps/chosen": -139.86900329589844,
1083
+ "logps/rejected": -305.75372314453125,
1084
+ "loss": 0.0259,
1085
+ "rewards/accuracies": 1.0,
1086
+ "rewards/chosen": -9.141308784484863,
1087
+ "rewards/margins": 14.639094352722168,
1088
+ "rewards/rejected": -23.78040313720703,
1089
+ "step": 710
1090
+ },
1091
+ {
1092
+ "epoch": 0.576,
1093
+ "grad_norm": 30.88419300499099,
1094
+ "learning_rate": 1.36405112970333e-06,
1095
+ "logits/chosen": 20.21243667602539,
1096
+ "logits/rejected": 22.893695831298828,
1097
+ "logps/chosen": -158.63064575195312,
1098
+ "logps/rejected": -314.7591247558594,
1099
+ "loss": 0.0607,
1100
+ "rewards/accuracies": 1.0,
1101
+ "rewards/chosen": -9.554333686828613,
1102
+ "rewards/margins": 14.731111526489258,
1103
+ "rewards/rejected": -24.285442352294922,
1104
+ "step": 720
1105
+ },
1106
+ {
1107
+ "epoch": 0.584,
1108
+ "grad_norm": 0.5428652197520796,
1109
+ "learning_rate": 1.3223940475402486e-06,
1110
+ "logits/chosen": 18.03298568725586,
1111
+ "logits/rejected": 20.249217987060547,
1112
+ "logps/chosen": -156.5733642578125,
1113
+ "logps/rejected": -343.110107421875,
1114
+ "loss": 0.0561,
1115
+ "rewards/accuracies": 1.0,
1116
+ "rewards/chosen": -9.43940258026123,
1117
+ "rewards/margins": 17.574020385742188,
1118
+ "rewards/rejected": -27.0134220123291,
1119
+ "step": 730
1120
+ },
1121
+ {
1122
+ "epoch": 0.592,
1123
+ "grad_norm": 1.2508849959109967,
1124
+ "learning_rate": 1.2808754571563827e-06,
1125
+ "logits/chosen": 20.323490142822266,
1126
+ "logits/rejected": 20.145631790161133,
1127
+ "logps/chosen": -172.84413146972656,
1128
+ "logps/rejected": -311.8101501464844,
1129
+ "loss": 0.0922,
1130
+ "rewards/accuracies": 1.0,
1131
+ "rewards/chosen": -11.189407348632812,
1132
+ "rewards/margins": 13.641016960144043,
1133
+ "rewards/rejected": -24.830425262451172,
1134
+ "step": 740
1135
+ },
1136
+ {
1137
+ "epoch": 0.6,
1138
+ "grad_norm": 2.991663806982443,
1139
+ "learning_rate": 1.2395277334996047e-06,
1140
+ "logits/chosen": 18.46148681640625,
1141
+ "logits/rejected": 19.215234756469727,
1142
+ "logps/chosen": -161.73716735839844,
1143
+ "logps/rejected": -280.82550048828125,
1144
+ "loss": 0.0407,
1145
+ "rewards/accuracies": 0.9750000238418579,
1146
+ "rewards/chosen": -9.929094314575195,
1147
+ "rewards/margins": 11.809922218322754,
1148
+ "rewards/rejected": -21.739017486572266,
1149
+ "step": 750
1150
+ },
1151
+ {
1152
+ "epoch": 0.608,
1153
+ "grad_norm": 20.4365745548988,
1154
+ "learning_rate": 1.1983831182810534e-06,
1155
+ "logits/chosen": 18.275842666625977,
1156
+ "logits/rejected": 21.843782424926758,
1157
+ "logps/chosen": -164.38780212402344,
1158
+ "logps/rejected": -306.2947998046875,
1159
+ "loss": 0.0164,
1160
+ "rewards/accuracies": 1.0,
1161
+ "rewards/chosen": -11.01513957977295,
1162
+ "rewards/margins": 12.679012298583984,
1163
+ "rewards/rejected": -23.694150924682617,
1164
+ "step": 760
1165
+ },
1166
+ {
1167
+ "epoch": 0.616,
1168
+ "grad_norm": 55.8762128914233,
1169
+ "learning_rate": 1.1574736948340164e-06,
1170
+ "logits/chosen": 17.70195960998535,
1171
+ "logits/rejected": 19.546524047851562,
1172
+ "logps/chosen": -178.3878173828125,
1173
+ "logps/rejected": -305.541259765625,
1174
+ "loss": 0.0884,
1175
+ "rewards/accuracies": 0.9750000238418579,
1176
+ "rewards/chosen": -11.889364242553711,
1177
+ "rewards/margins": 11.622591972351074,
1178
+ "rewards/rejected": -23.5119571685791,
1179
+ "step": 770
1180
+ },
1181
+ {
1182
+ "epoch": 0.624,
1183
+ "grad_norm": 16.837136970797925,
1184
+ "learning_rate": 1.1168313630963144e-06,
1185
+ "logits/chosen": 14.999873161315918,
1186
+ "logits/rejected": 17.94775390625,
1187
+ "logps/chosen": -173.88241577148438,
1188
+ "logps/rejected": -343.7441101074219,
1189
+ "loss": 0.0886,
1190
+ "rewards/accuracies": 1.0,
1191
+ "rewards/chosen": -11.541296005249023,
1192
+ "rewards/margins": 15.598657608032227,
1193
+ "rewards/rejected": -27.13995361328125,
1194
+ "step": 780
1195
+ },
1196
+ {
1197
+ "epoch": 0.632,
1198
+ "grad_norm": 20.598469671539753,
1199
+ "learning_rate": 1.0764878147356852e-06,
1200
+ "logits/chosen": 16.952680587768555,
1201
+ "logits/rejected": 19.94651222229004,
1202
+ "logps/chosen": -156.38262939453125,
1203
+ "logps/rejected": -301.9223327636719,
1204
+ "loss": 0.0413,
1205
+ "rewards/accuracies": 1.0,
1206
+ "rewards/chosen": -10.413267135620117,
1207
+ "rewards/margins": 13.252962112426758,
1208
+ "rewards/rejected": -23.66622543334961,
1209
+ "step": 790
1210
+ },
1211
+ {
1212
+ "epoch": 0.64,
1213
+ "grad_norm": 0.7362208350991403,
1214
+ "learning_rate": 1.036474508437579e-06,
1215
+ "logits/chosen": 19.5678653717041,
1216
+ "logits/rejected": 21.027923583984375,
1217
+ "logps/chosen": -173.03187561035156,
1218
+ "logps/rejected": -347.1485900878906,
1219
+ "loss": 0.0423,
1220
+ "rewards/accuracies": 0.9750000238418579,
1221
+ "rewards/chosen": -10.36100959777832,
1222
+ "rewards/margins": 16.25722885131836,
1223
+ "rewards/rejected": -26.618236541748047,
1224
+ "step": 800
1225
+ },
1226
+ {
1227
+ "epoch": 0.648,
1228
+ "grad_norm": 0.46643139845120424,
1229
+ "learning_rate": 9.968226453746177e-07,
1230
+ "logits/chosen": 14.546850204467773,
1231
+ "logits/rejected": 18.3260555267334,
1232
+ "logps/chosen": -182.55184936523438,
1233
+ "logps/rejected": -322.73321533203125,
1234
+ "loss": 0.0687,
1235
+ "rewards/accuracies": 1.0,
1236
+ "rewards/chosen": -12.82792854309082,
1237
+ "rewards/margins": 12.780898094177246,
1238
+ "rewards/rejected": -25.60882568359375,
1239
+ "step": 810
1240
+ },
1241
+ {
1242
+ "epoch": 0.656,
1243
+ "grad_norm": 0.007677469812290264,
1244
+ "learning_rate": 9.575631448768617e-07,
1245
+ "logits/chosen": 15.806520462036133,
1246
+ "logits/rejected": 19.548625946044922,
1247
+ "logps/chosen": -189.52545166015625,
1248
+ "logps/rejected": -327.7802429199219,
1249
+ "loss": 0.0064,
1250
+ "rewards/accuracies": 1.0,
1251
+ "rewards/chosen": -12.922433853149414,
1252
+ "rewards/margins": 12.425726890563965,
1253
+ "rewards/rejected": -25.348161697387695,
1254
+ "step": 820
1255
+ },
1256
+ {
1257
+ "epoch": 0.664,
1258
+ "grad_norm": 2.0558318532148774,
1259
+ "learning_rate": 9.187266203218456e-07,
1260
+ "logits/chosen": 17.683448791503906,
1261
+ "logits/rejected": 20.042156219482422,
1262
+ "logps/chosen": -159.53077697753906,
1263
+ "logps/rejected": -312.9973449707031,
1264
+ "loss": 0.0097,
1265
+ "rewards/accuracies": 1.0,
1266
+ "rewards/chosen": -10.393224716186523,
1267
+ "rewards/margins": 14.232980728149414,
1268
+ "rewards/rejected": -24.626201629638672,
1269
+ "step": 830
1270
+ },
1271
+ {
1272
+ "epoch": 0.672,
1273
+ "grad_norm": 17.198595889469797,
1274
+ "learning_rate": 8.803433552631875e-07,
1275
+ "logits/chosen": 17.60666275024414,
1276
+ "logits/rejected": 17.99045181274414,
1277
+ "logps/chosen": -170.3730926513672,
1278
+ "logps/rejected": -328.09185791015625,
1279
+ "loss": 0.0873,
1280
+ "rewards/accuracies": 0.9750000238418579,
1281
+ "rewards/chosen": -10.795900344848633,
1282
+ "rewards/margins": 15.03393840789795,
1283
+ "rewards/rejected": -25.8298397064209,
1284
+ "step": 840
1285
+ },
1286
+ {
1287
+ "epoch": 0.68,
1288
+ "grad_norm": 1.3072581313287024,
1289
+ "learning_rate": 8.424432798163837e-07,
1290
+ "logits/chosen": 18.38207244873047,
1291
+ "logits/rejected": 19.747913360595703,
1292
+ "logps/chosen": -165.44735717773438,
1293
+ "logps/rejected": -309.8100891113281,
1294
+ "loss": 0.0274,
1295
+ "rewards/accuracies": 0.949999988079071,
1296
+ "rewards/chosen": -10.688611030578613,
1297
+ "rewards/margins": 13.613398551940918,
1298
+ "rewards/rejected": -24.302011489868164,
1299
+ "step": 850
1300
+ },
1301
+ {
1302
+ "epoch": 0.688,
1303
+ "grad_norm": 0.4582264747297489,
1304
+ "learning_rate": 8.050559473202078e-07,
1305
+ "logits/chosen": 14.035835266113281,
1306
+ "logits/rejected": 18.153425216674805,
1307
+ "logps/chosen": -167.19036865234375,
1308
+ "logps/rejected": -312.13775634765625,
1309
+ "loss": 0.0353,
1310
+ "rewards/accuracies": 0.9750000238418579,
1311
+ "rewards/chosen": -11.431414604187012,
1312
+ "rewards/margins": 12.939321517944336,
1313
+ "rewards/rejected": -24.370737075805664,
1314
+ "step": 860
1315
+ },
1316
+ {
1317
+ "epoch": 0.696,
1318
+ "grad_norm": 3.136356793214795,
1319
+ "learning_rate": 7.682105112919007e-07,
1320
+ "logits/chosen": 19.952112197875977,
1321
+ "logits/rejected": 21.141132354736328,
1322
+ "logps/chosen": -154.23995971679688,
1323
+ "logps/rejected": -319.5997009277344,
1324
+ "loss": 0.0262,
1325
+ "rewards/accuracies": 1.0,
1326
+ "rewards/chosen": -9.864435195922852,
1327
+ "rewards/margins": 15.159929275512695,
1328
+ "rewards/rejected": -25.024364471435547,
1329
+ "step": 870
1330
+ },
1331
+ {
1332
+ "epoch": 0.704,
1333
+ "grad_norm": 3.519769878341393,
1334
+ "learning_rate": 7.319357026941429e-07,
1335
+ "logits/chosen": 19.199859619140625,
1336
+ "logits/rejected": 19.918563842773438,
1337
+ "logps/chosen": -194.5305938720703,
1338
+ "logps/rejected": -317.95941162109375,
1339
+ "loss": 0.0509,
1340
+ "rewards/accuracies": 0.9750000238418579,
1341
+ "rewards/chosen": -13.087428092956543,
1342
+ "rewards/margins": 12.253093719482422,
1343
+ "rewards/rejected": -25.34052276611328,
1344
+ "step": 880
1345
+ },
1346
+ {
1347
+ "epoch": 0.712,
1348
+ "grad_norm": 24.857735299978447,
1349
+ "learning_rate": 6.962598075315047e-07,
1350
+ "logits/chosen": 17.146230697631836,
1351
+ "logits/rejected": 20.964147567749023,
1352
+ "logps/chosen": -182.15980529785156,
1353
+ "logps/rejected": -345.76788330078125,
1354
+ "loss": 0.0118,
1355
+ "rewards/accuracies": 1.0,
1356
+ "rewards/chosen": -12.32874584197998,
1357
+ "rewards/margins": 14.388389587402344,
1358
+ "rewards/rejected": -26.71713638305664,
1359
+ "step": 890
1360
+ },
1361
+ {
1362
+ "epoch": 0.72,
1363
+ "grad_norm": 0.318353767436957,
1364
+ "learning_rate": 6.6121064479388e-07,
1365
+ "logits/chosen": 17.372299194335938,
1366
+ "logits/rejected": 20.967824935913086,
1367
+ "logps/chosen": -164.0208282470703,
1368
+ "logps/rejected": -318.7355651855469,
1369
+ "loss": 0.0267,
1370
+ "rewards/accuracies": 1.0,
1371
+ "rewards/chosen": -11.150172233581543,
1372
+ "rewards/margins": 13.733154296875,
1373
+ "rewards/rejected": -24.88332748413086,
1374
+ "step": 900
1375
+ },
1376
+ {
1377
+ "epoch": 0.728,
1378
+ "grad_norm": 3.8936657076963193,
1379
+ "learning_rate": 6.268155447640661e-07,
1380
+ "logits/chosen": 18.852811813354492,
1381
+ "logits/rejected": 21.377389907836914,
1382
+ "logps/chosen": -168.42544555664062,
1383
+ "logps/rejected": -329.688232421875,
1384
+ "loss": 0.0379,
1385
+ "rewards/accuracies": 0.9750000238418579,
1386
+ "rewards/chosen": -11.377861976623535,
1387
+ "rewards/margins": 15.148188591003418,
1388
+ "rewards/rejected": -26.526050567626953,
1389
+ "step": 910
1390
+ },
1391
+ {
1392
+ "epoch": 0.736,
1393
+ "grad_norm": 8.372162670348306,
1394
+ "learning_rate": 5.931013277064378e-07,
1395
+ "logits/chosen": 15.737344741821289,
1396
+ "logits/rejected": 17.821592330932617,
1397
+ "logps/chosen": -168.76051330566406,
1398
+ "logps/rejected": -316.5263366699219,
1399
+ "loss": 0.0359,
1400
+ "rewards/accuracies": 0.9750000238418579,
1401
+ "rewards/chosen": -11.1880521774292,
1402
+ "rewards/margins": 13.797701835632324,
1403
+ "rewards/rejected": -24.985754013061523,
1404
+ "step": 920
1405
+ },
1406
+ {
1407
+ "epoch": 0.744,
1408
+ "grad_norm": 19.290631890698684,
1409
+ "learning_rate": 5.600942829533097e-07,
1410
+ "logits/chosen": 16.14108657836914,
1411
+ "logits/rejected": 18.526290893554688,
1412
+ "logps/chosen": -183.9233856201172,
1413
+ "logps/rejected": -345.7073669433594,
1414
+ "loss": 0.0664,
1415
+ "rewards/accuracies": 1.0,
1416
+ "rewards/chosen": -12.947591781616211,
1417
+ "rewards/margins": 14.908193588256836,
1418
+ "rewards/rejected": -27.855789184570312,
1419
+ "step": 930
1420
+ },
1421
+ {
1422
+ "epoch": 0.752,
1423
+ "grad_norm": 28.437105122450188,
1424
+ "learning_rate": 5.278201484053037e-07,
1425
+ "logits/chosen": 14.238430976867676,
1426
+ "logits/rejected": 15.927212715148926,
1427
+ "logps/chosen": -175.82205200195312,
1428
+ "logps/rejected": -357.60577392578125,
1429
+ "loss": 0.0106,
1430
+ "rewards/accuracies": 1.0,
1431
+ "rewards/chosen": -11.766983032226562,
1432
+ "rewards/margins": 17.238004684448242,
1433
+ "rewards/rejected": -29.004989624023438,
1434
+ "step": 940
1435
+ },
1436
+ {
1437
+ "epoch": 0.76,
1438
+ "grad_norm": 47.398111998155244,
1439
+ "learning_rate": 4.963040904617131e-07,
1440
+ "logits/chosen": 14.369181632995605,
1441
+ "logits/rejected": 16.830841064453125,
1442
+ "logps/chosen": -195.03958129882812,
1443
+ "logps/rejected": -361.30755615234375,
1444
+ "loss": 0.0405,
1445
+ "rewards/accuracies": 1.0,
1446
+ "rewards/chosen": -13.507547378540039,
1447
+ "rewards/margins": 15.785783767700195,
1448
+ "rewards/rejected": -29.2933292388916,
1449
+ "step": 950
1450
+ },
1451
+ {
1452
+ "epoch": 0.768,
1453
+ "grad_norm": 18.929417271431557,
1454
+ "learning_rate": 4.6557068439649533e-07,
1455
+ "logits/chosen": 12.442909240722656,
1456
+ "logits/rejected": 15.81037712097168,
1457
+ "logps/chosen": -172.2083740234375,
1458
+ "logps/rejected": -346.9830627441406,
1459
+ "loss": 0.0164,
1460
+ "rewards/accuracies": 1.0,
1461
+ "rewards/chosen": -11.788956642150879,
1462
+ "rewards/margins": 15.845266342163086,
1463
+ "rewards/rejected": -27.63422203063965,
1464
+ "step": 960
1465
+ },
1466
+ {
1467
+ "epoch": 0.776,
1468
+ "grad_norm": 0.010537786096251401,
1469
+ "learning_rate": 4.3564389519521896e-07,
1470
+ "logits/chosen": 11.128179550170898,
1471
+ "logits/rejected": 15.002403259277344,
1472
+ "logps/chosen": -179.82070922851562,
1473
+ "logps/rejected": -359.905517578125,
1474
+ "loss": 0.0076,
1475
+ "rewards/accuracies": 1.0,
1476
+ "rewards/chosen": -12.77198314666748,
1477
+ "rewards/margins": 16.542465209960938,
1478
+ "rewards/rejected": -29.3144474029541,
1479
+ "step": 970
1480
+ },
1481
+ {
1482
+ "epoch": 0.784,
1483
+ "grad_norm": 3.2903365603646555,
1484
+ "learning_rate": 4.06547058867883e-07,
1485
+ "logits/chosen": 13.022871017456055,
1486
+ "logits/rejected": 17.119640350341797,
1487
+ "logps/chosen": -183.79476928710938,
1488
+ "logps/rejected": -351.65728759765625,
1489
+ "loss": 0.0185,
1490
+ "rewards/accuracies": 0.9750000238418579,
1491
+ "rewards/chosen": -12.853482246398926,
1492
+ "rewards/margins": 15.206174850463867,
1493
+ "rewards/rejected": -28.059656143188477,
1494
+ "step": 980
1495
+ },
1496
+ {
1497
+ "epoch": 0.792,
1498
+ "grad_norm": 8.653995083366096,
1499
+ "learning_rate": 3.7830286425220237e-07,
1500
+ "logits/chosen": 14.448740005493164,
1501
+ "logits/rejected": 16.771413803100586,
1502
+ "logps/chosen": -181.69955444335938,
1503
+ "logps/rejected": -360.3311767578125,
1504
+ "loss": 0.0337,
1505
+ "rewards/accuracies": 0.9750000238418579,
1506
+ "rewards/chosen": -12.328413009643555,
1507
+ "rewards/margins": 16.78951644897461,
1508
+ "rewards/rejected": -29.117929458618164,
1509
+ "step": 990
1510
+ },
1511
+ {
1512
+ "epoch": 0.8,
1513
+ "grad_norm": 0.028161198779737273,
1514
+ "learning_rate": 3.5093333532153313e-07,
1515
+ "logits/chosen": 12.604635238647461,
1516
+ "logits/rejected": 16.513111114501953,
1517
+ "logps/chosen": -170.95950317382812,
1518
+ "logps/rejected": -356.91705322265625,
1519
+ "loss": 0.0152,
1520
+ "rewards/accuracies": 0.9750000238418579,
1521
+ "rewards/chosen": -11.3618745803833,
1522
+ "rewards/margins": 17.1263370513916,
1523
+ "rewards/rejected": -28.488210678100586,
1524
+ "step": 1000
1525
+ },
1526
+ {
1527
+ "epoch": 0.808,
1528
+ "grad_norm": 1.7845230113365977,
1529
+ "learning_rate": 3.2445981401124044e-07,
1530
+ "logits/chosen": 12.725648880004883,
1531
+ "logits/rejected": 15.18701457977295,
1532
+ "logps/chosen": -182.36656188964844,
1533
+ "logps/rejected": -355.4485778808594,
1534
+ "loss": 0.0053,
1535
+ "rewards/accuracies": 1.0,
1536
+ "rewards/chosen": -12.828389167785645,
1537
+ "rewards/margins": 16.320749282836914,
1538
+ "rewards/rejected": -29.149139404296875,
1539
+ "step": 1010
1540
+ },
1541
+ {
1542
+ "epoch": 0.816,
1543
+ "grad_norm": 0.034587439255628315,
1544
+ "learning_rate": 2.9890294357689994e-07,
1545
+ "logits/chosen": 14.864664077758789,
1546
+ "logits/rejected": 18.851364135742188,
1547
+ "logps/chosen": -199.5797576904297,
1548
+ "logps/rejected": -365.7235412597656,
1549
+ "loss": 0.0171,
1550
+ "rewards/accuracies": 0.9750000238418579,
1551
+ "rewards/chosen": -13.424786567687988,
1552
+ "rewards/margins": 15.109170913696289,
1553
+ "rewards/rejected": -28.533960342407227,
1554
+ "step": 1020
1555
+ },
1556
+ {
1557
+ "epoch": 0.824,
1558
+ "grad_norm": 0.03435180433087091,
1559
+ "learning_rate": 2.7428265249730726e-07,
1560
+ "logits/chosen": 12.820098876953125,
1561
+ "logits/rejected": 15.737815856933594,
1562
+ "logps/chosen": -192.89608764648438,
1563
+ "logps/rejected": -351.27911376953125,
1564
+ "loss": 0.0322,
1565
+ "rewards/accuracies": 0.9750000238418579,
1566
+ "rewards/chosen": -13.299699783325195,
1567
+ "rewards/margins": 14.958763122558594,
1568
+ "rewards/rejected": -28.258464813232422,
1569
+ "step": 1030
1570
+ },
1571
+ {
1572
+ "epoch": 0.832,
1573
+ "grad_norm": 3.4002892431899436,
1574
+ "learning_rate": 2.5061813893485086e-07,
1575
+ "logits/chosen": 14.156018257141113,
1576
+ "logits/rejected": 17.030376434326172,
1577
+ "logps/chosen": -179.10986328125,
1578
+ "logps/rejected": -335.97186279296875,
1579
+ "loss": 0.0141,
1580
+ "rewards/accuracies": 1.0,
1581
+ "rewards/chosen": -11.678332328796387,
1582
+ "rewards/margins": 14.767659187316895,
1583
+ "rewards/rejected": -26.44598960876465,
1584
+ "step": 1040
1585
+ },
1586
+ {
1587
+ "epoch": 0.84,
1588
+ "grad_norm": 0.020870062099409628,
1589
+ "learning_rate": 2.2792785576536108e-07,
1590
+ "logits/chosen": 12.851526260375977,
1591
+ "logits/rejected": 17.06875228881836,
1592
+ "logps/chosen": -163.72132873535156,
1593
+ "logps/rejected": -354.77117919921875,
1594
+ "loss": 0.0303,
1595
+ "rewards/accuracies": 0.9750000238418579,
1596
+ "rewards/chosen": -10.758419036865234,
1597
+ "rewards/margins": 16.93534278869629,
1598
+ "rewards/rejected": -27.693761825561523,
1599
+ "step": 1050
1600
+ },
1601
+ {
1602
+ "epoch": 0.848,
1603
+ "grad_norm": 0.12145713143714221,
1604
+ "learning_rate": 2.062294961891138e-07,
1605
+ "logits/chosen": 13.231893539428711,
1606
+ "logits/rejected": 16.38172149658203,
1607
+ "logps/chosen": -175.502685546875,
1608
+ "logps/rejected": -341.85650634765625,
1609
+ "loss": 0.0067,
1610
+ "rewards/accuracies": 0.9750000238418579,
1611
+ "rewards/chosen": -11.950915336608887,
1612
+ "rewards/margins": 15.844401359558105,
1613
+ "rewards/rejected": -27.795318603515625,
1614
+ "step": 1060
1615
+ },
1616
+ {
1617
+ "epoch": 0.856,
1618
+ "grad_norm": 43.872269822135024,
1619
+ "learning_rate": 1.8553997993420495e-07,
1620
+ "logits/chosen": 11.042947769165039,
1621
+ "logits/rejected": 14.6314697265625,
1622
+ "logps/chosen": -167.84786987304688,
1623
+ "logps/rejected": -359.76239013671875,
1624
+ "loss": 0.0365,
1625
+ "rewards/accuracies": 0.949999988079071,
1626
+ "rewards/chosen": -11.415273666381836,
1627
+ "rewards/margins": 17.310102462768555,
1628
+ "rewards/rejected": -28.72537612915039,
1629
+ "step": 1070
1630
+ },
1631
+ {
1632
+ "epoch": 0.864,
1633
+ "grad_norm": 16.613133726739267,
1634
+ "learning_rate": 1.6587544006305372e-07,
1635
+ "logits/chosen": 14.675074577331543,
1636
+ "logits/rejected": 14.268391609191895,
1637
+ "logps/chosen": -200.9607391357422,
1638
+ "logps/rejected": -337.6705627441406,
1639
+ "loss": 0.0206,
1640
+ "rewards/accuracies": 0.9750000238418579,
1641
+ "rewards/chosen": -13.507387161254883,
1642
+ "rewards/margins": 13.960055351257324,
1643
+ "rewards/rejected": -27.46744155883789,
1644
+ "step": 1080
1645
+ },
1646
+ {
1647
+ "epoch": 0.872,
1648
+ "grad_norm": 0.10757306008079003,
1649
+ "learning_rate": 1.4725121039232948e-07,
1650
+ "logits/chosen": 14.780688285827637,
1651
+ "logits/rejected": 16.79671287536621,
1652
+ "logps/chosen": -167.3707733154297,
1653
+ "logps/rejected": -333.1465759277344,
1654
+ "loss": 0.0274,
1655
+ "rewards/accuracies": 0.9750000238418579,
1656
+ "rewards/chosen": -10.258435249328613,
1657
+ "rewards/margins": 16.166662216186523,
1658
+ "rewards/rejected": -26.425098419189453,
1659
+ "step": 1090
1660
+ },
1661
+ {
1662
+ "epoch": 0.88,
1663
+ "grad_norm": 53.565796368395205,
1664
+ "learning_rate": 1.2968181353609853e-07,
1665
+ "logits/chosen": 13.000600814819336,
1666
+ "logits/rejected": 15.201478958129883,
1667
+ "logps/chosen": -158.11436462402344,
1668
+ "logps/rejected": -326.98077392578125,
1669
+ "loss": 0.0248,
1670
+ "rewards/accuracies": 0.9750000238418579,
1671
+ "rewards/chosen": -10.076480865478516,
1672
+ "rewards/margins": 16.441701889038086,
1673
+ "rewards/rejected": -26.518178939819336,
1674
+ "step": 1100
1675
+ },
1676
+ {
1677
+ "epoch": 0.888,
1678
+ "grad_norm": 0.4931850288780545,
1679
+ "learning_rate": 1.1318094958153047e-07,
1680
+ "logits/chosen": 15.825027465820312,
1681
+ "logits/rejected": 18.127300262451172,
1682
+ "logps/chosen": -175.34991455078125,
1683
+ "logps/rejected": -349.7121276855469,
1684
+ "loss": 0.0138,
1685
+ "rewards/accuracies": 1.0,
1686
+ "rewards/chosen": -11.070804595947266,
1687
+ "rewards/margins": 16.37197494506836,
1688
+ "rewards/rejected": -27.44277572631836,
1689
+ "step": 1110
1690
+ },
1691
+ {
1692
+ "epoch": 0.896,
1693
+ "grad_norm": 0.12431725759551986,
1694
+ "learning_rate": 9.776148540597835e-08,
1695
+ "logits/chosen": 14.717602729797363,
1696
+ "logits/rejected": 16.464252471923828,
1697
+ "logps/chosen": -171.41806030273438,
1698
+ "logps/rejected": -341.33636474609375,
1699
+ "loss": 0.0055,
1700
+ "rewards/accuracies": 1.0,
1701
+ "rewards/chosen": -11.874577522277832,
1702
+ "rewards/margins": 15.349774360656738,
1703
+ "rewards/rejected": -27.224353790283203,
1704
+ "step": 1120
1705
+ },
1706
+ {
1707
+ "epoch": 0.904,
1708
+ "grad_norm": 44.43625690363336,
1709
+ "learning_rate": 8.34354446437785e-08,
1710
+ "logits/chosen": 13.284322738647461,
1711
+ "logits/rejected": 14.527437210083008,
1712
+ "logps/chosen": -176.1920166015625,
1713
+ "logps/rejected": -344.534912109375,
1714
+ "loss": 0.0616,
1715
+ "rewards/accuracies": 1.0,
1716
+ "rewards/chosen": -11.812251091003418,
1717
+ "rewards/margins": 16.68192481994629,
1718
+ "rewards/rejected": -28.49417495727539,
1719
+ "step": 1130
1720
+ },
1721
+ {
1722
+ "epoch": 0.912,
1723
+ "grad_norm": 0.3325545642344925,
1724
+ "learning_rate": 7.021399831057961e-08,
1725
+ "logits/chosen": 14.616548538208008,
1726
+ "logits/rejected": 18.06045913696289,
1727
+ "logps/chosen": -175.67092895507812,
1728
+ "logps/rejected": -326.31268310546875,
1729
+ "loss": 0.0184,
1730
+ "rewards/accuracies": 0.949999988079071,
1731
+ "rewards/chosen": -11.880111694335938,
1732
+ "rewards/margins": 14.142997741699219,
1733
+ "rewards/rejected": -26.023107528686523,
1734
+ "step": 1140
1735
+ },
1736
+ {
1737
+ "epoch": 0.92,
1738
+ "grad_norm": 9.535643407043993,
1739
+ "learning_rate": 5.810745609252166e-08,
1740
+ "logits/chosen": 15.016606330871582,
1741
+ "logits/rejected": 19.4204044342041,
1742
+ "logps/chosen": -171.228515625,
1743
+ "logps/rejected": -339.8002014160156,
1744
+ "loss": 0.016,
1745
+ "rewards/accuracies": 0.9750000238418579,
1746
+ "rewards/chosen": -11.794087409973145,
1747
+ "rewards/margins": 14.61567497253418,
1748
+ "rewards/rejected": -26.40976333618164,
1749
+ "step": 1150
1750
+ },
1751
+ {
1752
+ "epoch": 0.928,
1753
+ "grad_norm": 0.1185713320229559,
1754
+ "learning_rate": 4.712525830705339e-08,
1755
+ "logits/chosen": 14.084701538085938,
1756
+ "logits/rejected": 15.02185344696045,
1757
+ "logps/chosen": -175.77389526367188,
1758
+ "logps/rejected": -327.8313903808594,
1759
+ "loss": 0.0058,
1760
+ "rewards/accuracies": 0.9750000238418579,
1761
+ "rewards/chosen": -11.838964462280273,
1762
+ "rewards/margins": 15.071843147277832,
1763
+ "rewards/rejected": -26.91080665588379,
1764
+ "step": 1160
1765
+ },
1766
+ {
1767
+ "epoch": 0.936,
1768
+ "grad_norm": 49.630514506359575,
1769
+ "learning_rate": 3.72759685416551e-08,
1770
+ "logits/chosen": 12.91785717010498,
1771
+ "logits/rejected": 16.28598403930664,
1772
+ "logps/chosen": -193.53079223632812,
1773
+ "logps/rejected": -346.7781982421875,
1774
+ "loss": 0.0213,
1775
+ "rewards/accuracies": 1.0,
1776
+ "rewards/chosen": -13.229898452758789,
1777
+ "rewards/margins": 14.539329528808594,
1778
+ "rewards/rejected": -27.76923179626465,
1779
+ "step": 1170
1780
+ },
1781
+ {
1782
+ "epoch": 0.944,
1783
+ "grad_norm": 2.386421051511265,
1784
+ "learning_rate": 2.8567266976212704e-08,
1785
+ "logits/chosen": 13.009920120239258,
1786
+ "logits/rejected": 16.784725189208984,
1787
+ "logps/chosen": -174.90267944335938,
1788
+ "logps/rejected": -360.4608154296875,
1789
+ "loss": 0.0274,
1790
+ "rewards/accuracies": 0.9750000238418579,
1791
+ "rewards/chosen": -11.592721939086914,
1792
+ "rewards/margins": 16.40024185180664,
1793
+ "rewards/rejected": -27.992965698242188,
1794
+ "step": 1180
1795
+ },
1796
+ {
1797
+ "epoch": 0.952,
1798
+ "grad_norm": 0.09581452353121599,
1799
+ "learning_rate": 2.1005944394242692e-08,
1800
+ "logits/chosen": 14.088732719421387,
1801
+ "logits/rejected": 14.811144828796387,
1802
+ "logps/chosen": -190.23629760742188,
1803
+ "logps/rejected": -340.3313903808594,
1804
+ "loss": 0.0103,
1805
+ "rewards/accuracies": 1.0,
1806
+ "rewards/chosen": -12.186540603637695,
1807
+ "rewards/margins": 15.273702621459961,
1808
+ "rewards/rejected": -27.460241317749023,
1809
+ "step": 1190
1810
+ },
1811
+ {
1812
+ "epoch": 0.96,
1813
+ "grad_norm": 2.5596226732992444,
1814
+ "learning_rate": 1.4597896887644457e-08,
1815
+ "logits/chosen": 14.2628812789917,
1816
+ "logits/rejected": 15.2882661819458,
1817
+ "logps/chosen": -183.11183166503906,
1818
+ "logps/rejected": -330.42401123046875,
1819
+ "loss": 0.0051,
1820
+ "rewards/accuracies": 1.0,
1821
+ "rewards/chosen": -11.977328300476074,
1822
+ "rewards/margins": 14.7813720703125,
1823
+ "rewards/rejected": -26.75870132446289,
1824
+ "step": 1200
1825
+ },
1826
+ {
1827
+ "epoch": 0.968,
1828
+ "grad_norm": 8.966198799351712,
1829
+ "learning_rate": 9.348121259105447e-09,
1830
+ "logits/chosen": 13.10435962677002,
1831
+ "logits/rejected": 15.16510009765625,
1832
+ "logps/chosen": -194.73561096191406,
1833
+ "logps/rejected": -350.19671630859375,
1834
+ "loss": 0.0139,
1835
+ "rewards/accuracies": 0.9750000238418579,
1836
+ "rewards/chosen": -13.223831176757812,
1837
+ "rewards/margins": 14.7427978515625,
1838
+ "rewards/rejected": -27.966629028320312,
1839
+ "step": 1210
1840
+ },
1841
+ {
1842
+ "epoch": 0.976,
1843
+ "grad_norm": 2.1731183890345145,
1844
+ "learning_rate": 5.260711125743445e-09,
1845
+ "logits/chosen": 11.423822402954102,
1846
+ "logits/rejected": 15.956392288208008,
1847
+ "logps/chosen": -157.79640197753906,
1848
+ "logps/rejected": -352.1459655761719,
1849
+ "loss": 0.0088,
1850
+ "rewards/accuracies": 1.0,
1851
+ "rewards/chosen": -10.58418083190918,
1852
+ "rewards/margins": 17.188343048095703,
1853
+ "rewards/rejected": -27.772525787353516,
1854
+ "step": 1220
1855
+ },
1856
+ {
1857
+ "epoch": 0.984,
1858
+ "grad_norm": 0.000975641347440435,
1859
+ "learning_rate": 2.3388537270284673e-09,
1860
+ "logits/chosen": 14.252192497253418,
1861
+ "logits/rejected": 16.61027717590332,
1862
+ "logps/chosen": -179.70510864257812,
1863
+ "logps/rejected": -353.10711669921875,
1864
+ "loss": 0.0054,
1865
+ "rewards/accuracies": 1.0,
1866
+ "rewards/chosen": -12.220856666564941,
1867
+ "rewards/margins": 16.520856857299805,
1868
+ "rewards/rejected": -28.741714477539062,
1869
+ "step": 1230
1870
+ },
1871
+ {
1872
+ "epoch": 0.992,
1873
+ "grad_norm": 0.5744272246950218,
1874
+ "learning_rate": 5.848274394684716e-10,
1875
+ "logits/chosen": 14.274118423461914,
1876
+ "logits/rejected": 17.041919708251953,
1877
+ "logps/chosen": -176.65518188476562,
1878
+ "logps/rejected": -355.2174377441406,
1879
+ "loss": 0.0222,
1880
+ "rewards/accuracies": 0.9750000238418579,
1881
+ "rewards/chosen": -12.072566032409668,
1882
+ "rewards/margins": 16.45151710510254,
1883
+ "rewards/rejected": -28.524078369140625,
1884
+ "step": 1240
1885
+ },
1886
+ {
1887
+ "epoch": 1.0,
1888
+ "grad_norm": 0.016349832310295288,
1889
+ "learning_rate": 0.0,
1890
+ "logits/chosen": 12.271159172058105,
1891
+ "logits/rejected": 15.676365852355957,
1892
+ "logps/chosen": -191.03286743164062,
1893
+ "logps/rejected": -355.4150390625,
1894
+ "loss": 0.0361,
1895
+ "rewards/accuracies": 1.0,
1896
+ "rewards/chosen": -13.313619613647461,
1897
+ "rewards/margins": 14.883234024047852,
1898
+ "rewards/rejected": -28.196853637695312,
1899
+ "step": 1250
1900
+ },
1901
+ {
1902
+ "epoch": 1.0,
1903
+ "step": 1250,
1904
+ "total_flos": 0.0,
1905
+ "train_loss": 0.12455665428638459,
1906
+ "train_runtime": 20866.0985,
1907
+ "train_samples_per_second": 0.958,
1908
+ "train_steps_per_second": 0.06
1909
+ }
1910
+ ],
1911
+ "logging_steps": 10,
1912
+ "max_steps": 1250,
1913
+ "num_input_tokens_seen": 0,
1914
+ "num_train_epochs": 1,
1915
+ "save_steps": 500,
1916
+ "stateful_callbacks": {
1917
+ "TrainerControl": {
1918
+ "args": {
1919
+ "should_epoch_stop": false,
1920
+ "should_evaluate": false,
1921
+ "should_log": false,
1922
+ "should_save": true,
1923
+ "should_training_stop": true
1924
+ },
1925
+ "attributes": {}
1926
+ }
1927
+ },
1928
+ "total_flos": 0.0,
1929
+ "train_batch_size": 1,
1930
+ "trial_name": null,
1931
+ "trial_params": null
1932
+ }