lzc0525 commited on
Commit
455a6aa
·
verified ·
1 Parent(s): 3f0fdd7

Upload folder using huggingface_hub

Browse files
all_results.json CHANGED
@@ -1,24 +1,24 @@
1
  {
2
  "epoch": 0.9959925193694897,
3
- "eval_logits/chosen": -2.511686325073242,
4
- "eval_logits/rejected": -2.44095516204834,
5
- "eval_logps/chosen": -0.6983144879341125,
6
- "eval_logps/ref_chosen": -0.7951558232307434,
7
- "eval_logps/ref_rejected": -0.8243172764778137,
8
- "eval_logps/rejected": -0.8109735250473022,
9
- "eval_loss": 0.4769956171512604,
10
- "eval_rewards/accuracies": 0.6814516186714172,
11
- "eval_rewards/chosen": 0.24210312962532043,
12
- "eval_rewards/margins": 0.20874378085136414,
13
- "eval_rewards/rejected": 0.033359345048666,
14
- "eval_runtime": 145.1612,
15
  "eval_samples": 1961,
16
- "eval_samples_per_second": 13.509,
17
- "eval_steps_per_second": 0.427,
18
  "total_flos": 0.0,
19
- "train_loss": 0.4860667634931245,
20
- "train_runtime": 16529.3176,
21
  "train_samples": 59876,
22
- "train_samples_per_second": 3.622,
23
  "train_steps_per_second": 0.014
24
  }
 
1
  {
2
  "epoch": 0.9959925193694897,
3
+ "eval_logits/chosen": -2.370966672897339,
4
+ "eval_logits/rejected": -2.2963974475860596,
5
+ "eval_logps/chosen": -223.0048065185547,
6
+ "eval_logps/ref_chosen": -237.70138549804688,
7
+ "eval_logps/ref_rejected": -244.81951904296875,
8
+ "eval_logps/rejected": -232.80235290527344,
9
+ "eval_loss": 0.4936164915561676,
10
+ "eval_rewards/accuracies": 0.6572580933570862,
11
+ "eval_rewards/chosen": 0.14696598052978516,
12
+ "eval_rewards/margins": 0.026794558390975,
13
+ "eval_rewards/rejected": 0.12017140537500381,
14
+ "eval_runtime": 141.5135,
15
  "eval_samples": 1961,
16
+ "eval_samples_per_second": 13.857,
17
+ "eval_steps_per_second": 0.438,
18
  "total_flos": 0.0,
19
+ "train_loss": 0.49642937480124283,
20
+ "train_runtime": 16410.2083,
21
  "train_samples": 59876,
22
+ "train_samples_per_second": 3.649,
23
  "train_steps_per_second": 0.014
24
  }
eval_results.json CHANGED
@@ -1,18 +1,18 @@
1
  {
2
  "epoch": 0.9959925193694897,
3
- "eval_logits/chosen": -2.511686325073242,
4
- "eval_logits/rejected": -2.44095516204834,
5
- "eval_logps/chosen": -0.6983144879341125,
6
- "eval_logps/ref_chosen": -0.7951558232307434,
7
- "eval_logps/ref_rejected": -0.8243172764778137,
8
- "eval_logps/rejected": -0.8109735250473022,
9
- "eval_loss": 0.4769956171512604,
10
- "eval_rewards/accuracies": 0.6814516186714172,
11
- "eval_rewards/chosen": 0.24210312962532043,
12
- "eval_rewards/margins": 0.20874378085136414,
13
- "eval_rewards/rejected": 0.033359345048666,
14
- "eval_runtime": 145.1612,
15
  "eval_samples": 1961,
16
- "eval_samples_per_second": 13.509,
17
- "eval_steps_per_second": 0.427
18
  }
 
1
  {
2
  "epoch": 0.9959925193694897,
3
+ "eval_logits/chosen": -2.370966672897339,
4
+ "eval_logits/rejected": -2.2963974475860596,
5
+ "eval_logps/chosen": -223.0048065185547,
6
+ "eval_logps/ref_chosen": -237.70138549804688,
7
+ "eval_logps/ref_rejected": -244.81951904296875,
8
+ "eval_logps/rejected": -232.80235290527344,
9
+ "eval_loss": 0.4936164915561676,
10
+ "eval_rewards/accuracies": 0.6572580933570862,
11
+ "eval_rewards/chosen": 0.14696598052978516,
12
+ "eval_rewards/margins": 0.026794558390975,
13
+ "eval_rewards/rejected": 0.12017140537500381,
14
+ "eval_runtime": 141.5135,
15
  "eval_samples": 1961,
16
+ "eval_samples_per_second": 13.857,
17
+ "eval_steps_per_second": 0.438
18
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f0fcc07c77caf38cf16eca94466b09a450f59047949056da639891d7db810679
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:275ce3f1a86067d54cdf9833ffb6bc6744cc7f069a565035201c3b5532a681e0
3
  size 4976698672
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:716ddb06194501c683fce829c9c12c4360bffbe234af9790616bb8dea15c9a7c
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:357ef4d1e8076cf6ed25ef0832b4f778ccaac92509a671be2f0bd459bec774f9
3
  size 4999802720
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac1c3d736e0f1ea37f82f87d7e8a5ce546e8731b7d6e863adb987c73e40b1482
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f3bf313ea09793309e6446a9f39cb995390296792fb6bafc05f63c2e6b81cac
3
  size 4915916176
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1848cd805e8117e7c2d8bce0d5fa2d9a2ea04fcef2a0930d658566669435ea46
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46028d9aa6fb09491ef46a7ca1c69c42eb69eb801c9c83cc2063a1dc0ccb22c2
3
  size 1168138808
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.9959925193694897,
3
  "total_flos": 0.0,
4
- "train_loss": 0.4860667634931245,
5
- "train_runtime": 16529.3176,
6
  "train_samples": 59876,
7
- "train_samples_per_second": 3.622,
8
  "train_steps_per_second": 0.014
9
  }
 
1
  {
2
  "epoch": 0.9959925193694897,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.49642937480124283,
5
+ "train_runtime": 16410.2083,
6
  "train_samples": 59876,
7
+ "train_samples_per_second": 3.649,
8
  "train_steps_per_second": 0.014
9
  }
trainer_state.json CHANGED
@@ -10,793 +10,793 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.02137323002938819,
13
- "grad_norm": 0.7561385035514832,
14
- "learning_rate": 2.0833333333333333e-07,
15
- "logits/chosen": -1.7804944515228271,
16
- "logits/rejected": -1.6545133590698242,
17
- "logps/chosen": -0.7810468673706055,
18
- "logps/ref_chosen": -0.7813535928726196,
19
- "logps/ref_rejected": -0.8060104250907898,
20
- "logps/rejected": -0.805632472038269,
21
  "loss": 0.5,
22
- "rewards/accuracies": 0.33125001192092896,
23
- "rewards/chosen": 0.0007667625322937965,
24
- "rewards/margins": -0.00017807073891162872,
25
- "rewards/rejected": 0.0009448332712054253,
26
  "step": 5
27
  },
28
  {
29
  "epoch": 0.04274646005877638,
30
- "grad_norm": 0.5518713593482971,
31
- "learning_rate": 4.1666666666666667e-07,
32
- "logits/chosen": -1.7383073568344116,
33
- "logits/rejected": -1.7030198574066162,
34
- "logps/chosen": -0.8675562143325806,
35
- "logps/ref_chosen": -0.8662088513374329,
36
- "logps/ref_rejected": -0.9053529500961304,
37
- "logps/rejected": -0.907278835773468,
38
  "loss": 0.5,
39
- "rewards/accuracies": 0.5375000238418579,
40
- "rewards/chosen": -0.003368514124304056,
41
- "rewards/margins": 0.0014460685197263956,
42
- "rewards/rejected": -0.004814582876861095,
43
  "step": 10
44
  },
45
  {
46
  "epoch": 0.06411969008816458,
47
- "grad_norm": 0.49651747941970825,
48
- "learning_rate": 6.249999999999999e-07,
49
- "logits/chosen": -1.928145170211792,
50
- "logits/rejected": -1.8129940032958984,
51
- "logps/chosen": -0.855307400226593,
52
- "logps/ref_chosen": -0.8494969606399536,
53
- "logps/ref_rejected": -0.8662179708480835,
54
- "logps/rejected": -0.8723622560501099,
55
- "loss": 0.4998,
56
- "rewards/accuracies": 0.512499988079071,
57
- "rewards/chosen": -0.014526228420436382,
58
- "rewards/margins": 0.0008344631642103195,
59
- "rewards/rejected": -0.015360690653324127,
60
  "step": 15
61
  },
62
  {
63
  "epoch": 0.08549292011755276,
64
- "grad_norm": 0.27864935994148254,
65
- "learning_rate": 8.333333333333333e-07,
66
- "logits/chosen": -1.7597744464874268,
67
- "logits/rejected": -1.6725934743881226,
68
- "logps/chosen": -0.9120359420776367,
69
- "logps/ref_chosen": -0.8935796618461609,
70
- "logps/ref_rejected": -0.8952409029006958,
71
- "logps/rejected": -0.9148454666137695,
72
- "loss": 0.4993,
73
- "rewards/accuracies": 0.5687500238418579,
74
- "rewards/chosen": -0.0461406409740448,
75
- "rewards/margins": 0.0028707808814942837,
76
- "rewards/rejected": -0.04901142045855522,
77
  "step": 20
78
  },
79
  {
80
  "epoch": 0.10686615014694095,
81
- "grad_norm": 0.4763440787792206,
82
- "learning_rate": 9.999435142363483e-07,
83
- "logits/chosen": -1.624091386795044,
84
- "logits/rejected": -1.5722483396530151,
85
- "logps/chosen": -0.9541429281234741,
86
- "logps/ref_chosen": -0.8983734846115112,
87
- "logps/ref_rejected": -0.9594888687133789,
88
- "logps/rejected": -1.035103440284729,
89
- "loss": 0.4976,
90
- "rewards/accuracies": 0.53125,
91
- "rewards/chosen": -0.1394234150648117,
92
- "rewards/margins": 0.04961305111646652,
93
- "rewards/rejected": -0.18903647363185883,
94
  "step": 25
95
  },
96
  {
97
  "epoch": 0.12823938017632916,
98
- "grad_norm": 0.3390841782093048,
99
- "learning_rate": 9.97967852255038e-07,
100
- "logits/chosen": -1.6577975749969482,
101
- "logits/rejected": -1.5775320529937744,
102
- "logps/chosen": -0.8317171931266785,
103
- "logps/ref_chosen": -0.7469085454940796,
104
- "logps/ref_rejected": -0.79144287109375,
105
- "logps/rejected": -0.8856765031814575,
106
- "loss": 0.4975,
107
- "rewards/accuracies": 0.4749999940395355,
108
- "rewards/chosen": -0.2120215892791748,
109
- "rewards/margins": 0.023562394082546234,
110
- "rewards/rejected": -0.23558397591114044,
111
  "step": 30
112
  },
113
  {
114
  "epoch": 0.14961261020571734,
115
- "grad_norm": 0.3006940484046936,
116
- "learning_rate": 9.931806517013612e-07,
117
- "logits/chosen": -1.6243362426757812,
118
- "logits/rejected": -1.6471837759017944,
119
- "logps/chosen": -0.9220904111862183,
120
- "logps/ref_chosen": -0.7822158336639404,
121
- "logps/ref_rejected": -0.8102364540100098,
122
- "logps/rejected": -0.9670912027359009,
123
- "loss": 0.4963,
124
- "rewards/accuracies": 0.5,
125
- "rewards/chosen": -0.34968677163124084,
126
- "rewards/margins": 0.04244992882013321,
127
- "rewards/rejected": -0.39213672280311584,
128
  "step": 35
129
  },
130
  {
131
  "epoch": 0.17098584023510552,
132
- "grad_norm": 0.27084407210350037,
133
- "learning_rate": 9.856089412257604e-07,
134
- "logits/chosen": -1.6578766107559204,
135
- "logits/rejected": -1.6355148553848267,
136
- "logps/chosen": -1.05038583278656,
137
- "logps/ref_chosen": -0.8560595512390137,
138
- "logps/ref_rejected": -0.914546012878418,
139
- "logps/rejected": -1.1552728414535522,
140
- "loss": 0.4937,
141
  "rewards/accuracies": 0.574999988079071,
142
- "rewards/chosen": -0.48581594228744507,
143
- "rewards/margins": 0.11600111424922943,
144
- "rewards/rejected": -0.6018170118331909,
145
  "step": 40
146
  },
147
  {
148
  "epoch": 0.19235907026449373,
149
- "grad_norm": 0.49249762296676636,
150
- "learning_rate": 9.752954708892377e-07,
151
- "logits/chosen": -1.5577068328857422,
152
- "logits/rejected": -1.465714931488037,
153
- "logps/chosen": -1.0501822233200073,
154
- "logps/ref_chosen": -0.8724653124809265,
155
- "logps/ref_rejected": -0.8607926368713379,
156
- "logps/rejected": -1.0398765802383423,
157
- "loss": 0.4975,
158
  "rewards/accuracies": 0.543749988079071,
159
- "rewards/chosen": -0.4442923665046692,
160
- "rewards/margins": 0.003417615545913577,
161
- "rewards/rejected": -0.4477098882198334,
162
  "step": 45
163
  },
164
  {
165
  "epoch": 0.2137323002938819,
166
- "grad_norm": 0.32699063420295715,
167
- "learning_rate": 9.62298470795473e-07,
168
- "logits/chosen": -1.7691097259521484,
169
- "logits/rejected": -1.7416681051254272,
170
- "logps/chosen": -0.9927698969841003,
171
- "logps/ref_chosen": -0.8696678280830383,
172
- "logps/ref_rejected": -0.8965504765510559,
173
- "logps/rejected": -1.030956506729126,
174
- "loss": 0.4967,
175
- "rewards/accuracies": 0.5375000238418579,
176
- "rewards/chosen": -0.3077549934387207,
177
- "rewards/margins": 0.028260568156838417,
178
- "rewards/rejected": -0.33601561188697815,
179
  "step": 50
180
  },
181
  {
182
  "epoch": 0.2351055303232701,
183
- "grad_norm": 0.31701868772506714,
184
- "learning_rate": 9.466913223222465e-07,
185
- "logits/chosen": -1.5519920587539673,
186
- "logits/rejected": -1.4699208736419678,
187
- "logps/chosen": -0.8616452217102051,
188
- "logps/ref_chosen": -0.7731812596321106,
189
- "logps/ref_rejected": -0.7838868498802185,
190
- "logps/rejected": -0.8863974809646606,
191
- "loss": 0.4956,
192
- "rewards/accuracies": 0.5562499761581421,
193
- "rewards/chosen": -0.22115974128246307,
194
- "rewards/margins": 0.03511647880077362,
195
- "rewards/rejected": -0.2562762200832367,
196
  "step": 55
197
  },
198
  {
199
  "epoch": 0.2564787603526583,
200
- "grad_norm": 0.40317973494529724,
201
- "learning_rate": 9.285621438083997e-07,
202
- "logits/chosen": -1.601485252380371,
203
- "logits/rejected": -1.5545583963394165,
204
- "logps/chosen": -0.8779473304748535,
205
- "logps/ref_chosen": -0.7888692617416382,
206
- "logps/ref_rejected": -0.8163660168647766,
207
- "logps/rejected": -0.9202233552932739,
208
- "loss": 0.4936,
209
- "rewards/accuracies": 0.550000011920929,
210
- "rewards/chosen": -0.22269515693187714,
211
- "rewards/margins": 0.03694819286465645,
212
- "rewards/rejected": -0.2596433460712433,
213
  "step": 60
214
  },
215
  {
216
  "epoch": 0.2778519903820465,
217
- "grad_norm": 0.5152870416641235,
218
- "learning_rate": 9.080132930355566e-07,
219
- "logits/chosen": -1.6490017175674438,
220
- "logits/rejected": -1.6716206073760986,
221
- "logps/chosen": -0.9653270840644836,
222
- "logps/ref_chosen": -0.8533055186271667,
223
- "logps/ref_rejected": -0.9036076664924622,
224
- "logps/rejected": -1.0383471250534058,
225
- "loss": 0.4942,
226
- "rewards/accuracies": 0.612500011920929,
227
- "rewards/chosen": -0.28005388379096985,
228
- "rewards/margins": 0.05679459124803543,
229
- "rewards/rejected": -0.33684849739074707,
230
  "step": 65
231
  },
232
  {
233
  "epoch": 0.2992252204114347,
234
- "grad_norm": 0.4174951910972595,
235
- "learning_rate": 8.851607893136064e-07,
236
- "logits/chosen": -1.728899598121643,
237
- "logits/rejected": -1.6759620904922485,
238
- "logps/chosen": -0.9261114001274109,
239
- "logps/ref_chosen": -0.8132463693618774,
240
- "logps/ref_rejected": -0.8208681344985962,
241
- "logps/rejected": -0.9566439390182495,
242
- "loss": 0.4926,
243
  "rewards/accuracies": 0.5625,
244
- "rewards/chosen": -0.28216248750686646,
245
- "rewards/margins": 0.057276882231235504,
246
- "rewards/rejected": -0.33943939208984375,
247
  "step": 70
248
  },
249
  {
250
  "epoch": 0.32059845044082286,
251
- "grad_norm": 0.36295104026794434,
252
- "learning_rate": 8.601336584328658e-07,
253
- "logits/chosen": -1.7176015377044678,
254
- "logits/rejected": -1.7168267965316772,
255
- "logps/chosen": -0.9694639444351196,
256
- "logps/ref_chosen": -0.8283951878547668,
257
- "logps/ref_rejected": -0.8723212480545044,
258
- "logps/rejected": -1.0357882976531982,
259
- "loss": 0.4939,
260
- "rewards/accuracies": 0.5687500238418579,
261
- "rewards/chosen": -0.3526715338230133,
262
- "rewards/margins": 0.05599608272314072,
263
- "rewards/rejected": -0.4086676239967346,
264
  "step": 75
265
  },
266
  {
267
  "epoch": 0.34197168047021104,
268
- "grad_norm": 0.6786319017410278,
269
- "learning_rate": 8.330732041813366e-07,
270
- "logits/chosen": -1.8271814584732056,
271
- "logits/rejected": -1.7772512435913086,
272
- "logps/chosen": -0.8929777145385742,
273
- "logps/ref_chosen": -0.8355891108512878,
274
- "logps/ref_rejected": -0.9002590179443359,
275
- "logps/rejected": -0.9975314140319824,
276
- "loss": 0.4901,
277
- "rewards/accuracies": 0.53125,
278
- "rewards/chosen": -0.14347167313098907,
279
- "rewards/margins": 0.09970954060554504,
280
- "rewards/rejected": -0.24318119883537292,
281
  "step": 80
282
  },
283
  {
284
  "epoch": 0.36334491049959927,
285
- "grad_norm": 0.8407174348831177,
286
- "learning_rate": 8.041322105400921e-07,
287
- "logits/chosen": -1.706368088722229,
288
- "logits/rejected": -1.650854468345642,
289
- "logps/chosen": -0.8318307995796204,
290
- "logps/ref_chosen": -0.8256785273551941,
291
- "logps/ref_rejected": -0.8488883972167969,
292
- "logps/rejected": -0.9011926651000977,
293
- "loss": 0.4878,
294
- "rewards/accuracies": 0.6187499761581421,
295
- "rewards/chosen": -0.01538090594112873,
296
- "rewards/margins": 0.1153799295425415,
297
- "rewards/rejected": -0.13076083362102509,
298
  "step": 85
299
  },
300
  {
301
  "epoch": 0.38471814052898745,
302
- "grad_norm": 1.0095113515853882,
303
- "learning_rate": 7.734740790612136e-07,
304
- "logits/chosen": -1.8660595417022705,
305
- "logits/rejected": -1.8641777038574219,
306
- "logps/chosen": -0.8596251606941223,
307
- "logps/ref_chosen": -0.9228288531303406,
308
- "logps/ref_rejected": -0.9406684637069702,
309
- "logps/rejected": -0.9123810529708862,
310
- "loss": 0.4896,
311
- "rewards/accuracies": 0.59375,
312
- "rewards/chosen": 0.15800921618938446,
313
- "rewards/margins": 0.08729076385498047,
314
- "rewards/rejected": 0.07071846723556519,
315
  "step": 90
316
  },
317
  {
318
  "epoch": 0.40609137055837563,
319
- "grad_norm": 0.8767968416213989,
320
- "learning_rate": 7.412719062986631e-07,
321
- "logits/chosen": -1.9249579906463623,
322
- "logits/rejected": -1.8531602621078491,
323
- "logps/chosen": -0.8149619102478027,
324
- "logps/ref_chosen": -0.9041957855224609,
325
- "logps/ref_rejected": -0.914394736289978,
326
- "logps/rejected": -0.8845084309577942,
327
- "loss": 0.4859,
328
- "rewards/accuracies": 0.6312500238418579,
329
- "rewards/chosen": 0.22308464348316193,
330
- "rewards/margins": 0.14836890995502472,
331
- "rewards/rejected": 0.0747157484292984,
332
  "step": 95
333
  },
334
  {
335
  "epoch": 0.4274646005877638,
336
- "grad_norm": 0.6044840812683105,
337
- "learning_rate": 7.077075065009433e-07,
338
- "logits/chosen": -1.731792688369751,
339
- "logits/rejected": -1.7363353967666626,
340
- "logps/chosen": -0.7217603921890259,
341
- "logps/ref_chosen": -0.8257284164428711,
342
- "logps/ref_rejected": -0.8479409217834473,
343
- "logps/rejected": -0.784611701965332,
344
- "loss": 0.4852,
345
- "rewards/accuracies": 0.6312500238418579,
346
- "rewards/chosen": 0.2599199414253235,
347
- "rewards/margins": 0.10159693658351898,
348
- "rewards/rejected": 0.1583230048418045,
349
  "step": 100
350
  },
351
  {
352
  "epoch": 0.448837830617152,
353
- "grad_norm": 0.7751099467277527,
354
- "learning_rate": 6.72970385083438e-07,
355
- "logits/chosen": -1.9043171405792236,
356
- "logits/rejected": -1.789009690284729,
357
- "logps/chosen": -0.7011796236038208,
358
- "logps/ref_chosen": -0.8166704177856445,
359
- "logps/ref_rejected": -0.8361040949821472,
360
- "logps/rejected": -0.7531259655952454,
361
- "loss": 0.4905,
362
- "rewards/accuracies": 0.6312500238418579,
363
- "rewards/chosen": 0.2887269854545593,
364
- "rewards/margins": 0.0812816247344017,
365
- "rewards/rejected": 0.20744535326957703,
366
  "step": 105
367
  },
368
  {
369
  "epoch": 0.4702110606465402,
370
- "grad_norm": 0.7444783449172974,
371
- "learning_rate": 6.372566686762426e-07,
372
- "logits/chosen": -1.8429124355316162,
373
- "logits/rejected": -1.760053277015686,
374
- "logps/chosen": -0.7318671941757202,
375
- "logps/ref_chosen": -0.8331576585769653,
376
- "logps/ref_rejected": -0.8635438084602356,
377
- "logps/rejected": -0.7953906059265137,
378
- "loss": 0.4817,
379
- "rewards/accuracies": 0.668749988079071,
380
- "rewards/chosen": 0.253226101398468,
381
- "rewards/margins": 0.08284299075603485,
382
- "rewards/rejected": 0.17038312554359436,
383
  "step": 110
384
  },
385
  {
386
  "epoch": 0.4915842906759284,
387
- "grad_norm": 0.8039557337760925,
388
- "learning_rate": 6.00767997788451e-07,
389
- "logits/chosen": -1.9033887386322021,
390
- "logits/rejected": -1.770939588546753,
391
- "logps/chosen": -0.7566145062446594,
392
- "logps/ref_chosen": -0.8713752627372742,
393
- "logps/ref_rejected": -0.8939735293388367,
394
- "logps/rejected": -0.8082603216171265,
395
- "loss": 0.4844,
396
- "rewards/accuracies": 0.625,
397
- "rewards/chosen": 0.28690171241760254,
398
- "rewards/margins": 0.07261888682842255,
399
- "rewards/rejected": 0.2142828404903412,
400
  "step": 115
401
  },
402
  {
403
  "epoch": 0.5129575207053166,
404
- "grad_norm": 1.1172226667404175,
405
- "learning_rate": 5.637103883409525e-07,
406
- "logits/chosen": -1.9406812191009521,
407
- "logits/rejected": -1.867462158203125,
408
- "logps/chosen": -0.8249004483222961,
409
- "logps/ref_chosen": -0.873686671257019,
410
- "logps/ref_rejected": -0.9026174545288086,
411
- "logps/rejected": -0.8801124691963196,
412
- "loss": 0.4836,
413
- "rewards/accuracies": 0.643750011920929,
414
- "rewards/chosen": 0.1219654530286789,
415
- "rewards/margins": 0.06570279598236084,
416
- "rewards/rejected": 0.05626266077160835,
417
  "step": 120
418
  },
419
  {
420
  "epoch": 0.5343307507347048,
421
- "grad_norm": 0.8093484044075012,
422
- "learning_rate": 5.262930684955438e-07,
423
- "logits/chosen": -2.0165348052978516,
424
- "logits/rejected": -1.9574447870254517,
425
- "logps/chosen": -0.8015368580818176,
426
- "logps/ref_chosen": -0.815376877784729,
427
- "logps/ref_rejected": -0.8817696571350098,
428
- "logps/rejected": -0.9249275326728821,
429
- "loss": 0.4825,
430
- "rewards/accuracies": 0.6312500238418579,
431
- "rewards/chosen": 0.034600116312503815,
432
- "rewards/margins": 0.14249476790428162,
433
- "rewards/rejected": -0.1078946590423584,
434
  "step": 125
435
  },
436
  {
437
  "epoch": 0.555703980764093,
438
- "grad_norm": 0.8614518642425537,
439
- "learning_rate": 4.88727297347654e-07,
440
- "logits/chosen": -1.951319694519043,
441
- "logits/rejected": -1.933098554611206,
442
- "logps/chosen": -0.7757576107978821,
443
- "logps/ref_chosen": -0.7751168608665466,
444
- "logps/ref_rejected": -0.8734749555587769,
445
- "logps/rejected": -0.9476582407951355,
446
- "loss": 0.4794,
447
- "rewards/accuracies": 0.6625000238418579,
448
- "rewards/chosen": -0.0016015321016311646,
449
- "rewards/margins": 0.18385668098926544,
450
- "rewards/rejected": -0.1854582130908966,
451
  "step": 130
452
  },
453
  {
454
  "epoch": 0.5770772107934812,
455
- "grad_norm": 1.054673194885254,
456
- "learning_rate": 4.512251721523659e-07,
457
- "logits/chosen": -2.005443811416626,
458
- "logits/rejected": -2.0154759883880615,
459
- "logps/chosen": -0.7493831515312195,
460
- "logps/ref_chosen": -0.7740285992622375,
461
- "logps/ref_rejected": -0.8138446807861328,
462
- "logps/rejected": -0.851770281791687,
463
- "loss": 0.4847,
464
- "rewards/accuracies": 0.5687500238418579,
465
- "rewards/chosen": 0.06161379814147949,
466
- "rewards/margins": 0.15642789006233215,
467
- "rewards/rejected": -0.09481407701969147,
468
  "step": 135
469
  },
470
  {
471
  "epoch": 0.5984504408228694,
472
- "grad_norm": 1.669247031211853,
473
- "learning_rate": 4.139984308181708e-07,
474
- "logits/chosen": -1.9584558010101318,
475
- "logits/rejected": -1.8885042667388916,
476
- "logps/chosen": -0.7844404578208923,
477
- "logps/ref_chosen": -0.8231161236763,
478
- "logps/ref_rejected": -0.83356112241745,
479
- "logps/rejected": -0.8187839388847351,
480
- "loss": 0.481,
481
- "rewards/accuracies": 0.5249999761581421,
482
- "rewards/chosen": 0.09668895602226257,
483
- "rewards/margins": 0.0597461462020874,
484
- "rewards/rejected": 0.03694281354546547,
485
  "step": 140
486
  },
487
  {
488
  "epoch": 0.6198236708522575,
489
- "grad_norm": 1.0050734281539917,
490
- "learning_rate": 3.772572564296004e-07,
491
- "logits/chosen": -1.8883240222930908,
492
- "logits/rejected": -1.8240330219268799,
493
- "logps/chosen": -0.7662582397460938,
494
- "logps/ref_chosen": -0.8861669301986694,
495
- "logps/ref_rejected": -0.924543023109436,
496
- "logps/rejected": -0.8522801399230957,
497
- "loss": 0.4785,
498
- "rewards/accuracies": 0.59375,
499
- "rewards/chosen": 0.29977160692214966,
500
- "rewards/margins": 0.11911455541849136,
501
- "rewards/rejected": 0.18065707385540009,
502
  "step": 145
503
  },
504
  {
505
  "epoch": 0.6411969008816457,
506
- "grad_norm": 0.9123177528381348,
507
- "learning_rate": 3.412090905484337e-07,
508
- "logits/chosen": -1.9726581573486328,
509
- "logits/rejected": -1.9151983261108398,
510
- "logps/chosen": -0.7370959520339966,
511
- "logps/ref_chosen": -0.866258978843689,
512
- "logps/ref_rejected": -0.8657606840133667,
513
- "logps/rejected": -0.8145822286605835,
514
- "loss": 0.4817,
515
- "rewards/accuracies": 0.6499999761581421,
516
- "rewards/chosen": 0.3229079246520996,
517
- "rewards/margins": 0.19496168196201324,
518
- "rewards/rejected": 0.12794628739356995,
519
  "step": 150
520
  },
521
  {
522
  "epoch": 0.6625701309110339,
523
- "grad_norm": 0.7277486324310303,
524
- "learning_rate": 3.060574619936075e-07,
525
- "logits/chosen": -1.8861163854599,
526
- "logits/rejected": -1.9004647731781006,
527
- "logps/chosen": -0.7181011438369751,
528
- "logps/ref_chosen": -0.8273455500602722,
529
- "logps/ref_rejected": -0.8894198536872864,
530
- "logps/rejected": -0.8557901382446289,
531
- "loss": 0.4828,
532
- "rewards/accuracies": 0.675000011920929,
533
- "rewards/chosen": 0.27311110496520996,
534
- "rewards/margins": 0.18903681635856628,
535
- "rewards/rejected": 0.08407425880432129,
536
  "step": 155
537
  },
538
  {
539
  "epoch": 0.6839433609404221,
540
- "grad_norm": 1.0556169748306274,
541
- "learning_rate": 2.720008377125682e-07,
542
- "logits/chosen": -2.1498093605041504,
543
- "logits/rejected": -2.08402419090271,
544
- "logps/chosen": -0.7090437412261963,
545
- "logps/ref_chosen": -0.8103801012039185,
546
- "logps/ref_rejected": -0.8715206980705261,
547
- "logps/rejected": -0.7982600927352905,
548
- "loss": 0.4815,
549
- "rewards/accuracies": 0.606249988079071,
550
- "rewards/chosen": 0.25334107875823975,
551
- "rewards/margins": 0.0701896995306015,
552
- "rewards/rejected": 0.18315134942531586,
553
  "step": 160
554
  },
555
  {
556
  "epoch": 0.7053165909698104,
557
- "grad_norm": 0.9068896770477295,
558
- "learning_rate": 2.3923150223207173e-07,
559
- "logits/chosen": -1.9448268413543701,
560
- "logits/rejected": -1.904314637184143,
561
- "logps/chosen": -0.7294695377349854,
562
- "logps/ref_chosen": -0.8327474594116211,
563
- "logps/ref_rejected": -0.9134753346443176,
564
- "logps/rejected": -0.8374517560005188,
565
- "loss": 0.4781,
566
- "rewards/accuracies": 0.59375,
567
- "rewards/chosen": 0.25819462537765503,
568
- "rewards/margins": 0.06813579052686691,
569
- "rewards/rejected": 0.1900588572025299,
570
  "step": 165
571
  },
572
  {
573
  "epoch": 0.7266898209991985,
574
- "grad_norm": 0.9222660660743713,
575
- "learning_rate": 2.0793447201508286e-07,
576
- "logits/chosen": -1.9369819164276123,
577
- "logits/rejected": -1.9469242095947266,
578
- "logps/chosen": -0.6714679002761841,
579
- "logps/ref_chosen": -0.7705163359642029,
580
- "logps/ref_rejected": -0.8333786129951477,
581
- "logps/rejected": -0.7605674862861633,
582
- "loss": 0.4845,
583
- "rewards/accuracies": 0.574999988079071,
584
- "rewards/chosen": 0.24762126803398132,
585
- "rewards/margins": 0.06559363007545471,
586
- "rewards/rejected": 0.18202762305736542,
587
  "step": 170
588
  },
589
  {
590
  "epoch": 0.7480630510285867,
591
- "grad_norm": 0.7805910110473633,
592
- "learning_rate": 1.7828645085333644e-07,
593
- "logits/chosen": -1.9515256881713867,
594
- "logits/rejected": -1.9057369232177734,
595
- "logps/chosen": -0.7692245244979858,
596
- "logps/ref_chosen": -0.8767744302749634,
597
- "logps/ref_rejected": -0.8914516568183899,
598
- "logps/rejected": -0.8368504643440247,
599
- "loss": 0.4772,
600
- "rewards/accuracies": 0.637499988079071,
601
- "rewards/chosen": 0.2688748240470886,
602
- "rewards/margins": 0.13237187266349792,
603
- "rewards/rejected": 0.1365029364824295,
604
  "step": 175
605
  },
606
  {
607
  "epoch": 0.7694362810579749,
608
- "grad_norm": 0.8946753740310669,
609
- "learning_rate": 1.5045483219344385e-07,
610
- "logits/chosen": -2.008927583694458,
611
- "logits/rejected": -1.9914191961288452,
612
- "logps/chosen": -0.7557133436203003,
613
- "logps/ref_chosen": -0.8390854597091675,
614
- "logps/ref_rejected": -0.8787837028503418,
615
- "logps/rejected": -0.853840708732605,
616
- "loss": 0.479,
617
- "rewards/accuracies": 0.574999988079071,
618
- "rewards/chosen": 0.2084302008152008,
619
- "rewards/margins": 0.14607290923595428,
620
- "rewards/rejected": 0.062357254326343536,
621
  "step": 180
622
  },
623
  {
624
  "epoch": 0.7908095110873631,
625
- "grad_norm": 1.0278949737548828,
626
- "learning_rate": 1.2459675402943288e-07,
627
- "logits/chosen": -2.0313353538513184,
628
- "logits/rejected": -1.9398431777954102,
629
- "logps/chosen": -0.7753286361694336,
630
- "logps/ref_chosen": -0.8660305142402649,
631
- "logps/ref_rejected": -0.8604307174682617,
632
- "logps/rejected": -0.7984707951545715,
633
- "loss": 0.4765,
634
- "rewards/accuracies": 0.625,
635
- "rewards/chosen": 0.22675485908985138,
636
- "rewards/margins": 0.07185501605272293,
637
- "rewards/rejected": 0.15489983558654785,
638
  "step": 185
639
  },
640
  {
641
  "epoch": 0.8121827411167513,
642
- "grad_norm": 0.8263163566589355,
643
- "learning_rate": 1.0085821169782199e-07,
644
- "logits/chosen": -2.0978431701660156,
645
- "logits/rejected": -2.055168628692627,
646
- "logps/chosen": -0.7470763921737671,
647
- "logps/ref_chosen": -0.8603025674819946,
648
- "logps/ref_rejected": -0.9167188405990601,
649
- "logps/rejected": -0.8492987751960754,
650
- "loss": 0.4776,
651
- "rewards/accuracies": 0.675000011920929,
652
- "rewards/chosen": 0.2830653786659241,
653
- "rewards/margins": 0.11451487243175507,
654
- "rewards/rejected": 0.1685505211353302,
655
  "step": 190
656
  },
657
  {
658
  "epoch": 0.8335559711461394,
659
- "grad_norm": 0.9788782596588135,
660
- "learning_rate": 7.937323358440934e-08,
661
- "logits/chosen": -2.13584041595459,
662
- "logits/rejected": -2.0809855461120605,
663
- "logps/chosen": -0.7344987988471985,
664
- "logps/ref_chosen": -0.8153272867202759,
665
- "logps/ref_rejected": -0.8524506688117981,
666
- "logps/rejected": -0.8127390742301941,
667
- "loss": 0.475,
668
- "rewards/accuracies": 0.59375,
669
- "rewards/chosen": 0.20207130908966064,
670
- "rewards/margins": 0.10279206931591034,
671
- "rewards/rejected": 0.09927921742200851,
672
  "step": 195
673
  },
674
  {
675
  "epoch": 0.8549292011755276,
676
- "grad_norm": 1.1459167003631592,
677
- "learning_rate": 6.026312439675551e-08,
678
- "logits/chosen": -1.9530729055404663,
679
- "logits/rejected": -1.8388773202896118,
680
- "logps/chosen": -0.7524019479751587,
681
- "logps/ref_chosen": -0.832992672920227,
682
- "logps/ref_rejected": -0.8378564715385437,
683
- "logps/rejected": -0.7900499701499939,
684
- "loss": 0.4783,
685
- "rewards/accuracies": 0.59375,
686
- "rewards/chosen": 0.20147652924060822,
687
- "rewards/margins": 0.08196047693490982,
688
- "rewards/rejected": 0.11951601505279541,
689
  "step": 200
690
  },
691
  {
692
  "epoch": 0.8763024312049158,
693
- "grad_norm": 1.2217403650283813,
694
- "learning_rate": 4.3635780274861864e-08,
695
- "logits/chosen": -1.9760059118270874,
696
- "logits/rejected": -1.881757378578186,
697
- "logps/chosen": -0.754524827003479,
698
- "logps/ref_chosen": -0.8361877202987671,
699
- "logps/ref_rejected": -0.8636215329170227,
700
- "logps/rejected": -0.8430477380752563,
701
- "loss": 0.4761,
702
  "rewards/accuracies": 0.606249988079071,
703
- "rewards/chosen": 0.20415742695331573,
704
- "rewards/margins": 0.15272292494773865,
705
- "rewards/rejected": 0.05143451690673828,
706
  "step": 205
707
  },
708
  {
709
  "epoch": 0.897675661234304,
710
- "grad_norm": 0.9312232732772827,
711
- "learning_rate": 2.958507960694784e-08,
712
- "logits/chosen": -1.9826923608779907,
713
- "logits/rejected": -1.963322401046753,
714
- "logps/chosen": -0.7218400239944458,
715
- "logps/ref_chosen": -0.775153636932373,
716
- "logps/ref_rejected": -0.82710200548172,
717
- "logps/rejected": -0.8231655359268188,
718
- "loss": 0.4756,
719
- "rewards/accuracies": 0.550000011920929,
720
- "rewards/chosen": 0.13328400254249573,
721
- "rewards/margins": 0.12344253063201904,
722
- "rewards/rejected": 0.009841480292379856,
723
  "step": 210
724
  },
725
  {
726
  "epoch": 0.9190488912636923,
727
- "grad_norm": 0.9846080541610718,
728
- "learning_rate": 1.8190352989793322e-08,
729
- "logits/chosen": -1.9855095148086548,
730
- "logits/rejected": -1.907576560974121,
731
- "logps/chosen": -0.7199736833572388,
732
- "logps/ref_chosen": -0.803063690662384,
733
- "logps/ref_rejected": -0.8516971468925476,
734
- "logps/rejected": -0.8365123867988586,
735
- "loss": 0.4776,
736
- "rewards/accuracies": 0.574999988079071,
737
- "rewards/chosen": 0.2077248990535736,
738
- "rewards/margins": 0.16976311802864075,
739
- "rewards/rejected": 0.03796178475022316,
740
  "step": 215
741
  },
742
  {
743
  "epoch": 0.9404221212930804,
744
- "grad_norm": 0.7335969805717468,
745
- "learning_rate": 9.515935326265378e-09,
746
- "logits/chosen": -2.0024361610412598,
747
- "logits/rejected": -1.9636704921722412,
748
- "logps/chosen": -0.7521845698356628,
749
- "logps/ref_chosen": -0.8253963589668274,
750
- "logps/ref_rejected": -0.849533200263977,
751
- "logps/rejected": -0.8330303430557251,
752
- "loss": 0.4757,
753
- "rewards/accuracies": 0.612500011920929,
754
- "rewards/chosen": 0.1830292046070099,
755
- "rewards/margins": 0.14177197217941284,
756
- "rewards/rejected": 0.04125722497701645,
757
  "step": 220
758
  },
759
  {
760
  "epoch": 0.9617953513224686,
761
- "grad_norm": 1.1543854475021362,
762
- "learning_rate": 3.6108025888958447e-09,
763
- "logits/chosen": -1.9360746145248413,
764
- "logits/rejected": -1.911627173423767,
765
- "logps/chosen": -0.7084980010986328,
766
- "logps/ref_chosen": -0.7970255613327026,
767
- "logps/ref_rejected": -0.8132475018501282,
768
- "logps/rejected": -0.7666771411895752,
769
- "loss": 0.4765,
770
- "rewards/accuracies": 0.5874999761581421,
771
- "rewards/chosen": 0.22131893038749695,
772
- "rewards/margins": 0.10489317029714584,
773
- "rewards/rejected": 0.1164257749915123,
774
  "step": 225
775
  },
776
  {
777
  "epoch": 0.9831685813518568,
778
- "grad_norm": 1.1417807340621948,
779
- "learning_rate": 5.082953003528456e-10,
780
- "logits/chosen": -2.013756513595581,
781
- "logits/rejected": -2.0383520126342773,
782
- "logps/chosen": -0.8249381184577942,
783
- "logps/ref_chosen": -0.8979822993278503,
784
- "logps/ref_rejected": -0.9172071218490601,
785
- "logps/rejected": -0.9182927012443542,
786
- "loss": 0.4774,
787
- "rewards/accuracies": 0.5874999761581421,
788
- "rewards/chosen": 0.1826106607913971,
789
- "rewards/margins": 0.18532457947731018,
790
- "rewards/rejected": -0.0027139366138726473,
791
  "step": 230
792
  },
793
  {
794
  "epoch": 0.9959925193694897,
795
  "step": 233,
796
  "total_flos": 0.0,
797
- "train_loss": 0.4860667634931245,
798
- "train_runtime": 16529.3176,
799
- "train_samples_per_second": 3.622,
800
  "train_steps_per_second": 0.014
801
  }
802
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.02137323002938819,
13
+ "grad_norm": 0.4608515202999115,
14
+ "learning_rate": 1.0416666666666667e-07,
15
+ "logits/chosen": -1.7747037410736084,
16
+ "logits/rejected": -1.6486629247665405,
17
+ "logps/chosen": -247.47836303710938,
18
+ "logps/ref_chosen": -247.4757537841797,
19
+ "logps/ref_rejected": -250.2177734375,
20
+ "logps/rejected": -250.17874145507812,
21
  "loss": 0.5,
22
+ "rewards/accuracies": 0.29374998807907104,
23
+ "rewards/chosen": -2.605724148452282e-05,
24
+ "rewards/margins": -0.00041639525443315506,
25
+ "rewards/rejected": 0.0003903379547409713,
26
  "step": 5
27
  },
28
  {
29
  "epoch": 0.04274646005877638,
30
+ "grad_norm": 0.426495224237442,
31
+ "learning_rate": 2.0833333333333333e-07,
32
+ "logits/chosen": -1.7335236072540283,
33
+ "logits/rejected": -1.6989978551864624,
34
+ "logps/chosen": -222.6909637451172,
35
+ "logps/ref_chosen": -222.6491241455078,
36
+ "logps/ref_rejected": -223.95663452148438,
37
+ "logps/rejected": -223.9930877685547,
38
  "loss": 0.5,
39
+ "rewards/accuracies": 0.5,
40
+ "rewards/chosen": -0.00041838129982352257,
41
+ "rewards/margins": -5.400222653406672e-05,
42
+ "rewards/rejected": -0.00036437893868424,
43
  "step": 10
44
  },
45
  {
46
  "epoch": 0.06411969008816458,
47
+ "grad_norm": 0.4453659653663635,
48
+ "learning_rate": 3.1249999999999997e-07,
49
+ "logits/chosen": -1.9023773670196533,
50
+ "logits/rejected": -1.789849042892456,
51
+ "logps/chosen": -218.5724334716797,
52
+ "logps/ref_chosen": -218.7084503173828,
53
+ "logps/ref_rejected": -224.755615234375,
54
+ "logps/rejected": -224.6824493408203,
55
+ "loss": 0.5,
56
+ "rewards/accuracies": 0.5625,
57
+ "rewards/chosen": 0.0013600520323961973,
58
+ "rewards/margins": 0.0006284656701609492,
59
+ "rewards/rejected": 0.000731586420442909,
60
  "step": 15
61
  },
62
  {
63
  "epoch": 0.08549292011755276,
64
+ "grad_norm": 0.5101017951965332,
65
+ "learning_rate": 4.1666666666666667e-07,
66
+ "logits/chosen": -1.7127611637115479,
67
+ "logits/rejected": -1.6293315887451172,
68
+ "logps/chosen": -226.1074676513672,
69
+ "logps/ref_chosen": -226.7457275390625,
70
+ "logps/ref_rejected": -235.77908325195312,
71
+ "logps/rejected": -235.2657928466797,
72
+ "loss": 0.4999,
73
+ "rewards/accuracies": 0.5562499761581421,
74
+ "rewards/chosen": 0.006382433231920004,
75
+ "rewards/margins": 0.0012494683032855392,
76
+ "rewards/rejected": 0.005132964812219143,
77
  "step": 20
78
  },
79
  {
80
  "epoch": 0.10686615014694095,
81
+ "grad_norm": 0.4738335609436035,
82
+ "learning_rate": 4.999717571181741e-07,
83
+ "logits/chosen": -1.6099249124526978,
84
+ "logits/rejected": -1.5539109706878662,
85
+ "logps/chosen": -229.36843872070312,
86
+ "logps/ref_chosen": -230.34494018554688,
87
+ "logps/ref_rejected": -231.64236450195312,
88
+ "logps/rejected": -230.74813842773438,
89
+ "loss": 0.4999,
90
+ "rewards/accuracies": 0.606249988079071,
91
+ "rewards/chosen": 0.009765096008777618,
92
+ "rewards/margins": 0.000822968955617398,
93
+ "rewards/rejected": 0.008942126296460629,
94
  "step": 25
95
  },
96
  {
97
  "epoch": 0.12823938017632916,
98
+ "grad_norm": 0.4367460608482361,
99
+ "learning_rate": 4.98983926127519e-07,
100
+ "logits/chosen": -1.6448577642440796,
101
+ "logits/rejected": -1.560329794883728,
102
+ "logps/chosen": -239.9384002685547,
103
+ "logps/ref_chosen": -241.2040557861328,
104
+ "logps/ref_rejected": -253.18862915039062,
105
+ "logps/rejected": -251.95547485351562,
106
+ "loss": 0.4998,
107
+ "rewards/accuracies": 0.550000011920929,
108
+ "rewards/chosen": 0.012656150385737419,
109
+ "rewards/margins": 0.00032438611378893256,
110
+ "rewards/rejected": 0.012331764213740826,
111
  "step": 30
112
  },
113
  {
114
  "epoch": 0.14961261020571734,
115
+ "grad_norm": 0.5036317706108093,
116
+ "learning_rate": 4.965903258506806e-07,
117
+ "logits/chosen": -1.65009343624115,
118
+ "logits/rejected": -1.6685165166854858,
119
+ "logps/chosen": -240.6787109375,
120
+ "logps/ref_chosen": -242.33291625976562,
121
+ "logps/ref_rejected": -237.6911163330078,
122
+ "logps/rejected": -236.1189422607422,
123
+ "loss": 0.4997,
124
+ "rewards/accuracies": 0.543749988079071,
125
+ "rewards/chosen": 0.016541726887226105,
126
+ "rewards/margins": 0.0008201012387871742,
127
+ "rewards/rejected": 0.015721624717116356,
128
  "step": 35
129
  },
130
  {
131
  "epoch": 0.17098584023510552,
132
+ "grad_norm": 0.5212914347648621,
133
+ "learning_rate": 4.928044706128802e-07,
134
+ "logits/chosen": -1.6572792530059814,
135
+ "logits/rejected": -1.6342990398406982,
136
+ "logps/chosen": -224.078857421875,
137
+ "logps/ref_chosen": -226.43637084960938,
138
+ "logps/ref_rejected": -224.00546264648438,
139
+ "logps/rejected": -221.7003173828125,
140
+ "loss": 0.4996,
141
  "rewards/accuracies": 0.574999988079071,
142
+ "rewards/chosen": 0.02357516996562481,
143
+ "rewards/margins": 0.0005238516023382545,
144
+ "rewards/rejected": 0.023051317781209946,
145
  "step": 40
146
  },
147
  {
148
  "epoch": 0.19235907026449373,
149
+ "grad_norm": 0.5110143423080444,
150
+ "learning_rate": 4.876477354446189e-07,
151
+ "logits/chosen": -1.4905364513397217,
152
+ "logits/rejected": -1.3957011699676514,
153
+ "logps/chosen": -216.25308227539062,
154
+ "logps/ref_chosen": -219.16494750976562,
155
+ "logps/ref_rejected": -227.38040161132812,
156
+ "logps/rejected": -224.87564086914062,
157
+ "loss": 0.4994,
158
  "rewards/accuracies": 0.543749988079071,
159
+ "rewards/chosen": 0.02911846712231636,
160
+ "rewards/margins": 0.004071122966706753,
161
+ "rewards/rejected": 0.025047341361641884,
162
  "step": 45
163
  },
164
  {
165
  "epoch": 0.2137323002938819,
166
+ "grad_norm": 0.48523762822151184,
167
+ "learning_rate": 4.811492353977365e-07,
168
+ "logits/chosen": -1.7010364532470703,
169
+ "logits/rejected": -1.6736198663711548,
170
+ "logps/chosen": -218.8837127685547,
171
+ "logps/ref_chosen": -221.23171997070312,
172
+ "logps/ref_rejected": -223.6177215576172,
173
+ "logps/rejected": -221.6636199951172,
174
+ "loss": 0.4993,
175
+ "rewards/accuracies": 0.5562499761581421,
176
+ "rewards/chosen": 0.023480093106627464,
177
+ "rewards/margins": 0.0039388458244502544,
178
+ "rewards/rejected": 0.019541248679161072,
179
  "step": 50
180
  },
181
  {
182
  "epoch": 0.2351055303232701,
183
+ "grad_norm": 0.4816797971725464,
184
+ "learning_rate": 4.7334566116112327e-07,
185
+ "logits/chosen": -1.62349534034729,
186
+ "logits/rejected": -1.5281016826629639,
187
+ "logps/chosen": -237.206787109375,
188
+ "logps/ref_chosen": -239.38412475585938,
189
+ "logps/ref_rejected": -245.71304321289062,
190
+ "logps/rejected": -244.2113800048828,
191
+ "loss": 0.4989,
192
+ "rewards/accuracies": 0.625,
193
+ "rewards/chosen": 0.021773329004645348,
194
+ "rewards/margins": 0.006756873335689306,
195
+ "rewards/rejected": 0.015016456134617329,
196
  "step": 55
197
  },
198
  {
199
  "epoch": 0.2564787603526583,
200
+ "grad_norm": 0.5273976922035217,
201
+ "learning_rate": 4.6428107190419983e-07,
202
+ "logits/chosen": -1.6468950510025024,
203
+ "logits/rejected": -1.599461317062378,
204
+ "logps/chosen": -228.3268585205078,
205
+ "logps/ref_chosen": -231.1789093017578,
206
+ "logps/ref_rejected": -231.9095001220703,
207
+ "logps/rejected": -229.9440460205078,
208
+ "loss": 0.4988,
209
+ "rewards/accuracies": 0.6312500238418579,
210
+ "rewards/chosen": 0.028520625084638596,
211
+ "rewards/margins": 0.008865959011018276,
212
+ "rewards/rejected": 0.019654670730233192,
213
  "step": 60
214
  },
215
  {
216
  "epoch": 0.2778519903820465,
217
+ "grad_norm": 0.47698166966438293,
218
+ "learning_rate": 4.540066465177783e-07,
219
+ "logits/chosen": -1.7030376195907593,
220
+ "logits/rejected": -1.7270011901855469,
221
+ "logps/chosen": -218.37466430664062,
222
+ "logps/ref_chosen": -222.1732635498047,
223
+ "logps/ref_rejected": -221.90371704101562,
224
+ "logps/rejected": -219.0262451171875,
225
+ "loss": 0.4985,
226
+ "rewards/accuracies": 0.65625,
227
+ "rewards/chosen": 0.03798612207174301,
228
+ "rewards/margins": 0.009211419150233269,
229
+ "rewards/rejected": 0.028774702921509743,
230
  "step": 65
231
  },
232
  {
233
  "epoch": 0.2992252204114347,
234
+ "grad_norm": 0.4908115863800049,
235
+ "learning_rate": 4.425803946568032e-07,
236
+ "logits/chosen": -1.701042890548706,
237
+ "logits/rejected": -1.642853021621704,
238
+ "logps/chosen": -237.1160430908203,
239
+ "logps/ref_chosen": -241.13235473632812,
240
+ "logps/ref_rejected": -247.3893585205078,
241
+ "logps/rejected": -243.56692504882812,
242
+ "loss": 0.4985,
243
  "rewards/accuracies": 0.5625,
244
+ "rewards/chosen": 0.040162790566682816,
245
+ "rewards/margins": 0.0019384495681151748,
246
+ "rewards/rejected": 0.038224343210458755,
247
  "step": 70
248
  },
249
  {
250
  "epoch": 0.32059845044082286,
251
+ "grad_norm": 0.48811107873916626,
252
+ "learning_rate": 4.300668292164329e-07,
253
+ "logits/chosen": -1.6175544261932373,
254
+ "logits/rejected": -1.6155774593353271,
255
+ "logps/chosen": -223.8777618408203,
256
+ "logps/ref_chosen": -228.91860961914062,
257
+ "logps/ref_rejected": -227.78170776367188,
258
+ "logps/rejected": -223.22732543945312,
259
+ "loss": 0.4981,
260
+ "rewards/accuracies": 0.5625,
261
+ "rewards/chosen": 0.05040856450796127,
262
+ "rewards/margins": 0.004864625167101622,
263
+ "rewards/rejected": 0.04554395005106926,
264
  "step": 75
265
  },
266
  {
267
  "epoch": 0.34197168047021104,
268
+ "grad_norm": 0.5498376488685608,
269
+ "learning_rate": 4.165366020906683e-07,
270
+ "logits/chosen": -1.721421480178833,
271
+ "logits/rejected": -1.6703542470932007,
272
+ "logps/chosen": -220.573486328125,
273
+ "logps/ref_chosen": -226.90060424804688,
274
+ "logps/ref_rejected": -232.0827178955078,
275
+ "logps/rejected": -227.0341339111328,
276
+ "loss": 0.4975,
277
+ "rewards/accuracies": 0.6499999761581421,
278
+ "rewards/chosen": 0.06327112019062042,
279
+ "rewards/margins": 0.012785114347934723,
280
+ "rewards/rejected": 0.0504860058426857,
281
  "step": 80
282
  },
283
  {
284
  "epoch": 0.36334491049959927,
285
+ "grad_norm": 0.5343174338340759,
286
+ "learning_rate": 4.0206610527004607e-07,
287
+ "logits/chosen": -1.630051612854004,
288
+ "logits/rejected": -1.571542739868164,
289
+ "logps/chosen": -231.68496704101562,
290
+ "logps/ref_chosen": -237.4697723388672,
291
+ "logps/ref_rejected": -240.751953125,
292
+ "logps/rejected": -236.31600952148438,
293
+ "loss": 0.4978,
294
+ "rewards/accuracies": 0.6499999761581421,
295
+ "rewards/chosen": 0.05784807354211807,
296
+ "rewards/margins": 0.013488592579960823,
297
+ "rewards/rejected": 0.0443594828248024,
298
  "step": 85
299
  },
300
  {
301
  "epoch": 0.38471814052898745,
302
+ "grad_norm": 0.5112692713737488,
303
+ "learning_rate": 3.867370395306068e-07,
304
+ "logits/chosen": -1.7595088481903076,
305
+ "logits/rejected": -1.7580636739730835,
306
+ "logps/chosen": -211.63906860351562,
307
+ "logps/ref_chosen": -217.63436889648438,
308
+ "logps/ref_rejected": -222.6137237548828,
309
+ "logps/rejected": -217.2650909423828,
310
+ "loss": 0.4977,
311
+ "rewards/accuracies": 0.612500011920929,
312
+ "rewards/chosen": 0.05995314195752144,
313
+ "rewards/margins": 0.00646712351590395,
314
+ "rewards/rejected": 0.053486019372940063,
315
  "step": 90
316
  },
317
  {
318
  "epoch": 0.40609137055837563,
319
+ "grad_norm": 0.4654058516025543,
320
+ "learning_rate": 3.7063595314933156e-07,
321
+ "logits/chosen": -1.8619199991226196,
322
+ "logits/rejected": -1.786892294883728,
323
+ "logps/chosen": -208.5725555419922,
324
+ "logps/ref_chosen": -213.7164306640625,
325
+ "logps/ref_rejected": -228.556396484375,
326
+ "logps/rejected": -224.4815216064453,
327
+ "loss": 0.498,
328
+ "rewards/accuracies": 0.543749988079071,
329
+ "rewards/chosen": 0.051438819617033005,
330
+ "rewards/margins": 0.010690188966691494,
331
+ "rewards/rejected": 0.04074862599372864,
332
  "step": 95
333
  },
334
  {
335
  "epoch": 0.4274646005877638,
336
+ "grad_norm": 0.5265087485313416,
337
+ "learning_rate": 3.5385375325047163e-07,
338
+ "logits/chosen": -1.6727230548858643,
339
+ "logits/rejected": -1.677062749862671,
340
+ "logps/chosen": -239.5093994140625,
341
+ "logps/ref_chosen": -245.71194458007812,
342
+ "logps/ref_rejected": -240.1134490966797,
343
+ "logps/rejected": -235.6671142578125,
344
+ "loss": 0.4967,
345
+ "rewards/accuracies": 0.643750011920929,
346
+ "rewards/chosen": 0.06202547624707222,
347
+ "rewards/margins": 0.01756184920668602,
348
+ "rewards/rejected": 0.0444636233150959,
349
  "step": 100
350
  },
351
  {
352
  "epoch": 0.448837830617152,
353
+ "grad_norm": 0.53775554895401,
354
+ "learning_rate": 3.36485192541719e-07,
355
+ "logits/chosen": -1.8463099002838135,
356
+ "logits/rejected": -1.7264705896377563,
357
+ "logps/chosen": -224.50320434570312,
358
+ "logps/ref_chosen": -232.00527954101562,
359
+ "logps/ref_rejected": -232.0154266357422,
360
+ "logps/rejected": -225.75454711914062,
361
+ "loss": 0.4968,
362
+ "rewards/accuracies": 0.574999988079071,
363
+ "rewards/chosen": 0.0750209242105484,
364
+ "rewards/margins": 0.012411920353770256,
365
+ "rewards/rejected": 0.062609001994133,
366
  "step": 105
367
  },
368
  {
369
  "epoch": 0.4702110606465402,
370
+ "grad_norm": 0.5438077449798584,
371
+ "learning_rate": 3.186283343381213e-07,
372
+ "logits/chosen": -1.7997539043426514,
373
+ "logits/rejected": -1.7138378620147705,
374
+ "logps/chosen": -220.4825897216797,
375
+ "logps/ref_chosen": -229.9724578857422,
376
+ "logps/ref_rejected": -238.1800079345703,
377
+ "logps/rejected": -230.29736328125,
378
+ "loss": 0.4966,
379
+ "rewards/accuracies": 0.612500011920929,
380
+ "rewards/chosen": 0.09489865601062775,
381
+ "rewards/margins": 0.016072329133749008,
382
+ "rewards/rejected": 0.07882632315158844,
383
  "step": 110
384
  },
385
  {
386
  "epoch": 0.4915842906759284,
387
+ "grad_norm": 0.5453912019729614,
388
+ "learning_rate": 3.003839988942255e-07,
389
+ "logits/chosen": -1.8438644409179688,
390
+ "logits/rejected": -1.7028881311416626,
391
+ "logps/chosen": -203.79205322265625,
392
+ "logps/ref_chosen": -214.1478729248047,
393
+ "logps/ref_rejected": -226.24618530273438,
394
+ "logps/rejected": -217.4800567626953,
395
+ "loss": 0.4968,
396
+ "rewards/accuracies": 0.6499999761581421,
397
+ "rewards/chosen": 0.1035580188035965,
398
+ "rewards/margins": 0.015896398574113846,
399
+ "rewards/rejected": 0.08766160905361176,
400
  "step": 115
401
  },
402
  {
403
  "epoch": 0.5129575207053166,
404
+ "grad_norm": 0.5030398964881897,
405
+ "learning_rate": 2.8185519417047623e-07,
406
+ "logits/chosen": -1.8514922857284546,
407
+ "logits/rejected": -1.7740070819854736,
408
+ "logps/chosen": -214.818359375,
409
+ "logps/ref_chosen": -227.9495086669922,
410
+ "logps/ref_rejected": -230.5752410888672,
411
+ "logps/rejected": -218.9449005126953,
412
+ "loss": 0.496,
413
+ "rewards/accuracies": 0.6187499761581421,
414
+ "rewards/chosen": 0.13131138682365417,
415
+ "rewards/margins": 0.015008069574832916,
416
+ "rewards/rejected": 0.11630330979824066,
417
  "step": 120
418
  },
419
  {
420
  "epoch": 0.5343307507347048,
421
+ "grad_norm": 0.5339066982269287,
422
+ "learning_rate": 2.631465342477719e-07,
423
+ "logits/chosen": -1.9007892608642578,
424
+ "logits/rejected": -1.8334102630615234,
425
+ "logps/chosen": -218.14743041992188,
426
+ "logps/ref_chosen": -232.6212158203125,
427
+ "logps/ref_rejected": -234.5932159423828,
428
+ "logps/rejected": -222.1468505859375,
429
+ "loss": 0.4958,
430
+ "rewards/accuracies": 0.643750011920929,
431
+ "rewards/chosen": 0.1447378695011139,
432
+ "rewards/margins": 0.020274382084608078,
433
+ "rewards/rejected": 0.12446349859237671,
434
  "step": 125
435
  },
436
  {
437
  "epoch": 0.555703980764093,
438
+ "grad_norm": 0.5313855409622192,
439
+ "learning_rate": 2.44363648673827e-07,
440
+ "logits/chosen": -1.7636210918426514,
441
+ "logits/rejected": -1.7406389713287354,
442
+ "logps/chosen": -211.9698944091797,
443
+ "logps/ref_chosen": -226.790771484375,
444
+ "logps/ref_rejected": -231.8648223876953,
445
+ "logps/rejected": -219.543212890625,
446
+ "loss": 0.4945,
447
+ "rewards/accuracies": 0.6937500238418579,
448
+ "rewards/chosen": 0.1482090801000595,
449
+ "rewards/margins": 0.024992961436510086,
450
+ "rewards/rejected": 0.12321610748767853,
451
  "step": 130
452
  },
453
  {
454
  "epoch": 0.5770772107934812,
455
+ "grad_norm": 0.5537051558494568,
456
+ "learning_rate": 2.2561258607618294e-07,
457
+ "logits/chosen": -1.8008477687835693,
458
+ "logits/rejected": -1.8080832958221436,
459
+ "logps/chosen": -234.68893432617188,
460
+ "logps/ref_chosen": -247.26119995117188,
461
+ "logps/ref_rejected": -241.82345581054688,
462
+ "logps/rejected": -231.585693359375,
463
+ "loss": 0.4949,
464
+ "rewards/accuracies": 0.6812499761581421,
465
+ "rewards/chosen": 0.12572243809700012,
466
+ "rewards/margins": 0.023345012217760086,
467
+ "rewards/rejected": 0.10237739980220795,
468
  "step": 135
469
  },
470
  {
471
  "epoch": 0.5984504408228694,
472
+ "grad_norm": 0.5528976321220398,
473
+ "learning_rate": 2.069992154090854e-07,
474
+ "logits/chosen": -1.775397539138794,
475
+ "logits/rejected": -1.6931631565093994,
476
+ "logps/chosen": -219.74072265625,
477
+ "logps/ref_chosen": -230.71826171875,
478
+ "logps/ref_rejected": -227.7001953125,
479
+ "logps/rejected": -218.38241577148438,
480
+ "loss": 0.495,
481
+ "rewards/accuracies": 0.543749988079071,
482
+ "rewards/chosen": 0.10977540910243988,
483
+ "rewards/margins": 0.01659761555492878,
484
+ "rewards/rejected": 0.09317778795957565,
485
  "step": 140
486
  },
487
  {
488
  "epoch": 0.6198236708522575,
489
+ "grad_norm": 0.5473525524139404,
490
+ "learning_rate": 1.886286282148002e-07,
491
+ "logits/chosen": -1.7711913585662842,
492
+ "logits/rejected": -1.7026926279067993,
493
+ "logps/chosen": -195.3854217529297,
494
+ "logps/ref_chosen": -208.07254028320312,
495
+ "logps/ref_rejected": -210.4279022216797,
496
+ "logps/rejected": -199.79165649414062,
497
+ "loss": 0.4946,
498
+ "rewards/accuracies": 0.65625,
499
+ "rewards/chosen": 0.12687113881111145,
500
+ "rewards/margins": 0.020508771762251854,
501
+ "rewards/rejected": 0.10636236518621445,
502
  "step": 145
503
  },
504
  {
505
  "epoch": 0.6411969008816457,
506
+ "grad_norm": 0.5966719388961792,
507
+ "learning_rate": 1.7060454527421686e-07,
508
+ "logits/chosen": -1.8688771724700928,
509
+ "logits/rejected": -1.810694932937622,
510
+ "logps/chosen": -211.9062042236328,
511
+ "logps/ref_chosen": -224.8968505859375,
512
+ "logps/ref_rejected": -226.1548309326172,
513
+ "logps/rejected": -215.7084503173828,
514
+ "loss": 0.4943,
515
+ "rewards/accuracies": 0.625,
516
+ "rewards/chosen": 0.12990659475326538,
517
+ "rewards/margins": 0.02544253133237362,
518
+ "rewards/rejected": 0.10446406900882721,
519
  "step": 150
520
  },
521
  {
522
  "epoch": 0.6625701309110339,
523
+ "grad_norm": 0.5334843993186951,
524
+ "learning_rate": 1.5302873099680374e-07,
525
+ "logits/chosen": -1.786595344543457,
526
+ "logits/rejected": -1.7971456050872803,
527
+ "logps/chosen": -225.0083465576172,
528
+ "logps/ref_chosen": -237.4626922607422,
529
+ "logps/ref_rejected": -234.39547729492188,
530
+ "logps/rejected": -223.2943572998047,
531
+ "loss": 0.4955,
532
+ "rewards/accuracies": 0.5687500238418579,
533
+ "rewards/chosen": 0.12454362213611603,
534
+ "rewards/margins": 0.013532285578548908,
535
+ "rewards/rejected": 0.1110113263130188,
536
  "step": 155
537
  },
538
  {
539
  "epoch": 0.6839433609404221,
540
+ "grad_norm": 0.5639063715934753,
541
+ "learning_rate": 1.360004188562841e-07,
542
+ "logits/chosen": -2.0527145862579346,
543
+ "logits/rejected": -1.9811140298843384,
544
+ "logps/chosen": -217.0570068359375,
545
+ "logps/ref_chosen": -231.03369140625,
546
+ "logps/ref_rejected": -232.6383819580078,
547
+ "logps/rejected": -220.0625457763672,
548
+ "loss": 0.4952,
549
+ "rewards/accuracies": 0.581250011920929,
550
+ "rewards/chosen": 0.1397666186094284,
551
+ "rewards/margins": 0.014008410274982452,
552
+ "rewards/rejected": 0.12575821578502655,
553
  "step": 160
554
  },
555
  {
556
  "epoch": 0.7053165909698104,
557
+ "grad_norm": 0.5417853593826294,
558
+ "learning_rate": 1.1961575111603586e-07,
559
+ "logits/chosen": -1.8371235132217407,
560
+ "logits/rejected": -1.7954612970352173,
561
+ "logps/chosen": -220.7694854736328,
562
+ "logps/ref_chosen": -234.5041046142578,
563
+ "logps/ref_rejected": -235.61181640625,
564
+ "logps/rejected": -224.56640625,
565
+ "loss": 0.4944,
566
+ "rewards/accuracies": 0.6187499761581421,
567
+ "rewards/chosen": 0.1373465657234192,
568
+ "rewards/margins": 0.026892542839050293,
569
+ "rewards/rejected": 0.1104540079832077,
570
  "step": 165
571
  },
572
  {
573
  "epoch": 0.7266898209991985,
574
+ "grad_norm": 0.565830409526825,
575
+ "learning_rate": 1.0396723600754143e-07,
576
+ "logits/chosen": -1.8288425207138062,
577
+ "logits/rejected": -1.83499276638031,
578
+ "logps/chosen": -213.2861785888672,
579
+ "logps/ref_chosen": -227.1809844970703,
580
+ "logps/ref_rejected": -230.8953094482422,
581
+ "logps/rejected": -218.4414520263672,
582
+ "loss": 0.4954,
583
+ "rewards/accuracies": 0.5874999761581421,
584
+ "rewards/chosen": 0.13894793391227722,
585
+ "rewards/margins": 0.014409348368644714,
586
+ "rewards/rejected": 0.12453857809305191,
587
  "step": 170
588
  },
589
  {
590
  "epoch": 0.7480630510285867,
591
+ "grad_norm": 0.5855058431625366,
592
+ "learning_rate": 8.914322542666822e-08,
593
+ "logits/chosen": -1.8145122528076172,
594
+ "logits/rejected": -1.7646887302398682,
595
+ "logps/chosen": -212.070068359375,
596
+ "logps/ref_chosen": -224.17794799804688,
597
+ "logps/ref_rejected": -225.526123046875,
598
+ "logps/rejected": -214.7656707763672,
599
+ "loss": 0.4947,
600
+ "rewards/accuracies": 0.5625,
601
+ "rewards/chosen": 0.12107895314693451,
602
+ "rewards/margins": 0.013474419713020325,
603
+ "rewards/rejected": 0.10760452598333359,
604
  "step": 175
605
  },
606
  {
607
  "epoch": 0.7694362810579749,
608
+ "grad_norm": 0.6223751902580261,
609
+ "learning_rate": 7.522741609672193e-08,
610
+ "logits/chosen": -1.8675405979156494,
611
+ "logits/rejected": -1.8476943969726562,
612
+ "logps/chosen": -216.3776092529297,
613
+ "logps/ref_chosen": -230.77182006835938,
614
+ "logps/ref_rejected": -227.00619506835938,
615
+ "logps/rejected": -214.32931518554688,
616
+ "loss": 0.4945,
617
+ "rewards/accuracies": 0.612500011920929,
618
+ "rewards/chosen": 0.1439422070980072,
619
+ "rewards/margins": 0.017173700034618378,
620
+ "rewards/rejected": 0.12676851451396942,
621
  "step": 180
622
  },
623
  {
624
  "epoch": 0.7908095110873631,
625
+ "grad_norm": 0.5778200030326843,
626
+ "learning_rate": 6.229837701471644e-08,
627
+ "logits/chosen": -1.9124794006347656,
628
+ "logits/rejected": -1.8135532140731812,
629
+ "logps/chosen": -216.97702026367188,
630
+ "logps/ref_chosen": -229.8362274169922,
631
+ "logps/ref_rejected": -233.65390014648438,
632
+ "logps/rejected": -222.93417358398438,
633
+ "loss": 0.4945,
634
+ "rewards/accuracies": 0.6000000238418579,
635
+ "rewards/chosen": 0.1285921037197113,
636
+ "rewards/margins": 0.021394768729805946,
637
+ "rewards/rejected": 0.10719730705022812,
638
  "step": 185
639
  },
640
  {
641
  "epoch": 0.8121827411167513,
642
+ "grad_norm": 0.5558175444602966,
643
+ "learning_rate": 5.0429105848910996e-08,
644
+ "logits/chosen": -1.9621855020523071,
645
+ "logits/rejected": -1.9175077676773071,
646
+ "logps/chosen": -215.39450073242188,
647
+ "logps/ref_chosen": -229.72836303710938,
648
+ "logps/ref_rejected": -233.65237426757812,
649
+ "logps/rejected": -222.21798706054688,
650
+ "loss": 0.4937,
651
+ "rewards/accuracies": 0.6312500238418579,
652
+ "rewards/chosen": 0.14333853125572205,
653
+ "rewards/margins": 0.028994807973504066,
654
+ "rewards/rejected": 0.11434372514486313,
655
  "step": 190
656
  },
657
  {
658
  "epoch": 0.8335559711461394,
659
+ "grad_norm": 0.5308636426925659,
660
+ "learning_rate": 3.968661679220467e-08,
661
+ "logits/chosen": -1.971208930015564,
662
+ "logits/rejected": -1.9112732410430908,
663
+ "logps/chosen": -210.79598999023438,
664
+ "logps/ref_chosen": -224.2023468017578,
665
+ "logps/ref_rejected": -224.3248748779297,
666
+ "logps/rejected": -212.8175811767578,
667
+ "loss": 0.4932,
668
+ "rewards/accuracies": 0.6000000238418579,
669
+ "rewards/chosen": 0.1340634524822235,
670
+ "rewards/margins": 0.018990488722920418,
671
+ "rewards/rejected": 0.11507296562194824,
672
  "step": 195
673
  },
674
  {
675
  "epoch": 0.8549292011755276,
676
+ "grad_norm": 0.615912675857544,
677
+ "learning_rate": 3.013156219837776e-08,
678
+ "logits/chosen": -1.7899879217147827,
679
+ "logits/rejected": -1.6696176528930664,
680
+ "logps/chosen": -215.92288208007812,
681
+ "logps/ref_chosen": -228.88381958007812,
682
+ "logps/ref_rejected": -231.0583953857422,
683
+ "logps/rejected": -220.5959930419922,
684
+ "loss": 0.4932,
685
+ "rewards/accuracies": 0.612500011920929,
686
+ "rewards/chosen": 0.12960924208164215,
687
+ "rewards/margins": 0.024985069409012794,
688
+ "rewards/rejected": 0.1046241745352745,
689
  "step": 200
690
  },
691
  {
692
  "epoch": 0.8763024312049158,
693
+ "grad_norm": 0.590220034122467,
694
+ "learning_rate": 2.1817890137430932e-08,
695
+ "logits/chosen": -1.81471848487854,
696
+ "logits/rejected": -1.714023232460022,
697
+ "logps/chosen": -205.69888305664062,
698
+ "logps/ref_chosen": -221.30752563476562,
699
+ "logps/ref_rejected": -224.98486328125,
700
+ "logps/rejected": -211.78884887695312,
701
+ "loss": 0.4937,
702
  "rewards/accuracies": 0.606249988079071,
703
+ "rewards/chosen": 0.15608620643615723,
704
+ "rewards/margins": 0.024126073345541954,
705
+ "rewards/rejected": 0.13196012377738953,
706
  "step": 205
707
  },
708
  {
709
  "epoch": 0.897675661234304,
710
+ "grad_norm": 0.5369106531143188,
711
+ "learning_rate": 1.479253980347392e-08,
712
+ "logits/chosen": -1.8037662506103516,
713
+ "logits/rejected": -1.7787643671035767,
714
+ "logps/chosen": -225.9608612060547,
715
+ "logps/ref_chosen": -241.4657440185547,
716
+ "logps/ref_rejected": -241.3707733154297,
717
+ "logps/rejected": -228.4087371826172,
718
+ "loss": 0.4931,
719
+ "rewards/accuracies": 0.6625000238418579,
720
+ "rewards/chosen": 0.15504886209964752,
721
+ "rewards/margins": 0.025428583845496178,
722
+ "rewards/rejected": 0.1296202689409256,
723
  "step": 210
724
  },
725
  {
726
  "epoch": 0.9190488912636923,
727
+ "grad_norm": 0.5737273097038269,
728
+ "learning_rate": 9.095176494896661e-09,
729
+ "logits/chosen": -1.8023388385772705,
730
+ "logits/rejected": -1.7160924673080444,
731
+ "logps/chosen": -218.32034301757812,
732
+ "logps/ref_chosen": -231.6717071533203,
733
+ "logps/ref_rejected": -236.741943359375,
734
+ "logps/rejected": -225.2128448486328,
735
+ "loss": 0.4933,
736
+ "rewards/accuracies": 0.5874999761581421,
737
+ "rewards/chosen": 0.13351376354694366,
738
+ "rewards/margins": 0.018222931772470474,
739
+ "rewards/rejected": 0.11529083549976349,
740
  "step": 215
741
  },
742
  {
743
  "epoch": 0.9404221212930804,
744
+ "grad_norm": 0.6087775826454163,
745
+ "learning_rate": 4.757967663132689e-09,
746
+ "logits/chosen": -1.833620309829712,
747
+ "logits/rejected": -1.7870299816131592,
748
+ "logps/chosen": -221.86032104492188,
749
+ "logps/ref_chosen": -236.0878448486328,
750
+ "logps/ref_rejected": -230.54141235351562,
751
+ "logps/rejected": -218.8464813232422,
752
+ "loss": 0.4935,
753
+ "rewards/accuracies": 0.6000000238418579,
754
+ "rewards/chosen": 0.14227530360221863,
755
+ "rewards/margins": 0.025325754657387733,
756
+ "rewards/rejected": 0.11694953590631485,
757
  "step": 220
758
  },
759
  {
760
  "epoch": 0.9617953513224686,
761
+ "grad_norm": 0.6274195909500122,
762
+ "learning_rate": 1.8054012944479224e-09,
763
+ "logits/chosen": -1.7650978565216064,
764
+ "logits/rejected": -1.7383601665496826,
765
+ "logps/chosen": -231.64111328125,
766
+ "logps/ref_chosen": -244.44155883789062,
767
+ "logps/ref_rejected": -240.8953094482422,
768
+ "logps/rejected": -230.3839874267578,
769
+ "loss": 0.4932,
770
+ "rewards/accuracies": 0.574999988079071,
771
+ "rewards/chosen": 0.12800416350364685,
772
+ "rewards/margins": 0.022890925407409668,
773
+ "rewards/rejected": 0.10511324554681778,
774
  "step": 225
775
  },
776
  {
777
  "epoch": 0.9831685813518568,
778
+ "grad_norm": 0.5350868105888367,
779
+ "learning_rate": 2.541476501764228e-10,
780
+ "logits/chosen": -1.8503191471099854,
781
+ "logits/rejected": -1.878313660621643,
782
+ "logps/chosen": -206.16665649414062,
783
+ "logps/ref_chosen": -219.6629638671875,
784
+ "logps/ref_rejected": -212.42172241210938,
785
+ "logps/rejected": -200.54551696777344,
786
+ "loss": 0.494,
787
+ "rewards/accuracies": 0.550000011920929,
788
+ "rewards/chosen": 0.13496311008930206,
789
+ "rewards/margins": 0.016201000660657883,
790
+ "rewards/rejected": 0.11876209825277328,
791
  "step": 230
792
  },
793
  {
794
  "epoch": 0.9959925193694897,
795
  "step": 233,
796
  "total_flos": 0.0,
797
+ "train_loss": 0.49642937480124283,
798
+ "train_runtime": 16410.2083,
799
+ "train_samples_per_second": 3.649,
800
  "train_steps_per_second": 0.014
801
  }
802
  ],