dzanbek commited on
Commit
06c713e
·
verified ·
1 Parent(s): 30f4177

Training in progress, step 25, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -20,12 +20,12 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "down_proj",
24
  "o_proj",
25
- "k_proj",
26
- "gate_proj",
27
- "up_proj",
28
  "v_proj",
 
 
 
 
29
  "q_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
23
  "o_proj",
 
 
 
24
  "v_proj",
25
+ "up_proj",
26
+ "gate_proj",
27
+ "k_proj",
28
+ "down_proj",
29
  "q_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d0f66c6a47a25f385173d726904c546f03df4d695f3eb12ed84839a91227b0de
3
  size 200068512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5a1958c57fa8a3e075e39de100457523fe983665ef747cd214c2060014226c0
3
  size 200068512
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7406bfe3f0274b2db7be65887c5911aeba64eced0bd24782e7d038f2c70e82c1
3
  size 400361770
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7bab43b300f4949a336e8d5f11c091fb566f7cad43eb09ab5f37c35196c0d97
3
  size 400361770
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0aa299d6b618ba4f73d7d6c4d99f3591b3ed1a9520f2ccb1ca96d7f9f3c7936d
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e87cdd914cacce0bf0f2d3377e1ea0cf41243706dad7f4209963b8c2a7ac5a75
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e69e2b49ea642509f0c688c16fb190b7cf27dac0a18903a5e2d1467d0343d8b8
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:040b95482c646935022d148ebc91f462fbf2195cfa3365adbbe3bd6ca1f35a74
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.04081632653061224,
5
  "eval_steps": 5,
6
- "global_step": 50,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0008163265306122449,
13
- "grad_norm": 0.7370944619178772,
14
  "learning_rate": 2e-05,
15
  "loss": 0.5845,
16
  "step": 1
@@ -18,433 +18,218 @@
18
  {
19
  "epoch": 0.0008163265306122449,
20
  "eval_loss": 0.6241262555122375,
21
- "eval_runtime": 303.1464,
22
- "eval_samples_per_second": 3.404,
23
- "eval_steps_per_second": 1.702,
24
  "step": 1
25
  },
26
  {
27
  "epoch": 0.0016326530612244899,
28
- "grad_norm": 0.5706467628479004,
29
  "learning_rate": 4e-05,
30
  "loss": 0.5365,
31
  "step": 2
32
  },
33
  {
34
  "epoch": 0.0024489795918367346,
35
- "grad_norm": 0.633495569229126,
36
  "learning_rate": 6e-05,
37
- "loss": 0.6398,
38
  "step": 3
39
  },
40
  {
41
  "epoch": 0.0032653061224489797,
42
- "grad_norm": 0.6009469032287598,
43
  "learning_rate": 8e-05,
44
- "loss": 0.5877,
45
  "step": 4
46
  },
47
  {
48
  "epoch": 0.004081632653061225,
49
- "grad_norm": 0.5381947159767151,
50
  "learning_rate": 0.0001,
51
- "loss": 0.5462,
52
  "step": 5
53
  },
54
  {
55
  "epoch": 0.004081632653061225,
56
- "eval_loss": 0.5509580969810486,
57
- "eval_runtime": 305.7326,
58
- "eval_samples_per_second": 3.375,
59
- "eval_steps_per_second": 1.688,
60
  "step": 5
61
  },
62
  {
63
  "epoch": 0.004897959183673469,
64
- "grad_norm": 0.5985301733016968,
65
  "learning_rate": 0.00012,
66
- "loss": 0.5205,
67
  "step": 6
68
  },
69
  {
70
  "epoch": 0.005714285714285714,
71
- "grad_norm": 0.5854198336601257,
72
  "learning_rate": 0.00014,
73
- "loss": 0.4624,
74
  "step": 7
75
  },
76
  {
77
  "epoch": 0.006530612244897959,
78
- "grad_norm": 0.7701842784881592,
79
  "learning_rate": 0.00016,
80
- "loss": 0.4551,
81
  "step": 8
82
  },
83
  {
84
  "epoch": 0.007346938775510204,
85
- "grad_norm": 0.656694769859314,
86
  "learning_rate": 0.00018,
87
- "loss": 0.4233,
88
  "step": 9
89
  },
90
  {
91
  "epoch": 0.00816326530612245,
92
- "grad_norm": 0.8429622054100037,
93
  "learning_rate": 0.0002,
94
- "loss": 0.487,
95
  "step": 10
96
  },
97
  {
98
  "epoch": 0.00816326530612245,
99
- "eval_loss": 0.4371676445007324,
100
- "eval_runtime": 305.7536,
101
- "eval_samples_per_second": 3.375,
102
- "eval_steps_per_second": 1.688,
103
  "step": 10
104
  },
105
  {
106
  "epoch": 0.008979591836734694,
107
- "grad_norm": 0.6876599192619324,
108
  "learning_rate": 0.0001996917333733128,
109
- "loss": 0.4084,
110
  "step": 11
111
  },
112
  {
113
  "epoch": 0.009795918367346938,
114
- "grad_norm": 0.5299339890480042,
115
  "learning_rate": 0.00019876883405951377,
116
- "loss": 0.4388,
117
  "step": 12
118
  },
119
  {
120
  "epoch": 0.010612244897959184,
121
- "grad_norm": 0.5224944353103638,
122
  "learning_rate": 0.00019723699203976766,
123
- "loss": 0.3741,
124
  "step": 13
125
  },
126
  {
127
  "epoch": 0.011428571428571429,
128
- "grad_norm": 0.5660629868507385,
129
  "learning_rate": 0.00019510565162951537,
130
- "loss": 0.3746,
131
  "step": 14
132
  },
133
  {
134
  "epoch": 0.012244897959183673,
135
- "grad_norm": 0.4487418532371521,
136
  "learning_rate": 0.0001923879532511287,
137
- "loss": 0.3684,
138
  "step": 15
139
  },
140
  {
141
  "epoch": 0.012244897959183673,
142
- "eval_loss": 0.3932255804538727,
143
- "eval_runtime": 306.0955,
144
- "eval_samples_per_second": 3.371,
145
- "eval_steps_per_second": 1.686,
146
  "step": 15
147
  },
148
  {
149
  "epoch": 0.013061224489795919,
150
- "grad_norm": 0.4456219971179962,
151
  "learning_rate": 0.0001891006524188368,
152
- "loss": 0.4175,
153
  "step": 16
154
  },
155
  {
156
  "epoch": 0.013877551020408163,
157
- "grad_norm": 0.46617552638053894,
158
  "learning_rate": 0.00018526401643540922,
159
- "loss": 0.3896,
160
  "step": 17
161
  },
162
  {
163
  "epoch": 0.014693877551020407,
164
- "grad_norm": 0.39636608958244324,
165
  "learning_rate": 0.00018090169943749476,
166
- "loss": 0.3915,
167
  "step": 18
168
  },
169
  {
170
  "epoch": 0.015510204081632653,
171
- "grad_norm": 0.42811569571495056,
172
  "learning_rate": 0.0001760405965600031,
173
- "loss": 0.3479,
174
  "step": 19
175
  },
176
  {
177
  "epoch": 0.0163265306122449,
178
- "grad_norm": 0.4594975411891937,
179
  "learning_rate": 0.00017071067811865476,
180
- "loss": 0.3663,
181
  "step": 20
182
  },
183
  {
184
  "epoch": 0.0163265306122449,
185
- "eval_loss": 0.3825583755970001,
186
- "eval_runtime": 305.9194,
187
- "eval_samples_per_second": 3.373,
188
- "eval_steps_per_second": 1.687,
189
  "step": 20
190
  },
191
  {
192
  "epoch": 0.017142857142857144,
193
- "grad_norm": 0.42077338695526123,
194
  "learning_rate": 0.00016494480483301836,
195
- "loss": 0.3459,
196
  "step": 21
197
  },
198
  {
199
  "epoch": 0.017959183673469388,
200
- "grad_norm": 0.448321670293808,
201
  "learning_rate": 0.00015877852522924732,
202
- "loss": 0.3624,
203
  "step": 22
204
  },
205
  {
206
  "epoch": 0.018775510204081632,
207
- "grad_norm": 0.40496572852134705,
208
  "learning_rate": 0.0001522498564715949,
209
- "loss": 0.2865,
210
  "step": 23
211
  },
212
  {
213
  "epoch": 0.019591836734693877,
214
- "grad_norm": 0.4883009195327759,
215
  "learning_rate": 0.00014539904997395468,
216
- "loss": 0.3692,
217
  "step": 24
218
  },
219
  {
220
  "epoch": 0.02040816326530612,
221
- "grad_norm": 0.47144022583961487,
222
  "learning_rate": 0.000138268343236509,
223
- "loss": 0.3643,
224
  "step": 25
225
  },
226
  {
227
  "epoch": 0.02040816326530612,
228
- "eval_loss": 0.3697856366634369,
229
- "eval_runtime": 306.009,
230
- "eval_samples_per_second": 3.372,
231
- "eval_steps_per_second": 1.686,
232
  "step": 25
233
- },
234
- {
235
- "epoch": 0.02122448979591837,
236
- "grad_norm": 0.4498879015445709,
237
- "learning_rate": 0.00013090169943749476,
238
- "loss": 0.3263,
239
- "step": 26
240
- },
241
- {
242
- "epoch": 0.022040816326530613,
243
- "grad_norm": 0.46243250370025635,
244
- "learning_rate": 0.00012334453638559057,
245
- "loss": 0.4412,
246
- "step": 27
247
- },
248
- {
249
- "epoch": 0.022857142857142857,
250
- "grad_norm": 0.48129433393478394,
251
- "learning_rate": 0.0001156434465040231,
252
- "loss": 0.3595,
253
- "step": 28
254
- },
255
- {
256
- "epoch": 0.0236734693877551,
257
- "grad_norm": 0.41295281052589417,
258
- "learning_rate": 0.0001078459095727845,
259
- "loss": 0.3386,
260
- "step": 29
261
- },
262
- {
263
- "epoch": 0.024489795918367346,
264
- "grad_norm": 0.42476311326026917,
265
- "learning_rate": 0.0001,
266
- "loss": 0.342,
267
- "step": 30
268
- },
269
- {
270
- "epoch": 0.024489795918367346,
271
- "eval_loss": 0.3653227388858795,
272
- "eval_runtime": 305.8828,
273
- "eval_samples_per_second": 3.374,
274
- "eval_steps_per_second": 1.687,
275
- "step": 30
276
- },
277
- {
278
- "epoch": 0.025306122448979593,
279
- "grad_norm": 0.40272751450538635,
280
- "learning_rate": 9.215409042721552e-05,
281
- "loss": 0.4082,
282
- "step": 31
283
- },
284
- {
285
- "epoch": 0.026122448979591838,
286
- "grad_norm": 0.38805681467056274,
287
- "learning_rate": 8.435655349597689e-05,
288
- "loss": 0.2459,
289
- "step": 32
290
- },
291
- {
292
- "epoch": 0.026938775510204082,
293
- "grad_norm": 0.4278540015220642,
294
- "learning_rate": 7.66554636144095e-05,
295
- "loss": 0.3213,
296
- "step": 33
297
- },
298
- {
299
- "epoch": 0.027755102040816326,
300
- "grad_norm": 0.574360728263855,
301
- "learning_rate": 6.909830056250527e-05,
302
- "loss": 0.42,
303
- "step": 34
304
- },
305
- {
306
- "epoch": 0.02857142857142857,
307
- "grad_norm": 0.4692784547805786,
308
- "learning_rate": 6.173165676349103e-05,
309
- "loss": 0.3466,
310
- "step": 35
311
- },
312
- {
313
- "epoch": 0.02857142857142857,
314
- "eval_loss": 0.3594253957271576,
315
- "eval_runtime": 305.9204,
316
- "eval_samples_per_second": 3.373,
317
- "eval_steps_per_second": 1.687,
318
- "step": 35
319
- },
320
- {
321
- "epoch": 0.029387755102040815,
322
- "grad_norm": 0.40394076704978943,
323
- "learning_rate": 5.4600950026045326e-05,
324
- "loss": 0.3218,
325
- "step": 36
326
- },
327
- {
328
- "epoch": 0.030204081632653063,
329
- "grad_norm": 0.40415486693382263,
330
- "learning_rate": 4.7750143528405126e-05,
331
- "loss": 0.3449,
332
- "step": 37
333
- },
334
- {
335
- "epoch": 0.031020408163265307,
336
- "grad_norm": 0.5078446865081787,
337
- "learning_rate": 4.12214747707527e-05,
338
- "loss": 0.3804,
339
- "step": 38
340
- },
341
- {
342
- "epoch": 0.03183673469387755,
343
- "grad_norm": 0.4428996443748474,
344
- "learning_rate": 3.5055195166981645e-05,
345
- "loss": 0.4278,
346
- "step": 39
347
- },
348
- {
349
- "epoch": 0.0326530612244898,
350
- "grad_norm": 0.5538510084152222,
351
- "learning_rate": 2.9289321881345254e-05,
352
- "loss": 0.53,
353
- "step": 40
354
- },
355
- {
356
- "epoch": 0.0326530612244898,
357
- "eval_loss": 0.3550981879234314,
358
- "eval_runtime": 305.9407,
359
- "eval_samples_per_second": 3.373,
360
- "eval_steps_per_second": 1.687,
361
- "step": 40
362
- },
363
- {
364
- "epoch": 0.03346938775510204,
365
- "grad_norm": 0.5039154887199402,
366
- "learning_rate": 2.3959403439996907e-05,
367
- "loss": 0.3739,
368
- "step": 41
369
- },
370
- {
371
- "epoch": 0.03428571428571429,
372
- "grad_norm": 0.4173552095890045,
373
- "learning_rate": 1.9098300562505266e-05,
374
- "loss": 0.3184,
375
- "step": 42
376
- },
377
- {
378
- "epoch": 0.03510204081632653,
379
- "grad_norm": 0.3404304087162018,
380
- "learning_rate": 1.4735983564590783e-05,
381
- "loss": 0.3103,
382
- "step": 43
383
- },
384
- {
385
- "epoch": 0.035918367346938776,
386
- "grad_norm": 0.4408942759037018,
387
- "learning_rate": 1.0899347581163221e-05,
388
- "loss": 0.3103,
389
- "step": 44
390
- },
391
- {
392
- "epoch": 0.036734693877551024,
393
- "grad_norm": 0.4228384792804718,
394
- "learning_rate": 7.612046748871327e-06,
395
- "loss": 0.328,
396
- "step": 45
397
- },
398
- {
399
- "epoch": 0.036734693877551024,
400
- "eval_loss": 0.3556883931159973,
401
- "eval_runtime": 306.0718,
402
- "eval_samples_per_second": 3.372,
403
- "eval_steps_per_second": 1.686,
404
- "step": 45
405
- },
406
- {
407
- "epoch": 0.037551020408163265,
408
- "grad_norm": 0.36199188232421875,
409
- "learning_rate": 4.8943483704846475e-06,
410
- "loss": 0.2883,
411
- "step": 46
412
- },
413
- {
414
- "epoch": 0.03836734693877551,
415
- "grad_norm": 0.4111153483390808,
416
- "learning_rate": 2.7630079602323442e-06,
417
- "loss": 0.2967,
418
- "step": 47
419
- },
420
- {
421
- "epoch": 0.03918367346938775,
422
- "grad_norm": 0.3962281346321106,
423
- "learning_rate": 1.231165940486234e-06,
424
- "loss": 0.3148,
425
- "step": 48
426
- },
427
- {
428
- "epoch": 0.04,
429
- "grad_norm": 0.41017216444015503,
430
- "learning_rate": 3.0826662668720364e-07,
431
- "loss": 0.3113,
432
- "step": 49
433
- },
434
- {
435
- "epoch": 0.04081632653061224,
436
- "grad_norm": 0.4169033467769623,
437
- "learning_rate": 0.0,
438
- "loss": 0.3195,
439
- "step": 50
440
- },
441
- {
442
- "epoch": 0.04081632653061224,
443
- "eval_loss": 0.35500943660736084,
444
- "eval_runtime": 306.0822,
445
- "eval_samples_per_second": 3.372,
446
- "eval_steps_per_second": 1.686,
447
- "step": 50
448
  }
449
  ],
450
  "logging_steps": 1,
@@ -459,12 +244,12 @@
459
  "should_evaluate": false,
460
  "should_log": false,
461
  "should_save": true,
462
- "should_training_stop": true
463
  },
464
  "attributes": {}
465
  }
466
  },
467
- "total_flos": 7.707288333503693e+16,
468
  "train_batch_size": 2,
469
  "trial_name": null,
470
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.02040816326530612,
5
  "eval_steps": 5,
6
+ "global_step": 25,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0008163265306122449,
13
+ "grad_norm": 0.7544206976890564,
14
  "learning_rate": 2e-05,
15
  "loss": 0.5845,
16
  "step": 1
 
18
  {
19
  "epoch": 0.0008163265306122449,
20
  "eval_loss": 0.6241262555122375,
21
+ "eval_runtime": 303.7147,
22
+ "eval_samples_per_second": 3.398,
23
+ "eval_steps_per_second": 1.699,
24
  "step": 1
25
  },
26
  {
27
  "epoch": 0.0016326530612244899,
28
+ "grad_norm": 0.5853733420372009,
29
  "learning_rate": 4e-05,
30
  "loss": 0.5365,
31
  "step": 2
32
  },
33
  {
34
  "epoch": 0.0024489795918367346,
35
+ "grad_norm": 0.6549996733665466,
36
  "learning_rate": 6e-05,
37
+ "loss": 0.6425,
38
  "step": 3
39
  },
40
  {
41
  "epoch": 0.0032653061224489797,
42
+ "grad_norm": 0.614408552646637,
43
  "learning_rate": 8e-05,
44
+ "loss": 0.5795,
45
  "step": 4
46
  },
47
  {
48
  "epoch": 0.004081632653061225,
49
+ "grad_norm": 0.5475334525108337,
50
  "learning_rate": 0.0001,
51
+ "loss": 0.5454,
52
  "step": 5
53
  },
54
  {
55
  "epoch": 0.004081632653061225,
56
+ "eval_loss": 0.5514864325523376,
57
+ "eval_runtime": 306.4705,
58
+ "eval_samples_per_second": 3.367,
59
+ "eval_steps_per_second": 1.684,
60
  "step": 5
61
  },
62
  {
63
  "epoch": 0.004897959183673469,
64
+ "grad_norm": 0.6220462918281555,
65
  "learning_rate": 0.00012,
66
+ "loss": 0.5158,
67
  "step": 6
68
  },
69
  {
70
  "epoch": 0.005714285714285714,
71
+ "grad_norm": 0.6376144289970398,
72
  "learning_rate": 0.00014,
73
+ "loss": 0.4644,
74
  "step": 7
75
  },
76
  {
77
  "epoch": 0.006530612244897959,
78
+ "grad_norm": 0.7968347668647766,
79
  "learning_rate": 0.00016,
80
+ "loss": 0.4495,
81
  "step": 8
82
  },
83
  {
84
  "epoch": 0.007346938775510204,
85
+ "grad_norm": 0.6648410558700562,
86
  "learning_rate": 0.00018,
87
+ "loss": 0.4173,
88
  "step": 9
89
  },
90
  {
91
  "epoch": 0.00816326530612245,
92
+ "grad_norm": 0.8778969645500183,
93
  "learning_rate": 0.0002,
94
+ "loss": 0.4878,
95
  "step": 10
96
  },
97
  {
98
  "epoch": 0.00816326530612245,
99
+ "eval_loss": 0.43802592158317566,
100
+ "eval_runtime": 306.6738,
101
+ "eval_samples_per_second": 3.365,
102
+ "eval_steps_per_second": 1.683,
103
  "step": 10
104
  },
105
  {
106
  "epoch": 0.008979591836734694,
107
+ "grad_norm": 0.7390214800834656,
108
  "learning_rate": 0.0001996917333733128,
109
+ "loss": 0.4016,
110
  "step": 11
111
  },
112
  {
113
  "epoch": 0.009795918367346938,
114
+ "grad_norm": 0.5307807922363281,
115
  "learning_rate": 0.00019876883405951377,
116
+ "loss": 0.439,
117
  "step": 12
118
  },
119
  {
120
  "epoch": 0.010612244897959184,
121
+ "grad_norm": 0.515744686126709,
122
  "learning_rate": 0.00019723699203976766,
123
+ "loss": 0.37,
124
  "step": 13
125
  },
126
  {
127
  "epoch": 0.011428571428571429,
128
+ "grad_norm": 0.5767286419868469,
129
  "learning_rate": 0.00019510565162951537,
130
+ "loss": 0.3815,
131
  "step": 14
132
  },
133
  {
134
  "epoch": 0.012244897959183673,
135
+ "grad_norm": 0.4507094919681549,
136
  "learning_rate": 0.0001923879532511287,
137
+ "loss": 0.3675,
138
  "step": 15
139
  },
140
  {
141
  "epoch": 0.012244897959183673,
142
+ "eval_loss": 0.3925292193889618,
143
+ "eval_runtime": 306.4671,
144
+ "eval_samples_per_second": 3.367,
145
+ "eval_steps_per_second": 1.684,
146
  "step": 15
147
  },
148
  {
149
  "epoch": 0.013061224489795919,
150
+ "grad_norm": 0.45942962169647217,
151
  "learning_rate": 0.0001891006524188368,
152
+ "loss": 0.4134,
153
  "step": 16
154
  },
155
  {
156
  "epoch": 0.013877551020408163,
157
+ "grad_norm": 0.4870969355106354,
158
  "learning_rate": 0.00018526401643540922,
159
+ "loss": 0.389,
160
  "step": 17
161
  },
162
  {
163
  "epoch": 0.014693877551020407,
164
+ "grad_norm": 0.4066920876502991,
165
  "learning_rate": 0.00018090169943749476,
166
+ "loss": 0.4022,
167
  "step": 18
168
  },
169
  {
170
  "epoch": 0.015510204081632653,
171
+ "grad_norm": 0.44875219464302063,
172
  "learning_rate": 0.0001760405965600031,
173
+ "loss": 0.3507,
174
  "step": 19
175
  },
176
  {
177
  "epoch": 0.0163265306122449,
178
+ "grad_norm": 0.49346503615379333,
179
  "learning_rate": 0.00017071067811865476,
180
+ "loss": 0.3737,
181
  "step": 20
182
  },
183
  {
184
  "epoch": 0.0163265306122449,
185
+ "eval_loss": 0.38162362575531006,
186
+ "eval_runtime": 306.1047,
187
+ "eval_samples_per_second": 3.371,
188
+ "eval_steps_per_second": 1.686,
189
  "step": 20
190
  },
191
  {
192
  "epoch": 0.017142857142857144,
193
+ "grad_norm": 0.42135682702064514,
194
  "learning_rate": 0.00016494480483301836,
195
+ "loss": 0.3401,
196
  "step": 21
197
  },
198
  {
199
  "epoch": 0.017959183673469388,
200
+ "grad_norm": 0.45447424054145813,
201
  "learning_rate": 0.00015877852522924732,
202
+ "loss": 0.3611,
203
  "step": 22
204
  },
205
  {
206
  "epoch": 0.018775510204081632,
207
+ "grad_norm": 0.4188750982284546,
208
  "learning_rate": 0.0001522498564715949,
209
+ "loss": 0.2922,
210
  "step": 23
211
  },
212
  {
213
  "epoch": 0.019591836734693877,
214
+ "grad_norm": 0.5015071630477905,
215
  "learning_rate": 0.00014539904997395468,
216
+ "loss": 0.3682,
217
  "step": 24
218
  },
219
  {
220
  "epoch": 0.02040816326530612,
221
+ "grad_norm": 0.4825016260147095,
222
  "learning_rate": 0.000138268343236509,
223
+ "loss": 0.367,
224
  "step": 25
225
  },
226
  {
227
  "epoch": 0.02040816326530612,
228
+ "eval_loss": 0.3707207441329956,
229
+ "eval_runtime": 306.4625,
230
+ "eval_samples_per_second": 3.367,
231
+ "eval_steps_per_second": 1.684,
232
  "step": 25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  }
234
  ],
235
  "logging_steps": 1,
 
244
  "should_evaluate": false,
245
  "should_log": false,
246
  "should_save": true,
247
+ "should_training_stop": false
248
  },
249
  "attributes": {}
250
  }
251
  },
252
+ "total_flos": 3.83447180771328e+16,
253
  "train_batch_size": 2,
254
  "trial_name": null,
255
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab4f2c6f49088c35f2e82a3466d0efce6faeea0c8f475325fa63f3479bc8ed25
3
  size 6776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe1e904a16c4b011350bab7b53595b43a35f758347a5d7fc1ee6fa65abe253d8
3
  size 6776