gamallo commited on
Commit
42ea7bf
1 Parent(s): 71f9158

Upload 7 files

Browse files
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ac7e7acf92fc5a1ee58fb029355df9e9b627ae91c30477ca7530b2e5edc4b66
3
+ size 627
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "clean_up_tokenization_spaces": true,
13
+ "eos_token": {
14
+ "__type": "AddedToken",
15
+ "content": "<|endoftext|>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "errors": "replace",
22
+ "model_max_length": 1000000000000000019884624838656,
23
+ "pad_token": null,
24
+ "tokenizer_class": "GPT2Tokenizer",
25
+ "unk_token": {
26
+ "__type": "AddedToken",
27
+ "content": "<|endoftext|>",
28
+ "lstrip": false,
29
+ "normalized": true,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
trainer_state.json ADDED
@@ -0,0 +1,556 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.774057048004438,
5
+ "global_step": 180000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.02,
12
+ "learning_rate": 4.892535079835384e-05,
13
+ "loss": 3.2017,
14
+ "step": 5000
15
+ },
16
+ {
17
+ "epoch": 0.02,
18
+ "eval_accuracy": 0.4179681193308354,
19
+ "eval_loss": 2.969822883605957,
20
+ "eval_runtime": 1929.4803,
21
+ "eval_samples_per_second": 19.496,
22
+ "eval_steps_per_second": 2.437,
23
+ "step": 5000
24
+ },
25
+ {
26
+ "epoch": 0.04,
27
+ "learning_rate": 4.785070159670768e-05,
28
+ "loss": 2.9116,
29
+ "step": 10000
30
+ },
31
+ {
32
+ "epoch": 0.04,
33
+ "eval_accuracy": 0.4297956163670618,
34
+ "eval_loss": 2.8731179237365723,
35
+ "eval_runtime": 1919.4124,
36
+ "eval_samples_per_second": 19.598,
37
+ "eval_steps_per_second": 2.45,
38
+ "step": 10000
39
+ },
40
+ {
41
+ "epoch": 0.06,
42
+ "learning_rate": 4.677605239506152e-05,
43
+ "loss": 2.8412,
44
+ "step": 15000
45
+ },
46
+ {
47
+ "epoch": 0.06,
48
+ "eval_accuracy": 0.4368075611910975,
49
+ "eval_loss": 2.81874942779541,
50
+ "eval_runtime": 2015.9454,
51
+ "eval_samples_per_second": 18.66,
52
+ "eval_steps_per_second": 2.333,
53
+ "step": 15000
54
+ },
55
+ {
56
+ "epoch": 0.09,
57
+ "learning_rate": 4.5701403193415356e-05,
58
+ "loss": 2.7937,
59
+ "step": 20000
60
+ },
61
+ {
62
+ "epoch": 0.09,
63
+ "eval_accuracy": 0.44204520475458375,
64
+ "eval_loss": 2.778646230697632,
65
+ "eval_runtime": 1919.08,
66
+ "eval_samples_per_second": 19.602,
67
+ "eval_steps_per_second": 2.451,
68
+ "step": 20000
69
+ },
70
+ {
71
+ "epoch": 0.11,
72
+ "learning_rate": 4.4626753991769196e-05,
73
+ "loss": 2.7614,
74
+ "step": 25000
75
+ },
76
+ {
77
+ "epoch": 0.11,
78
+ "eval_accuracy": 0.44610035327770187,
79
+ "eval_loss": 2.7471842765808105,
80
+ "eval_runtime": 1919.775,
81
+ "eval_samples_per_second": 19.594,
82
+ "eval_steps_per_second": 2.45,
83
+ "step": 25000
84
+ },
85
+ {
86
+ "epoch": 0.13,
87
+ "learning_rate": 4.355188977427636e-05,
88
+ "loss": 2.7295,
89
+ "step": 30000
90
+ },
91
+ {
92
+ "epoch": 0.13,
93
+ "eval_accuracy": 0.4501440419543679,
94
+ "eval_loss": 2.7196261882781982,
95
+ "eval_runtime": 1928.9026,
96
+ "eval_samples_per_second": 19.502,
97
+ "eval_steps_per_second": 2.438,
98
+ "step": 30000
99
+ },
100
+ {
101
+ "epoch": 0.15,
102
+ "learning_rate": 4.24772405726302e-05,
103
+ "loss": 2.7008,
104
+ "step": 35000
105
+ },
106
+ {
107
+ "epoch": 0.15,
108
+ "eval_accuracy": 0.45296033170252703,
109
+ "eval_loss": 2.6961302757263184,
110
+ "eval_runtime": 1926.8386,
111
+ "eval_samples_per_second": 19.523,
112
+ "eval_steps_per_second": 2.441,
113
+ "step": 35000
114
+ },
115
+ {
116
+ "epoch": 0.17,
117
+ "learning_rate": 4.140237635513737e-05,
118
+ "loss": 2.6827,
119
+ "step": 40000
120
+ },
121
+ {
122
+ "epoch": 0.17,
123
+ "eval_accuracy": 0.4561378015092748,
124
+ "eval_loss": 2.6738195419311523,
125
+ "eval_runtime": 1920.5318,
126
+ "eval_samples_per_second": 19.587,
127
+ "eval_steps_per_second": 2.449,
128
+ "step": 40000
129
+ },
130
+ {
131
+ "epoch": 0.19,
132
+ "learning_rate": 4.032772715349121e-05,
133
+ "loss": 2.6545,
134
+ "step": 45000
135
+ },
136
+ {
137
+ "epoch": 0.19,
138
+ "eval_accuracy": 0.4590067649734393,
139
+ "eval_loss": 2.6545751094818115,
140
+ "eval_runtime": 1933.7083,
141
+ "eval_samples_per_second": 19.453,
142
+ "eval_steps_per_second": 2.432,
143
+ "step": 45000
144
+ },
145
+ {
146
+ "epoch": 0.22,
147
+ "learning_rate": 3.9252862935998384e-05,
148
+ "loss": 2.6419,
149
+ "step": 50000
150
+ },
151
+ {
152
+ "epoch": 0.22,
153
+ "eval_accuracy": 0.4616434651550895,
154
+ "eval_loss": 2.635937452316284,
155
+ "eval_runtime": 1931.385,
156
+ "eval_samples_per_second": 19.477,
157
+ "eval_steps_per_second": 2.435,
158
+ "step": 50000
159
+ },
160
+ {
161
+ "epoch": 0.24,
162
+ "learning_rate": 3.8178213734352224e-05,
163
+ "loss": 2.6208,
164
+ "step": 55000
165
+ },
166
+ {
167
+ "epoch": 0.24,
168
+ "eval_accuracy": 0.4644954077588773,
169
+ "eval_loss": 2.617478847503662,
170
+ "eval_runtime": 1935.6188,
171
+ "eval_samples_per_second": 19.434,
172
+ "eval_steps_per_second": 2.43,
173
+ "step": 55000
174
+ },
175
+ {
176
+ "epoch": 0.26,
177
+ "learning_rate": 3.7103564532706064e-05,
178
+ "loss": 2.604,
179
+ "step": 60000
180
+ },
181
+ {
182
+ "epoch": 0.26,
183
+ "eval_accuracy": 0.4666273030036154,
184
+ "eval_loss": 2.6017534732818604,
185
+ "eval_runtime": 1934.3532,
186
+ "eval_samples_per_second": 19.447,
187
+ "eval_steps_per_second": 2.431,
188
+ "step": 60000
189
+ },
190
+ {
191
+ "epoch": 0.28,
192
+ "learning_rate": 3.60289153310599e-05,
193
+ "loss": 2.5883,
194
+ "step": 65000
195
+ },
196
+ {
197
+ "epoch": 0.28,
198
+ "eval_accuracy": 0.4690363654190064,
199
+ "eval_loss": 2.5858962535858154,
200
+ "eval_runtime": 1937.9051,
201
+ "eval_samples_per_second": 19.411,
202
+ "eval_steps_per_second": 2.427,
203
+ "step": 65000
204
+ },
205
+ {
206
+ "epoch": 0.3,
207
+ "learning_rate": 3.495426612941374e-05,
208
+ "loss": 2.5733,
209
+ "step": 70000
210
+ },
211
+ {
212
+ "epoch": 0.3,
213
+ "eval_accuracy": 0.47110511457104925,
214
+ "eval_loss": 2.5711781978607178,
215
+ "eval_runtime": 1929.1817,
216
+ "eval_samples_per_second": 19.499,
217
+ "eval_steps_per_second": 2.438,
218
+ "step": 70000
219
+ },
220
+ {
221
+ "epoch": 0.32,
222
+ "learning_rate": 3.387940191192091e-05,
223
+ "loss": 2.5603,
224
+ "step": 75000
225
+ },
226
+ {
227
+ "epoch": 0.32,
228
+ "eval_accuracy": 0.4735584312234197,
229
+ "eval_loss": 2.5554990768432617,
230
+ "eval_runtime": 1924.3705,
231
+ "eval_samples_per_second": 19.548,
232
+ "eval_steps_per_second": 2.444,
233
+ "step": 75000
234
+ },
235
+ {
236
+ "epoch": 0.34,
237
+ "learning_rate": 3.280475271027475e-05,
238
+ "loss": 2.5462,
239
+ "step": 80000
240
+ },
241
+ {
242
+ "epoch": 0.34,
243
+ "eval_accuracy": 0.4755388538038284,
244
+ "eval_loss": 2.541663646697998,
245
+ "eval_runtime": 1932.1982,
246
+ "eval_samples_per_second": 19.469,
247
+ "eval_steps_per_second": 2.434,
248
+ "step": 80000
249
+ },
250
+ {
251
+ "epoch": 0.37,
252
+ "learning_rate": 3.173010350862859e-05,
253
+ "loss": 2.5267,
254
+ "step": 85000
255
+ },
256
+ {
257
+ "epoch": 0.37,
258
+ "eval_accuracy": 0.47766700705788817,
259
+ "eval_loss": 2.528374433517456,
260
+ "eval_runtime": 1933.9592,
261
+ "eval_samples_per_second": 19.451,
262
+ "eval_steps_per_second": 2.432,
263
+ "step": 85000
264
+ },
265
+ {
266
+ "epoch": 0.39,
267
+ "learning_rate": 3.0655454306982425e-05,
268
+ "loss": 2.5153,
269
+ "step": 90000
270
+ },
271
+ {
272
+ "epoch": 0.39,
273
+ "eval_accuracy": 0.47989050311610376,
274
+ "eval_loss": 2.514660596847534,
275
+ "eval_runtime": 1929.164,
276
+ "eval_samples_per_second": 19.499,
277
+ "eval_steps_per_second": 2.438,
278
+ "step": 90000
279
+ },
280
+ {
281
+ "epoch": 0.41,
282
+ "learning_rate": 2.9580805105336262e-05,
283
+ "loss": 2.5028,
284
+ "step": 95000
285
+ },
286
+ {
287
+ "epoch": 0.41,
288
+ "eval_accuracy": 0.481903668114947,
289
+ "eval_loss": 2.501615285873413,
290
+ "eval_runtime": 1929.1634,
291
+ "eval_samples_per_second": 19.499,
292
+ "eval_steps_per_second": 2.438,
293
+ "step": 95000
294
+ },
295
+ {
296
+ "epoch": 0.43,
297
+ "learning_rate": 2.8505940887843435e-05,
298
+ "loss": 2.4881,
299
+ "step": 100000
300
+ },
301
+ {
302
+ "epoch": 0.43,
303
+ "eval_accuracy": 0.4839338799602133,
304
+ "eval_loss": 2.488723039627075,
305
+ "eval_runtime": 1930.4001,
306
+ "eval_samples_per_second": 19.487,
307
+ "eval_steps_per_second": 2.436,
308
+ "step": 100000
309
+ },
310
+ {
311
+ "epoch": 0.45,
312
+ "learning_rate": 2.7431291686197276e-05,
313
+ "loss": 2.4758,
314
+ "step": 105000
315
+ },
316
+ {
317
+ "epoch": 0.45,
318
+ "eval_accuracy": 0.48596689829848827,
319
+ "eval_loss": 2.475806951522827,
320
+ "eval_runtime": 1923.9804,
321
+ "eval_samples_per_second": 19.552,
322
+ "eval_steps_per_second": 2.444,
323
+ "step": 105000
324
+ },
325
+ {
326
+ "epoch": 0.47,
327
+ "learning_rate": 2.6356642484551112e-05,
328
+ "loss": 2.4632,
329
+ "step": 110000
330
+ },
331
+ {
332
+ "epoch": 0.47,
333
+ "eval_accuracy": 0.4880078683669545,
334
+ "eval_loss": 2.463679552078247,
335
+ "eval_runtime": 1943.6457,
336
+ "eval_samples_per_second": 19.354,
337
+ "eval_steps_per_second": 2.42,
338
+ "step": 110000
339
+ },
340
+ {
341
+ "epoch": 0.49,
342
+ "learning_rate": 2.5281993282904953e-05,
343
+ "loss": 2.4543,
344
+ "step": 115000
345
+ },
346
+ {
347
+ "epoch": 0.49,
348
+ "eval_accuracy": 0.4898118976645586,
349
+ "eval_loss": 2.4515490531921387,
350
+ "eval_runtime": 1940.3072,
351
+ "eval_samples_per_second": 19.387,
352
+ "eval_steps_per_second": 2.424,
353
+ "step": 115000
354
+ },
355
+ {
356
+ "epoch": 0.52,
357
+ "learning_rate": 2.4206914049565453e-05,
358
+ "loss": 2.4416,
359
+ "step": 120000
360
+ },
361
+ {
362
+ "epoch": 0.52,
363
+ "eval_accuracy": 0.49199028194626443,
364
+ "eval_loss": 2.4388370513916016,
365
+ "eval_runtime": 1928.5903,
366
+ "eval_samples_per_second": 19.505,
367
+ "eval_steps_per_second": 2.439,
368
+ "step": 120000
369
+ },
370
+ {
371
+ "epoch": 0.54,
372
+ "learning_rate": 2.313247986376596e-05,
373
+ "loss": 2.4256,
374
+ "step": 125000
375
+ },
376
+ {
377
+ "epoch": 0.54,
378
+ "eval_accuracy": 0.49386564294117247,
379
+ "eval_loss": 2.4272964000701904,
380
+ "eval_runtime": 1927.1096,
381
+ "eval_samples_per_second": 19.52,
382
+ "eval_steps_per_second": 2.44,
383
+ "step": 125000
384
+ },
385
+ {
386
+ "epoch": 0.56,
387
+ "learning_rate": 2.20578306621198e-05,
388
+ "loss": 2.4166,
389
+ "step": 130000
390
+ },
391
+ {
392
+ "epoch": 0.56,
393
+ "eval_accuracy": 0.4959010260096677,
394
+ "eval_loss": 2.4158496856689453,
395
+ "eval_runtime": 1932.6805,
396
+ "eval_samples_per_second": 19.464,
397
+ "eval_steps_per_second": 2.433,
398
+ "step": 130000
399
+ },
400
+ {
401
+ "epoch": 0.58,
402
+ "learning_rate": 2.0982751428780303e-05,
403
+ "loss": 2.4052,
404
+ "step": 135000
405
+ },
406
+ {
407
+ "epoch": 0.58,
408
+ "eval_accuracy": 0.4977953568184306,
409
+ "eval_loss": 2.4038643836975098,
410
+ "eval_runtime": 1925.4197,
411
+ "eval_samples_per_second": 19.537,
412
+ "eval_steps_per_second": 2.443,
413
+ "step": 135000
414
+ },
415
+ {
416
+ "epoch": 0.6,
417
+ "learning_rate": 1.990831724298081e-05,
418
+ "loss": 2.3936,
419
+ "step": 140000
420
+ },
421
+ {
422
+ "epoch": 0.6,
423
+ "eval_accuracy": 0.49973316228278164,
424
+ "eval_loss": 2.392191171646118,
425
+ "eval_runtime": 1928.0691,
426
+ "eval_samples_per_second": 19.51,
427
+ "eval_steps_per_second": 2.439,
428
+ "step": 140000
429
+ },
430
+ {
431
+ "epoch": 0.62,
432
+ "learning_rate": 1.8833668041334647e-05,
433
+ "loss": 2.379,
434
+ "step": 145000
435
+ },
436
+ {
437
+ "epoch": 0.62,
438
+ "eval_accuracy": 0.5019234221876816,
439
+ "eval_loss": 2.38037371635437,
440
+ "eval_runtime": 1923.4067,
441
+ "eval_samples_per_second": 19.557,
442
+ "eval_steps_per_second": 2.445,
443
+ "step": 145000
444
+ },
445
+ {
446
+ "epoch": 0.65,
447
+ "learning_rate": 1.7759018839688484e-05,
448
+ "loss": 2.3659,
449
+ "step": 150000
450
+ },
451
+ {
452
+ "epoch": 0.65,
453
+ "eval_accuracy": 0.5037539832386363,
454
+ "eval_loss": 2.3692569732666016,
455
+ "eval_runtime": 1925.9734,
456
+ "eval_samples_per_second": 19.531,
457
+ "eval_steps_per_second": 2.442,
458
+ "step": 150000
459
+ },
460
+ {
461
+ "epoch": 0.67,
462
+ "learning_rate": 1.6684154622195657e-05,
463
+ "loss": 2.3588,
464
+ "step": 155000
465
+ },
466
+ {
467
+ "epoch": 0.67,
468
+ "eval_accuracy": 0.5057738006653519,
469
+ "eval_loss": 2.357806444168091,
470
+ "eval_runtime": 1927.4777,
471
+ "eval_samples_per_second": 19.516,
472
+ "eval_steps_per_second": 2.44,
473
+ "step": 155000
474
+ },
475
+ {
476
+ "epoch": 0.69,
477
+ "learning_rate": 1.5609505420549494e-05,
478
+ "loss": 2.3529,
479
+ "step": 160000
480
+ },
481
+ {
482
+ "epoch": 0.69,
483
+ "eval_accuracy": 0.5079495603563737,
484
+ "eval_loss": 2.3464465141296387,
485
+ "eval_runtime": 1926.6683,
486
+ "eval_samples_per_second": 19.524,
487
+ "eval_steps_per_second": 2.441,
488
+ "step": 160000
489
+ },
490
+ {
491
+ "epoch": 0.71,
492
+ "learning_rate": 1.4535071234750002e-05,
493
+ "loss": 2.3338,
494
+ "step": 165000
495
+ },
496
+ {
497
+ "epoch": 0.71,
498
+ "eval_accuracy": 0.5098386939558613,
499
+ "eval_loss": 2.335519313812256,
500
+ "eval_runtime": 1925.6719,
501
+ "eval_samples_per_second": 19.534,
502
+ "eval_steps_per_second": 2.442,
503
+ "step": 165000
504
+ },
505
+ {
506
+ "epoch": 0.73,
507
+ "learning_rate": 1.3460422033103839e-05,
508
+ "loss": 2.3238,
509
+ "step": 170000
510
+ },
511
+ {
512
+ "epoch": 0.73,
513
+ "eval_accuracy": 0.5118405290384843,
514
+ "eval_loss": 2.3250160217285156,
515
+ "eval_runtime": 1925.0126,
516
+ "eval_samples_per_second": 19.541,
517
+ "eval_steps_per_second": 2.443,
518
+ "step": 170000
519
+ },
520
+ {
521
+ "epoch": 0.75,
522
+ "learning_rate": 1.2385557815611011e-05,
523
+ "loss": 2.3139,
524
+ "step": 175000
525
+ },
526
+ {
527
+ "epoch": 0.75,
528
+ "eval_accuracy": 0.5137200997729053,
529
+ "eval_loss": 2.314518451690674,
530
+ "eval_runtime": 1927.2965,
531
+ "eval_samples_per_second": 19.518,
532
+ "eval_steps_per_second": 2.44,
533
+ "step": 175000
534
+ },
535
+ {
536
+ "epoch": 0.77,
537
+ "learning_rate": 1.1311123629811518e-05,
538
+ "loss": 2.3035,
539
+ "step": 180000
540
+ },
541
+ {
542
+ "epoch": 0.77,
543
+ "eval_accuracy": 0.5153991621734844,
544
+ "eval_loss": 2.3050577640533447,
545
+ "eval_runtime": 1927.482,
546
+ "eval_samples_per_second": 19.516,
547
+ "eval_steps_per_second": 2.44,
548
+ "step": 180000
549
+ }
550
+ ],
551
+ "max_steps": 232541,
552
+ "num_train_epochs": 1,
553
+ "total_flos": 2.138588502294528e+19,
554
+ "trial_name": null,
555
+ "trial_params": null
556
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b33031977e3e2319a3bd5e94fe11042151c004762041fcd9d3e2684f231264c
3
+ size 3643
vocab.json ADDED
The diff for this file is too large to render. See raw diff