KakashiH commited on
Commit
3bc4573
·
verified ·
1 Parent(s): 7c60d89

Upload 8 files

Browse files
config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/vit-base-patch16-224",
3
+ "architectures": [
4
+ "ViTForImageClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "encoder_stride": 16,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.0,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "Gatak",
13
+ "1": "Kelihos_ver1",
14
+ "2": "Kelihos_ver3",
15
+ "3": "Lollipop",
16
+ "4": "Obfuscator.ACY",
17
+ "5": "Ramnit",
18
+ "6": "Simda",
19
+ "7": "Tracur",
20
+ "8": "Vundo"
21
+ },
22
+ "image_size": 224,
23
+ "initializer_range": 0.02,
24
+ "intermediate_size": 3072,
25
+ "label2id": {
26
+ "Gatak": 0,
27
+ "Kelihos_ver1": 1,
28
+ "Kelihos_ver3": 2,
29
+ "Lollipop": 3,
30
+ "Obfuscator.ACY": 4,
31
+ "Ramnit": 5,
32
+ "Simda": 6,
33
+ "Tracur": 7,
34
+ "Vundo": 8
35
+ },
36
+ "layer_norm_eps": 1e-12,
37
+ "model_type": "vit",
38
+ "num_attention_heads": 12,
39
+ "num_channels": 3,
40
+ "num_hidden_layers": 12,
41
+ "patch_size": 16,
42
+ "problem_type": "single_label_classification",
43
+ "qkv_bias": true,
44
+ "torch_dtype": "float32",
45
+ "transformers_version": "4.38.2"
46
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d639cfa6c0c0a6dff1bc1dc022fa0318fa4467a430429788e3e7f6ce1274c8fe
3
+ size 343245508
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:149980f70f4056542cce52de73abd4d087fd4b61c033dd4d6bad54dca6eabe68
3
+ size 686611898
preprocessor_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "do_rescale": true,
4
+ "do_resize": true,
5
+ "image_mean": [
6
+ 0.5,
7
+ 0.5,
8
+ 0.5
9
+ ],
10
+ "image_processor_type": "ViTImageProcessor",
11
+ "image_std": [
12
+ 0.5,
13
+ 0.5,
14
+ 0.5
15
+ ],
16
+ "resample": 2,
17
+ "rescale_factor": 0.00392156862745098,
18
+ "size": {
19
+ "height": 224,
20
+ "width": 224
21
+ }
22
+ }
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d245e05e72192c132e0f2edb6fdcae0c578c890f0fe912f17ec7b0bba2d38cc3
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27a2701b91920a744cb212eaaa2227a252769180fa2250531b99f54a7a77e59c
3
+ size 1064
trainer_state.json ADDED
@@ -0,0 +1,682 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.983640081799591,
3
+ "best_model_checkpoint": "/home/user/Desktop/ViT/microsoft/vit_finetuned/checkpoint-4405",
4
+ "epoch": 5.0,
5
+ "eval_steps": 500,
6
+ "global_step": 4405,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.06,
13
+ "grad_norm": 7.11489200592041,
14
+ "learning_rate": 1.9772985244040862e-05,
15
+ "loss": 1.3281,
16
+ "step": 50
17
+ },
18
+ {
19
+ "epoch": 0.11,
20
+ "grad_norm": 7.982043266296387,
21
+ "learning_rate": 1.9545970488081726e-05,
22
+ "loss": 0.855,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.17,
27
+ "grad_norm": 10.429923057556152,
28
+ "learning_rate": 1.9318955732122587e-05,
29
+ "loss": 0.5551,
30
+ "step": 150
31
+ },
32
+ {
33
+ "epoch": 0.23,
34
+ "grad_norm": 8.25513744354248,
35
+ "learning_rate": 1.909194097616345e-05,
36
+ "loss": 0.4069,
37
+ "step": 200
38
+ },
39
+ {
40
+ "epoch": 0.28,
41
+ "grad_norm": 8.956159591674805,
42
+ "learning_rate": 1.8864926220204315e-05,
43
+ "loss": 0.3274,
44
+ "step": 250
45
+ },
46
+ {
47
+ "epoch": 0.34,
48
+ "grad_norm": 1.4499558210372925,
49
+ "learning_rate": 1.863791146424518e-05,
50
+ "loss": 0.2836,
51
+ "step": 300
52
+ },
53
+ {
54
+ "epoch": 0.4,
55
+ "grad_norm": 8.211179733276367,
56
+ "learning_rate": 1.841089670828604e-05,
57
+ "loss": 0.2992,
58
+ "step": 350
59
+ },
60
+ {
61
+ "epoch": 0.45,
62
+ "grad_norm": 8.798752784729004,
63
+ "learning_rate": 1.8183881952326903e-05,
64
+ "loss": 0.2276,
65
+ "step": 400
66
+ },
67
+ {
68
+ "epoch": 0.51,
69
+ "grad_norm": 0.3845123052597046,
70
+ "learning_rate": 1.7956867196367764e-05,
71
+ "loss": 0.2877,
72
+ "step": 450
73
+ },
74
+ {
75
+ "epoch": 0.57,
76
+ "grad_norm": 15.287618637084961,
77
+ "learning_rate": 1.7729852440408628e-05,
78
+ "loss": 0.3295,
79
+ "step": 500
80
+ },
81
+ {
82
+ "epoch": 0.62,
83
+ "grad_norm": 19.116310119628906,
84
+ "learning_rate": 1.750283768444949e-05,
85
+ "loss": 0.2301,
86
+ "step": 550
87
+ },
88
+ {
89
+ "epoch": 0.68,
90
+ "grad_norm": 8.03623104095459,
91
+ "learning_rate": 1.7275822928490352e-05,
92
+ "loss": 0.1888,
93
+ "step": 600
94
+ },
95
+ {
96
+ "epoch": 0.74,
97
+ "grad_norm": 6.62972354888916,
98
+ "learning_rate": 1.7048808172531216e-05,
99
+ "loss": 0.2183,
100
+ "step": 650
101
+ },
102
+ {
103
+ "epoch": 0.79,
104
+ "grad_norm": 2.2878506183624268,
105
+ "learning_rate": 1.6821793416572077e-05,
106
+ "loss": 0.1921,
107
+ "step": 700
108
+ },
109
+ {
110
+ "epoch": 0.85,
111
+ "grad_norm": 11.028101921081543,
112
+ "learning_rate": 1.659477866061294e-05,
113
+ "loss": 0.2508,
114
+ "step": 750
115
+ },
116
+ {
117
+ "epoch": 0.91,
118
+ "grad_norm": 19.02913475036621,
119
+ "learning_rate": 1.6367763904653805e-05,
120
+ "loss": 0.2718,
121
+ "step": 800
122
+ },
123
+ {
124
+ "epoch": 0.96,
125
+ "grad_norm": 2.5358214378356934,
126
+ "learning_rate": 1.6140749148694666e-05,
127
+ "loss": 0.1693,
128
+ "step": 850
129
+ },
130
+ {
131
+ "epoch": 1.0,
132
+ "eval_accuracy": 0.9611451942740287,
133
+ "eval_loss": 0.14811581373214722,
134
+ "eval_runtime": 75.4555,
135
+ "eval_samples_per_second": 12.961,
136
+ "eval_steps_per_second": 3.247,
137
+ "step": 881
138
+ },
139
+ {
140
+ "epoch": 1.02,
141
+ "grad_norm": 0.04142925515770912,
142
+ "learning_rate": 1.591373439273553e-05,
143
+ "loss": 0.0968,
144
+ "step": 900
145
+ },
146
+ {
147
+ "epoch": 1.08,
148
+ "grad_norm": 12.202880859375,
149
+ "learning_rate": 1.568671963677639e-05,
150
+ "loss": 0.1501,
151
+ "step": 950
152
+ },
153
+ {
154
+ "epoch": 1.14,
155
+ "grad_norm": 0.7614470720291138,
156
+ "learning_rate": 1.5459704880817254e-05,
157
+ "loss": 0.1543,
158
+ "step": 1000
159
+ },
160
+ {
161
+ "epoch": 1.19,
162
+ "grad_norm": 13.012730598449707,
163
+ "learning_rate": 1.5232690124858116e-05,
164
+ "loss": 0.2138,
165
+ "step": 1050
166
+ },
167
+ {
168
+ "epoch": 1.25,
169
+ "grad_norm": 25.198537826538086,
170
+ "learning_rate": 1.500567536889898e-05,
171
+ "loss": 0.1441,
172
+ "step": 1100
173
+ },
174
+ {
175
+ "epoch": 1.31,
176
+ "grad_norm": 10.802690505981445,
177
+ "learning_rate": 1.4778660612939841e-05,
178
+ "loss": 0.1082,
179
+ "step": 1150
180
+ },
181
+ {
182
+ "epoch": 1.36,
183
+ "grad_norm": 32.39842224121094,
184
+ "learning_rate": 1.4551645856980705e-05,
185
+ "loss": 0.1193,
186
+ "step": 1200
187
+ },
188
+ {
189
+ "epoch": 1.42,
190
+ "grad_norm": 0.05127358064055443,
191
+ "learning_rate": 1.4324631101021567e-05,
192
+ "loss": 0.1431,
193
+ "step": 1250
194
+ },
195
+ {
196
+ "epoch": 1.48,
197
+ "grad_norm": 15.374510765075684,
198
+ "learning_rate": 1.409761634506243e-05,
199
+ "loss": 0.1306,
200
+ "step": 1300
201
+ },
202
+ {
203
+ "epoch": 1.53,
204
+ "grad_norm": 22.718175888061523,
205
+ "learning_rate": 1.3870601589103292e-05,
206
+ "loss": 0.1379,
207
+ "step": 1350
208
+ },
209
+ {
210
+ "epoch": 1.59,
211
+ "grad_norm": 0.05202079191803932,
212
+ "learning_rate": 1.3643586833144156e-05,
213
+ "loss": 0.1174,
214
+ "step": 1400
215
+ },
216
+ {
217
+ "epoch": 1.65,
218
+ "grad_norm": 0.06224975362420082,
219
+ "learning_rate": 1.3416572077185016e-05,
220
+ "loss": 0.1303,
221
+ "step": 1450
222
+ },
223
+ {
224
+ "epoch": 1.7,
225
+ "grad_norm": 11.104813575744629,
226
+ "learning_rate": 1.318955732122588e-05,
227
+ "loss": 0.1536,
228
+ "step": 1500
229
+ },
230
+ {
231
+ "epoch": 1.76,
232
+ "grad_norm": 9.409710884094238,
233
+ "learning_rate": 1.2962542565266743e-05,
234
+ "loss": 0.1594,
235
+ "step": 1550
236
+ },
237
+ {
238
+ "epoch": 1.82,
239
+ "grad_norm": 1.2388224601745605,
240
+ "learning_rate": 1.2735527809307607e-05,
241
+ "loss": 0.0914,
242
+ "step": 1600
243
+ },
244
+ {
245
+ "epoch": 1.87,
246
+ "grad_norm": 12.70537281036377,
247
+ "learning_rate": 1.2508513053348467e-05,
248
+ "loss": 0.1253,
249
+ "step": 1650
250
+ },
251
+ {
252
+ "epoch": 1.93,
253
+ "grad_norm": 19.78186798095703,
254
+ "learning_rate": 1.2281498297389331e-05,
255
+ "loss": 0.12,
256
+ "step": 1700
257
+ },
258
+ {
259
+ "epoch": 1.99,
260
+ "grad_norm": 1.3154646158218384,
261
+ "learning_rate": 1.2054483541430193e-05,
262
+ "loss": 0.1244,
263
+ "step": 1750
264
+ },
265
+ {
266
+ "epoch": 2.0,
267
+ "eval_accuracy": 0.9693251533742331,
268
+ "eval_loss": 0.11500896513462067,
269
+ "eval_runtime": 73.5646,
270
+ "eval_samples_per_second": 13.294,
271
+ "eval_steps_per_second": 3.33,
272
+ "step": 1762
273
+ },
274
+ {
275
+ "epoch": 2.04,
276
+ "grad_norm": 0.14741405844688416,
277
+ "learning_rate": 1.1827468785471057e-05,
278
+ "loss": 0.0995,
279
+ "step": 1800
280
+ },
281
+ {
282
+ "epoch": 2.1,
283
+ "grad_norm": 0.8733153939247131,
284
+ "learning_rate": 1.1600454029511918e-05,
285
+ "loss": 0.041,
286
+ "step": 1850
287
+ },
288
+ {
289
+ "epoch": 2.16,
290
+ "grad_norm": 0.20158135890960693,
291
+ "learning_rate": 1.1373439273552782e-05,
292
+ "loss": 0.025,
293
+ "step": 1900
294
+ },
295
+ {
296
+ "epoch": 2.21,
297
+ "grad_norm": 0.3511994779109955,
298
+ "learning_rate": 1.1146424517593644e-05,
299
+ "loss": 0.1116,
300
+ "step": 1950
301
+ },
302
+ {
303
+ "epoch": 2.27,
304
+ "grad_norm": 0.01977057382464409,
305
+ "learning_rate": 1.0919409761634507e-05,
306
+ "loss": 0.0746,
307
+ "step": 2000
308
+ },
309
+ {
310
+ "epoch": 2.33,
311
+ "grad_norm": 0.0061535644344985485,
312
+ "learning_rate": 1.0692395005675369e-05,
313
+ "loss": 0.0586,
314
+ "step": 2050
315
+ },
316
+ {
317
+ "epoch": 2.38,
318
+ "grad_norm": 0.009720620699226856,
319
+ "learning_rate": 1.0465380249716233e-05,
320
+ "loss": 0.0499,
321
+ "step": 2100
322
+ },
323
+ {
324
+ "epoch": 2.44,
325
+ "grad_norm": 0.07518770545721054,
326
+ "learning_rate": 1.0238365493757093e-05,
327
+ "loss": 0.0497,
328
+ "step": 2150
329
+ },
330
+ {
331
+ "epoch": 2.5,
332
+ "grad_norm": 0.17679017782211304,
333
+ "learning_rate": 1.0011350737797957e-05,
334
+ "loss": 0.1088,
335
+ "step": 2200
336
+ },
337
+ {
338
+ "epoch": 2.55,
339
+ "grad_norm": 0.010072373785078526,
340
+ "learning_rate": 9.784335981838821e-06,
341
+ "loss": 0.0475,
342
+ "step": 2250
343
+ },
344
+ {
345
+ "epoch": 2.61,
346
+ "grad_norm": 13.180130004882812,
347
+ "learning_rate": 9.557321225879684e-06,
348
+ "loss": 0.0702,
349
+ "step": 2300
350
+ },
351
+ {
352
+ "epoch": 2.67,
353
+ "grad_norm": 10.65009880065918,
354
+ "learning_rate": 9.330306469920546e-06,
355
+ "loss": 0.0956,
356
+ "step": 2350
357
+ },
358
+ {
359
+ "epoch": 2.72,
360
+ "grad_norm": 0.06876125931739807,
361
+ "learning_rate": 9.103291713961408e-06,
362
+ "loss": 0.1123,
363
+ "step": 2400
364
+ },
365
+ {
366
+ "epoch": 2.78,
367
+ "grad_norm": 0.03692874684929848,
368
+ "learning_rate": 8.87627695800227e-06,
369
+ "loss": 0.0345,
370
+ "step": 2450
371
+ },
372
+ {
373
+ "epoch": 2.84,
374
+ "grad_norm": 30.104206085205078,
375
+ "learning_rate": 8.649262202043135e-06,
376
+ "loss": 0.0515,
377
+ "step": 2500
378
+ },
379
+ {
380
+ "epoch": 2.89,
381
+ "grad_norm": 3.871462345123291,
382
+ "learning_rate": 8.422247446083997e-06,
383
+ "loss": 0.0661,
384
+ "step": 2550
385
+ },
386
+ {
387
+ "epoch": 2.95,
388
+ "grad_norm": 0.03757273405790329,
389
+ "learning_rate": 8.195232690124859e-06,
390
+ "loss": 0.0692,
391
+ "step": 2600
392
+ },
393
+ {
394
+ "epoch": 3.0,
395
+ "eval_accuracy": 0.9785276073619632,
396
+ "eval_loss": 0.11215907335281372,
397
+ "eval_runtime": 73.8279,
398
+ "eval_samples_per_second": 13.247,
399
+ "eval_steps_per_second": 3.319,
400
+ "step": 2643
401
+ },
402
+ {
403
+ "epoch": 3.01,
404
+ "grad_norm": 0.36702996492385864,
405
+ "learning_rate": 7.968217934165721e-06,
406
+ "loss": 0.0561,
407
+ "step": 2650
408
+ },
409
+ {
410
+ "epoch": 3.06,
411
+ "grad_norm": 0.0126617681235075,
412
+ "learning_rate": 7.741203178206584e-06,
413
+ "loss": 0.039,
414
+ "step": 2700
415
+ },
416
+ {
417
+ "epoch": 3.12,
418
+ "grad_norm": 0.009421919472515583,
419
+ "learning_rate": 7.514188422247447e-06,
420
+ "loss": 0.0243,
421
+ "step": 2750
422
+ },
423
+ {
424
+ "epoch": 3.18,
425
+ "grad_norm": 0.09766176342964172,
426
+ "learning_rate": 7.28717366628831e-06,
427
+ "loss": 0.0299,
428
+ "step": 2800
429
+ },
430
+ {
431
+ "epoch": 3.23,
432
+ "grad_norm": 0.0054335640743374825,
433
+ "learning_rate": 7.060158910329172e-06,
434
+ "loss": 0.0066,
435
+ "step": 2850
436
+ },
437
+ {
438
+ "epoch": 3.29,
439
+ "grad_norm": 0.046133432537317276,
440
+ "learning_rate": 6.833144154370035e-06,
441
+ "loss": 0.0275,
442
+ "step": 2900
443
+ },
444
+ {
445
+ "epoch": 3.35,
446
+ "grad_norm": 0.003417497966438532,
447
+ "learning_rate": 6.606129398410898e-06,
448
+ "loss": 0.0139,
449
+ "step": 2950
450
+ },
451
+ {
452
+ "epoch": 3.41,
453
+ "grad_norm": 0.009167753159999847,
454
+ "learning_rate": 6.37911464245176e-06,
455
+ "loss": 0.0553,
456
+ "step": 3000
457
+ },
458
+ {
459
+ "epoch": 3.46,
460
+ "grad_norm": 0.03221438080072403,
461
+ "learning_rate": 6.152099886492623e-06,
462
+ "loss": 0.0601,
463
+ "step": 3050
464
+ },
465
+ {
466
+ "epoch": 3.52,
467
+ "grad_norm": 15.037586212158203,
468
+ "learning_rate": 5.925085130533485e-06,
469
+ "loss": 0.0343,
470
+ "step": 3100
471
+ },
472
+ {
473
+ "epoch": 3.58,
474
+ "grad_norm": 0.18505249917507172,
475
+ "learning_rate": 5.6980703745743485e-06,
476
+ "loss": 0.0539,
477
+ "step": 3150
478
+ },
479
+ {
480
+ "epoch": 3.63,
481
+ "grad_norm": 24.898054122924805,
482
+ "learning_rate": 5.471055618615211e-06,
483
+ "loss": 0.0455,
484
+ "step": 3200
485
+ },
486
+ {
487
+ "epoch": 3.69,
488
+ "grad_norm": 7.430636882781982,
489
+ "learning_rate": 5.244040862656073e-06,
490
+ "loss": 0.0502,
491
+ "step": 3250
492
+ },
493
+ {
494
+ "epoch": 3.75,
495
+ "grad_norm": 0.005094760097563267,
496
+ "learning_rate": 5.017026106696936e-06,
497
+ "loss": 0.0114,
498
+ "step": 3300
499
+ },
500
+ {
501
+ "epoch": 3.8,
502
+ "grad_norm": 0.03962313383817673,
503
+ "learning_rate": 4.7900113507377985e-06,
504
+ "loss": 0.0173,
505
+ "step": 3350
506
+ },
507
+ {
508
+ "epoch": 3.86,
509
+ "grad_norm": 0.7573426961898804,
510
+ "learning_rate": 4.562996594778661e-06,
511
+ "loss": 0.0185,
512
+ "step": 3400
513
+ },
514
+ {
515
+ "epoch": 3.92,
516
+ "grad_norm": 0.03129027783870697,
517
+ "learning_rate": 4.335981838819524e-06,
518
+ "loss": 0.0262,
519
+ "step": 3450
520
+ },
521
+ {
522
+ "epoch": 3.97,
523
+ "grad_norm": 0.18763615190982819,
524
+ "learning_rate": 4.108967082860386e-06,
525
+ "loss": 0.0326,
526
+ "step": 3500
527
+ },
528
+ {
529
+ "epoch": 4.0,
530
+ "eval_accuracy": 0.9775051124744376,
531
+ "eval_loss": 0.09212471544742584,
532
+ "eval_runtime": 73.5389,
533
+ "eval_samples_per_second": 13.299,
534
+ "eval_steps_per_second": 3.332,
535
+ "step": 3524
536
+ },
537
+ {
538
+ "epoch": 4.03,
539
+ "grad_norm": 0.052001625299453735,
540
+ "learning_rate": 3.8819523269012485e-06,
541
+ "loss": 0.0174,
542
+ "step": 3550
543
+ },
544
+ {
545
+ "epoch": 4.09,
546
+ "grad_norm": 0.017523808404803276,
547
+ "learning_rate": 3.6549375709421116e-06,
548
+ "loss": 0.0081,
549
+ "step": 3600
550
+ },
551
+ {
552
+ "epoch": 4.14,
553
+ "grad_norm": 0.007012099493294954,
554
+ "learning_rate": 3.427922814982974e-06,
555
+ "loss": 0.0152,
556
+ "step": 3650
557
+ },
558
+ {
559
+ "epoch": 4.2,
560
+ "grad_norm": 0.01468530111014843,
561
+ "learning_rate": 3.2009080590238366e-06,
562
+ "loss": 0.0098,
563
+ "step": 3700
564
+ },
565
+ {
566
+ "epoch": 4.26,
567
+ "grad_norm": 0.160551518201828,
568
+ "learning_rate": 2.9738933030646993e-06,
569
+ "loss": 0.01,
570
+ "step": 3750
571
+ },
572
+ {
573
+ "epoch": 4.31,
574
+ "grad_norm": 0.006342815700918436,
575
+ "learning_rate": 2.746878547105562e-06,
576
+ "loss": 0.001,
577
+ "step": 3800
578
+ },
579
+ {
580
+ "epoch": 4.37,
581
+ "grad_norm": 0.011349351145327091,
582
+ "learning_rate": 2.5198637911464247e-06,
583
+ "loss": 0.0135,
584
+ "step": 3850
585
+ },
586
+ {
587
+ "epoch": 4.43,
588
+ "grad_norm": 0.004315485712140799,
589
+ "learning_rate": 2.2928490351872874e-06,
590
+ "loss": 0.0061,
591
+ "step": 3900
592
+ },
593
+ {
594
+ "epoch": 4.48,
595
+ "grad_norm": 0.25492075085639954,
596
+ "learning_rate": 2.06583427922815e-06,
597
+ "loss": 0.0122,
598
+ "step": 3950
599
+ },
600
+ {
601
+ "epoch": 4.54,
602
+ "grad_norm": 0.00799469742923975,
603
+ "learning_rate": 1.8388195232690126e-06,
604
+ "loss": 0.0036,
605
+ "step": 4000
606
+ },
607
+ {
608
+ "epoch": 4.6,
609
+ "grad_norm": 0.12055113166570663,
610
+ "learning_rate": 1.6118047673098751e-06,
611
+ "loss": 0.004,
612
+ "step": 4050
613
+ },
614
+ {
615
+ "epoch": 4.65,
616
+ "grad_norm": 0.014251478016376495,
617
+ "learning_rate": 1.3847900113507379e-06,
618
+ "loss": 0.0191,
619
+ "step": 4100
620
+ },
621
+ {
622
+ "epoch": 4.71,
623
+ "grad_norm": 0.007032675202935934,
624
+ "learning_rate": 1.1577752553916006e-06,
625
+ "loss": 0.0106,
626
+ "step": 4150
627
+ },
628
+ {
629
+ "epoch": 4.77,
630
+ "grad_norm": 0.007455560844391584,
631
+ "learning_rate": 9.307604994324632e-07,
632
+ "loss": 0.0081,
633
+ "step": 4200
634
+ },
635
+ {
636
+ "epoch": 4.82,
637
+ "grad_norm": 0.06628051400184631,
638
+ "learning_rate": 7.037457434733258e-07,
639
+ "loss": 0.0038,
640
+ "step": 4250
641
+ },
642
+ {
643
+ "epoch": 4.88,
644
+ "grad_norm": 0.004159100819379091,
645
+ "learning_rate": 4.7673098751418843e-07,
646
+ "loss": 0.0134,
647
+ "step": 4300
648
+ },
649
+ {
650
+ "epoch": 4.94,
651
+ "grad_norm": 0.0036380901001393795,
652
+ "learning_rate": 2.497162315550511e-07,
653
+ "loss": 0.0211,
654
+ "step": 4350
655
+ },
656
+ {
657
+ "epoch": 4.99,
658
+ "grad_norm": 0.0037857021670788527,
659
+ "learning_rate": 2.2701475595913736e-08,
660
+ "loss": 0.0235,
661
+ "step": 4400
662
+ },
663
+ {
664
+ "epoch": 5.0,
665
+ "eval_accuracy": 0.983640081799591,
666
+ "eval_loss": 0.09139783680438995,
667
+ "eval_runtime": 73.8525,
668
+ "eval_samples_per_second": 13.243,
669
+ "eval_steps_per_second": 3.317,
670
+ "step": 4405
671
+ }
672
+ ],
673
+ "logging_steps": 50,
674
+ "max_steps": 4405,
675
+ "num_input_tokens_seen": 0,
676
+ "num_train_epochs": 5,
677
+ "save_steps": 500,
678
+ "total_flos": 3.4106364287028634e+18,
679
+ "train_batch_size": 10,
680
+ "trial_name": null,
681
+ "trial_params": null
682
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b03b8afadef574f31977970ff75a155cb4dd85cd7dea9d2a032c204d80d5c2d9
3
+ size 4856