franfj commited on
Commit
3e95f6a
·
verified ·
1 Parent(s): b957777

Upload folder using huggingface_hub

Browse files
checkpoint-3993/config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "distilbert/distilbert-base-multilingual-cased",
3
+ "activation": "gelu",
4
+ "architectures": [
5
+ "DistilBertForSequenceClassification"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "dim": 768,
9
+ "dropout": 0.1,
10
+ "hidden_dim": 3072,
11
+ "initializer_range": 0.02,
12
+ "max_position_embeddings": 512,
13
+ "model_type": "distilbert",
14
+ "n_heads": 12,
15
+ "n_layers": 6,
16
+ "output_past": true,
17
+ "pad_token_id": 0,
18
+ "problem_type": "single_label_classification",
19
+ "qa_dropout": 0.1,
20
+ "seq_classif_dropout": 0.2,
21
+ "sinusoidal_pos_embds": false,
22
+ "tie_weights_": true,
23
+ "torch_dtype": "float32",
24
+ "transformers_version": "4.48.3",
25
+ "vocab_size": 119547
26
+ }
checkpoint-3993/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ea9b5e7183bbf5cf8fa18a035400a551437fae62101ab6a9f9b5deb3b2f5b8c
3
+ size 541317368
checkpoint-3993/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:790a47a1e084ea24a5141b94cd9e7642464dcde31589e665983af95a17982f9d
3
+ size 1082696890
checkpoint-3993/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f77155ef51b39b582fee6b7532a11655f8bbce22f1a1541c851f69568820fae
3
+ size 14244
checkpoint-3993/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c9a100e2f2b7b8fc6eaa82083fc485d0bcbe8913f96de28b5d46bbcaee0f201
3
+ size 1064
checkpoint-3993/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
checkpoint-3993/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-3993/tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": false,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "DistilBertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
checkpoint-3993/trainer_state.json ADDED
@@ -0,0 +1,616 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 500,
6
+ "global_step": 3993,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.037565740045078885,
13
+ "grad_norm": 1.0086880922317505,
14
+ "learning_rate": 4.937390433258202e-05,
15
+ "loss": 0.6372,
16
+ "step": 50
17
+ },
18
+ {
19
+ "epoch": 0.07513148009015777,
20
+ "grad_norm": 2.3579623699188232,
21
+ "learning_rate": 4.874780866516404e-05,
22
+ "loss": 0.5979,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.11269722013523667,
27
+ "grad_norm": 1.6555202007293701,
28
+ "learning_rate": 4.812171299774606e-05,
29
+ "loss": 0.5668,
30
+ "step": 150
31
+ },
32
+ {
33
+ "epoch": 0.15026296018031554,
34
+ "grad_norm": 2.134246587753296,
35
+ "learning_rate": 4.749561733032808e-05,
36
+ "loss": 0.5677,
37
+ "step": 200
38
+ },
39
+ {
40
+ "epoch": 0.18782870022539444,
41
+ "grad_norm": 1.2070592641830444,
42
+ "learning_rate": 4.6869521662910095e-05,
43
+ "loss": 0.5519,
44
+ "step": 250
45
+ },
46
+ {
47
+ "epoch": 0.22539444027047334,
48
+ "grad_norm": 1.7169790267944336,
49
+ "learning_rate": 4.6243425995492114e-05,
50
+ "loss": 0.5403,
51
+ "step": 300
52
+ },
53
+ {
54
+ "epoch": 0.26296018031555224,
55
+ "grad_norm": 2.367072105407715,
56
+ "learning_rate": 4.561733032807413e-05,
57
+ "loss": 0.5375,
58
+ "step": 350
59
+ },
60
+ {
61
+ "epoch": 0.3005259203606311,
62
+ "grad_norm": 1.7623053789138794,
63
+ "learning_rate": 4.499123466065615e-05,
64
+ "loss": 0.5425,
65
+ "step": 400
66
+ },
67
+ {
68
+ "epoch": 0.33809166040571,
69
+ "grad_norm": 1.7148823738098145,
70
+ "learning_rate": 4.436513899323817e-05,
71
+ "loss": 0.5454,
72
+ "step": 450
73
+ },
74
+ {
75
+ "epoch": 0.3756574004507889,
76
+ "grad_norm": 1.318366527557373,
77
+ "learning_rate": 4.373904332582019e-05,
78
+ "loss": 0.5353,
79
+ "step": 500
80
+ },
81
+ {
82
+ "epoch": 0.4132231404958678,
83
+ "grad_norm": 2.0299246311187744,
84
+ "learning_rate": 4.311294765840221e-05,
85
+ "loss": 0.5296,
86
+ "step": 550
87
+ },
88
+ {
89
+ "epoch": 0.4507888805409467,
90
+ "grad_norm": 1.5541517734527588,
91
+ "learning_rate": 4.2486851990984225e-05,
92
+ "loss": 0.5433,
93
+ "step": 600
94
+ },
95
+ {
96
+ "epoch": 0.4883546205860255,
97
+ "grad_norm": 1.6975432634353638,
98
+ "learning_rate": 4.1860756323566244e-05,
99
+ "loss": 0.5142,
100
+ "step": 650
101
+ },
102
+ {
103
+ "epoch": 0.5259203606311045,
104
+ "grad_norm": 1.6241447925567627,
105
+ "learning_rate": 4.123466065614826e-05,
106
+ "loss": 0.5139,
107
+ "step": 700
108
+ },
109
+ {
110
+ "epoch": 0.5634861006761833,
111
+ "grad_norm": 1.4247183799743652,
112
+ "learning_rate": 4.060856498873028e-05,
113
+ "loss": 0.5102,
114
+ "step": 750
115
+ },
116
+ {
117
+ "epoch": 0.6010518407212622,
118
+ "grad_norm": 2.266134023666382,
119
+ "learning_rate": 3.99824693213123e-05,
120
+ "loss": 0.5036,
121
+ "step": 800
122
+ },
123
+ {
124
+ "epoch": 0.6386175807663411,
125
+ "grad_norm": 1.5822668075561523,
126
+ "learning_rate": 3.935637365389432e-05,
127
+ "loss": 0.5203,
128
+ "step": 850
129
+ },
130
+ {
131
+ "epoch": 0.67618332081142,
132
+ "grad_norm": 2.1509227752685547,
133
+ "learning_rate": 3.873027798647634e-05,
134
+ "loss": 0.5116,
135
+ "step": 900
136
+ },
137
+ {
138
+ "epoch": 0.7137490608564989,
139
+ "grad_norm": 1.511104941368103,
140
+ "learning_rate": 3.810418231905835e-05,
141
+ "loss": 0.5048,
142
+ "step": 950
143
+ },
144
+ {
145
+ "epoch": 0.7513148009015778,
146
+ "grad_norm": 2.872206211090088,
147
+ "learning_rate": 3.747808665164037e-05,
148
+ "loss": 0.5137,
149
+ "step": 1000
150
+ },
151
+ {
152
+ "epoch": 0.7888805409466566,
153
+ "grad_norm": 3.4886534214019775,
154
+ "learning_rate": 3.685199098422239e-05,
155
+ "loss": 0.4881,
156
+ "step": 1050
157
+ },
158
+ {
159
+ "epoch": 0.8264462809917356,
160
+ "grad_norm": 2.8081490993499756,
161
+ "learning_rate": 3.622589531680441e-05,
162
+ "loss": 0.4764,
163
+ "step": 1100
164
+ },
165
+ {
166
+ "epoch": 0.8640120210368144,
167
+ "grad_norm": 1.6759647130966187,
168
+ "learning_rate": 3.559979964938643e-05,
169
+ "loss": 0.5082,
170
+ "step": 1150
171
+ },
172
+ {
173
+ "epoch": 0.9015777610818934,
174
+ "grad_norm": 2.0700764656066895,
175
+ "learning_rate": 3.497370398196845e-05,
176
+ "loss": 0.5035,
177
+ "step": 1200
178
+ },
179
+ {
180
+ "epoch": 0.9391435011269722,
181
+ "grad_norm": 1.5199862718582153,
182
+ "learning_rate": 3.434760831455047e-05,
183
+ "loss": 0.5007,
184
+ "step": 1250
185
+ },
186
+ {
187
+ "epoch": 0.976709241172051,
188
+ "grad_norm": 2.5575571060180664,
189
+ "learning_rate": 3.3721512647132486e-05,
190
+ "loss": 0.5015,
191
+ "step": 1300
192
+ },
193
+ {
194
+ "epoch": 1.0,
195
+ "eval_accuracy": 0.7457595263825588,
196
+ "eval_f1": 0.7615774399647499,
197
+ "eval_loss": 0.47971779108047485,
198
+ "eval_runtime": 154.8323,
199
+ "eval_samples_per_second": 137.458,
200
+ "eval_steps_per_second": 2.151,
201
+ "step": 1331
202
+ },
203
+ {
204
+ "epoch": 1.01427498121713,
205
+ "grad_norm": 1.657974362373352,
206
+ "learning_rate": 3.30954169797145e-05,
207
+ "loss": 0.4846,
208
+ "step": 1350
209
+ },
210
+ {
211
+ "epoch": 1.051840721262209,
212
+ "grad_norm": 2.831270217895508,
213
+ "learning_rate": 3.2469321312296516e-05,
214
+ "loss": 0.4359,
215
+ "step": 1400
216
+ },
217
+ {
218
+ "epoch": 1.0894064613072878,
219
+ "grad_norm": 2.0967586040496826,
220
+ "learning_rate": 3.184322564487854e-05,
221
+ "loss": 0.4352,
222
+ "step": 1450
223
+ },
224
+ {
225
+ "epoch": 1.1269722013523666,
226
+ "grad_norm": 2.5382165908813477,
227
+ "learning_rate": 3.121712997746056e-05,
228
+ "loss": 0.423,
229
+ "step": 1500
230
+ },
231
+ {
232
+ "epoch": 1.1645379413974455,
233
+ "grad_norm": 1.702189326286316,
234
+ "learning_rate": 3.059103431004258e-05,
235
+ "loss": 0.4178,
236
+ "step": 1550
237
+ },
238
+ {
239
+ "epoch": 1.2021036814425243,
240
+ "grad_norm": 2.0063211917877197,
241
+ "learning_rate": 2.9964938642624597e-05,
242
+ "loss": 0.4235,
243
+ "step": 1600
244
+ },
245
+ {
246
+ "epoch": 1.2396694214876034,
247
+ "grad_norm": 2.133720874786377,
248
+ "learning_rate": 2.9338842975206616e-05,
249
+ "loss": 0.4381,
250
+ "step": 1650
251
+ },
252
+ {
253
+ "epoch": 1.2772351615326822,
254
+ "grad_norm": 1.841479778289795,
255
+ "learning_rate": 2.8712747307788628e-05,
256
+ "loss": 0.4531,
257
+ "step": 1700
258
+ },
259
+ {
260
+ "epoch": 1.314800901577761,
261
+ "grad_norm": 2.0691092014312744,
262
+ "learning_rate": 2.808665164037065e-05,
263
+ "loss": 0.4247,
264
+ "step": 1750
265
+ },
266
+ {
267
+ "epoch": 1.35236664162284,
268
+ "grad_norm": 4.776940822601318,
269
+ "learning_rate": 2.7460555972952668e-05,
270
+ "loss": 0.4265,
271
+ "step": 1800
272
+ },
273
+ {
274
+ "epoch": 1.389932381667919,
275
+ "grad_norm": 1.9305018186569214,
276
+ "learning_rate": 2.6834460305534687e-05,
277
+ "loss": 0.4306,
278
+ "step": 1850
279
+ },
280
+ {
281
+ "epoch": 1.4274981217129978,
282
+ "grad_norm": 2.0644538402557373,
283
+ "learning_rate": 2.6208364638116705e-05,
284
+ "loss": 0.4255,
285
+ "step": 1900
286
+ },
287
+ {
288
+ "epoch": 1.4650638617580767,
289
+ "grad_norm": 3.263160228729248,
290
+ "learning_rate": 2.5582268970698724e-05,
291
+ "loss": 0.4427,
292
+ "step": 1950
293
+ },
294
+ {
295
+ "epoch": 1.5026296018031555,
296
+ "grad_norm": 3.1235225200653076,
297
+ "learning_rate": 2.4956173303280743e-05,
298
+ "loss": 0.4433,
299
+ "step": 2000
300
+ },
301
+ {
302
+ "epoch": 1.5401953418482344,
303
+ "grad_norm": 1.4947174787521362,
304
+ "learning_rate": 2.433007763586276e-05,
305
+ "loss": 0.4359,
306
+ "step": 2050
307
+ },
308
+ {
309
+ "epoch": 1.5777610818933132,
310
+ "grad_norm": 1.4775031805038452,
311
+ "learning_rate": 2.370398196844478e-05,
312
+ "loss": 0.4397,
313
+ "step": 2100
314
+ },
315
+ {
316
+ "epoch": 1.615326821938392,
317
+ "grad_norm": 3.387707233428955,
318
+ "learning_rate": 2.30778863010268e-05,
319
+ "loss": 0.4202,
320
+ "step": 2150
321
+ },
322
+ {
323
+ "epoch": 1.6528925619834711,
324
+ "grad_norm": 3.0223426818847656,
325
+ "learning_rate": 2.2451790633608817e-05,
326
+ "loss": 0.4275,
327
+ "step": 2200
328
+ },
329
+ {
330
+ "epoch": 1.69045830202855,
331
+ "grad_norm": 2.5161309242248535,
332
+ "learning_rate": 2.1825694966190836e-05,
333
+ "loss": 0.406,
334
+ "step": 2250
335
+ },
336
+ {
337
+ "epoch": 1.7280240420736288,
338
+ "grad_norm": 2.2194652557373047,
339
+ "learning_rate": 2.1199599298772854e-05,
340
+ "loss": 0.416,
341
+ "step": 2300
342
+ },
343
+ {
344
+ "epoch": 1.7655897821187079,
345
+ "grad_norm": 2.4205820560455322,
346
+ "learning_rate": 2.057350363135487e-05,
347
+ "loss": 0.4315,
348
+ "step": 2350
349
+ },
350
+ {
351
+ "epoch": 1.8031555221637867,
352
+ "grad_norm": 1.2840783596038818,
353
+ "learning_rate": 1.994740796393689e-05,
354
+ "loss": 0.4155,
355
+ "step": 2400
356
+ },
357
+ {
358
+ "epoch": 1.8407212622088656,
359
+ "grad_norm": 1.456228256225586,
360
+ "learning_rate": 1.932131229651891e-05,
361
+ "loss": 0.4247,
362
+ "step": 2450
363
+ },
364
+ {
365
+ "epoch": 1.8782870022539444,
366
+ "grad_norm": 2.8294334411621094,
367
+ "learning_rate": 1.869521662910093e-05,
368
+ "loss": 0.4305,
369
+ "step": 2500
370
+ },
371
+ {
372
+ "epoch": 1.9158527422990232,
373
+ "grad_norm": 2.2248728275299072,
374
+ "learning_rate": 1.8069120961682944e-05,
375
+ "loss": 0.4349,
376
+ "step": 2550
377
+ },
378
+ {
379
+ "epoch": 1.953418482344102,
380
+ "grad_norm": 1.7025116682052612,
381
+ "learning_rate": 1.7443025294264966e-05,
382
+ "loss": 0.423,
383
+ "step": 2600
384
+ },
385
+ {
386
+ "epoch": 1.990984222389181,
387
+ "grad_norm": 3.388554811477661,
388
+ "learning_rate": 1.6816929626846984e-05,
389
+ "loss": 0.4265,
390
+ "step": 2650
391
+ },
392
+ {
393
+ "epoch": 2.0,
394
+ "eval_accuracy": 0.7559084715500635,
395
+ "eval_f1": 0.764024528730411,
396
+ "eval_loss": 0.47860267758369446,
397
+ "eval_runtime": 149.5526,
398
+ "eval_samples_per_second": 142.311,
399
+ "eval_steps_per_second": 2.227,
400
+ "step": 2662
401
+ },
402
+ {
403
+ "epoch": 2.02854996243426,
404
+ "grad_norm": 6.253864765167236,
405
+ "learning_rate": 1.6190833959429003e-05,
406
+ "loss": 0.3602,
407
+ "step": 2700
408
+ },
409
+ {
410
+ "epoch": 2.0661157024793386,
411
+ "grad_norm": 2.9103243350982666,
412
+ "learning_rate": 1.5564738292011018e-05,
413
+ "loss": 0.3674,
414
+ "step": 2750
415
+ },
416
+ {
417
+ "epoch": 2.103681442524418,
418
+ "grad_norm": 2.6558964252471924,
419
+ "learning_rate": 1.4938642624593038e-05,
420
+ "loss": 0.3632,
421
+ "step": 2800
422
+ },
423
+ {
424
+ "epoch": 2.1412471825694968,
425
+ "grad_norm": 2.243708610534668,
426
+ "learning_rate": 1.4312546957175057e-05,
427
+ "loss": 0.3729,
428
+ "step": 2850
429
+ },
430
+ {
431
+ "epoch": 2.1788129226145756,
432
+ "grad_norm": 2.718883752822876,
433
+ "learning_rate": 1.3686451289757077e-05,
434
+ "loss": 0.3564,
435
+ "step": 2900
436
+ },
437
+ {
438
+ "epoch": 2.2163786626596544,
439
+ "grad_norm": 5.29821252822876,
440
+ "learning_rate": 1.3060355622339094e-05,
441
+ "loss": 0.3617,
442
+ "step": 2950
443
+ },
444
+ {
445
+ "epoch": 2.2539444027047333,
446
+ "grad_norm": 4.189558982849121,
447
+ "learning_rate": 1.2434259954921113e-05,
448
+ "loss": 0.3653,
449
+ "step": 3000
450
+ },
451
+ {
452
+ "epoch": 2.291510142749812,
453
+ "grad_norm": 1.6549113988876343,
454
+ "learning_rate": 1.180816428750313e-05,
455
+ "loss": 0.3727,
456
+ "step": 3050
457
+ },
458
+ {
459
+ "epoch": 2.329075882794891,
460
+ "grad_norm": 1.1492334604263306,
461
+ "learning_rate": 1.118206862008515e-05,
462
+ "loss": 0.3649,
463
+ "step": 3100
464
+ },
465
+ {
466
+ "epoch": 2.36664162283997,
467
+ "grad_norm": 2.9691059589385986,
468
+ "learning_rate": 1.0555972952667167e-05,
469
+ "loss": 0.3558,
470
+ "step": 3150
471
+ },
472
+ {
473
+ "epoch": 2.4042073628850487,
474
+ "grad_norm": 2.8758184909820557,
475
+ "learning_rate": 9.929877285249187e-06,
476
+ "loss": 0.3605,
477
+ "step": 3200
478
+ },
479
+ {
480
+ "epoch": 2.441773102930128,
481
+ "grad_norm": 2.866455554962158,
482
+ "learning_rate": 9.303781617831204e-06,
483
+ "loss": 0.3459,
484
+ "step": 3250
485
+ },
486
+ {
487
+ "epoch": 2.479338842975207,
488
+ "grad_norm": 2.672839879989624,
489
+ "learning_rate": 8.677685950413224e-06,
490
+ "loss": 0.3553,
491
+ "step": 3300
492
+ },
493
+ {
494
+ "epoch": 2.5169045830202856,
495
+ "grad_norm": 3.4048142433166504,
496
+ "learning_rate": 8.051590282995241e-06,
497
+ "loss": 0.3502,
498
+ "step": 3350
499
+ },
500
+ {
501
+ "epoch": 2.5544703230653645,
502
+ "grad_norm": 2.534475564956665,
503
+ "learning_rate": 7.4254946155772605e-06,
504
+ "loss": 0.3519,
505
+ "step": 3400
506
+ },
507
+ {
508
+ "epoch": 2.5920360631104433,
509
+ "grad_norm": 5.125302791595459,
510
+ "learning_rate": 6.799398948159279e-06,
511
+ "loss": 0.3561,
512
+ "step": 3450
513
+ },
514
+ {
515
+ "epoch": 2.629601803155522,
516
+ "grad_norm": 3.038414478302002,
517
+ "learning_rate": 6.173303280741298e-06,
518
+ "loss": 0.3708,
519
+ "step": 3500
520
+ },
521
+ {
522
+ "epoch": 2.667167543200601,
523
+ "grad_norm": 3.264307975769043,
524
+ "learning_rate": 5.5472076133233154e-06,
525
+ "loss": 0.3688,
526
+ "step": 3550
527
+ },
528
+ {
529
+ "epoch": 2.70473328324568,
530
+ "grad_norm": 3.3217625617980957,
531
+ "learning_rate": 4.921111945905334e-06,
532
+ "loss": 0.3538,
533
+ "step": 3600
534
+ },
535
+ {
536
+ "epoch": 2.7422990232907587,
537
+ "grad_norm": 1.3763796091079712,
538
+ "learning_rate": 4.295016278487353e-06,
539
+ "loss": 0.3463,
540
+ "step": 3650
541
+ },
542
+ {
543
+ "epoch": 2.779864763335838,
544
+ "grad_norm": 4.890012741088867,
545
+ "learning_rate": 3.6689206110693716e-06,
546
+ "loss": 0.3514,
547
+ "step": 3700
548
+ },
549
+ {
550
+ "epoch": 2.8174305033809164,
551
+ "grad_norm": 2.0561699867248535,
552
+ "learning_rate": 3.0428249436513902e-06,
553
+ "loss": 0.3619,
554
+ "step": 3750
555
+ },
556
+ {
557
+ "epoch": 2.8549962434259957,
558
+ "grad_norm": 3.4119489192962646,
559
+ "learning_rate": 2.4167292762334084e-06,
560
+ "loss": 0.3557,
561
+ "step": 3800
562
+ },
563
+ {
564
+ "epoch": 2.8925619834710745,
565
+ "grad_norm": 4.181868553161621,
566
+ "learning_rate": 1.7906336088154272e-06,
567
+ "loss": 0.3526,
568
+ "step": 3850
569
+ },
570
+ {
571
+ "epoch": 2.9301277235161534,
572
+ "grad_norm": 2.8571691513061523,
573
+ "learning_rate": 1.1645379413974456e-06,
574
+ "loss": 0.37,
575
+ "step": 3900
576
+ },
577
+ {
578
+ "epoch": 2.967693463561232,
579
+ "grad_norm": 7.221222877502441,
580
+ "learning_rate": 5.38442273979464e-07,
581
+ "loss": 0.3626,
582
+ "step": 3950
583
+ },
584
+ {
585
+ "epoch": 3.0,
586
+ "eval_accuracy": 0.7501292111074567,
587
+ "eval_f1": 0.7617169997311587,
588
+ "eval_loss": 0.5218836665153503,
589
+ "eval_runtime": 144.5779,
590
+ "eval_samples_per_second": 147.208,
591
+ "eval_steps_per_second": 2.303,
592
+ "step": 3993
593
+ }
594
+ ],
595
+ "logging_steps": 50,
596
+ "max_steps": 3993,
597
+ "num_input_tokens_seen": 0,
598
+ "num_train_epochs": 3,
599
+ "save_steps": 500,
600
+ "stateful_callbacks": {
601
+ "TrainerControl": {
602
+ "args": {
603
+ "should_epoch_stop": false,
604
+ "should_evaluate": false,
605
+ "should_log": false,
606
+ "should_save": true,
607
+ "should_training_stop": true
608
+ },
609
+ "attributes": {}
610
+ }
611
+ },
612
+ "total_flos": 3.3829656736167936e+16,
613
+ "train_batch_size": 64,
614
+ "trial_name": null,
615
+ "trial_params": null
616
+ }
checkpoint-3993/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:497b1fb162099a485fef5d85839946a6214e40e81d40b13c3ef06199d07ece80
3
+ size 5304
checkpoint-3993/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "distilbert/distilbert-base-multilingual-cased",
3
+ "activation": "gelu",
4
+ "architectures": [
5
+ "DistilBertForSequenceClassification"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "dim": 768,
9
+ "dropout": 0.1,
10
+ "hidden_dim": 3072,
11
+ "initializer_range": 0.02,
12
+ "max_position_embeddings": 512,
13
+ "model_type": "distilbert",
14
+ "n_heads": 12,
15
+ "n_layers": 6,
16
+ "output_past": true,
17
+ "pad_token_id": 0,
18
+ "problem_type": "single_label_classification",
19
+ "qa_dropout": 0.1,
20
+ "seq_classif_dropout": 0.2,
21
+ "sinusoidal_pos_embds": false,
22
+ "tie_weights_": true,
23
+ "torch_dtype": "float32",
24
+ "transformers_version": "4.48.3",
25
+ "vocab_size": 119547
26
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ea9b5e7183bbf5cf8fa18a035400a551437fae62101ab6a9f9b5deb3b2f5b8c
3
+ size 541317368
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": false,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "DistilBertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff