franfj commited on
Commit
49dca78
·
verified ·
1 Parent(s): 3e95f6a

Upload folder using huggingface_hub

Browse files
checkpoint-2661/config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "distilbert/distilbert-base-multilingual-cased",
3
+ "activation": "gelu",
4
+ "architectures": [
5
+ "DistilBertForSequenceClassification"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "dim": 768,
9
+ "dropout": 0.1,
10
+ "hidden_dim": 3072,
11
+ "initializer_range": 0.02,
12
+ "max_position_embeddings": 512,
13
+ "model_type": "distilbert",
14
+ "n_heads": 12,
15
+ "n_layers": 6,
16
+ "output_past": true,
17
+ "pad_token_id": 0,
18
+ "problem_type": "single_label_classification",
19
+ "qa_dropout": 0.1,
20
+ "seq_classif_dropout": 0.2,
21
+ "sinusoidal_pos_embds": false,
22
+ "tie_weights_": true,
23
+ "torch_dtype": "float32",
24
+ "transformers_version": "4.48.3",
25
+ "vocab_size": 119547
26
+ }
checkpoint-2661/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:591c68f23e62a2e2c2186c8f31dda5212136e630664963b224e4719c47dcd4ea
3
+ size 541317368
checkpoint-2661/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bbf46c170717591038485a90221f42c04da067bcad1172f0dfe5e9cf4786086
3
+ size 1082696890
checkpoint-2661/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51d81d9f2c53331206efe2ce78301f609f46b42e7344d02dafa3749e588b726c
3
+ size 14244
checkpoint-2661/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32b93fb50a5142fbdd43ea128ac5e1d93e0dbdc81a227fedc2724f289978cae5
3
+ size 1064
checkpoint-2661/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
checkpoint-2661/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2661/tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": false,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "DistilBertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
checkpoint-2661/trainer_state.json ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 500,
6
+ "global_step": 2661,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.05636978579481398,
13
+ "grad_norm": 1.0439274311065674,
14
+ "learning_rate": 4.9060503570086435e-05,
15
+ "loss": 0.6368,
16
+ "step": 50
17
+ },
18
+ {
19
+ "epoch": 0.11273957158962795,
20
+ "grad_norm": 2.2991294860839844,
21
+ "learning_rate": 4.812100714017287e-05,
22
+ "loss": 0.5758,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.16910935738444194,
27
+ "grad_norm": 1.6218173503875732,
28
+ "learning_rate": 4.71815107102593e-05,
29
+ "loss": 0.5639,
30
+ "step": 150
31
+ },
32
+ {
33
+ "epoch": 0.2254791431792559,
34
+ "grad_norm": 2.0030906200408936,
35
+ "learning_rate": 4.6242014280345734e-05,
36
+ "loss": 0.5421,
37
+ "step": 200
38
+ },
39
+ {
40
+ "epoch": 0.2818489289740699,
41
+ "grad_norm": 2.060424566268921,
42
+ "learning_rate": 4.530251785043217e-05,
43
+ "loss": 0.5377,
44
+ "step": 250
45
+ },
46
+ {
47
+ "epoch": 0.3382187147688839,
48
+ "grad_norm": 1.5049147605895996,
49
+ "learning_rate": 4.43630214205186e-05,
50
+ "loss": 0.5457,
51
+ "step": 300
52
+ },
53
+ {
54
+ "epoch": 0.3945885005636979,
55
+ "grad_norm": 2.017237663269043,
56
+ "learning_rate": 4.342352499060503e-05,
57
+ "loss": 0.5329,
58
+ "step": 350
59
+ },
60
+ {
61
+ "epoch": 0.4509582863585118,
62
+ "grad_norm": 1.5960201025009155,
63
+ "learning_rate": 4.2484028560691466e-05,
64
+ "loss": 0.538,
65
+ "step": 400
66
+ },
67
+ {
68
+ "epoch": 0.5073280721533259,
69
+ "grad_norm": 1.7139301300048828,
70
+ "learning_rate": 4.1544532130777905e-05,
71
+ "loss": 0.5141,
72
+ "step": 450
73
+ },
74
+ {
75
+ "epoch": 0.5636978579481398,
76
+ "grad_norm": 1.4965617656707764,
77
+ "learning_rate": 4.060503570086434e-05,
78
+ "loss": 0.5196,
79
+ "step": 500
80
+ },
81
+ {
82
+ "epoch": 0.6200676437429538,
83
+ "grad_norm": 1.2821619510650635,
84
+ "learning_rate": 3.966553927095078e-05,
85
+ "loss": 0.5114,
86
+ "step": 550
87
+ },
88
+ {
89
+ "epoch": 0.6764374295377678,
90
+ "grad_norm": 1.5799860954284668,
91
+ "learning_rate": 3.872604284103721e-05,
92
+ "loss": 0.52,
93
+ "step": 600
94
+ },
95
+ {
96
+ "epoch": 0.7328072153325818,
97
+ "grad_norm": 1.6334625482559204,
98
+ "learning_rate": 3.7786546411123644e-05,
99
+ "loss": 0.5093,
100
+ "step": 650
101
+ },
102
+ {
103
+ "epoch": 0.7891770011273957,
104
+ "grad_norm": 2.19488525390625,
105
+ "learning_rate": 3.6847049981210077e-05,
106
+ "loss": 0.503,
107
+ "step": 700
108
+ },
109
+ {
110
+ "epoch": 0.8455467869222097,
111
+ "grad_norm": 1.5821040868759155,
112
+ "learning_rate": 3.590755355129651e-05,
113
+ "loss": 0.4988,
114
+ "step": 750
115
+ },
116
+ {
117
+ "epoch": 0.9019165727170236,
118
+ "grad_norm": 2.2586898803710938,
119
+ "learning_rate": 3.496805712138294e-05,
120
+ "loss": 0.5102,
121
+ "step": 800
122
+ },
123
+ {
124
+ "epoch": 0.9582863585118376,
125
+ "grad_norm": 1.5365625619888306,
126
+ "learning_rate": 3.4028560691469375e-05,
127
+ "loss": 0.5047,
128
+ "step": 850
129
+ },
130
+ {
131
+ "epoch": 1.0,
132
+ "eval_accuracy": 0.744162007235822,
133
+ "eval_f1": 0.7656552614590059,
134
+ "eval_loss": 0.4819556772708893,
135
+ "eval_runtime": 56.9558,
136
+ "eval_samples_per_second": 373.676,
137
+ "eval_steps_per_second": 3.898,
138
+ "step": 887
139
+ },
140
+ {
141
+ "epoch": 1.0146561443066517,
142
+ "grad_norm": 1.1745824813842773,
143
+ "learning_rate": 3.308906426155581e-05,
144
+ "loss": 0.4858,
145
+ "step": 900
146
+ },
147
+ {
148
+ "epoch": 1.0710259301014655,
149
+ "grad_norm": 3.0086119174957275,
150
+ "learning_rate": 3.214956783164224e-05,
151
+ "loss": 0.4368,
152
+ "step": 950
153
+ },
154
+ {
155
+ "epoch": 1.1273957158962795,
156
+ "grad_norm": 2.154918909072876,
157
+ "learning_rate": 3.1210071401728674e-05,
158
+ "loss": 0.4373,
159
+ "step": 1000
160
+ },
161
+ {
162
+ "epoch": 1.1837655016910935,
163
+ "grad_norm": 2.587249994277954,
164
+ "learning_rate": 3.0270574971815107e-05,
165
+ "loss": 0.4275,
166
+ "step": 1050
167
+ },
168
+ {
169
+ "epoch": 1.2401352874859075,
170
+ "grad_norm": 2.3393185138702393,
171
+ "learning_rate": 2.933107854190154e-05,
172
+ "loss": 0.4351,
173
+ "step": 1100
174
+ },
175
+ {
176
+ "epoch": 1.2965050732807215,
177
+ "grad_norm": 1.727163553237915,
178
+ "learning_rate": 2.8391582111987973e-05,
179
+ "loss": 0.4457,
180
+ "step": 1150
181
+ },
182
+ {
183
+ "epoch": 1.3528748590755355,
184
+ "grad_norm": 2.3959648609161377,
185
+ "learning_rate": 2.745208568207441e-05,
186
+ "loss": 0.4291,
187
+ "step": 1200
188
+ },
189
+ {
190
+ "epoch": 1.4092446448703495,
191
+ "grad_norm": 2.1558401584625244,
192
+ "learning_rate": 2.6512589252160842e-05,
193
+ "loss": 0.4375,
194
+ "step": 1250
195
+ },
196
+ {
197
+ "epoch": 1.4656144306651635,
198
+ "grad_norm": 2.3182780742645264,
199
+ "learning_rate": 2.5573092822247275e-05,
200
+ "loss": 0.4297,
201
+ "step": 1300
202
+ },
203
+ {
204
+ "epoch": 1.5219842164599775,
205
+ "grad_norm": 2.0232203006744385,
206
+ "learning_rate": 2.463359639233371e-05,
207
+ "loss": 0.4383,
208
+ "step": 1350
209
+ },
210
+ {
211
+ "epoch": 1.5783540022547915,
212
+ "grad_norm": 1.779449224472046,
213
+ "learning_rate": 2.3694099962420144e-05,
214
+ "loss": 0.4454,
215
+ "step": 1400
216
+ },
217
+ {
218
+ "epoch": 1.6347237880496053,
219
+ "grad_norm": 2.562135696411133,
220
+ "learning_rate": 2.2754603532506577e-05,
221
+ "loss": 0.4251,
222
+ "step": 1450
223
+ },
224
+ {
225
+ "epoch": 1.6910935738444195,
226
+ "grad_norm": 2.216463327407837,
227
+ "learning_rate": 2.181510710259301e-05,
228
+ "loss": 0.4212,
229
+ "step": 1500
230
+ },
231
+ {
232
+ "epoch": 1.7474633596392333,
233
+ "grad_norm": 1.364642858505249,
234
+ "learning_rate": 2.0875610672679443e-05,
235
+ "loss": 0.4242,
236
+ "step": 1550
237
+ },
238
+ {
239
+ "epoch": 1.8038331454340475,
240
+ "grad_norm": 1.5100990533828735,
241
+ "learning_rate": 1.993611424276588e-05,
242
+ "loss": 0.4264,
243
+ "step": 1600
244
+ },
245
+ {
246
+ "epoch": 1.8602029312288613,
247
+ "grad_norm": 2.195282220840454,
248
+ "learning_rate": 1.8996617812852312e-05,
249
+ "loss": 0.4317,
250
+ "step": 1650
251
+ },
252
+ {
253
+ "epoch": 1.9165727170236753,
254
+ "grad_norm": 2.2001285552978516,
255
+ "learning_rate": 1.8057121382938748e-05,
256
+ "loss": 0.4392,
257
+ "step": 1700
258
+ },
259
+ {
260
+ "epoch": 1.9729425028184893,
261
+ "grad_norm": 2.5997512340545654,
262
+ "learning_rate": 1.711762495302518e-05,
263
+ "loss": 0.4199,
264
+ "step": 1750
265
+ },
266
+ {
267
+ "epoch": 2.0,
268
+ "eval_accuracy": 0.7526194615420758,
269
+ "eval_f1": 0.768051456011278,
270
+ "eval_loss": 0.4766261875629425,
271
+ "eval_runtime": 56.2574,
272
+ "eval_samples_per_second": 378.314,
273
+ "eval_steps_per_second": 3.946,
274
+ "step": 1774
275
+ },
276
+ {
277
+ "epoch": 2.0293122886133035,
278
+ "grad_norm": 2.3610382080078125,
279
+ "learning_rate": 1.6178128523111614e-05,
280
+ "loss": 0.3941,
281
+ "step": 1800
282
+ },
283
+ {
284
+ "epoch": 2.0856820744081173,
285
+ "grad_norm": 3.2270758152008057,
286
+ "learning_rate": 1.5238632093198047e-05,
287
+ "loss": 0.3712,
288
+ "step": 1850
289
+ },
290
+ {
291
+ "epoch": 2.142051860202931,
292
+ "grad_norm": 2.4744791984558105,
293
+ "learning_rate": 1.429913566328448e-05,
294
+ "loss": 0.3755,
295
+ "step": 1900
296
+ },
297
+ {
298
+ "epoch": 2.1984216459977453,
299
+ "grad_norm": 3.3841307163238525,
300
+ "learning_rate": 1.3359639233370913e-05,
301
+ "loss": 0.3645,
302
+ "step": 1950
303
+ },
304
+ {
305
+ "epoch": 2.254791431792559,
306
+ "grad_norm": 2.185528516769409,
307
+ "learning_rate": 1.2420142803457347e-05,
308
+ "loss": 0.3716,
309
+ "step": 2000
310
+ },
311
+ {
312
+ "epoch": 2.3111612175873733,
313
+ "grad_norm": 2.18782639503479,
314
+ "learning_rate": 1.1480646373543782e-05,
315
+ "loss": 0.378,
316
+ "step": 2050
317
+ },
318
+ {
319
+ "epoch": 2.367531003382187,
320
+ "grad_norm": 2.1629090309143066,
321
+ "learning_rate": 1.0541149943630215e-05,
322
+ "loss": 0.3569,
323
+ "step": 2100
324
+ },
325
+ {
326
+ "epoch": 2.4239007891770012,
327
+ "grad_norm": 3.4792726039886475,
328
+ "learning_rate": 9.601653513716648e-06,
329
+ "loss": 0.3624,
330
+ "step": 2150
331
+ },
332
+ {
333
+ "epoch": 2.480270574971815,
334
+ "grad_norm": 2.1638121604919434,
335
+ "learning_rate": 8.662157083803082e-06,
336
+ "loss": 0.3647,
337
+ "step": 2200
338
+ },
339
+ {
340
+ "epoch": 2.5366403607666292,
341
+ "grad_norm": 1.679718255996704,
342
+ "learning_rate": 7.722660653889515e-06,
343
+ "loss": 0.3636,
344
+ "step": 2250
345
+ },
346
+ {
347
+ "epoch": 2.593010146561443,
348
+ "grad_norm": 3.7099499702453613,
349
+ "learning_rate": 6.783164223975949e-06,
350
+ "loss": 0.3619,
351
+ "step": 2300
352
+ },
353
+ {
354
+ "epoch": 2.649379932356257,
355
+ "grad_norm": 3.1615912914276123,
356
+ "learning_rate": 5.8436677940623835e-06,
357
+ "loss": 0.3667,
358
+ "step": 2350
359
+ },
360
+ {
361
+ "epoch": 2.705749718151071,
362
+ "grad_norm": 2.1708598136901855,
363
+ "learning_rate": 4.904171364148816e-06,
364
+ "loss": 0.3707,
365
+ "step": 2400
366
+ },
367
+ {
368
+ "epoch": 2.7621195039458852,
369
+ "grad_norm": 1.8847932815551758,
370
+ "learning_rate": 3.96467493423525e-06,
371
+ "loss": 0.3558,
372
+ "step": 2450
373
+ },
374
+ {
375
+ "epoch": 2.818489289740699,
376
+ "grad_norm": 2.6283624172210693,
377
+ "learning_rate": 3.025178504321684e-06,
378
+ "loss": 0.3575,
379
+ "step": 2500
380
+ },
381
+ {
382
+ "epoch": 2.874859075535513,
383
+ "grad_norm": 1.7425137758255005,
384
+ "learning_rate": 2.0856820744081176e-06,
385
+ "loss": 0.3685,
386
+ "step": 2550
387
+ },
388
+ {
389
+ "epoch": 2.931228861330327,
390
+ "grad_norm": 3.9649059772491455,
391
+ "learning_rate": 1.146185644494551e-06,
392
+ "loss": 0.374,
393
+ "step": 2600
394
+ },
395
+ {
396
+ "epoch": 2.987598647125141,
397
+ "grad_norm": 3.081310749053955,
398
+ "learning_rate": 2.066892145809846e-07,
399
+ "loss": 0.363,
400
+ "step": 2650
401
+ },
402
+ {
403
+ "epoch": 3.0,
404
+ "eval_accuracy": 0.7502701686792276,
405
+ "eval_f1": 0.7636622348703811,
406
+ "eval_loss": 0.5071986317634583,
407
+ "eval_runtime": 57.0998,
408
+ "eval_samples_per_second": 372.733,
409
+ "eval_steps_per_second": 3.888,
410
+ "step": 2661
411
+ }
412
+ ],
413
+ "logging_steps": 50,
414
+ "max_steps": 2661,
415
+ "num_input_tokens_seen": 0,
416
+ "num_train_epochs": 3,
417
+ "save_steps": 500,
418
+ "stateful_callbacks": {
419
+ "TrainerControl": {
420
+ "args": {
421
+ "should_epoch_stop": false,
422
+ "should_evaluate": false,
423
+ "should_log": false,
424
+ "should_save": true,
425
+ "should_training_stop": true
426
+ },
427
+ "attributes": {}
428
+ }
429
+ },
430
+ "total_flos": 3.3829656736167936e+16,
431
+ "train_batch_size": 96,
432
+ "trial_name": null,
433
+ "trial_params": null
434
+ }
checkpoint-2661/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b76a7922b684c1c4953ec27b2cd8cab89ecad5c9324f5600caab8aaa80f8c94a
3
+ size 5304
checkpoint-2661/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5ea9b5e7183bbf5cf8fa18a035400a551437fae62101ab6a9f9b5deb3b2f5b8c
3
  size 541317368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:591c68f23e62a2e2c2186c8f31dda5212136e630664963b224e4719c47dcd4ea
3
  size 541317368