CrabInHoney commited on
Commit
babca08
·
verified ·
1 Parent(s): 76c6969

Upload 6 files

Browse files
Files changed (6) hide show
  1. config.json +34 -0
  2. model.safetensors +3 -0
  3. special_tokens_map.json +37 -0
  4. tokenizer.json +636 -0
  5. tokenizer_config.json +62 -0
  6. vocab.txt +472 -0
config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "bert-tiny-finetuned",
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 192,
11
+ "id2label": {
12
+ "0": "good",
13
+ "1": "phish"
14
+ },
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": 768,
17
+ "label2id": {
18
+ "good": 0,
19
+ "phish": 1
20
+ },
21
+ "layer_norm_eps": 1e-12,
22
+ "max_position_embeddings": 64,
23
+ "model_type": "bert",
24
+ "num_attention_heads": 8,
25
+ "num_hidden_layers": 8,
26
+ "pad_token_id": 0,
27
+ "position_embedding_type": "absolute",
28
+ "problem_type": "single_label_classification",
29
+ "torch_dtype": "float32",
30
+ "transformers_version": "4.44.2",
31
+ "type_vocab_size": 2,
32
+ "use_cache": true,
33
+ "vocab_size": 472
34
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad9130a5b6f8b2ace28bf0e61826fec7b6cb0cf500a081ff6891f78fd408f5ab
3
+ size 14815608
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
@@ -0,0 +1,636 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 64,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
+ "padding": {
10
+ "strategy": {
11
+ "Fixed": 64
12
+ },
13
+ "direction": "Right",
14
+ "pad_to_multiple_of": null,
15
+ "pad_id": 0,
16
+ "pad_type_id": 0,
17
+ "pad_token": "[PAD]"
18
+ },
19
+ "added_tokens": [
20
+ {
21
+ "id": 0,
22
+ "content": "[PAD]",
23
+ "single_word": false,
24
+ "lstrip": false,
25
+ "rstrip": false,
26
+ "normalized": false,
27
+ "special": true
28
+ },
29
+ {
30
+ "id": 1,
31
+ "content": "[UNK]",
32
+ "single_word": false,
33
+ "lstrip": false,
34
+ "rstrip": false,
35
+ "normalized": false,
36
+ "special": true
37
+ },
38
+ {
39
+ "id": 2,
40
+ "content": "[CLS]",
41
+ "single_word": false,
42
+ "lstrip": false,
43
+ "rstrip": false,
44
+ "normalized": false,
45
+ "special": true
46
+ },
47
+ {
48
+ "id": 3,
49
+ "content": "[SEP]",
50
+ "single_word": false,
51
+ "lstrip": false,
52
+ "rstrip": false,
53
+ "normalized": false,
54
+ "special": true
55
+ },
56
+ {
57
+ "id": 4,
58
+ "content": "[MASK]",
59
+ "single_word": false,
60
+ "lstrip": false,
61
+ "rstrip": false,
62
+ "normalized": false,
63
+ "special": true
64
+ }
65
+ ],
66
+ "normalizer": {
67
+ "type": "BertNormalizer",
68
+ "clean_text": true,
69
+ "handle_chinese_chars": true,
70
+ "strip_accents": null,
71
+ "lowercase": true
72
+ },
73
+ "pre_tokenizer": {
74
+ "type": "BertPreTokenizer"
75
+ },
76
+ "post_processor": {
77
+ "type": "TemplateProcessing",
78
+ "single": [
79
+ {
80
+ "SpecialToken": {
81
+ "id": "[CLS]",
82
+ "type_id": 0
83
+ }
84
+ },
85
+ {
86
+ "Sequence": {
87
+ "id": "A",
88
+ "type_id": 0
89
+ }
90
+ },
91
+ {
92
+ "SpecialToken": {
93
+ "id": "[SEP]",
94
+ "type_id": 0
95
+ }
96
+ }
97
+ ],
98
+ "pair": [
99
+ {
100
+ "SpecialToken": {
101
+ "id": "[CLS]",
102
+ "type_id": 0
103
+ }
104
+ },
105
+ {
106
+ "Sequence": {
107
+ "id": "A",
108
+ "type_id": 0
109
+ }
110
+ },
111
+ {
112
+ "SpecialToken": {
113
+ "id": "[SEP]",
114
+ "type_id": 0
115
+ }
116
+ },
117
+ {
118
+ "Sequence": {
119
+ "id": "B",
120
+ "type_id": 1
121
+ }
122
+ },
123
+ {
124
+ "SpecialToken": {
125
+ "id": "[SEP]",
126
+ "type_id": 1
127
+ }
128
+ }
129
+ ],
130
+ "special_tokens": {
131
+ "[CLS]": {
132
+ "id": "[CLS]",
133
+ "ids": [
134
+ 2
135
+ ],
136
+ "tokens": [
137
+ "[CLS]"
138
+ ]
139
+ },
140
+ "[SEP]": {
141
+ "id": "[SEP]",
142
+ "ids": [
143
+ 3
144
+ ],
145
+ "tokens": [
146
+ "[SEP]"
147
+ ]
148
+ }
149
+ }
150
+ },
151
+ "decoder": {
152
+ "type": "WordPiece",
153
+ "prefix": "##",
154
+ "cleanup": true
155
+ },
156
+ "model": {
157
+ "type": "WordPiece",
158
+ "unk_token": "[UNK]",
159
+ "continuing_subword_prefix": "##",
160
+ "max_input_chars_per_word": 100,
161
+ "vocab": {
162
+ "[PAD]": 0,
163
+ "[UNK]": 1,
164
+ "[CLS]": 2,
165
+ "[SEP]": 3,
166
+ "[MASK]": 4,
167
+ "&": 5,
168
+ "'": 6,
169
+ "*": 7,
170
+ ",": 8,
171
+ "-": 9,
172
+ ".": 10,
173
+ "/": 11,
174
+ "?": 12,
175
+ "0": 13,
176
+ "1": 14,
177
+ "2": 15,
178
+ "3": 16,
179
+ "4": 17,
180
+ "5": 18,
181
+ "6": 19,
182
+ "7": 20,
183
+ "8": 21,
184
+ "9": 22,
185
+ ":": 23,
186
+ ";": 24,
187
+ "\\": 25,
188
+ "_": 26,
189
+ "a": 27,
190
+ "b": 28,
191
+ "c": 29,
192
+ "d": 30,
193
+ "e": 31,
194
+ "f": 32,
195
+ "g": 33,
196
+ "h": 34,
197
+ "i": 35,
198
+ "j": 36,
199
+ "k": 37,
200
+ "l": 38,
201
+ "m": 39,
202
+ "n": 40,
203
+ "o": 41,
204
+ "p": 42,
205
+ "q": 43,
206
+ "r": 44,
207
+ "s": 45,
208
+ "t": 46,
209
+ "u": 47,
210
+ "v": 48,
211
+ "w": 49,
212
+ "x": 50,
213
+ "y": 51,
214
+ "z": 52,
215
+ "//": 53,
216
+ "//:": 54,
217
+ "http": 55,
218
+ "https": 56,
219
+ "ftp": 57,
220
+ "sftp": 58,
221
+ "mailto": 59,
222
+ "tel": 60,
223
+ "file": 61,
224
+ "ws": 62,
225
+ "wss": 63,
226
+ "rtmp": 64,
227
+ "ssh": 65,
228
+ "ldap": 66,
229
+ "ldaps": 67,
230
+ "nntp": 68,
231
+ "gopher": 69,
232
+ "telnet": 70,
233
+ "view": 71,
234
+ "source": 72,
235
+ "about": 73,
236
+ "chrome": 74,
237
+ "data": 75,
238
+ "irc": 76,
239
+ "ircs": 77,
240
+ "magnet": 78,
241
+ "mms": 79,
242
+ "redis": 80,
243
+ "rsync": 81,
244
+ "rtsp": 82,
245
+ "svn": 83,
246
+ "vnc": 84,
247
+ "webcal": 85,
248
+ "xmpp": 86,
249
+ "dns": 87,
250
+ "ntp": 88,
251
+ "ip": 89,
252
+ "com": 90,
253
+ "de": 91,
254
+ "net": 92,
255
+ "uk": 93,
256
+ "cn": 94,
257
+ "org": 95,
258
+ "info": 96,
259
+ "nl": 97,
260
+ "eu": 98,
261
+ "ru": 99,
262
+ "su": 100,
263
+ "href": 101,
264
+ "br": 102,
265
+ "htm": 103,
266
+ "php": 104,
267
+ "co": 105,
268
+ "ly": 106,
269
+ "bit": 107,
270
+ "log": 108,
271
+ "index": 109,
272
+ "bank": 110,
273
+ "za": 111,
274
+ "direct": 112,
275
+ "xml": 113,
276
+ "mail": 114,
277
+ "it": 115,
278
+ "www": 116,
279
+ "run": 117,
280
+ "security": 118,
281
+ "code": 119,
282
+ "promo": 120,
283
+ "jpg": 121,
284
+ "img": 122,
285
+ "pay": 123,
286
+ "form": 124,
287
+ "docs": 125,
288
+ "host": 126,
289
+ "ec": 127,
290
+ "cx": 128,
291
+ "free": 129,
292
+ "true": 130,
293
+ "false": 131,
294
+ "amp": 132,
295
+ "blog": 133,
296
+ "key": 134,
297
+ "pal": 135,
298
+ "contact": 136,
299
+ "online": 137,
300
+ "abc": 138,
301
+ "media": 139,
302
+ "admin": 140,
303
+ "etc": 141,
304
+ "login": 142,
305
+ "cmd": 143,
306
+ "bin": 144,
307
+ "web": 145,
308
+ "verif": 146,
309
+ "the": 147,
310
+ "in": 148,
311
+ "##s": 149,
312
+ "of": 150,
313
+ "la": 151,
314
+ "en": 152,
315
+ "and": 153,
316
+ "##e": 154,
317
+ "##a": 155,
318
+ "to": 156,
319
+ "##n": 157,
320
+ "##i": 158,
321
+ "der": 159,
322
+ "un": 160,
323
+ "di": 161,
324
+ "que": 162,
325
+ "##t": 163,
326
+ "is": 164,
327
+ "el": 165,
328
+ "se": 166,
329
+ "del": 167,
330
+ "die": 168,
331
+ "##r": 169,
332
+ "und": 170,
333
+ "et": 171,
334
+ "na": 172,
335
+ "##o": 173,
336
+ "was": 174,
337
+ "on": 175,
338
+ "##en": 176,
339
+ "##u": 177,
340
+ "des": 178,
341
+ "den": 179,
342
+ "le": 180,
343
+ "for": 181,
344
+ "da": 182,
345
+ "je": 183,
346
+ "van": 184,
347
+ "as": 185,
348
+ "##m": 186,
349
+ "sa": 187,
350
+ "do": 188,
351
+ "10": 189,
352
+ "an": 190,
353
+ "les": 191,
354
+ "una": 192,
355
+ "il": 193,
356
+ "by": 194,
357
+ "og": 195,
358
+ "##y": 196,
359
+ "at": 197,
360
+ "##l": 198,
361
+ "##d": 199,
362
+ "er": 200,
363
+ "al": 201,
364
+ "##er": 202,
365
+ "von": 203,
366
+ "du": 204,
367
+ "av": 205,
368
+ "##es": 206,
369
+ "med": 207,
370
+ "con": 208,
371
+ "##k": 209,
372
+ "est": 210,
373
+ "per": 211,
374
+ "som": 212,
375
+ "los": 213,
376
+ "por": 214,
377
+ "from": 215,
378
+ "that": 216,
379
+ "no": 217,
380
+ "11": 218,
381
+ "es": 219,
382
+ "ja": 220,
383
+ "km": 221,
384
+ "##е": 222,
385
+ "##an": 223,
386
+ "om": 224,
387
+ "im": 225,
388
+ "##ta": 226,
389
+ "dan": 227,
390
+ "##te": 228,
391
+ "##na": 229,
392
+ "para": 230,
393
+ "mit": 231,
394
+ "El": 232,
395
+ "his": 233,
396
+ "##у": 234,
397
+ "ha": 235,
398
+ "##da": 236,
399
+ "##ing": 237,
400
+ "une": 238,
401
+ "##h": 239,
402
+ "##ne": 240,
403
+ "##g": 241,
404
+ "das": 242,
405
+ "##in": 243,
406
+ "##re": 244,
407
+ "par": 245,
408
+ "##us": 246,
409
+ "##de": 247,
410
+ "au": 248,
411
+ "dans": 249,
412
+ "he": 250,
413
+ "che": 251,
414
+ "em": 252,
415
+ "dem": 253,
416
+ "19": 254,
417
+ "til": 255,
418
+ "се": 256,
419
+ "han": 257,
420
+ "##ia": 258,
421
+ "##le": 259,
422
+ "las": 260,
423
+ "della": 261,
424
+ "new": 262,
425
+ "##ra": 263,
426
+ "##is": 264,
427
+ "um": 265,
428
+ "si": 266,
429
+ "var": 267,
430
+ "are": 268,
431
+ "op": 269,
432
+ "zu": 270,
433
+ "##et": 271,
434
+ "were": 272,
435
+ "##os": 273,
436
+ "od": 274,
437
+ "son": 275,
438
+ "##о": 276,
439
+ "##do": 277,
440
+ "which": 278,
441
+ "##ja": 279,
442
+ "va": 280,
443
+ "pour": 281,
444
+ "ve": 282,
445
+ "##ti": 283,
446
+ "sur": 284,
447
+ "##la": 285,
448
+ "##ed": 286,
449
+ "war": 287,
450
+ "##to": 288,
451
+ "##se": 289,
452
+ "##ni": 290,
453
+ "##no": 291,
454
+ "be": 292,
455
+ "det": 293,
456
+ "##і": 294,
457
+ "##х": 295,
458
+ "gov": 296,
459
+ "##ar": 297,
460
+ "qui": 298,
461
+ "az": 299,
462
+ "te": 300,
463
+ "##va": 301,
464
+ "##nt": 302,
465
+ "##ma": 303,
466
+ "##ka": 304,
467
+ "had": 305,
468
+ "##ng": 306,
469
+ "also": 307,
470
+ "so": 308,
471
+ "##je": 309,
472
+ "##li": 310,
473
+ "am": 311,
474
+ "has": 312,
475
+ "dos": 313,
476
+ "ur": 314,
477
+ "##ie": 315,
478
+ "##ri": 316,
479
+ "entre": 317,
480
+ "##as": 318,
481
+ "lo": 319,
482
+ "era": 320,
483
+ "ni": 321,
484
+ "##al": 322,
485
+ "##j": 323,
486
+ "##ce": 324,
487
+ "first": 325,
488
+ "##ca": 326,
489
+ "##ment": 327,
490
+ "os": 328,
491
+ "met": 329,
492
+ "ou": 330,
493
+ "all": 331,
494
+ "##ko": 332,
495
+ "aus": 333,
496
+ "non": 334,
497
+ "##si": 335,
498
+ "##em": 336,
499
+ "##ly": 337,
500
+ "##b": 338,
501
+ "film": 339,
502
+ "##um": 340,
503
+ "##sa": 341,
504
+ "##v": 342,
505
+ "##ga": 343,
506
+ "##it": 344,
507
+ "##mi": 345,
508
+ "##ki": 346,
509
+ "po": 347,
510
+ "##ge": 348,
511
+ "##at": 349,
512
+ "##ba": 350,
513
+ "##ur": 351,
514
+ "##ke": 352,
515
+ "##st": 353,
516
+ "##ro": 354,
517
+ "##el": 355,
518
+ "##f": 356,
519
+ "##man": 357,
520
+ "##ci": 358,
521
+ "##ul": 359,
522
+ "##ndo": 360,
523
+ "##mente": 361,
524
+ "##ve": 362,
525
+ "##me": 363,
526
+ "##den": 364,
527
+ "##za": 365,
528
+ "##io": 366,
529
+ "##or": 367,
530
+ "##nya": 368,
531
+ "##ya": 369,
532
+ "##ten": 370,
533
+ "##x": 371,
534
+ "##om": 372,
535
+ "##di": 373,
536
+ "##kan": 374,
537
+ "##ek": 375,
538
+ "into": 376,
539
+ "##ak": 377,
540
+ "till": 378,
541
+ "##lo": 379,
542
+ "ble": 380,
543
+ "ka": 381,
544
+ "##S": 382,
545
+ "mai": 383,
546
+ "up": 384,
547
+ "ng": 385,
548
+ "##то": 386,
549
+ "aux": 387,
550
+ "##ny": 388,
551
+ "##ho": 389,
552
+ "##1": 390,
553
+ "##ju": 391,
554
+ "##lar": 392,
555
+ "##ji": 393,
556
+ "##go": 394,
557
+ "##ts": 395,
558
+ "##co": 396,
559
+ "##ler": 397,
560
+ "##tion": 398,
561
+ "##ir": 399,
562
+ "ad": 400,
563
+ "##ku": 401,
564
+ "##ze": 402,
565
+ "##w": 403,
566
+ "ki": 404,
567
+ "##ns": 405,
568
+ "##ik": 406,
569
+ "##ers": 407,
570
+ "##ry": 408,
571
+ "me": 409,
572
+ "##sen": 410,
573
+ "##des": 411,
574
+ "##ha": 412,
575
+ "##ban": 413,
576
+ "during": 414,
577
+ "where": 415,
578
+ "ze": 416,
579
+ "##rs": 417,
580
+ "can": 418,
581
+ "out": 419,
582
+ "wie": 420,
583
+ "со": 421,
584
+ "##ben": 422,
585
+ "##ren": 423,
586
+ "##sta": 424,
587
+ "##rt": 425,
588
+ "##tu": 426,
589
+ "fu": 427,
590
+ "##am": 428,
591
+ "##ou": 429,
592
+ "##ria": 430,
593
+ "##ov": 431,
594
+ "##il": 432,
595
+ "##mo": 433,
596
+ "vom": 434,
597
+ "##wa": 435,
598
+ "##jo": 436,
599
+ "##ica": 437,
600
+ "##be": 438,
601
+ "##ion": 439,
602
+ "##ken": 440,
603
+ "##ina": 441,
604
+ "##land": 442,
605
+ "##lla": 443,
606
+ "nu": 444,
607
+ "##ine": 445,
608
+ "##un": 446,
609
+ "club": 447,
610
+ "##che": 448,
611
+ "team": 449,
612
+ "##ner": 450,
613
+ "##ic": 451,
614
+ "ca": 452,
615
+ "##ok": 453,
616
+ "##ig": 454,
617
+ "##th": 455,
618
+ "##nu": 456,
619
+ "##ada": 457,
620
+ "##ste": 458,
621
+ "##ut": 459,
622
+ "pe": 460,
623
+ "ke": 461,
624
+ "area": 462,
625
+ "any": 463,
626
+ "##dos": 464,
627
+ "##ton": 465,
628
+ "##que": 466,
629
+ "vor": 467,
630
+ "##ty": 468,
631
+ "##tes": 469,
632
+ "##ble": 470,
633
+ "##das": 471
634
+ }
635
+ }
636
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "lowercase": true,
49
+ "mask_token": "[MASK]",
50
+ "max_length": 64,
51
+ "model_max_length": 1000000000000000019884624838656,
52
+ "never_split": null,
53
+ "pad_token": "[PAD]",
54
+ "sep_token": "[SEP]",
55
+ "stride": 0,
56
+ "strip_accents": null,
57
+ "tokenize_chinese_chars": true,
58
+ "tokenizer_class": "BertTokenizer",
59
+ "truncation_side": "right",
60
+ "truncation_strategy": "longest_first",
61
+ "unk_token": "[UNK]"
62
+ }
vocab.txt ADDED
@@ -0,0 +1,472 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [PAD]
2
+ [UNK]
3
+ [CLS]
4
+ [SEP]
5
+ [MASK]
6
+ &
7
+ '
8
+ *
9
+ ,
10
+ -
11
+ .
12
+ /
13
+ ?
14
+ 0
15
+ 1
16
+ 2
17
+ 3
18
+ 4
19
+ 5
20
+ 6
21
+ 7
22
+ 8
23
+ 9
24
+ :
25
+ ;
26
+ \
27
+ _
28
+ a
29
+ b
30
+ c
31
+ d
32
+ e
33
+ f
34
+ g
35
+ h
36
+ i
37
+ j
38
+ k
39
+ l
40
+ m
41
+ n
42
+ o
43
+ p
44
+ q
45
+ r
46
+ s
47
+ t
48
+ u
49
+ v
50
+ w
51
+ x
52
+ y
53
+ z
54
+ //
55
+ //:
56
+ http
57
+ https
58
+ ftp
59
+ sftp
60
+ mailto
61
+ tel
62
+ file
63
+ ws
64
+ wss
65
+ rtmp
66
+ ssh
67
+ ldap
68
+ ldaps
69
+ nntp
70
+ gopher
71
+ telnet
72
+ view
73
+ source
74
+ about
75
+ chrome
76
+ data
77
+ irc
78
+ ircs
79
+ magnet
80
+ mms
81
+ redis
82
+ rsync
83
+ rtsp
84
+ svn
85
+ vnc
86
+ webcal
87
+ xmpp
88
+ dns
89
+ ntp
90
+ ip
91
+ com
92
+ de
93
+ net
94
+ uk
95
+ cn
96
+ org
97
+ info
98
+ nl
99
+ eu
100
+ ru
101
+ su
102
+ href
103
+ br
104
+ htm
105
+ php
106
+ co
107
+ ly
108
+ bit
109
+ log
110
+ index
111
+ bank
112
+ za
113
+ direct
114
+ xml
115
+ mail
116
+ it
117
+ www
118
+ run
119
+ security
120
+ code
121
+ promo
122
+ jpg
123
+ img
124
+ pay
125
+ form
126
+ docs
127
+ host
128
+ ec
129
+ cx
130
+ free
131
+ true
132
+ false
133
+ amp
134
+ blog
135
+ key
136
+ pal
137
+ contact
138
+ online
139
+ abc
140
+ media
141
+ admin
142
+ etc
143
+ login
144
+ cmd
145
+ bin
146
+ web
147
+ verif
148
+ the
149
+ in
150
+ ##s
151
+ of
152
+ la
153
+ en
154
+ and
155
+ ##e
156
+ ##a
157
+ to
158
+ ##n
159
+ ##i
160
+ der
161
+ un
162
+ di
163
+ que
164
+ ##t
165
+ is
166
+ el
167
+ se
168
+ del
169
+ die
170
+ ##r
171
+ und
172
+ et
173
+ na
174
+ ##o
175
+ was
176
+ on
177
+ ##en
178
+ ##u
179
+ des
180
+ den
181
+ le
182
+ for
183
+ da
184
+ je
185
+ van
186
+ as
187
+ ##m
188
+ sa
189
+ do
190
+ 10
191
+ an
192
+ les
193
+ una
194
+ il
195
+ by
196
+ og
197
+ ##y
198
+ at
199
+ ##l
200
+ ##d
201
+ er
202
+ al
203
+ ##er
204
+ von
205
+ du
206
+ av
207
+ ##es
208
+ med
209
+ con
210
+ ##k
211
+ est
212
+ per
213
+ som
214
+ los
215
+ por
216
+ from
217
+ that
218
+ no
219
+ 11
220
+ es
221
+ ja
222
+ km
223
+ ##е
224
+ ##an
225
+ om
226
+ im
227
+ ##ta
228
+ dan
229
+ ##te
230
+ ##na
231
+ para
232
+ mit
233
+ El
234
+ his
235
+ ##у
236
+ ha
237
+ ##da
238
+ ##ing
239
+ une
240
+ ##h
241
+ ##ne
242
+ ##g
243
+ das
244
+ ##in
245
+ ##re
246
+ par
247
+ ##us
248
+ ##de
249
+ au
250
+ dans
251
+ he
252
+ che
253
+ em
254
+ dem
255
+ 19
256
+ til
257
+ се
258
+ han
259
+ ##ia
260
+ ##le
261
+ las
262
+ della
263
+ new
264
+ ##ra
265
+ ##is
266
+ um
267
+ si
268
+ var
269
+ are
270
+ op
271
+ zu
272
+ ##et
273
+ were
274
+ ##os
275
+ od
276
+ son
277
+ ##о
278
+ ##do
279
+ which
280
+ ##ja
281
+ va
282
+ pour
283
+ ve
284
+ ##ti
285
+ sur
286
+ ##la
287
+ ##ed
288
+ war
289
+ ##to
290
+ ##se
291
+ ##ni
292
+ ##no
293
+ be
294
+ det
295
+ ##і
296
+ ##х
297
+ gov
298
+ ##ar
299
+ qui
300
+ az
301
+ te
302
+ ##va
303
+ ##nt
304
+ ##ma
305
+ ##ka
306
+ had
307
+ ##ng
308
+ also
309
+ so
310
+ ##je
311
+ ##li
312
+ am
313
+ has
314
+ dos
315
+ ur
316
+ ##ie
317
+ ##ri
318
+ entre
319
+ ##as
320
+ lo
321
+ era
322
+ ni
323
+ ##al
324
+ ##j
325
+ ##ce
326
+ first
327
+ ##ca
328
+ ##ment
329
+ os
330
+ met
331
+ ou
332
+ all
333
+ ##ko
334
+ aus
335
+ non
336
+ ##si
337
+ ##em
338
+ ##ly
339
+ ##b
340
+ film
341
+ ##um
342
+ ##sa
343
+ ##v
344
+ ##ga
345
+ ##it
346
+ ##mi
347
+ ##ki
348
+ po
349
+ ##ge
350
+ ##at
351
+ ##ba
352
+ ##ur
353
+ ##ke
354
+ ##st
355
+ ##ro
356
+ ##el
357
+ ##f
358
+ ##man
359
+ ##ci
360
+ ##ul
361
+ ##ndo
362
+ ##mente
363
+ ##ve
364
+ ##me
365
+ ##den
366
+ ##za
367
+ ##io
368
+ ##or
369
+ ##nya
370
+ ##ya
371
+ ##ten
372
+ ##x
373
+ ##om
374
+ ##di
375
+ ##kan
376
+ ##ek
377
+ into
378
+ ##ak
379
+ till
380
+ ##lo
381
+ ble
382
+ ka
383
+ ##S
384
+ mai
385
+ up
386
+ ng
387
+ ##то
388
+ aux
389
+ ##ny
390
+ ##ho
391
+ ##1
392
+ ##ju
393
+ ##lar
394
+ ##ji
395
+ ##go
396
+ ##ts
397
+ ##co
398
+ ##ler
399
+ ##tion
400
+ ##ir
401
+ ad
402
+ ##ku
403
+ ##ze
404
+ ##w
405
+ ki
406
+ ##ns
407
+ ##ik
408
+ ##ers
409
+ ##ry
410
+ me
411
+ ##sen
412
+ ##des
413
+ ##ha
414
+ ##ban
415
+ during
416
+ where
417
+ ze
418
+ ##rs
419
+ can
420
+ out
421
+ wie
422
+ со
423
+ ##ben
424
+ ##ren
425
+ ##sta
426
+ ##rt
427
+ ##tu
428
+ fu
429
+ ##am
430
+ ##ou
431
+ ##ria
432
+ ##ov
433
+ ##il
434
+ ##mo
435
+ vom
436
+ ##wa
437
+ ##jo
438
+ ##ica
439
+ ##be
440
+ ##ion
441
+ ##ken
442
+ ##ina
443
+ ##land
444
+ ##lla
445
+ nu
446
+ ##ine
447
+ ##un
448
+ club
449
+ ##che
450
+ team
451
+ ##ner
452
+ ##ic
453
+ ca
454
+ ##ok
455
+ ##ig
456
+ ##th
457
+ ##nu
458
+ ##ada
459
+ ##ste
460
+ ##ut
461
+ pe
462
+ ke
463
+ area
464
+ any
465
+ ##dos
466
+ ##ton
467
+ ##que
468
+ vor
469
+ ##ty
470
+ ##tes
471
+ ##ble
472
+ ##das