Kaspar Beelen
commited on
Commit
•
cd57bf1
1
Parent(s):
ad792cc
add tokenizer
Browse files- added_tokens.json +84 -0
- special_tokens_map.json +13 -0
- tokenizer.json +0 -0
- tokenizer_config.json +14 -0
- vocab.txt +0 -0
added_tokens.json
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"[1801]": 30579,
|
3 |
+
"[1802]": 30591,
|
4 |
+
"[1803]": 30577,
|
5 |
+
"[1804]": 30589,
|
6 |
+
"[1805]": 30596,
|
7 |
+
"[1806]": 30547,
|
8 |
+
"[1807]": 30532,
|
9 |
+
"[1808]": 30555,
|
10 |
+
"[1809]": 30530,
|
11 |
+
"[1810]": 30593,
|
12 |
+
"[1811]": 30584,
|
13 |
+
"[1812]": 30581,
|
14 |
+
"[1813]": 30553,
|
15 |
+
"[1814]": 30573,
|
16 |
+
"[1815]": 30536,
|
17 |
+
"[1816]": 30568,
|
18 |
+
"[1817]": 30587,
|
19 |
+
"[1818]": 30570,
|
20 |
+
"[1819]": 30586,
|
21 |
+
"[1820]": 30578,
|
22 |
+
"[1821]": 30597,
|
23 |
+
"[1822]": 30557,
|
24 |
+
"[1823]": 30561,
|
25 |
+
"[1824]": 30566,
|
26 |
+
"[1825]": 30569,
|
27 |
+
"[1826]": 30595,
|
28 |
+
"[1827]": 30580,
|
29 |
+
"[1828]": 30594,
|
30 |
+
"[1829]": 30582,
|
31 |
+
"[1830]": 30583,
|
32 |
+
"[1831]": 30534,
|
33 |
+
"[1832]": 30588,
|
34 |
+
"[1833]": 30590,
|
35 |
+
"[1834]": 30539,
|
36 |
+
"[1835]": 30565,
|
37 |
+
"[1836]": 30567,
|
38 |
+
"[1837]": 30549,
|
39 |
+
"[1838]": 30585,
|
40 |
+
"[1839]": 30592,
|
41 |
+
"[1840]": 30562,
|
42 |
+
"[1841]": 30541,
|
43 |
+
"[1842]": 30575,
|
44 |
+
"[1843]": 30598,
|
45 |
+
"[1844]": 30552,
|
46 |
+
"[1845]": 30554,
|
47 |
+
"[1846]": 30544,
|
48 |
+
"[1847]": 30558,
|
49 |
+
"[1848]": 30533,
|
50 |
+
"[1849]": 30531,
|
51 |
+
"[1850]": 30543,
|
52 |
+
"[1851]": 30559,
|
53 |
+
"[1852]": 30550,
|
54 |
+
"[1853]": 30551,
|
55 |
+
"[1854]": 30556,
|
56 |
+
"[1855]": 30542,
|
57 |
+
"[1856]": 30548,
|
58 |
+
"[1857]": 30563,
|
59 |
+
"[1858]": 30571,
|
60 |
+
"[1859]": 30529,
|
61 |
+
"[1860]": 30564,
|
62 |
+
"[1861]": 30538,
|
63 |
+
"[1862]": 30537,
|
64 |
+
"[1863]": 30546,
|
65 |
+
"[1864]": 30572,
|
66 |
+
"[1865]": 30535,
|
67 |
+
"[1866]": 30545,
|
68 |
+
"[1867]": 30560,
|
69 |
+
"[1868]": 30540,
|
70 |
+
"[1869]": 30576,
|
71 |
+
"[1870]": 30574,
|
72 |
+
"[1871]": 30599,
|
73 |
+
"[LOC]": 30603,
|
74 |
+
"[MET]": 30600,
|
75 |
+
"[POL]": 30602,
|
76 |
+
"[YEAR]": 30601,
|
77 |
+
"[con]": 30523,
|
78 |
+
"[lib]": 30522,
|
79 |
+
"[liverpool]": 30528,
|
80 |
+
"[london]": 30527,
|
81 |
+
"[neutr]": 30526,
|
82 |
+
"[none]": 30524,
|
83 |
+
"[rad]": 30525
|
84 |
+
}
|
special_tokens_map.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"additional_special_tokens": [
|
3 |
+
"[MET]",
|
4 |
+
"[YEAR]",
|
5 |
+
"[POL]",
|
6 |
+
"[LOC]"
|
7 |
+
],
|
8 |
+
"cls_token": "[CLS]",
|
9 |
+
"mask_token": "[MASK]",
|
10 |
+
"pad_token": "[PAD]",
|
11 |
+
"sep_token": "[SEP]",
|
12 |
+
"unk_token": "[UNK]"
|
13 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"do_lower_case": true,
|
4 |
+
"mask_token": "[MASK]",
|
5 |
+
"model_max_length": 512,
|
6 |
+
"name_or_path": "erwt-year-st",
|
7 |
+
"pad_token": "[PAD]",
|
8 |
+
"sep_token": "[SEP]",
|
9 |
+
"special_tokens_map_file": null,
|
10 |
+
"strip_accents": null,
|
11 |
+
"tokenize_chinese_chars": true,
|
12 |
+
"tokenizer_class": "DistilBertTokenizer",
|
13 |
+
"unk_token": "[UNK]"
|
14 |
+
}
|
vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|