Transformers
Hack90 commited on
Commit
f6472a3
·
verified ·
1 Parent(s): dfa4ed2

Upload tokenizer

Browse files
Files changed (2) hide show
  1. special_tokens_map.json +3 -21
  2. tokenizer.json +124 -15
special_tokens_map.json CHANGED
@@ -1,23 +1,5 @@
1
  {
2
- "bos_token": {
3
- "content": "<|endoftext|>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "<|endoftext|>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "unk_token": {
17
- "content": "<|endoftext|>",
18
- "lstrip": false,
19
- "normalized": false,
20
- "rstrip": false,
21
- "single_word": false
22
- }
23
  }
 
1
  {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "unk_token": "<|endoftext|>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  }
tokenizer.json CHANGED
@@ -15,30 +15,139 @@
15
  ],
16
  "normalizer": null,
17
  "pre_tokenizer": {
18
- "type": "Sequence",
19
- "pretokenizers": [
20
- {
21
- "type": "Split",
22
- "pattern": {
23
- "String": ""
24
- },
25
- "behavior": "Isolated",
26
- "invert": false
27
- }
28
- ]
29
  },
30
  "post_processor": null,
31
- "decoder": null,
 
 
 
 
 
32
  "model": {
33
- "type": "WordLevel",
 
 
 
 
 
 
 
34
  "vocab": {
35
  "<|endoftext|>": 0,
36
  "a": 1,
37
  "t": 2,
38
  "c": 3,
39
  "g": 4,
40
- "n": 5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  },
42
- "unk_token": "<|endoftext|>"
43
  }
44
  }
 
15
  ],
16
  "normalizer": null,
17
  "pre_tokenizer": {
18
+ "type": "ByteLevel",
19
+ "add_prefix_space": false,
20
+ "trim_offsets": true,
21
+ "use_regex": true
 
 
 
 
 
 
 
22
  },
23
  "post_processor": null,
24
+ "decoder": {
25
+ "type": "ByteLevel",
26
+ "add_prefix_space": true,
27
+ "trim_offsets": true,
28
+ "use_regex": true
29
+ },
30
  "model": {
31
+ "type": "BPE",
32
+ "dropout": null,
33
+ "unk_token": "<|endoftext|>",
34
+ "continuing_subword_prefix": null,
35
+ "end_of_word_suffix": null,
36
+ "fuse_unk": false,
37
+ "byte_fallback": false,
38
+ "ignore_merges": false,
39
  "vocab": {
40
  "<|endoftext|>": 0,
41
  "a": 1,
42
  "t": 2,
43
  "c": 3,
44
  "g": 4,
45
+ "n": 5,
46
+ "k": 6,
47
+ "z": 7,
48
+ "m": 8,
49
+ "l": 9,
50
+ "1": 10,
51
+ "2": 11,
52
+ "3": 12,
53
+ "4": 13,
54
+ "5": 14,
55
+ "6": 15,
56
+ "7": 16,
57
+ "8": 17,
58
+ "9": 18,
59
+ "10": 19,
60
+ "11": 20,
61
+ "12": 21,
62
+ "13": 22,
63
+ "14": 23,
64
+ "15": 24,
65
+ "16": 25,
66
+ "17": 26,
67
+ "18": 27,
68
+ "19": 28,
69
+ "20": 29,
70
+ "21": 30,
71
+ "22": 31,
72
+ "23": 32,
73
+ "24": 33,
74
+ "25": 34,
75
+ "26": 35,
76
+ "27": 36,
77
+ "28": 37,
78
+ "29": 38,
79
+ "30": 39,
80
+ "31": 40,
81
+ "32": 41,
82
+ "33": 42,
83
+ "34": 43,
84
+ "35": 44,
85
+ "36": 45,
86
+ "37": 46,
87
+ "38": 47,
88
+ "39": 48,
89
+ "40": 49,
90
+ "41": 50,
91
+ "42": 51,
92
+ "43": 52,
93
+ "44": 53,
94
+ "45": 54,
95
+ "46": 55,
96
+ "47": 56,
97
+ "48": 57,
98
+ "49": 58,
99
+ "50": 59,
100
+ "51": 60,
101
+ "52": 61,
102
+ "53": 62,
103
+ "54": 63,
104
+ "55": 64,
105
+ "56": 65,
106
+ "57": 66,
107
+ "58": 67,
108
+ "59": 68,
109
+ "60": 69,
110
+ "61": 70,
111
+ "62": 71,
112
+ "63": 72,
113
+ "64": 73,
114
+ "65": 74,
115
+ "66": 75,
116
+ "67": 76,
117
+ "68": 77,
118
+ "69": 78,
119
+ "70": 79,
120
+ "71": 80,
121
+ "72": 81,
122
+ "73": 82,
123
+ "74": 83,
124
+ "75": 84,
125
+ "76": 85,
126
+ "77": 86,
127
+ "78": 87,
128
+ "79": 88,
129
+ "80": 89,
130
+ "81": 90,
131
+ "82": 91,
132
+ "83": 92,
133
+ "84": 93,
134
+ "85": 94,
135
+ "86": 95,
136
+ "87": 96,
137
+ "88": 97,
138
+ "89": 98,
139
+ "90": 99,
140
+ "91": 100,
141
+ "92": 101,
142
+ "93": 102,
143
+ "94": 103,
144
+ "95": 104,
145
+ "96": 105,
146
+ "97": 106,
147
+ "98": 107,
148
+ "99": 108,
149
+ "100": 109
150
  },
151
+ "merges": []
152
  }
153
  }