nioushasadjadi commited on
Commit
1bbc46c
·
1 Parent(s): 477c08b

First version of BarcodeBERT kmer tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer.json +286 -0
  2. tokenizer_config.json +27 -0
tokenizer.json ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "added_tokens": [
4
+ {
5
+ "id": 0,
6
+ "content": "[MASK]",
7
+ "special": true
8
+ },
9
+ {
10
+ "id": 1,
11
+ "content": "[UNK]",
12
+ "special": true
13
+ }
14
+ ],
15
+ "pre_tokenizer": {
16
+ "type": "KmerSplitter",
17
+ "k": 4,
18
+ "stride": 4
19
+ },
20
+ "model": {
21
+ "type": "k-mer",
22
+ "k": 4,
23
+ "stride": 4,
24
+ "unk_token": "[UNK]",
25
+ "vocab": {
26
+ "[MASK]": 0,
27
+ "[UNK]": 1,
28
+ "AAAA": 2,
29
+ "AAAC": 3,
30
+ "AAAG": 4,
31
+ "AAAT": 5,
32
+ "AACA": 6,
33
+ "AACC": 7,
34
+ "AACG": 8,
35
+ "AACT": 9,
36
+ "AAGA": 10,
37
+ "AAGC": 11,
38
+ "AAGG": 12,
39
+ "AAGT": 13,
40
+ "AATA": 14,
41
+ "AATC": 15,
42
+ "AATG": 16,
43
+ "AATT": 17,
44
+ "ACAA": 18,
45
+ "ACAC": 19,
46
+ "ACAG": 20,
47
+ "ACAT": 21,
48
+ "ACCA": 22,
49
+ "ACCC": 23,
50
+ "ACCG": 24,
51
+ "ACCT": 25,
52
+ "ACGA": 26,
53
+ "ACGC": 27,
54
+ "ACGG": 28,
55
+ "ACGT": 29,
56
+ "ACTA": 30,
57
+ "ACTC": 31,
58
+ "ACTG": 32,
59
+ "ACTT": 33,
60
+ "AGAA": 34,
61
+ "AGAC": 35,
62
+ "AGAG": 36,
63
+ "AGAT": 37,
64
+ "AGCA": 38,
65
+ "AGCC": 39,
66
+ "AGCG": 40,
67
+ "AGCT": 41,
68
+ "AGGA": 42,
69
+ "AGGC": 43,
70
+ "AGGG": 44,
71
+ "AGGT": 45,
72
+ "AGTA": 46,
73
+ "AGTC": 47,
74
+ "AGTG": 48,
75
+ "AGTT": 49,
76
+ "ATAA": 50,
77
+ "ATAC": 51,
78
+ "ATAG": 52,
79
+ "ATAT": 53,
80
+ "ATCA": 54,
81
+ "ATCC": 55,
82
+ "ATCG": 56,
83
+ "ATCT": 57,
84
+ "ATGA": 58,
85
+ "ATGC": 59,
86
+ "ATGG": 60,
87
+ "ATGT": 61,
88
+ "ATTA": 62,
89
+ "ATTC": 63,
90
+ "ATTG": 64,
91
+ "ATTT": 65,
92
+ "CAAA": 66,
93
+ "CAAC": 67,
94
+ "CAAG": 68,
95
+ "CAAT": 69,
96
+ "CACA": 70,
97
+ "CACC": 71,
98
+ "CACG": 72,
99
+ "CACT": 73,
100
+ "CAGA": 74,
101
+ "CAGC": 75,
102
+ "CAGG": 76,
103
+ "CAGT": 77,
104
+ "CATA": 78,
105
+ "CATC": 79,
106
+ "CATG": 80,
107
+ "CATT": 81,
108
+ "CCAA": 82,
109
+ "CCAC": 83,
110
+ "CCAG": 84,
111
+ "CCAT": 85,
112
+ "CCCA": 86,
113
+ "CCCC": 87,
114
+ "CCCG": 88,
115
+ "CCCT": 89,
116
+ "CCGA": 90,
117
+ "CCGC": 91,
118
+ "CCGG": 92,
119
+ "CCGT": 93,
120
+ "CCTA": 94,
121
+ "CCTC": 95,
122
+ "CCTG": 96,
123
+ "CCTT": 97,
124
+ "CGAA": 98,
125
+ "CGAC": 99,
126
+ "CGAG": 100,
127
+ "CGAT": 101,
128
+ "CGCA": 102,
129
+ "CGCC": 103,
130
+ "CGCG": 104,
131
+ "CGCT": 105,
132
+ "CGGA": 106,
133
+ "CGGC": 107,
134
+ "CGGG": 108,
135
+ "CGGT": 109,
136
+ "CGTA": 110,
137
+ "CGTC": 111,
138
+ "CGTG": 112,
139
+ "CGTT": 113,
140
+ "CTAA": 114,
141
+ "CTAC": 115,
142
+ "CTAG": 116,
143
+ "CTAT": 117,
144
+ "CTCA": 118,
145
+ "CTCC": 119,
146
+ "CTCG": 120,
147
+ "CTCT": 121,
148
+ "CTGA": 122,
149
+ "CTGC": 123,
150
+ "CTGG": 124,
151
+ "CTGT": 125,
152
+ "CTTA": 126,
153
+ "CTTC": 127,
154
+ "CTTG": 128,
155
+ "CTTT": 129,
156
+ "GAAA": 130,
157
+ "GAAC": 131,
158
+ "GAAG": 132,
159
+ "GAAT": 133,
160
+ "GACA": 134,
161
+ "GACC": 135,
162
+ "GACG": 136,
163
+ "GACT": 137,
164
+ "GAGA": 138,
165
+ "GAGC": 139,
166
+ "GAGG": 140,
167
+ "GAGT": 141,
168
+ "GATA": 142,
169
+ "GATC": 143,
170
+ "GATG": 144,
171
+ "GATT": 145,
172
+ "GCAA": 146,
173
+ "GCAC": 147,
174
+ "GCAG": 148,
175
+ "GCAT": 149,
176
+ "GCCA": 150,
177
+ "GCCC": 151,
178
+ "GCCG": 152,
179
+ "GCCT": 153,
180
+ "GCGA": 154,
181
+ "GCGC": 155,
182
+ "GCGG": 156,
183
+ "GCGT": 157,
184
+ "GCTA": 158,
185
+ "GCTC": 159,
186
+ "GCTG": 160,
187
+ "GCTT": 161,
188
+ "GGAA": 162,
189
+ "GGAC": 163,
190
+ "GGAG": 164,
191
+ "GGAT": 165,
192
+ "GGCA": 166,
193
+ "GGCC": 167,
194
+ "GGCG": 168,
195
+ "GGCT": 169,
196
+ "GGGA": 170,
197
+ "GGGC": 171,
198
+ "GGGG": 172,
199
+ "GGGT": 173,
200
+ "GGTA": 174,
201
+ "GGTC": 175,
202
+ "GGTG": 176,
203
+ "GGTT": 177,
204
+ "GTAA": 178,
205
+ "GTAC": 179,
206
+ "GTAG": 180,
207
+ "GTAT": 181,
208
+ "GTCA": 182,
209
+ "GTCC": 183,
210
+ "GTCG": 184,
211
+ "GTCT": 185,
212
+ "GTGA": 186,
213
+ "GTGC": 187,
214
+ "GTGG": 188,
215
+ "GTGT": 189,
216
+ "GTTA": 190,
217
+ "GTTC": 191,
218
+ "GTTG": 192,
219
+ "GTTT": 193,
220
+ "TAAA": 194,
221
+ "TAAC": 195,
222
+ "TAAG": 196,
223
+ "TAAT": 197,
224
+ "TACA": 198,
225
+ "TACC": 199,
226
+ "TACG": 200,
227
+ "TACT": 201,
228
+ "TAGA": 202,
229
+ "TAGC": 203,
230
+ "TAGG": 204,
231
+ "TAGT": 205,
232
+ "TATA": 206,
233
+ "TATC": 207,
234
+ "TATG": 208,
235
+ "TATT": 209,
236
+ "TCAA": 210,
237
+ "TCAC": 211,
238
+ "TCAG": 212,
239
+ "TCAT": 213,
240
+ "TCCA": 214,
241
+ "TCCC": 215,
242
+ "TCCG": 216,
243
+ "TCCT": 217,
244
+ "TCGA": 218,
245
+ "TCGC": 219,
246
+ "TCGG": 220,
247
+ "TCGT": 221,
248
+ "TCTA": 222,
249
+ "TCTC": 223,
250
+ "TCTG": 224,
251
+ "TCTT": 225,
252
+ "TGAA": 226,
253
+ "TGAC": 227,
254
+ "TGAG": 228,
255
+ "TGAT": 229,
256
+ "TGCA": 230,
257
+ "TGCC": 231,
258
+ "TGCG": 232,
259
+ "TGCT": 233,
260
+ "TGGA": 234,
261
+ "TGGC": 235,
262
+ "TGGG": 236,
263
+ "TGGT": 237,
264
+ "TGTA": 238,
265
+ "TGTC": 239,
266
+ "TGTG": 240,
267
+ "TGTT": 241,
268
+ "TTAA": 242,
269
+ "TTAC": 243,
270
+ "TTAG": 244,
271
+ "TTAT": 245,
272
+ "TTCA": 246,
273
+ "TTCC": 247,
274
+ "TTCG": 248,
275
+ "TTCT": 249,
276
+ "TTGA": 250,
277
+ "TTGC": 251,
278
+ "TTGG": 252,
279
+ "TTGT": 253,
280
+ "TTTA": 254,
281
+ "TTTC": 255,
282
+ "TTTG": 256,
283
+ "TTTT": 257
284
+ }
285
+ }
286
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[MASK]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ }
19
+ },
20
+ "clean_up_tokenization_spaces": true,
21
+ "mask_token": "[MASK]",
22
+ "model_max_length": 1000000000000.0,
23
+ "tokenizer_class": "KmerTokenizer",
24
+ "unk_token": "[UNK]",
25
+ "k": 4,
26
+ "stride": 4
27
+ }