nioushasadjadi
commited on
Commit
·
bcd9e56
1
Parent(s):
82681b6
Fixing tokenizer return type
Browse files- tokenizer.py +2 -2
tokenizer.py
CHANGED
@@ -26,14 +26,14 @@ class KmerTokenizer(PreTrainedTokenizer):
|
|
26 |
self.unk_token = "[UNK]"
|
27 |
# self.pad_token = "[PAD]"
|
28 |
|
29 |
-
def
|
30 |
splits = [text[i:i + self.k] for i in range(0, len(text) - self.k + 1, self.stride)]
|
31 |
if kwargs.get('return_tensors') == 'pt':
|
32 |
return torch.tensor(splits)
|
33 |
return splits
|
34 |
|
35 |
def _encode(self, text, **kwargs):
|
36 |
-
tokens = self.
|
37 |
token_ids = self.convert_tokens_to_ids(tokens)
|
38 |
if kwargs.get('return_tensors') == 'pt':
|
39 |
return torch.tensor(token_ids)
|
|
|
26 |
self.unk_token = "[UNK]"
|
27 |
# self.pad_token = "[PAD]"
|
28 |
|
29 |
+
def tokenize(self, text, **kwargs):
|
30 |
splits = [text[i:i + self.k] for i in range(0, len(text) - self.k + 1, self.stride)]
|
31 |
if kwargs.get('return_tensors') == 'pt':
|
32 |
return torch.tensor(splits)
|
33 |
return splits
|
34 |
|
35 |
def _encode(self, text, **kwargs):
|
36 |
+
tokens = self.tokenize(text, **kwargs)
|
37 |
token_ids = self.convert_tokens_to_ids(tokens)
|
38 |
if kwargs.get('return_tensors') == 'pt':
|
39 |
return torch.tensor(token_ids)
|