nioushasadjadi commited on
Commit
bcd9e56
·
1 Parent(s): 82681b6

Fixing tokenizer return type

Browse files
Files changed (1) hide show
  1. tokenizer.py +2 -2
tokenizer.py CHANGED
@@ -26,14 +26,14 @@ class KmerTokenizer(PreTrainedTokenizer):
26
  self.unk_token = "[UNK]"
27
  # self.pad_token = "[PAD]"
28
 
29
- def _tokenize(self, text, **kwargs):
30
  splits = [text[i:i + self.k] for i in range(0, len(text) - self.k + 1, self.stride)]
31
  if kwargs.get('return_tensors') == 'pt':
32
  return torch.tensor(splits)
33
  return splits
34
 
35
  def _encode(self, text, **kwargs):
36
- tokens = self._tokenize(text, **kwargs)
37
  token_ids = self.convert_tokens_to_ids(tokens)
38
  if kwargs.get('return_tensors') == 'pt':
39
  return torch.tensor(token_ids)
 
26
  self.unk_token = "[UNK]"
27
  # self.pad_token = "[PAD]"
28
 
29
+ def tokenize(self, text, **kwargs):
30
  splits = [text[i:i + self.k] for i in range(0, len(text) - self.k + 1, self.stride)]
31
  if kwargs.get('return_tensors') == 'pt':
32
  return torch.tensor(splits)
33
  return splits
34
 
35
  def _encode(self, text, **kwargs):
36
+ tokens = self.tokenize(text, **kwargs)
37
  token_ids = self.convert_tokens_to_ids(tokens)
38
  if kwargs.get('return_tensors') == 'pt':
39
  return torch.tensor(token_ids)