nioushasadjadi commited on
Commit
82681b6
·
1 Parent(s): 92d46e2

Fixing encoder and tokenize functions.

Browse files
Files changed (1) hide show
  1. tokenizer.py +13 -18
tokenizer.py CHANGED
@@ -1,5 +1,6 @@
1
  from transformers import PreTrainedTokenizer
2
  from huggingface_hub import hf_hub_download
 
3
  import json
4
  import os
5
  from itertools import product
@@ -25,15 +26,24 @@ class KmerTokenizer(PreTrainedTokenizer):
25
  self.unk_token = "[UNK]"
26
  # self.pad_token = "[PAD]"
27
 
28
- def _tokenize(self, text):
29
  splits = [text[i:i + self.k] for i in range(0, len(text) - self.k + 1, self.stride)]
30
- return self.convert_tokens_to_ids(splits)
 
 
 
 
 
 
 
 
 
31
 
32
  def convert_tokens_to_ids(self, tokens):
33
  unk_id = self.vocab_dict.get(self.unk_token)
34
  return [self.vocab_dict[token] if token in self.vocab_dict else unk_id for token in tokens]
35
 
36
- def convert_ids_to_tokens(self, ids):
37
  id_to_token = {idx: token for token, idx in self.vocab_dict.items()}
38
  return [id_to_token.get(id_, self.unk_token) for id_ in ids]
39
 
@@ -58,21 +68,6 @@ class KmerTokenizer(PreTrainedTokenizer):
58
  "k": self.k,
59
  "stride": self.stride
60
  },
61
- # "post_processor": {
62
- # "type": "TemplateProcessing",
63
- # "single": [
64
- # {"SpecialToken": {"id": self.cls_token, "type_id": 0}},
65
- # {"Sequence": {"id": "A", "type_id": 0}},
66
- # {"SpecialToken": {"id": self.sep_token, "type_id": 0}}
67
- # ],
68
- # "pair": [
69
- # {"SpecialToken": {"id": self.cls_token, "type_id": 0}},
70
- # {"Sequence": {"id": "A", "type_id": 0}},
71
- # {"SpecialToken": {"id": self.sep_token, "type_id": 0}},
72
- # {"Sequence": {"id": "B", "type_id": 1}},
73
- # {"SpecialToken": {"id": self.sep_token, "type_id": 1}}
74
- # ]
75
- # }
76
  "model": {
77
  "type": "k-mer",
78
  "k": self.k,
 
1
  from transformers import PreTrainedTokenizer
2
  from huggingface_hub import hf_hub_download
3
+ import torch
4
  import json
5
  import os
6
  from itertools import product
 
26
  self.unk_token = "[UNK]"
27
  # self.pad_token = "[PAD]"
28
 
29
+ def _tokenize(self, text, **kwargs):
30
  splits = [text[i:i + self.k] for i in range(0, len(text) - self.k + 1, self.stride)]
31
+ if kwargs.get('return_tensors') == 'pt':
32
+ return torch.tensor(splits)
33
+ return splits
34
+
35
+ def _encode(self, text, **kwargs):
36
+ tokens = self._tokenize(text, **kwargs)
37
+ token_ids = self.convert_tokens_to_ids(tokens)
38
+ if kwargs.get('return_tensors') == 'pt':
39
+ return torch.tensor(token_ids)
40
+ return token_ids
41
 
42
  def convert_tokens_to_ids(self, tokens):
43
  unk_id = self.vocab_dict.get(self.unk_token)
44
  return [self.vocab_dict[token] if token in self.vocab_dict else unk_id for token in tokens]
45
 
46
+ def convert_ids_to_tokens(self, ids, **kwargs):
47
  id_to_token = {idx: token for token, idx in self.vocab_dict.items()}
48
  return [id_to_token.get(id_, self.unk_token) for id_ in ids]
49
 
 
68
  "k": self.k,
69
  "stride": self.stride
70
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  "model": {
72
  "type": "k-mer",
73
  "k": self.k,