nioushasadjadi commited on
Commit
156a2ea
·
1 Parent(s): 2fa1eca

Adding max_length and padding to tokenizer and encoder.

Browse files
Files changed (3) hide show
  1. tokenizer.json +3 -4
  2. tokenizer.py +23 -18
  3. tokenizer_config.json +1 -3
tokenizer.json CHANGED
@@ -15,12 +15,11 @@
15
  "pre_tokenizer": {
16
  "type": "KmerSplitter",
17
  "k": 4,
18
- "stride": 4
 
19
  },
20
  "model": {
21
- "type": "k-mer",
22
- "k": 4,
23
- "stride": 4,
24
  "unk_token": "[UNK]",
25
  "vocab": {
26
  "[MASK]": 0,
 
15
  "pre_tokenizer": {
16
  "type": "KmerSplitter",
17
  "k": 4,
18
+ "stride": 4,
19
+ "max_length": 660
20
  },
21
  "model": {
22
+ "type": "KmerTokenizer",
 
 
23
  "unk_token": "[UNK]",
24
  "vocab": {
25
  "[MASK]": 0,
tokenizer.py CHANGED
@@ -7,9 +7,10 @@ from itertools import product
7
 
8
 
9
  class KmerTokenizer(PreTrainedTokenizer):
10
- def __init__(self, vocab_dict=None, k=4, stride=4, **kwargs):
11
  self.k = k
12
  self.stride = stride
 
13
  self.special_tokens = ["[MASK]", "[UNK]"]
14
 
15
  if vocab_dict is None:
@@ -27,6 +28,11 @@ class KmerTokenizer(PreTrainedTokenizer):
27
  # self.pad_token = "[PAD]"
28
 
29
  def tokenize(self, text, **kwargs):
 
 
 
 
 
30
  splits = [text[i:i + self.k] for i in range(0, len(text) - self.k + 1, self.stride)]
31
  return splits
32
 
@@ -64,12 +70,11 @@ class KmerTokenizer(PreTrainedTokenizer):
64
  "pre_tokenizer": {
65
  "type": "KmerSplitter",
66
  "k": self.k,
67
- "stride": self.stride
 
68
  },
69
  "model": {
70
- "type": "k-mer",
71
- "k": self.k,
72
- "stride": self.stride,
73
  "unk_token": self.unk_token,
74
  "vocab": self.vocab_dict
75
  },
@@ -96,9 +101,7 @@ class KmerTokenizer(PreTrainedTokenizer):
96
  "mask_token": "[MASK]",
97
  "model_max_length": 1e12, # Set a high number, or adjust as needed
98
  "tokenizer_class": "KmerTokenizer", # Set your tokenizer class name
99
- "unk_token": "[UNK]",
100
- "k": self.k,
101
- "stride": self.stride
102
  }
103
  tokenizer_config_file = os.path.join(save_directory, "tokenizer_config.json")
104
  with open(tokenizer_config_file, "w", encoding="utf-8") as f:
@@ -109,24 +112,26 @@ class KmerTokenizer(PreTrainedTokenizer):
109
  @classmethod
110
  def from_pretrained(cls, pretrained_dir, **kwargs):
111
  # Load vocabulary
112
- vocab_file = hf_hub_download(repo_id=pretrained_dir, filename="tokenizer.json")
113
  # vocab_file = os.path.join(pretrained_dir, "tokenizer.json")
114
- with open(vocab_file, "r", encoding="utf-8") as f:
115
- vocab_content = json.load(f)
116
- vocab = vocab_content["model"]["vocab"]
117
- # k = vocab_content["model"]["k"]
118
- # stride = vocab_content["model"]["stride"]
 
 
 
 
 
119
 
120
- # Load k and stride from tokenizer_config.json
121
  # tokenizer_config_file = os.path.join(pretrained_dir, "tokenizer_config.json")
122
  tokenizer_config_file = hf_hub_download(repo_id=pretrained_dir, filename="tokenizer_config.json")
123
  if os.path.exists(tokenizer_config_file):
124
  with open(tokenizer_config_file, "r", encoding="utf-8") as f:
125
  tokenizer_config = json.load(f)
126
- k = tokenizer_config.get("k", 4) # Default to 4 if not specified
127
- stride = tokenizer_config.get("stride", k) # Default to k if not specified
128
  else:
129
  raise ValueError(f"Tokenizer config file not found at {tokenizer_config_file}")
130
 
131
  # Instantiate the tokenizer with loaded values
132
- return cls(vocab=vocab, k=k, stride=stride, **kwargs)
 
7
 
8
 
9
  class KmerTokenizer(PreTrainedTokenizer):
10
+ def __init__(self, vocab_dict=None, k=4, stride=4, max_len=660, **kwargs):
11
  self.k = k
12
  self.stride = stride
13
+ self.max_len = max_len
14
  self.special_tokens = ["[MASK]", "[UNK]"]
15
 
16
  if vocab_dict is None:
 
28
  # self.pad_token = "[PAD]"
29
 
30
  def tokenize(self, text, **kwargs):
31
+ if len(text) > self.max_len:
32
+ text = text[:self.max_len]
33
+ if kwargs.get('padding'):
34
+ if len(text) < self.max_len:
35
+ text = text + 'N' * (self.max_len - len(text))
36
  splits = [text[i:i + self.k] for i in range(0, len(text) - self.k + 1, self.stride)]
37
  return splits
38
 
 
70
  "pre_tokenizer": {
71
  "type": "KmerSplitter",
72
  "k": self.k,
73
+ "stride": self.stride,
74
+ "max_length": self.max_len
75
  },
76
  "model": {
77
+ "type": "KmerTokenizer",
 
 
78
  "unk_token": self.unk_token,
79
  "vocab": self.vocab_dict
80
  },
 
101
  "mask_token": "[MASK]",
102
  "model_max_length": 1e12, # Set a high number, or adjust as needed
103
  "tokenizer_class": "KmerTokenizer", # Set your tokenizer class name
104
+ "unk_token": "[UNK]"
 
 
105
  }
106
  tokenizer_config_file = os.path.join(save_directory, "tokenizer_config.json")
107
  with open(tokenizer_config_file, "w", encoding="utf-8") as f:
 
112
  @classmethod
113
  def from_pretrained(cls, pretrained_dir, **kwargs):
114
  # Load vocabulary
 
115
  # vocab_file = os.path.join(pretrained_dir, "tokenizer.json")
116
+ vocab_file = hf_hub_download(repo_id=pretrained_dir, filename="tokenizer.json")
117
+ if os.path.exists(vocab_file):
118
+ with open(vocab_file, "r", encoding="utf-8") as f:
119
+ vocab_content = json.load(f)
120
+ vocab = vocab_content["model"]["vocab"]
121
+ k = vocab_content["pre_tokenizer"]["k"]
122
+ stride = vocab_content["pre_tokenizer"]["stride"]
123
+ max_len = vocab_content["pre_tokenizer"]["max_length"]
124
+ else:
125
+ raise ValueError(f"Vocabulary file not found at {vocab_file}")
126
 
127
+ # Check for the existence of tokenizer_config.json
128
  # tokenizer_config_file = os.path.join(pretrained_dir, "tokenizer_config.json")
129
  tokenizer_config_file = hf_hub_download(repo_id=pretrained_dir, filename="tokenizer_config.json")
130
  if os.path.exists(tokenizer_config_file):
131
  with open(tokenizer_config_file, "r", encoding="utf-8") as f:
132
  tokenizer_config = json.load(f)
 
 
133
  else:
134
  raise ValueError(f"Tokenizer config file not found at {tokenizer_config_file}")
135
 
136
  # Instantiate the tokenizer with loaded values
137
+ return cls(vocab=vocab, k=k, stride=stride, max_len=max_len, **kwargs)
tokenizer_config.json CHANGED
@@ -27,7 +27,5 @@
27
  "mask_token": "[MASK]",
28
  "model_max_length": 1000000000000.0,
29
  "tokenizer_class": "KmerTokenizer",
30
- "unk_token": "[UNK]",
31
- "k": 4,
32
- "stride": 4
33
  }
 
27
  "mask_token": "[MASK]",
28
  "model_max_length": 1000000000000.0,
29
  "tokenizer_class": "KmerTokenizer",
30
+ "unk_token": "[UNK]"
 
 
31
  }