nioushasadjadi commited on
Commit
1c6d85d
·
1 Parent(s): 1bbc46c

Add tokenizer.py

Browse files
Files changed (1) hide show
  1. tokenizer.py +130 -0
tokenizer.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PreTrainedTokenizer
2
+ import json
3
+ import os
4
+ from itertools import product
5
+
6
+
7
+ class KmerTokenizer(PreTrainedTokenizer):
8
+ def __init__(self, vocab_dict=None, k=4, stride=4, **kwargs):
9
+ self.k = k
10
+ self.stride = stride
11
+ self.special_tokens = ["[MASK]", "[UNK]"]
12
+
13
+ if vocab_dict is None:
14
+ kmers = ["".join(kmer) for kmer in product('ACGT', repeat=self.k)]
15
+ self.vocab = self.special_tokens + kmers
16
+ self.vocab_dict = {word: idx for idx, word in enumerate(self.vocab)}
17
+ else:
18
+ self.vocab = list(vocab_dict.keys())
19
+ self.vocab_dict = vocab_dict
20
+
21
+ super().__init__(**kwargs)
22
+
23
+ self.mask_token = "[MASK]"
24
+ self.unk_token = "[UNK]"
25
+ # self.pad_token = "[PAD]"
26
+
27
+ def _tokenize(self, text):
28
+ splits = [text[i:i + self.k] for i in range(0, len(text) - self.k + 1, self.stride)]
29
+ return self.convert_tokens_to_ids(splits)
30
+
31
+ def convert_tokens_to_ids(self, tokens):
32
+ unk_id = self.vocab_dict.get(self.unk_token)
33
+ return [self.vocab_dict[token] if token in self.vocab_dict else unk_id for token in tokens]
34
+
35
+ def convert_ids_to_tokens(self, ids):
36
+ id_to_token = {idx: token for token, idx in self.vocab_dict.items()}
37
+ return [id_to_token.get(id_, self.unk_token) for id_ in ids]
38
+
39
+ # def build_inputs_with_special_tokens(self, token_ids):
40
+ # return [self.vocab_dict.get(self.cls_token)] + token_ids + [self.vocab_dict.get(self.sep_token)]
41
+
42
+ def get_vocab(self):
43
+ return self.vocab_dict
44
+
45
+ def save_vocabulary(self, save_directory, **kwargs):
46
+ vocab_file = os.path.join(save_directory, "tokenizer.json")
47
+ with open(vocab_file, "w", encoding="utf-8") as f:
48
+ # Format
49
+ vocab_content = {
50
+ "version": "1.0",
51
+ "added_tokens": [
52
+ {"id": self.vocab_dict[self.mask_token], "content": self.mask_token, "special": True},
53
+ {"id": self.vocab_dict[self.unk_token], "content": self.unk_token, "special": True}
54
+ ],
55
+ "pre_tokenizer": {
56
+ "type": "KmerSplitter",
57
+ "k": self.k,
58
+ "stride": self.stride
59
+ },
60
+ # "post_processor": {
61
+ # "type": "TemplateProcessing",
62
+ # "single": [
63
+ # {"SpecialToken": {"id": self.cls_token, "type_id": 0}},
64
+ # {"Sequence": {"id": "A", "type_id": 0}},
65
+ # {"SpecialToken": {"id": self.sep_token, "type_id": 0}}
66
+ # ],
67
+ # "pair": [
68
+ # {"SpecialToken": {"id": self.cls_token, "type_id": 0}},
69
+ # {"Sequence": {"id": "A", "type_id": 0}},
70
+ # {"SpecialToken": {"id": self.sep_token, "type_id": 0}},
71
+ # {"Sequence": {"id": "B", "type_id": 1}},
72
+ # {"SpecialToken": {"id": self.sep_token, "type_id": 1}}
73
+ # ]
74
+ # }
75
+ "model": {
76
+ "type": "k-mer",
77
+ "k": self.k,
78
+ "stride": self.stride,
79
+ "unk_token": self.unk_token,
80
+ "vocab": self.vocab_dict
81
+ },
82
+ }
83
+ json.dump(vocab_content, f, ensure_ascii=False, indent=2)
84
+ # vocab_file = os.path.join(save_directory, "tokenizer.json")
85
+ # with open(vocab_file, "w", encoding="utf-8") as f:
86
+ # json.dump(self.vocab_dict, f, ensure_ascii=False, indent=2)
87
+
88
+ tokenizer_config = {
89
+ "added_tokens_decoder": {
90
+ "0": {"content": "[MASK]", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False,
91
+ "special": True},
92
+ "1": {"content": "[UNK]", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False,
93
+ "special": True}
94
+ },
95
+ "clean_up_tokenization_spaces": True,
96
+ "mask_token": "[MASK]",
97
+ "model_max_length": 1e12, # Set a high number, or adjust as needed
98
+ "tokenizer_class": "KmerTokenizer", # Set your tokenizer class name
99
+ "unk_token": "[UNK]",
100
+ "k": self.k,
101
+ "stride": self.stride
102
+ }
103
+ tokenizer_config_file = os.path.join(save_directory, "tokenizer_config.json")
104
+ with open(tokenizer_config_file, "w", encoding="utf-8") as f:
105
+ json.dump(tokenizer_config, f, ensure_ascii=False, indent=2)
106
+
107
+ return vocab_file, tokenizer_config_file
108
+
109
+ @classmethod
110
+ def from_pretrained(cls, pretrained_dir, **kwargs):
111
+ # Load vocabulary
112
+ vocab_file = os.path.join(pretrained_dir, "tokenizer.json")
113
+ with open(vocab_file, "r", encoding="utf-8") as f:
114
+ vocab_content = json.load(f)
115
+ vocab = vocab_content["model"]["vocab"]
116
+ # k = vocab_content["model"]["k"]
117
+ # stride = vocab_content["model"]["stride"]
118
+
119
+ # Load k and stride from tokenizer_config.json
120
+ tokenizer_config_file = os.path.join(pretrained_dir, "tokenizer_config.json")
121
+ if os.path.exists(tokenizer_config_file):
122
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
123
+ tokenizer_config = json.load(f)
124
+ k = tokenizer_config.get("k", 4) # Default to 4 if not specified
125
+ stride = tokenizer_config.get("stride", k) # Default to k if not specified
126
+ else:
127
+ raise ValueError(f"Tokenizer config file not found at {tokenizer_config_file}")
128
+
129
+ # Instantiate the tokenizer with loaded values
130
+ return cls(vocab=vocab, k=k, stride=stride, **kwargs)