File size: 6,230 Bytes
1c6d85d
eb1e311
1c6d85d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92d46e2
 
 
 
 
 
1c6d85d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb1e311
 
1c6d85d
 
 
 
 
 
 
92d46e2
 
1c6d85d
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
from transformers import PreTrainedTokenizer
from huggingface_hub import hf_hub_download
import json
import os
from itertools import product


class KmerTokenizer(PreTrainedTokenizer):
    def __init__(self, vocab_dict=None, k=4, stride=4, **kwargs):
        self.k = k
        self.stride = stride
        self.special_tokens = ["[MASK]", "[UNK]"]

        if vocab_dict is None:
            kmers = ["".join(kmer) for kmer in product('ACGT', repeat=self.k)]
            self.vocab = self.special_tokens + kmers
            self.vocab_dict = {word: idx for idx, word in enumerate(self.vocab)}
        else:
            self.vocab = list(vocab_dict.keys())
            self.vocab_dict = vocab_dict

        super().__init__(**kwargs)

        self.mask_token = "[MASK]"
        self.unk_token = "[UNK]"
        # self.pad_token = "[PAD]"

    def _tokenize(self, text):
        splits = [text[i:i + self.k] for i in range(0, len(text) - self.k + 1, self.stride)]
        return self.convert_tokens_to_ids(splits)

    def convert_tokens_to_ids(self, tokens):
        unk_id = self.vocab_dict.get(self.unk_token)
        return [self.vocab_dict[token] if token in self.vocab_dict else unk_id for token in tokens]

    def convert_ids_to_tokens(self, ids):
        id_to_token = {idx: token for token, idx in self.vocab_dict.items()}
        return [id_to_token.get(id_, self.unk_token) for id_ in ids]

    # def build_inputs_with_special_tokens(self, token_ids):
    #     return [self.vocab_dict.get(self.cls_token)] + token_ids + [self.vocab_dict.get(self.sep_token)]

    def get_vocab(self):
        return self.vocab_dict

    def save_vocabulary(self, save_directory, **kwargs):
        vocab_file = os.path.join(save_directory, "tokenizer.json")
        with open(vocab_file, "w", encoding="utf-8") as f:
            # Format
            vocab_content = {
                "version": "1.0",
                "added_tokens": [
                    {"id": self.vocab_dict[self.mask_token], "content": self.mask_token, "special": True},
                    {"id": self.vocab_dict[self.unk_token], "content": self.unk_token, "special": True}
                ],
                "pre_tokenizer": {
                    "type": "KmerSplitter",
                    "k": self.k,
                    "stride": self.stride
                },
                # "post_processor": {
                #     "type": "TemplateProcessing",
                #     "single": [
                #         {"SpecialToken": {"id": self.cls_token, "type_id": 0}},
                #         {"Sequence": {"id": "A", "type_id": 0}},
                #         {"SpecialToken": {"id": self.sep_token, "type_id": 0}}
                #     ],
                #     "pair": [
                #         {"SpecialToken": {"id": self.cls_token, "type_id": 0}},
                #         {"Sequence": {"id": "A", "type_id": 0}},
                #         {"SpecialToken": {"id": self.sep_token, "type_id": 0}},
                #         {"Sequence": {"id": "B", "type_id": 1}},
                #         {"SpecialToken": {"id": self.sep_token, "type_id": 1}}
                #     ]
                # }
                "model": {
                    "type": "k-mer",
                    "k": self.k,
                    "stride": self.stride,
                    "unk_token": self.unk_token,
                    "vocab": self.vocab_dict
                },
            }
            json.dump(vocab_content, f, ensure_ascii=False, indent=2)
        # vocab_file = os.path.join(save_directory, "tokenizer.json")
        # with open(vocab_file, "w", encoding="utf-8") as f:
        #     json.dump(self.vocab_dict, f, ensure_ascii=False, indent=2)

        tokenizer_config = {
            "added_tokens_decoder": {
                "0": {"content": "[MASK]", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False,
                      "special": True},
                "1": {"content": "[UNK]", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False,
                      "special": True}
            },
            "auto_map": {
                "AutoTokenizer": [
                    "tokenizer.KmerTokenizer",
                    None
                ]
            },
            "clean_up_tokenization_spaces": True,
            "mask_token": "[MASK]",
            "model_max_length": 1e12,  # Set a high number, or adjust as needed
            "tokenizer_class": "KmerTokenizer",  # Set your tokenizer class name
            "unk_token": "[UNK]",
            "k": self.k,
            "stride": self.stride
        }
        tokenizer_config_file = os.path.join(save_directory, "tokenizer_config.json")
        with open(tokenizer_config_file, "w", encoding="utf-8") as f:
            json.dump(tokenizer_config, f, ensure_ascii=False, indent=2)

        return vocab_file, tokenizer_config_file

    @classmethod
    def from_pretrained(cls, pretrained_dir, **kwargs):
        # Load vocabulary
        vocab_file = hf_hub_download(repo_id=pretrained_dir, filename="tokenizer.json")
        # vocab_file = os.path.join(pretrained_dir, "tokenizer.json")
        with open(vocab_file, "r", encoding="utf-8") as f:
            vocab_content = json.load(f)
            vocab = vocab_content["model"]["vocab"]
            # k = vocab_content["model"]["k"]
            # stride = vocab_content["model"]["stride"]

        # Load k and stride from tokenizer_config.json
        # tokenizer_config_file = os.path.join(pretrained_dir, "tokenizer_config.json")
        tokenizer_config_file = hf_hub_download(repo_id=pretrained_dir, filename="tokenizer_config.json")
        if os.path.exists(tokenizer_config_file):
            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
                tokenizer_config = json.load(f)
            k = tokenizer_config.get("k", 4)  # Default to 4 if not specified
            stride = tokenizer_config.get("stride", k)  # Default to k if not specified
        else:
            raise ValueError(f"Tokenizer config file not found at {tokenizer_config_file}")

        # Instantiate the tokenizer with loaded values
        return cls(vocab=vocab, k=k, stride=stride, **kwargs)