File size: 6,230 Bytes
1c6d85d eb1e311 1c6d85d 92d46e2 1c6d85d eb1e311 1c6d85d 92d46e2 1c6d85d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
from transformers import PreTrainedTokenizer
from huggingface_hub import hf_hub_download
import json
import os
from itertools import product
class KmerTokenizer(PreTrainedTokenizer):
def __init__(self, vocab_dict=None, k=4, stride=4, **kwargs):
self.k = k
self.stride = stride
self.special_tokens = ["[MASK]", "[UNK]"]
if vocab_dict is None:
kmers = ["".join(kmer) for kmer in product('ACGT', repeat=self.k)]
self.vocab = self.special_tokens + kmers
self.vocab_dict = {word: idx for idx, word in enumerate(self.vocab)}
else:
self.vocab = list(vocab_dict.keys())
self.vocab_dict = vocab_dict
super().__init__(**kwargs)
self.mask_token = "[MASK]"
self.unk_token = "[UNK]"
# self.pad_token = "[PAD]"
def _tokenize(self, text):
splits = [text[i:i + self.k] for i in range(0, len(text) - self.k + 1, self.stride)]
return self.convert_tokens_to_ids(splits)
def convert_tokens_to_ids(self, tokens):
unk_id = self.vocab_dict.get(self.unk_token)
return [self.vocab_dict[token] if token in self.vocab_dict else unk_id for token in tokens]
def convert_ids_to_tokens(self, ids):
id_to_token = {idx: token for token, idx in self.vocab_dict.items()}
return [id_to_token.get(id_, self.unk_token) for id_ in ids]
# def build_inputs_with_special_tokens(self, token_ids):
# return [self.vocab_dict.get(self.cls_token)] + token_ids + [self.vocab_dict.get(self.sep_token)]
def get_vocab(self):
return self.vocab_dict
def save_vocabulary(self, save_directory, **kwargs):
vocab_file = os.path.join(save_directory, "tokenizer.json")
with open(vocab_file, "w", encoding="utf-8") as f:
# Format
vocab_content = {
"version": "1.0",
"added_tokens": [
{"id": self.vocab_dict[self.mask_token], "content": self.mask_token, "special": True},
{"id": self.vocab_dict[self.unk_token], "content": self.unk_token, "special": True}
],
"pre_tokenizer": {
"type": "KmerSplitter",
"k": self.k,
"stride": self.stride
},
# "post_processor": {
# "type": "TemplateProcessing",
# "single": [
# {"SpecialToken": {"id": self.cls_token, "type_id": 0}},
# {"Sequence": {"id": "A", "type_id": 0}},
# {"SpecialToken": {"id": self.sep_token, "type_id": 0}}
# ],
# "pair": [
# {"SpecialToken": {"id": self.cls_token, "type_id": 0}},
# {"Sequence": {"id": "A", "type_id": 0}},
# {"SpecialToken": {"id": self.sep_token, "type_id": 0}},
# {"Sequence": {"id": "B", "type_id": 1}},
# {"SpecialToken": {"id": self.sep_token, "type_id": 1}}
# ]
# }
"model": {
"type": "k-mer",
"k": self.k,
"stride": self.stride,
"unk_token": self.unk_token,
"vocab": self.vocab_dict
},
}
json.dump(vocab_content, f, ensure_ascii=False, indent=2)
# vocab_file = os.path.join(save_directory, "tokenizer.json")
# with open(vocab_file, "w", encoding="utf-8") as f:
# json.dump(self.vocab_dict, f, ensure_ascii=False, indent=2)
tokenizer_config = {
"added_tokens_decoder": {
"0": {"content": "[MASK]", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False,
"special": True},
"1": {"content": "[UNK]", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False,
"special": True}
},
"auto_map": {
"AutoTokenizer": [
"tokenizer.KmerTokenizer",
None
]
},
"clean_up_tokenization_spaces": True,
"mask_token": "[MASK]",
"model_max_length": 1e12, # Set a high number, or adjust as needed
"tokenizer_class": "KmerTokenizer", # Set your tokenizer class name
"unk_token": "[UNK]",
"k": self.k,
"stride": self.stride
}
tokenizer_config_file = os.path.join(save_directory, "tokenizer_config.json")
with open(tokenizer_config_file, "w", encoding="utf-8") as f:
json.dump(tokenizer_config, f, ensure_ascii=False, indent=2)
return vocab_file, tokenizer_config_file
@classmethod
def from_pretrained(cls, pretrained_dir, **kwargs):
# Load vocabulary
vocab_file = hf_hub_download(repo_id=pretrained_dir, filename="tokenizer.json")
# vocab_file = os.path.join(pretrained_dir, "tokenizer.json")
with open(vocab_file, "r", encoding="utf-8") as f:
vocab_content = json.load(f)
vocab = vocab_content["model"]["vocab"]
# k = vocab_content["model"]["k"]
# stride = vocab_content["model"]["stride"]
# Load k and stride from tokenizer_config.json
# tokenizer_config_file = os.path.join(pretrained_dir, "tokenizer_config.json")
tokenizer_config_file = hf_hub_download(repo_id=pretrained_dir, filename="tokenizer_config.json")
if os.path.exists(tokenizer_config_file):
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
tokenizer_config = json.load(f)
k = tokenizer_config.get("k", 4) # Default to 4 if not specified
stride = tokenizer_config.get("stride", k) # Default to k if not specified
else:
raise ValueError(f"Tokenizer config file not found at {tokenizer_config_file}")
# Instantiate the tokenizer with loaded values
return cls(vocab=vocab, k=k, stride=stride, **kwargs)
|