File size: 6,632 Bytes
1c6d85d eb1e311 82681b6 1c6d85d 156a2ea 1c6d85d 156a2ea 1c6d85d bcd9e56 156a2ea 1c6d85d 82681b6 4a303bd bcd9e56 82681b6 1c6d85d 82681b6 1c6d85d 156a2ea 1c6d85d 156a2ea 1c6d85d 92d46e2 1c6d85d 156a2ea 1c6d85d eb1e311 156a2ea 1c6d85d 156a2ea 92d46e2 1c6d85d 156a2ea 4e98ce2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
from transformers import PreTrainedTokenizer
from huggingface_hub import hf_hub_download
import torch
import json
import os
from itertools import product
class KmerTokenizer(PreTrainedTokenizer):
def __init__(self, vocab_dict=None, k=4, stride=4, max_len=660, **kwargs):
self.k = k
self.stride = stride
self.max_len = max_len
self.special_tokens = ["[MASK]", "[UNK]"]
if vocab_dict is None:
kmers = ["".join(kmer) for kmer in product('ACGT', repeat=self.k)]
self.vocab = self.special_tokens + kmers
self.vocab_dict = {word: idx for idx, word in enumerate(self.vocab)}
else:
self.vocab = list(vocab_dict.keys())
self.vocab_dict = vocab_dict
super().__init__(**kwargs)
self.mask_token = "[MASK]"
self.unk_token = "[UNK]"
# self.pad_token = "[PAD]"
def tokenize(self, text, **kwargs):
if len(text) > self.max_len:
text = text[:self.max_len]
if kwargs.get('padding'):
if len(text) < self.max_len:
text = text + 'N' * (self.max_len - len(text))
splits = [text[i:i + self.k] for i in range(0, len(text) - self.k + 1, self.stride)]
return splits
def encode(self, text, **kwargs):
tokens = self.tokenize(text, **kwargs)
token_ids = self.convert_tokens_to_ids(tokens)
if kwargs.get('return_tensors') == 'pt':
return torch.tensor(token_ids)
return token_ids
def convert_tokens_to_ids(self, tokens):
unk_id = self.vocab_dict.get(self.unk_token)
return [self.vocab_dict[token] if token in self.vocab_dict else unk_id for token in tokens]
def convert_ids_to_tokens(self, ids, **kwargs):
id_to_token = {idx: token for token, idx in self.vocab_dict.items()}
return [id_to_token.get(id_, self.unk_token) for id_ in ids]
# def build_inputs_with_special_tokens(self, token_ids):
# return [self.vocab_dict.get(self.cls_token)] + token_ids + [self.vocab_dict.get(self.sep_token)]
def get_vocab(self):
return self.vocab_dict
def save_vocabulary(self, save_directory, **kwargs):
vocab_file = os.path.join(save_directory, "tokenizer.json")
with open(vocab_file, "w", encoding="utf-8") as f:
# Format
vocab_content = {
"version": "1.0",
"added_tokens": [
{"id": self.vocab_dict[self.mask_token], "content": self.mask_token, "special": True},
{"id": self.vocab_dict[self.unk_token], "content": self.unk_token, "special": True}
],
"pre_tokenizer": {
"type": "KmerSplitter",
"k": self.k,
"stride": self.stride,
"max_length": self.max_len
},
"model": {
"type": "KmerTokenizer",
"unk_token": self.unk_token,
"vocab": self.vocab_dict
},
}
json.dump(vocab_content, f, ensure_ascii=False, indent=2)
# vocab_file = os.path.join(save_directory, "tokenizer.json")
# with open(vocab_file, "w", encoding="utf-8") as f:
# json.dump(self.vocab_dict, f, ensure_ascii=False, indent=2)
tokenizer_config = {
"added_tokens_decoder": {
"0": {"content": "[MASK]", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False,
"special": True},
"1": {"content": "[UNK]", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False,
"special": True}
},
"auto_map": {
"AutoTokenizer": [
"tokenizer.KmerTokenizer",
None
]
},
"clean_up_tokenization_spaces": True,
"mask_token": "[MASK]",
"model_max_length": 1e12, # Set a high number, or adjust as needed
"tokenizer_class": "KmerTokenizer", # Set your tokenizer class name
"unk_token": "[UNK]"
}
tokenizer_config_file = os.path.join(save_directory, "tokenizer_config.json")
with open(tokenizer_config_file, "w", encoding="utf-8") as f:
json.dump(tokenizer_config, f, ensure_ascii=False, indent=2)
return vocab_file, tokenizer_config_file
@classmethod
def from_pretrained(cls, pretrained_dir, **kwargs):
# Load vocabulary
# vocab_file = os.path.join(pretrained_dir, "tokenizer.json")
vocab_file = hf_hub_download(repo_id=pretrained_dir, filename="tokenizer.json")
if os.path.exists(vocab_file):
with open(vocab_file, "r", encoding="utf-8") as f:
vocab_content = json.load(f)
vocab = vocab_content["model"]["vocab"]
k = vocab_content["pre_tokenizer"]["k"]
stride = vocab_content["pre_tokenizer"]["stride"]
max_len = vocab_content["pre_tokenizer"]["max_length"]
else:
raise ValueError(f"Vocabulary file not found at {vocab_file}")
# Check for the existence of tokenizer_config.json
# tokenizer_config_file = os.path.join(pretrained_dir, "tokenizer_config.json")
tokenizer_config_file = hf_hub_download(repo_id=pretrained_dir, filename="tokenizer_config.json")
if os.path.exists(tokenizer_config_file):
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
tokenizer_config = json.load(f)
else:
raise ValueError(f"Tokenizer config file not found at {tokenizer_config_file}")
# Instantiate the tokenizer with loaded values
return cls(vocab=vocab, k=k, stride=stride, max_len=max_len, **kwargs)
def __call__(self, text, padding=False, **kwargs):
token_ids = self.encode(text, padding=padding, **kwargs)
unk_token_id = self.vocab_dict.get("[UNK]")
attention_mask = [1 if id_ != unk_token_id else 0 for id_ in token_ids]
token_type_ids = [0] * len(token_ids)
# Convert to the specified tensor format
if kwargs.get('return_tensors') == 'pt':
attention_mask = torch.tensor(attention_mask)
token_type_ids = torch.tensor(token_type_ids)
# Return the output dictionary
return {
"input_ids": token_ids,
"token_type_ids": token_type_ids,
"attention_mask": attention_mask
}
|