mgelard commited on
Commit
f959c61
·
verified ·
1 Parent(s): 7ee63cd

Upload tokenizer

Browse files
Files changed (4) hide show
  1. special_tokens_map.json +8 -0
  2. tokenizer.py +151 -0
  3. tokenizer_config.json +67 -0
  4. vocab.json +1 -0
special_tokens_map.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<bos>",
3
+ "cls_token": "<cls>",
4
+ "eos_token": "<eos>",
5
+ "mask_token": "<mask>",
6
+ "pad_token": "<pad>",
7
+ "unk_token": "<unk>"
8
+ }
tokenizer.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from typing import List, Optional, Union
4
+
5
+ import numpy as np
6
+ import torch
7
+ from transformers import PreTrainedTokenizer
8
+
9
+
10
+ class BinnedOmicTokenizer(PreTrainedTokenizer):
11
+ def __init__(
12
+ self,
13
+ n_expressions_bins: int = 64,
14
+ min_omic_value: float = 0.0,
15
+ max_omic_value: float = 1.0,
16
+ use_max_normalization: bool = True,
17
+ normalization_factor: float = 1.0,
18
+ prepend_cls_token: bool = False,
19
+ fixed_sequence_length: Optional[int] = None,
20
+ unpadded_length: Optional[int] = None,
21
+ **kwargs,
22
+ ):
23
+ bin_tokens = [str(i) for i in range(n_expressions_bins)]
24
+ special_tokens = ["<pad>", "<mask>", "<unk>", "<bos>", "<eos>", "<cls>"]
25
+
26
+ vocab = {tok: i for i, tok in enumerate(bin_tokens)}
27
+ offset = len(vocab)
28
+ for i, tok in enumerate(special_tokens):
29
+ vocab[tok] = offset + i
30
+
31
+ ids_to_tokens = {i: tok for tok, i in vocab.items()}
32
+
33
+ # Save vocab attributes before superclass init
34
+ self.vocab = vocab
35
+ self.ids_to_tokens = ids_to_tokens
36
+
37
+ self.n_expressions_bins = n_expressions_bins
38
+ self.min_omic_value = min_omic_value
39
+ self.max_omic_value = max_omic_value
40
+ self.use_max_normalization = use_max_normalization
41
+ self.normalization_factor = normalization_factor
42
+ self.prepend_cls_token = prepend_cls_token
43
+ self.fixed_sequence_length = fixed_sequence_length
44
+ self.unpadded_length = unpadded_length
45
+
46
+ self.bin_edges = np.linspace(min_omic_value, max_omic_value, n_expressions_bins)
47
+
48
+ self.pad_token = "<pad>"
49
+ self.mask_token = "<mask>"
50
+ self.unk_token = "<unk>"
51
+ self.bos_token = "<bos>"
52
+ self.eos_token = "<eos>"
53
+ self.cls_token = "<cls>"
54
+
55
+ super().__init__(
56
+ pad_token=self.pad_token,
57
+ mask_token=self.mask_token,
58
+ unk_token=self.unk_token,
59
+ bos_token=self.bos_token,
60
+ eos_token=self.eos_token,
61
+ cls_token=self.cls_token,
62
+ **kwargs,
63
+ )
64
+
65
+ def _convert_token_to_id(self, token: str) -> int:
66
+ return self.vocab.get(token, self.vocab[self.unk_token])
67
+
68
+ def _convert_id_to_token(self, index: int) -> str:
69
+ return self.ids_to_tokens.get(index, self.unk_token)
70
+
71
+ def get_vocab(self) -> dict:
72
+ return self.vocab
73
+
74
+ def _tokenize(self, text, **kwargs):
75
+ raise NotImplementedError("Use `encode` or `batch_encode_plus` methods.")
76
+
77
+ def encode(
78
+ self,
79
+ gene_expr: Union[np.ndarray, List[float]],
80
+ pad_to_fixed_length: bool = False,
81
+ max_length: Optional[int] = None,
82
+ return_tensors: Optional[str] = None,
83
+ **kwargs,
84
+ ) -> Union[List[int], torch.Tensor]:
85
+ gene_expr = np.array(gene_expr)
86
+
87
+ if self.use_max_normalization:
88
+ gene_expr = gene_expr / self.normalization_factor
89
+
90
+ token_ids = np.digitize(gene_expr, self.bin_edges).astype(int)
91
+ token_ids[gene_expr == 0.0] = 0
92
+
93
+ if self.prepend_cls_token:
94
+ token_ids = np.concatenate([[self.cls_token_id], token_ids])
95
+
96
+ if pad_to_fixed_length:
97
+ current_max_length = self.fixed_sequence_length or max_length
98
+ if current_max_length is None:
99
+ raise ValueError("fixed_sequence_length or max_length must be set.")
100
+ pad_len = current_max_length - len(token_ids)
101
+ if pad_len > 0:
102
+ token_ids = np.concatenate([token_ids, [self.pad_token_id] * pad_len])
103
+ else:
104
+ token_ids = token_ids[:current_max_length]
105
+
106
+ if return_tensors == "pt":
107
+ return torch.tensor(token_ids).unsqueeze(0)
108
+ return token_ids.tolist() # type: ignore
109
+
110
+ def batch_encode_plus(
111
+ self,
112
+ batch_gene_expr: Union[np.ndarray, List[np.ndarray]],
113
+ pad_to_fixed_length: bool = False,
114
+ max_length: Optional[int] = None,
115
+ return_tensors: Optional[str] = None,
116
+ **kwargs,
117
+ ):
118
+ if isinstance(batch_gene_expr, list):
119
+ batch_gene_expr = np.array(batch_gene_expr)
120
+
121
+ encoded = [
122
+ self.encode(
123
+ gene_expr,
124
+ pad_to_fixed_length=pad_to_fixed_length,
125
+ max_length=max_length,
126
+ return_tensors=None,
127
+ **kwargs,
128
+ )
129
+ for gene_expr in batch_gene_expr
130
+ ]
131
+
132
+ encoded = np.array(encoded, dtype=np.int64)
133
+
134
+ if return_tensors == "pt":
135
+ return {"input_ids": torch.tensor(encoded)}
136
+ return {"input_ids": encoded}
137
+
138
+ @property
139
+ def vocab_size(self) -> int:
140
+ return len(self.vocab)
141
+
142
+ def save_vocabulary(
143
+ self, save_directory: str, filename_prefix: Optional[str] = None
144
+ ):
145
+ vocab_file = os.path.join(
146
+ save_directory,
147
+ (filename_prefix + "-" if filename_prefix else "") + "vocab.json",
148
+ )
149
+ with open(vocab_file, "w") as f:
150
+ json.dump(self.vocab, f)
151
+ return (vocab_file,)
tokenizer_config.json ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "64": {
4
+ "content": "<pad>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "65": {
12
+ "content": "<mask>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "66": {
20
+ "content": "<unk>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "67": {
28
+ "content": "<bos>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "68": {
36
+ "content": "<eos>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "69": {
44
+ "content": "<cls>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ }
51
+ },
52
+ "auto_map": {
53
+ "AutoTokenizer": [
54
+ "tokenizer.BinnedOmicTokenizer",
55
+ null
56
+ ]
57
+ },
58
+ "bos_token": "<bos>",
59
+ "clean_up_tokenization_spaces": true,
60
+ "cls_token": "<cls>",
61
+ "eos_token": "<eos>",
62
+ "mask_token": "<mask>",
63
+ "model_max_length": 1000000000000000019884624838656,
64
+ "pad_token": "<pad>",
65
+ "tokenizer_class": "BinnedOmicTokenizer",
66
+ "unk_token": "<unk>"
67
+ }
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, "10": 10, "11": 11, "12": 12, "13": 13, "14": 14, "15": 15, "16": 16, "17": 17, "18": 18, "19": 19, "20": 20, "21": 21, "22": 22, "23": 23, "24": 24, "25": 25, "26": 26, "27": 27, "28": 28, "29": 29, "30": 30, "31": 31, "32": 32, "33": 33, "34": 34, "35": 35, "36": 36, "37": 37, "38": 38, "39": 39, "40": 40, "41": 41, "42": 42, "43": 43, "44": 44, "45": 45, "46": 46, "47": 47, "48": 48, "49": 49, "50": 50, "51": 51, "52": 52, "53": 53, "54": 54, "55": 55, "56": 56, "57": 57, "58": 58, "59": 59, "60": 60, "61": 61, "62": 62, "63": 63, "<pad>": 64, "<mask>": 65, "<unk>": 66, "<bos>": 67, "<eos>": 68, "<cls>": 69}