mgelard commited on
Commit
965ba4a
·
verified ·
1 Parent(s): 1051fcd

Upload tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer.py +4 -16
  2. tokenizer_config.json +6 -11
tokenizer.py CHANGED
@@ -4,22 +4,7 @@ from typing import List, Optional, Union
4
 
5
  import numpy as np
6
  import torch
7
- from transformers import PretrainedConfig, PreTrainedTokenizer
8
-
9
-
10
- class BinnedOmicTokenizerConfig(PretrainedConfig):
11
- def __init__(self, **kwargs):
12
- super().__init__(**kwargs)
13
- self.n_expressions_bins = kwargs.get("n_expressions_bins", 64)
14
- self.min_omic_value = kwargs.get("min_omic_value", 0.0)
15
- self.max_omic_value = kwargs.get("max_omic_value", 1.0)
16
- self.use_max_normalization = kwargs.get("use_max_normalization", True)
17
- self.normalization_factor = kwargs.get(
18
- "normalization_factor", 5.547176906585117
19
- )
20
- self.prepend_cls_token = kwargs.get("prepend_cls_token", False)
21
- self.fixed_sequence_length = kwargs.get("fixed_sequence_length", None)
22
- self.unpadded_length = kwargs.get("unpadded_length", None)
23
 
24
 
25
  class BinnedOmicTokenizer(PreTrainedTokenizer):
@@ -77,6 +62,9 @@ class BinnedOmicTokenizer(PreTrainedTokenizer):
77
  def _tokenize(self, text, **kwargs):
78
  raise NotImplementedError("Use `encode` or `batch_encode_plus` methods.")
79
 
 
 
 
80
  def encode(
81
  self,
82
  gene_expr: Union[np.ndarray, List[float]],
 
4
 
5
  import numpy as np
6
  import torch
7
+ from transformers import PreTrainedTokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
  class BinnedOmicTokenizer(PreTrainedTokenizer):
 
62
  def _tokenize(self, text, **kwargs):
63
  raise NotImplementedError("Use `encode` or `batch_encode_plus` methods.")
64
 
65
+ def decode(self, token_ids, **kwargs):
66
+ return [self._convert_id_to_token(i) for i in token_ids]
67
+
68
  def encode(
69
  self,
70
  gene_expr: Union[np.ndarray, List[float]],
tokenizer_config.json CHANGED
@@ -1,17 +1,12 @@
1
  {
2
- "tokenizer_class": "BinnedOmicTokenizer",
3
- "n_expressions_bins": 64,
4
- "min_omic_value": 0.0,
5
- "max_omic_value": 1.0,
6
- "use_max_normalization": true,
7
- "normalization_factor": 5.547176906585117,
8
- "prepend_cls_token": false,
9
- "fixed_sequence_length": null,
10
- "unpadded_length": null,
11
  "auto_map": {
12
  "AutoTokenizer": [
13
  "tokenizer.BinnedOmicTokenizer",
14
  null
15
  ]
16
- }
17
- }
 
 
 
 
1
  {
2
+ "added_tokens_decoder": {},
 
 
 
 
 
 
 
 
3
  "auto_map": {
4
  "AutoTokenizer": [
5
  "tokenizer.BinnedOmicTokenizer",
6
  null
7
  ]
8
+ },
9
+ "clean_up_tokenization_spaces": true,
10
+ "model_max_length": 1000000000000000019884624838656,
11
+ "tokenizer_class": "BinnedOmicTokenizer"
12
+ }