mgelard commited on
Commit
25d94dc
·
verified ·
1 Parent(s): 12e8428

Upload tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer.py +16 -1
  2. tokenizer_config.json +4 -9
tokenizer.py CHANGED
@@ -4,7 +4,22 @@ from typing import List, Optional, Union
4
 
5
  import numpy as np
6
  import torch
7
- from transformers import PreTrainedTokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
  class BinnedOmicTokenizer(PreTrainedTokenizer):
 
4
 
5
  import numpy as np
6
  import torch
7
+ from transformers import PretrainedConfig, PreTrainedTokenizer
8
+
9
+
10
+ class BinnedOmicTokenizerConfig(PretrainedConfig):
11
+ def __init__(self, **kwargs):
12
+ super().__init__(**kwargs)
13
+ self.n_expressions_bins = kwargs.get("n_expressions_bins", 64)
14
+ self.min_omic_value = kwargs.get("min_omic_value", 0.0)
15
+ self.max_omic_value = kwargs.get("max_omic_value", 1.0)
16
+ self.use_max_normalization = kwargs.get("use_max_normalization", True)
17
+ self.normalization_factor = kwargs.get(
18
+ "normalization_factor", 5.547176906585117
19
+ )
20
+ self.prepend_cls_token = kwargs.get("prepend_cls_token", False)
21
+ self.fixed_sequence_length = kwargs.get("fixed_sequence_length", None)
22
+ self.unpadded_length = kwargs.get("unpadded_length", None)
23
 
24
 
25
  class BinnedOmicTokenizer(PreTrainedTokenizer):
tokenizer_config.json CHANGED
@@ -1,17 +1,12 @@
1
  {
 
2
  "auto_map": {
3
  "AutoTokenizer": [
4
  "tokenizer.BinnedOmicTokenizer",
5
  null
6
  ]
7
  },
8
- "tokenizer_class": "BinnedOmicTokenizer",
9
- "n_expressions_bins": 64,
10
- "min_omic_value": 0.0,
11
- "max_omic_value": 1.0,
12
- "use_max_normalization": true,
13
- "normalization_factor": 5.547176906585117,
14
- "prepend_cls_token": false,
15
- "fixed_sequence_length": null,
16
- "unpadded_length": null
17
  }
 
1
  {
2
+ "added_tokens_decoder": {},
3
  "auto_map": {
4
  "AutoTokenizer": [
5
  "tokenizer.BinnedOmicTokenizer",
6
  null
7
  ]
8
  },
9
+ "clean_up_tokenization_spaces": true,
10
+ "model_max_length": 1000000000000000019884624838656,
11
+ "tokenizer_class": "BinnedOmicTokenizer"
 
 
 
 
 
 
12
  }