InstaDeepAI
/

BulkRNABert

Feature Extraction

transcriptomics

Model card Files Files and versions Community

mgelard commited on Jun 25

Commit

25d94dc

·

verified ·

1 Parent(s): 12e8428

Upload tokenizer

Files changed (2) hide show

tokenizer.py +16 -1
tokenizer_config.json +4 -9

tokenizer.py CHANGED Viewed

@@ -4,7 +4,22 @@ from typing import List, Optional, Union
 import numpy as np
 import torch
-from transformers import PreTrainedTokenizer
 class BinnedOmicTokenizer(PreTrainedTokenizer):

 import numpy as np
 import torch
+from transformers import PretrainedConfig, PreTrainedTokenizer
+class BinnedOmicTokenizerConfig(PretrainedConfig):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.n_expressions_bins = kwargs.get("n_expressions_bins", 64)
+        self.min_omic_value = kwargs.get("min_omic_value", 0.0)
+        self.max_omic_value = kwargs.get("max_omic_value", 1.0)
+        self.use_max_normalization = kwargs.get("use_max_normalization", True)
+        self.normalization_factor = kwargs.get(
+            "normalization_factor", 5.547176906585117
+        )
+        self.prepend_cls_token = kwargs.get("prepend_cls_token", False)
+        self.fixed_sequence_length = kwargs.get("fixed_sequence_length", None)
+        self.unpadded_length = kwargs.get("unpadded_length", None)
 class BinnedOmicTokenizer(PreTrainedTokenizer):

tokenizer_config.json CHANGED Viewed

@@ -1,17 +1,12 @@
 {
   "auto_map": {
     "AutoTokenizer": [
       "tokenizer.BinnedOmicTokenizer",
       null
     ]
   },
-  "tokenizer_class": "BinnedOmicTokenizer",
-  "n_expressions_bins": 64,
-  "min_omic_value": 0.0,
-  "max_omic_value": 1.0,
-  "use_max_normalization": true,
-  "normalization_factor": 5.547176906585117,
-  "prepend_cls_token": false,
-  "fixed_sequence_length": null,
-  "unpadded_length": null
 }

 {
+  "added_tokens_decoder": {},
   "auto_map": {
     "AutoTokenizer": [
       "tokenizer.BinnedOmicTokenizer",
       null
     ]
   },
+  "clean_up_tokenization_spaces": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "tokenizer_class": "BinnedOmicTokenizer"
 }