nioushasadjadi commited on
Commit
92d46e2
·
1 Parent(s): eb1e311

Adding automap to the tokenizer config

Browse files
Files changed (2) hide show
  1. tokenizer.py +8 -1
  2. tokenizer_config.json +6 -0
tokenizer.py CHANGED
@@ -93,6 +93,12 @@ class KmerTokenizer(PreTrainedTokenizer):
93
  "1": {"content": "[UNK]", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False,
94
  "special": True}
95
  },
 
 
 
 
 
 
96
  "clean_up_tokenization_spaces": True,
97
  "mask_token": "[MASK]",
98
  "model_max_length": 1e12, # Set a high number, or adjust as needed
@@ -119,7 +125,8 @@ class KmerTokenizer(PreTrainedTokenizer):
119
  # stride = vocab_content["model"]["stride"]
120
 
121
  # Load k and stride from tokenizer_config.json
122
- tokenizer_config_file = os.path.join(pretrained_dir, "tokenizer_config.json")
 
123
  if os.path.exists(tokenizer_config_file):
124
  with open(tokenizer_config_file, "r", encoding="utf-8") as f:
125
  tokenizer_config = json.load(f)
 
93
  "1": {"content": "[UNK]", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False,
94
  "special": True}
95
  },
96
+ "auto_map": {
97
+ "AutoTokenizer": [
98
+ "tokenizer.KmerTokenizer",
99
+ None
100
+ ]
101
+ },
102
  "clean_up_tokenization_spaces": True,
103
  "mask_token": "[MASK]",
104
  "model_max_length": 1e12, # Set a high number, or adjust as needed
 
125
  # stride = vocab_content["model"]["stride"]
126
 
127
  # Load k and stride from tokenizer_config.json
128
+ # tokenizer_config_file = os.path.join(pretrained_dir, "tokenizer_config.json")
129
+ tokenizer_config_file = hf_hub_download(repo_id=pretrained_dir, filename="tokenizer_config.json")
130
  if os.path.exists(tokenizer_config_file):
131
  with open(tokenizer_config_file, "r", encoding="utf-8") as f:
132
  tokenizer_config = json.load(f)
tokenizer_config.json CHANGED
@@ -17,6 +17,12 @@
17
  "special": true
18
  }
19
  },
 
 
 
 
 
 
20
  "clean_up_tokenization_spaces": true,
21
  "mask_token": "[MASK]",
22
  "model_max_length": 1000000000000.0,
 
17
  "special": true
18
  }
19
  },
20
+ "auto_map": {
21
+ "AutoTokenizer": [
22
+ "tokenizer.KmerTokenizer",
23
+ null
24
+ ]
25
+ },
26
  "clean_up_tokenization_spaces": true,
27
  "mask_token": "[MASK]",
28
  "model_max_length": 1000000000000.0,