nioushasadjadi
commited on
Commit
·
92d46e2
1
Parent(s):
eb1e311
Adding automap to the tokenizer config
Browse files- tokenizer.py +8 -1
- tokenizer_config.json +6 -0
tokenizer.py
CHANGED
@@ -93,6 +93,12 @@ class KmerTokenizer(PreTrainedTokenizer):
|
|
93 |
"1": {"content": "[UNK]", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False,
|
94 |
"special": True}
|
95 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
"clean_up_tokenization_spaces": True,
|
97 |
"mask_token": "[MASK]",
|
98 |
"model_max_length": 1e12, # Set a high number, or adjust as needed
|
@@ -119,7 +125,8 @@ class KmerTokenizer(PreTrainedTokenizer):
|
|
119 |
# stride = vocab_content["model"]["stride"]
|
120 |
|
121 |
# Load k and stride from tokenizer_config.json
|
122 |
-
tokenizer_config_file = os.path.join(pretrained_dir, "tokenizer_config.json")
|
|
|
123 |
if os.path.exists(tokenizer_config_file):
|
124 |
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
125 |
tokenizer_config = json.load(f)
|
|
|
93 |
"1": {"content": "[UNK]", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False,
|
94 |
"special": True}
|
95 |
},
|
96 |
+
"auto_map": {
|
97 |
+
"AutoTokenizer": [
|
98 |
+
"tokenizer.KmerTokenizer",
|
99 |
+
None
|
100 |
+
]
|
101 |
+
},
|
102 |
"clean_up_tokenization_spaces": True,
|
103 |
"mask_token": "[MASK]",
|
104 |
"model_max_length": 1e12, # Set a high number, or adjust as needed
|
|
|
125 |
# stride = vocab_content["model"]["stride"]
|
126 |
|
127 |
# Load k and stride from tokenizer_config.json
|
128 |
+
# tokenizer_config_file = os.path.join(pretrained_dir, "tokenizer_config.json")
|
129 |
+
tokenizer_config_file = hf_hub_download(repo_id=pretrained_dir, filename="tokenizer_config.json")
|
130 |
if os.path.exists(tokenizer_config_file):
|
131 |
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
132 |
tokenizer_config = json.load(f)
|
tokenizer_config.json
CHANGED
@@ -17,6 +17,12 @@
|
|
17 |
"special": true
|
18 |
}
|
19 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
"clean_up_tokenization_spaces": true,
|
21 |
"mask_token": "[MASK]",
|
22 |
"model_max_length": 1000000000000.0,
|
|
|
17 |
"special": true
|
18 |
}
|
19 |
},
|
20 |
+
"auto_map": {
|
21 |
+
"AutoTokenizer": [
|
22 |
+
"tokenizer.KmerTokenizer",
|
23 |
+
null
|
24 |
+
]
|
25 |
+
},
|
26 |
"clean_up_tokenization_spaces": true,
|
27 |
"mask_token": "[MASK]",
|
28 |
"model_max_length": 1000000000000.0,
|