bioscan-ml
/

BarcodeBERT

@@ -15,12 +15,11 @@
   "pre_tokenizer": {
     "type": "KmerSplitter",
     "k": 4,
-    "stride": 4
   },
   "model": {
-    "type": "k-mer",
-    "k": 4,
-    "stride": 4,
     "unk_token": "[UNK]",
     "vocab": {
       "[MASK]": 0,

   "pre_tokenizer": {
     "type": "KmerSplitter",
     "k": 4,
+    "stride": 4,
+    "max_length": 660
   },
   "model": {
+    "type": "KmerTokenizer",
     "unk_token": "[UNK]",
     "vocab": {
       "[MASK]": 0,

tokenizer.py CHANGED Viewed

@@ -7,9 +7,10 @@ from itertools import product
 class KmerTokenizer(PreTrainedTokenizer):
-    def __init__(self, vocab_dict=None, k=4, stride=4, **kwargs):
         self.k = k
         self.stride = stride
         self.special_tokens = ["[MASK]", "[UNK]"]
         if vocab_dict is None:
@@ -27,6 +28,11 @@ class KmerTokenizer(PreTrainedTokenizer):
         # self.pad_token = "[PAD]"
     def tokenize(self, text, **kwargs):
         splits = [text[i:i + self.k] for i in range(0, len(text) - self.k + 1, self.stride)]
         return splits
@@ -64,12 +70,11 @@ class KmerTokenizer(PreTrainedTokenizer):
                 "pre_tokenizer": {
                     "type": "KmerSplitter",
                     "k": self.k,
-                    "stride": self.stride
                 },
                 "model": {
-                    "type": "k-mer",
-                    "k": self.k,
-                    "stride": self.stride,
                     "unk_token": self.unk_token,
                     "vocab": self.vocab_dict
                 },
@@ -96,9 +101,7 @@ class KmerTokenizer(PreTrainedTokenizer):
             "mask_token": "[MASK]",
             "model_max_length": 1e12,  # Set a high number, or adjust as needed
             "tokenizer_class": "KmerTokenizer",  # Set your tokenizer class name
-            "unk_token": "[UNK]",
-            "k": self.k,
-            "stride": self.stride
         }
         tokenizer_config_file = os.path.join(save_directory, "tokenizer_config.json")
         with open(tokenizer_config_file, "w", encoding="utf-8") as f:
@@ -109,24 +112,26 @@ class KmerTokenizer(PreTrainedTokenizer):
     @classmethod
     def from_pretrained(cls, pretrained_dir, **kwargs):
         # Load vocabulary
-        vocab_file = hf_hub_download(repo_id=pretrained_dir, filename="tokenizer.json")
         # vocab_file = os.path.join(pretrained_dir, "tokenizer.json")
-        with open(vocab_file, "r", encoding="utf-8") as f:
-            vocab_content = json.load(f)
-            vocab = vocab_content["model"]["vocab"]
-            # k = vocab_content["model"]["k"]
-            # stride = vocab_content["model"]["stride"]
-        # Load k and stride from tokenizer_config.json
         # tokenizer_config_file = os.path.join(pretrained_dir, "tokenizer_config.json")
         tokenizer_config_file = hf_hub_download(repo_id=pretrained_dir, filename="tokenizer_config.json")
         if os.path.exists(tokenizer_config_file):
             with open(tokenizer_config_file, "r", encoding="utf-8") as f:
                 tokenizer_config = json.load(f)
-            k = tokenizer_config.get("k", 4)  # Default to 4 if not specified
-            stride = tokenizer_config.get("stride", k)  # Default to k if not specified
         else:
             raise ValueError(f"Tokenizer config file not found at {tokenizer_config_file}")
         # Instantiate the tokenizer with loaded values
-        return cls(vocab=vocab, k=k, stride=stride, **kwargs)

 class KmerTokenizer(PreTrainedTokenizer):
+    def __init__(self, vocab_dict=None, k=4, stride=4, max_len=660, **kwargs):
         self.k = k
         self.stride = stride
+        self.max_len = max_len
         self.special_tokens = ["[MASK]", "[UNK]"]
         if vocab_dict is None:
         # self.pad_token = "[PAD]"
     def tokenize(self, text, **kwargs):
+        if len(text) > self.max_len:
+            text = text[:self.max_len]
+        if kwargs.get('padding'):
+            if len(text) < self.max_len:
+                text = text + 'N' * (self.max_len - len(text))
         splits = [text[i:i + self.k] for i in range(0, len(text) - self.k + 1, self.stride)]
         return splits
                 "pre_tokenizer": {
                     "type": "KmerSplitter",
                     "k": self.k,
+                    "stride": self.stride,
+                    "max_length": self.max_len
                 },
                 "model": {
+                    "type": "KmerTokenizer",
                     "unk_token": self.unk_token,
                     "vocab": self.vocab_dict
                 },
             "mask_token": "[MASK]",
             "model_max_length": 1e12,  # Set a high number, or adjust as needed
             "tokenizer_class": "KmerTokenizer",  # Set your tokenizer class name
+            "unk_token": "[UNK]"
         }
         tokenizer_config_file = os.path.join(save_directory, "tokenizer_config.json")
         with open(tokenizer_config_file, "w", encoding="utf-8") as f:
     @classmethod
     def from_pretrained(cls, pretrained_dir, **kwargs):
         # Load vocabulary
         # vocab_file = os.path.join(pretrained_dir, "tokenizer.json")
+        vocab_file = hf_hub_download(repo_id=pretrained_dir, filename="tokenizer.json")
+        if os.path.exists(vocab_file):
+            with open(vocab_file, "r", encoding="utf-8") as f:
+                vocab_content = json.load(f)
+                vocab = vocab_content["model"]["vocab"]
+                k = vocab_content["pre_tokenizer"]["k"]
+                stride = vocab_content["pre_tokenizer"]["stride"]
+                max_len = vocab_content["pre_tokenizer"]["max_length"]
+        else:
+            raise ValueError(f"Vocabulary file not found at {vocab_file}")
+        # Check for the existence of tokenizer_config.json
         # tokenizer_config_file = os.path.join(pretrained_dir, "tokenizer_config.json")
         tokenizer_config_file = hf_hub_download(repo_id=pretrained_dir, filename="tokenizer_config.json")
         if os.path.exists(tokenizer_config_file):
             with open(tokenizer_config_file, "r", encoding="utf-8") as f:
                 tokenizer_config = json.load(f)
         else:
             raise ValueError(f"Tokenizer config file not found at {tokenizer_config_file}")
         # Instantiate the tokenizer with loaded values
+        return cls(vocab=vocab, k=k, stride=stride, max_len=max_len, **kwargs)

tokenizer_config.json CHANGED Viewed

@@ -27,7 +27,5 @@
   "mask_token": "[MASK]",
   "model_max_length": 1000000000000.0,
   "tokenizer_class": "KmerTokenizer",
-  "unk_token": "[UNK]",
-  "k": 4,
-  "stride": 4
 }

   "mask_token": "[MASK]",
   "model_max_length": 1000000000000.0,
   "tokenizer_class": "KmerTokenizer",
+  "unk_token": "[UNK]"
 }