bioscan-ml
/

BarcodeBERT

Feature Extraction

token-classification

text-embeddings-inference

Model card Files Files and versions Community

nioushasadjadi commited on Dec 5, 2024

Commit

4e98ce2

·

1 Parent(s): 4a303bd

Changing the call function.

Files changed (1) hide show

tokenizer.py +20 -0

tokenizer.py CHANGED Viewed

@@ -135,3 +135,23 @@ class KmerTokenizer(PreTrainedTokenizer):
         # Instantiate the tokenizer with loaded values
         return cls(vocab=vocab, k=k, stride=stride, max_len=max_len, **kwargs)

         # Instantiate the tokenizer with loaded values
         return cls(vocab=vocab, k=k, stride=stride, max_len=max_len, **kwargs)
+    def __call__(self, text, padding=False, **kwargs):
+        token_ids = self.encode(text, padding=padding, **kwargs)
+        unk_token_id = self.vocab_dict.get("[UNK]")
+        attention_mask = [1 if id_ != unk_token_id else 0 for id_ in token_ids]
+        token_type_ids = [0] * len(token_ids)
+        # Convert to the specified tensor format
+        if kwargs.get('return_tensors') == 'pt':
+            attention_mask = torch.tensor(attention_mask)
+            token_type_ids = torch.tensor(token_type_ids)
+        # Return the output dictionary
+        return {
+            "input_ids": token_ids,
+            "token_type_ids": token_type_ids,
+            "attention_mask": attention_mask
+        }