MagedSaeed
/

APCD-Plus-meter-classification-model

Text Classification

model_hub_mixin

pytorch_model_hub_mixin

Model card Files Files and versions Community

MagedSaeed commited on 14 days ago

Commit

d7a3880

·

verified ·

1 Parent(s): 49eb973

Update tokenizer_script.py

Files changed (1) hide show

tokenizer_script.py +5 -5

tokenizer_script.py CHANGED Viewed

@@ -76,19 +76,19 @@ class CharacterTokenizer(PreTrainedTokenizer):
         return (vocab_file,)
-    def batch_encode(tokenizer, texts, add_special_tokens=False,padding=False, truncation=True, max_length=None):
-        encoded_texts = [tokenizer.encode(text) for text in texts]
         # Handle max_length (truncation)
         if max_length is not None:
             encoded_texts = [ids[:max_length] for ids in encoded_texts]
         if add_special_tokens:
-            bos_token_id = tokenizer.convert_tokens_to_ids(tokenizer.bos_token)
-            eos_token_id = tokenizer.convert_tokens_to_ids(tokenizer.eos_token)
             encoded_texts = [[bos_token_id] + ids + [eos_token_id] for ids in encoded_texts]
         # Handle padding
         if padding:
             # properly handle padding side
-            pad_id = tokenizer.vocab.get(tokenizer.pad_token, 0)
             max_len = max(len(ids) for ids in encoded_texts) if max_length is None else max_length
             if tokenizer.padding_side == "right":
               encoded_texts = [ids + [pad_id] * (max_len - len(ids)) for ids in encoded_texts]

         return (vocab_file,)
+    def batch_encode(self, texts, add_special_tokens=False, padding=False, truncation=True, max_length=None):
+        encoded_texts = [self.encode(text) for text in texts]
         # Handle max_length (truncation)
         if max_length is not None:
             encoded_texts = [ids[:max_length] for ids in encoded_texts]
         if add_special_tokens:
+            bos_token_id = self.convert_tokens_to_ids(tokenizer.bos_token)
+            eos_token_id = self.convert_tokens_to_ids(tokenizer.eos_token)
             encoded_texts = [[bos_token_id] + ids + [eos_token_id] for ids in encoded_texts]
         # Handle padding
         if padding:
             # properly handle padding side
+            pad_id = self.vocab.get(tokenizer.pad_token, 0)
             max_len = max(len(ids) for ids in encoded_texts) if max_length is None else max_length
             if tokenizer.padding_side == "right":
               encoded_texts = [ids + [pad_id] * (max_len - len(ids)) for ids in encoded_texts]