File size: 429 Bytes
4bb9d41
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
from transformers import PreTrainedTokenizerFast
from typing import List, Dict

class NigerianLanguageTokenizer:
    def __init__(self, base_tokenizer: PreTrainedTokenizerFast):
        self.tokenizer = base_tokenizer
        
    def tokenize_batch(self, texts: List[str]) -> Dict:
        return self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            return_tensors="pt"
        )