class CustomTokenizer: def __init__(self, pretrained_tokenizer_path, cache_dir): self.tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer_path, cache_dir= cache_dir, trust_remote_code = True ) new_tokens = ['', '', ''] self.tokenizer.add_tokens(new_tokens) # Define regex patterns for numbers and URLs self.number_pattern = re.compile(r'\b\d+\.?\d*\b') self.url_pattern = re.compile(r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})') self.mail_pattern = re.compile(r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}') self.replacement_symbols = { "url": '',#'Ħ', "num": '',#'IJ', "mail": '',#'Ĵ', } def preprocess(self, text): text = self.number_pattern.sub(self.replacement_symbols["num"], text) text = self.url_pattern.sub(self.replacement_symbols["url"], text) text = self.mail_pattern.sub(self.replacement_symbols["mail"], text) return text def __getattr__(self, attr): # Delegate attribute access to the underlying tokenizer to retain all its methods return getattr(self.tokenizer, attr) def __call__(self, text, **kwargs): preprocessed_text = self.preprocess(text) return self.tokenizer(preprocessed_text, **kwargs)