|
class CustomTokenizer: |
|
def __init__(self, pretrained_tokenizer_path, cache_dir): |
|
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer_path, |
|
cache_dir= cache_dir, |
|
trust_remote_code = True |
|
) |
|
|
|
new_tokens = ['<num>', '<url>', '<mail>'] |
|
self.tokenizer.add_tokens(new_tokens) |
|
|
|
|
|
self.number_pattern = re.compile(r'\b\d+\.?\d*\b') |
|
self.url_pattern = re.compile(r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})') |
|
self.mail_pattern = re.compile(r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}') |
|
|
|
self.replacement_symbols = { |
|
"url": '<url>', |
|
"num": '<num>', |
|
"mail": '<mail>', |
|
} |
|
|
|
def preprocess(self, text): |
|
text = self.number_pattern.sub(self.replacement_symbols["num"], text) |
|
text = self.url_pattern.sub(self.replacement_symbols["url"], text) |
|
text = self.mail_pattern.sub(self.replacement_symbols["mail"], text) |
|
|
|
return text |
|
|
|
def __getattr__(self, attr): |
|
|
|
return getattr(self.tokenizer, attr) |
|
|
|
def __call__(self, text, **kwargs): |
|
preprocessed_text = self.preprocess(text) |
|
return self.tokenizer(preprocessed_text, **kwargs) |