MEIRa / pytorch_utils /transformer_utils.py
KawshikManikantan's picture
upload_trial
98e2ea5
raw
history blame
528 Bytes
from transformers import LongformerTokenizerFast, AutoTokenizer, PreTrainedTokenizerFast
def get_tokenizer(model_str: str) -> PreTrainedTokenizerFast:
if "longformer" in model_str:
tokenizer = LongformerTokenizerFast.from_pretrained(
model_str,
add_prefix_space=True,
clean_up_tokenization_spaces=True,
)
else:
tokenizer = AutoTokenizer.from_pretrained(
model_str, use_fast=True, clean_up_tokenization_spaces=True
)
return tokenizer