from tokenizers import Tokenizer from tokenizers.models import BPE from tokenizers.trainers import BpeTrainer from tokenizers.pre_tokenizers import Whitespace tokenizer = Tokenizer(BPE(unk_token="")) # Initialize trainer trainer = BpeTrainer( special_tokens=["", "", "", ""], ) # Train tokenizer on your corpus files tokenizer.pre_tokenizer = Whitespace() tokenizer.train(files=["tokenizer_corpus.txt"], trainer=trainer) # Save tokenizer tokenizer.save("IndicTrans2/huggingface_interface/IndicTransToolkit/tokenizer_training/tokenizer.json")