File size: 572 Bytes
d44849f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

tokenizer = Tokenizer(BPE(unk_token="<unk>"))

# Initialize trainer
trainer = BpeTrainer(
    special_tokens=["<unk>", "<s>", "</s>", "<pad>"],
)

# Train tokenizer on your corpus files
tokenizer.pre_tokenizer = Whitespace()
tokenizer.train(files=["tokenizer_corpus.txt"], trainer=trainer)

# Save tokenizer
tokenizer.save("IndicTrans2/huggingface_interface/IndicTransToolkit/tokenizer_training/tokenizer.json")