File size: 393 Bytes
02d2bd7
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
from tokenizers import ByteLevelBPETokenizer
import os

input_path = os.path.join("..", "data", "input.txt")
if not os.path.exists(input_path):
    input_path = os.path.join("data", "input.txt")
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=input_path, vocab_size=1000, min_frequency=2)
tokenizer.save_model(os.path.dirname(__file__))
print("Tokenizer trained and saved.")