File size: 393 Bytes
02d2bd7 |
1 2 3 4 5 6 7 8 9 10 |
from tokenizers import ByteLevelBPETokenizer
import os
input_path = os.path.join("..", "data", "input.txt")
if not os.path.exists(input_path):
input_path = os.path.join("data", "input.txt")
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=input_path, vocab_size=1000, min_frequency=2)
tokenizer.save_model(os.path.dirname(__file__))
print("Tokenizer trained and saved.") |