from tokenizers import ByteLevelBPETokenizer | |
import os | |
input_path = os.path.join("..", "data", "input.txt") | |
if not os.path.exists(input_path): | |
input_path = os.path.join("data", "input.txt") | |
tokenizer = ByteLevelBPETokenizer() | |
tokenizer.train(files=input_path, vocab_size=1000, min_frequency=2) | |
tokenizer.save_model(os.path.dirname(__file__)) | |
print("Tokenizer trained and saved.") |