from tokenizers import ByteLevelBPETokenizer import os input_path = os.path.join("..", "data", "input.txt") if not os.path.exists(input_path): input_path = os.path.join("data", "input.txt") tokenizer = ByteLevelBPETokenizer() tokenizer.train(files=input_path, vocab_size=1000, min_frequency=2) tokenizer.save_model(os.path.dirname(__file__)) print("Tokenizer trained and saved.")