NeuraVerse / tokenizer /train_tokenizer.py
SilentProgrammer's picture
Upload 18 files
02d2bd7 verified
raw
history blame contribute delete
393 Bytes
from tokenizers import ByteLevelBPETokenizer
import os
input_path = os.path.join("..", "data", "input.txt")
if not os.path.exists(input_path):
input_path = os.path.join("data", "input.txt")
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=input_path, vocab_size=1000, min_frequency=2)
tokenizer.save_model(os.path.dirname(__file__))
print("Tokenizer trained and saved.")