potterGPT-v0 / build_tokenizer.py
nullHawk's picture
add: v0
9fe7c42 verified
raw
history blame contribute delete
796 Bytes
from pathlib import Path
from tokenizers import Tokenizer
from tokenizers.models import BPE
import tokenizers.pre_tokenizers as pre_tokenizers
import tokenizers.processors as processors
import tokenizers.decoders as decoders
from tokenizers.trainers import BpeTrainer
if __name__ == '__main__':
tokenizer_path = Path('tokenizer/')
tokenizer_path.mkdir(exist_ok=True)
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
tokenizer.decoder = decoders.ByteLevel()
trainer = BpeTrainer(special_tokens=['<|endoftext|>'], min_frequency=2)
tokenizer.train(['data/harry_potter_data'],trainer)
tokenizer.save(str(tokenizer_path / 'potter.json'))