finewebedu_32000 / train.py
gvlassis's picture
Add tokenizer
d7c0260
import os
import datasets
import tokenizers
import transformers
def dataset_iterator(dataset, batch_size):
for i in range(0, len(dataset), batch_size):
yield dataset[i:i+batch_size]["text"]
script_path = os.path.abspath(__file__)
root_path = os.path.dirname(script_path)
dataset = datasets.load_dataset("HuggingFaceFW/fineweb-edu", name="CC-MAIN-2024-10", split="train", trust_remote_code=True)
model = tokenizers.models.BPE(vocab={"[UNK]":0}, merges=[], unk_token="[UNK]")
tokenizer = tokenizers.Tokenizer(model)
# Normalizer
tokenizer.normalizer = tokenizers.normalizers.Sequence([tokenizers.normalizers.NFKD()])
# Pre_tokenizer
tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence([tokenizers.pre_tokenizers.Split(" ", "isolated"),
tokenizers.pre_tokenizers.Split("\t", "isolated"),
tokenizers.pre_tokenizers.Split("\n", "isolated"),
tokenizers.pre_tokenizers.Digits(individual_digits=True),
tokenizers.pre_tokenizers.Punctuation()])
# Postprocessor
# Decoder
tokenizer.decoder = tokenizers.decoders.BPEDecoder()
trainer = tokenizers.trainers.BpeTrainer(vocab_size=32_000, special_tokens=["[CLS]","[BOS]","[UNK]","[MASK]","[EOS]","[SEP]","[EOT]","[PAD]"], limit_alphabet=512)
tokenizer.train_from_iterator(dataset_iterator(dataset, 1024), trainer, len(dataset))
transformers.PreTrainedTokenizerFast(tokenizer_object=tokenizer).save_pretrained(".")