|
import os |
|
import datasets |
|
import tokenizers |
|
import transformers |
|
|
|
def dataset_iterator(dataset, batch_size): |
|
for i in range(0, len(dataset), batch_size): |
|
yield dataset[i:i+batch_size]["text"] |
|
|
|
script_path = os.path.abspath(__file__) |
|
root_path = os.path.dirname(script_path) |
|
|
|
dataset = datasets.load_dataset("HuggingFaceFW/fineweb-edu", name="CC-MAIN-2024-10", split="train", trust_remote_code=True) |
|
|
|
model = tokenizers.models.BPE(vocab={"[UNK]":0}, merges=[], unk_token="[UNK]") |
|
tokenizer = tokenizers.Tokenizer(model) |
|
|
|
|
|
tokenizer.normalizer = tokenizers.normalizers.Sequence([tokenizers.normalizers.NFKD()]) |
|
|
|
|
|
tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence([tokenizers.pre_tokenizers.Split(" ", "isolated"), |
|
tokenizers.pre_tokenizers.Split("\t", "isolated"), |
|
tokenizers.pre_tokenizers.Split("\n", "isolated"), |
|
tokenizers.pre_tokenizers.Digits(individual_digits=True), |
|
tokenizers.pre_tokenizers.Punctuation()]) |
|
|
|
|
|
|
|
|
|
tokenizer.decoder = tokenizers.decoders.BPEDecoder() |
|
|
|
trainer = tokenizers.trainers.BpeTrainer(vocab_size=32_000, special_tokens=["[CLS]","[BOS]","[UNK]","[MASK]","[EOS]","[SEP]","[EOT]","[PAD]"], limit_alphabet=512) |
|
tokenizer.train_from_iterator(dataset_iterator(dataset, 1024), trainer, len(dataset)) |
|
|
|
transformers.PreTrainedTokenizerFast(tokenizer_object=tokenizer).save_pretrained(".") |
|
|