import os import datasets import tokenizers import transformers def dataset_iterator(dataset, batch_size): for i in range(0, len(dataset), batch_size): yield dataset[i:i+batch_size]["text"] script_path = os.path.abspath(__file__) root_path = os.path.dirname(script_path) dataset = datasets.load_dataset("HuggingFaceFW/fineweb-edu", name="CC-MAIN-2024-10", split="train", trust_remote_code=True) model = tokenizers.models.BPE(vocab={"[UNK]":0}, merges=[], unk_token="[UNK]") tokenizer = tokenizers.Tokenizer(model) # Normalizer tokenizer.normalizer = tokenizers.normalizers.Sequence([tokenizers.normalizers.NFKD()]) # Pre_tokenizer tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence([tokenizers.pre_tokenizers.Split(" ", "isolated"), tokenizers.pre_tokenizers.Split("\t", "isolated"), tokenizers.pre_tokenizers.Split("\n", "isolated"), tokenizers.pre_tokenizers.Digits(individual_digits=True), tokenizers.pre_tokenizers.Punctuation()]) # Postprocessor # Decoder tokenizer.decoder = tokenizers.decoders.BPEDecoder() trainer = tokenizers.trainers.BpeTrainer(vocab_size=32_000, special_tokens=["[CLS]","[BOS]","[UNK]","[MASK]","[EOS]","[SEP]","[EOT]","[PAD]"], limit_alphabet=512) tokenizer.train_from_iterator(dataset_iterator(dataset, 1024), trainer, len(dataset)) transformers.PreTrainedTokenizerFast(tokenizer_object=tokenizer).save_pretrained(".")