File size: 764 Bytes
ab790b6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
import json
from transformers import AutoTokenizer
old_tokenizer = AutoTokenizer.from_pretrained("gpt2")
import os
from datasets import load_dataset
langs = ["en", "ja", "ko", "zh-cn", "zh-tw"]
raw_datasets = [
load_dataset("wiki40b", lang, beam_runner='DirectRunner')
for lang in langs
]
total_line = 0
for training_dataset in raw_datasets:
for line in training_dataset["train"]:
total_line += 1
def training_dataset_iterator():
for training_dataset in raw_datasets:
for line in training_dataset["train"]:
yield line['text']
# tokenizer.train(training_files, trainer)
tokenizer = old_tokenizer.train_new_from_iterator(training_dataset_iterator(), 102000, total_line)
tokenizer.save_pretrained("tokenizer-shami")
|