|
from transformers import AutoTokenizer |
|
from transformers import AutoModelWithLMHead |
|
from transformers import Trainer, TrainingArguments |
|
from transformers import DataCollatorForLanguageModeling |
|
|
|
from datasets import load_dataset |
|
|
|
|
|
data = load_dataset("json", data_files = "./authors_all_CUT.json") |
|
data = data["train"].train_test_split(test_size = 0.10) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("distilgpt2") |
|
tokenizer.pad_token = tokenizer.eos_token |
|
def tokenize_datasets(data_set): |
|
return tokenizer(data_set["text"], padding = False, truncation = True) |
|
|
|
BATCH_SIZE = 8 |
|
data = data.map(tokenize_datasets, batched = True, batch_size = BATCH_SIZE) |
|
|
|
|
|
FOLDER_NAME = "./distilgpt2_quotes.TRANS" |
|
model = AutoModelWithLMHead.from_pretrained("distilgpt2") |
|
|
|
collator = DataCollatorForLanguageModeling(tokenizer, mlm = False) |
|
|
|
EPOCHS = 5 |
|
training_args = TrainingArguments(FOLDER_NAME, overwrite_output_dir = True, num_train_epochs = EPOCHS, per_device_train_batch_size = BATCH_SIZE, per_device_eval_batch_size = BATCH_SIZE, eval_steps = 400, save_steps = 800) |
|
trainer = Trainer(model, args = training_args, data_collator = collator, train_dataset = data["train"], eval_dataset = data["test"]) |
|
|
|
trainer.train() |
|
trainer.save_model() |