|
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling |
|
|
|
|
|
model_name = "gpt2" |
|
tokenizer = GPT2Tokenizer.from_pretrained(model_name) |
|
model = GPT2LMHeadModel.from_pretrained(model_name) |
|
|
|
|
|
def load_dataset(file_path, tokenizer, block_size=128): |
|
return TextDataset( |
|
tokenizer=tokenizer, |
|
file_path=file_path, |
|
block_size=block_size |
|
) |
|
|
|
|
|
train_dataset = load_dataset("train.txt", tokenizer) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir="./thought_model", |
|
overwrite_output_dir=True, |
|
num_train_epochs=3, |
|
per_device_train_batch_size=2, |
|
save_steps=500, |
|
save_total_limit=2 |
|
) |
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling( |
|
tokenizer=tokenizer, |
|
mlm=False |
|
) |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
data_collator=data_collator, |
|
train_dataset=train_dataset |
|
) |
|
|
|
trainer.train() |
|
|
|
|
|
trainer.save_model("./thought_model") |
|
tokenizer.save_pretrained("./thought_model") |
|
|