|
import torch
|
|
from datasets import load_dataset
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
|
|
|
|
|
|
dataset = load_dataset("json", data_files={"train": "qa_data.jsonl"})
|
|
|
|
|
|
model_name = "gpt2"
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
model = AutoModelForCausalLM.from_pretrained(model_name)
|
|
|
|
|
|
if tokenizer.pad_token is None:
|
|
tokenizer.pad_token = tokenizer.eos_token
|
|
|
|
|
|
def preprocess(example):
|
|
prompt = example["prompt"]
|
|
response = example["response"]
|
|
text = prompt + " " + response
|
|
tokens = tokenizer(
|
|
text,
|
|
truncation=True,
|
|
padding="max_length",
|
|
max_length=128,
|
|
)
|
|
tokens["labels"] = tokens["input_ids"].copy()
|
|
return tokens
|
|
|
|
tokenized = dataset["train"].map(preprocess)
|
|
|
|
|
|
args = TrainingArguments(
|
|
output_dir="gpt2-finetuned-qa",
|
|
per_device_train_batch_size=2,
|
|
num_train_epochs=5,
|
|
logging_steps=10,
|
|
save_steps=50,
|
|
fp16=True if torch.cuda.is_available() else False,
|
|
report_to="none",
|
|
)
|
|
|
|
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=args,
|
|
train_dataset=tokenized,
|
|
)
|
|
|
|
trainer.train()
|
|
model.save_pretrained("gpt2-finetuned-qa")
|
|
tokenizer.save_pretrained("gpt2-finetuned-qa") |