NeuraVerse / finetune.py
SilentProgrammer's picture
Upload 18 files
02d2bd7 verified
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
# Load your data
dataset = load_dataset("json", data_files={"train": "qa_data.jsonl"})
# Choose a model (GPT-2 small is easy to start)
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# Add pad token if missing (GPT-2 doesn't have one by default)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Tokenize
def preprocess(example):
prompt = example["prompt"]
response = example["response"]
text = prompt + " " + response
tokens = tokenizer(
text,
truncation=True,
padding="max_length",
max_length=128,
)
tokens["labels"] = tokens["input_ids"].copy()
return tokens
tokenized = dataset["train"].map(preprocess)
# Training arguments
args = TrainingArguments(
output_dir="gpt2-finetuned-qa",
per_device_train_batch_size=2,
num_train_epochs=5,
logging_steps=10,
save_steps=50,
fp16=True if torch.cuda.is_available() else False,
report_to="none",
)
# Trainer
trainer = Trainer(
model=model,
args=args,
train_dataset=tokenized,
)
trainer.train()
model.save_pretrained("gpt2-finetuned-qa")
tokenizer.save_pretrained("gpt2-finetuned-qa")