import torch from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer # Load your data dataset = load_dataset("json", data_files={"train": "qa_data.jsonl"}) # Choose a model (GPT-2 small is easy to start) model_name = "gpt2" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) # Add pad token if missing (GPT-2 doesn't have one by default) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # Tokenize def preprocess(example): prompt = example["prompt"] response = example["response"] text = prompt + " " + response tokens = tokenizer( text, truncation=True, padding="max_length", max_length=128, ) tokens["labels"] = tokens["input_ids"].copy() return tokens tokenized = dataset["train"].map(preprocess) # Training arguments args = TrainingArguments( output_dir="gpt2-finetuned-qa", per_device_train_batch_size=2, num_train_epochs=5, logging_steps=10, save_steps=50, fp16=True if torch.cuda.is_available() else False, report_to="none", ) # Trainer trainer = Trainer( model=model, args=args, train_dataset=tokenized, ) trainer.train() model.save_pretrained("gpt2-finetuned-qa") tokenizer.save_pretrained("gpt2-finetuned-qa")