from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments from datasets import load_dataset # Step 1: Load pre-trained model and tokenizer model_name = "google/mt5-base" # Example: T5 model tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name) # Step 2: Load dataset dataset = load_dataset("csv", data_files="accounting_data.jsonl") # Step 3: Preprocess the dataset def preprocess_function(examples): inputs = [f"Generate report: {ledger}" for ledger in examples["prompt"]] targets = examples["completion"] model_inputs = tokenizer(inputs, max_length=512, truncation=True) labels = tokenizer(targets, max_length=128, truncation=True) model_inputs["labels"] = labels["input_ids"] return model_inputs tokenized_datasets = dataset.map(preprocess_function, batched=True) # Step 4: Define training arguments training_args = TrainingArguments( output_dir="./results", evaluation_strategy="epoch", learning_rate=5e-5, per_device_train_batch_size=8, num_train_epochs=3, save_total_limit=3, predict_with_generate=True, ) # Step 5: Initialize Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["test"], ) # Step 6: Train the model trainer.train()