### aifixcode_trainer.py """ This script sets up a simple HuggingFace-based training + inference pipeline for bug-fixing AI using a CodeT5 model and supports continual training. You can upload this script to HuggingFace Space or Hub repo. """ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments, DataCollatorForSeq2Seq from datasets import load_dataset, DatasetDict import torch import os # ========== CONFIG ========== MODEL_NAME = "Salesforce/codet5p-220m" MODEL_OUT_DIR = "./aifixcode-model" TRAIN_DATASET_PATH = "./data/train.json" VAL_DATASET_PATH = "./data/val.json" # ========== LOAD MODEL + TOKENIZER ========== print("Loading model and tokenizer...") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME) # ========== LOAD DATASET ========== print("Loading dataset...") def load_json_dataset(train_path, val_path): dataset = DatasetDict({ "train": load_dataset("json", data_files=train_path)["train"], "validation": load_dataset("json", data_files=val_path)["train"] }) return dataset dataset = load_json_dataset(TRAIN_DATASET_PATH, VAL_DATASET_PATH) # ========== PREPROCESS ========== print("Tokenizing dataset...") def preprocess(example): input_code = example["input"] target_code = example["output"] model_inputs = tokenizer(input_code, truncation=True, padding="max_length", max_length=512) labels = tokenizer(target_code, truncation=True, padding="max_length", max_length=512) model_inputs["labels"] = labels["input_ids"] return model_inputs encoded_dataset = dataset.map(preprocess, batched=True) # ========== TRAINING SETUP ========== print("Setting up trainer...") training_args = TrainingArguments( output_dir=MODEL_OUT_DIR, evaluation_strategy="epoch", save_strategy="epoch", learning_rate=5e-5, per_device_train_batch_size=4, per_device_eval_batch_size=4, num_train_epochs=3, weight_decay=0.01, logging_dir="./logs", logging_strategy="epoch", push_to_hub=True, hub_model_id="khulnasoft/aifixcode-model", hub_strategy="every_save" ) data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) trainer = Trainer( model=model, args=training_args, train_dataset=encoded_dataset["train"], eval_dataset=encoded_dataset["validation"], tokenizer=tokenizer, data_collator=data_collator ) # ========== TRAIN ========== print("Starting training...") trainer.train() # ========== SAVE FINAL MODEL ========== print("Saving model...") trainer.save_model(MODEL_OUT_DIR) tokenizer.save_pretrained(MODEL_OUT_DIR) print("Training complete and model saved!")