import os import torch import json from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, TrainerCallback from datasets import Dataset import matplotlib.pyplot as plt # Set Hugging Face token (replace with your actual token) os.environ["HF_TOKEN"] = "hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" # Replace with your HF_TOKEN # Download model and tokenizer model_name = "Salesforce/codegen-350M-multi" local_model_path = "./codegen_model" tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=local_model_path) model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, cache_dir=local_model_path) # Set padding token tokenizer.pad_token = tokenizer.eos_token # Move model to CPU device = torch.device("cpu") model.to(device) # Load custom dataset from JSONL dataset_path = "./custom_dataset.jsonl" data = [] with open(dataset_path, 'r', encoding='utf-8') as f: for line in f: data.append(json.loads(line.strip())) dataset = Dataset.from_list(data) # Tokenize dataset def tokenize_function(examples): inputs = [f"{prompt}\n{code}" for prompt, code in zip(examples["prompt"], examples["code"])] return tokenizer(inputs, truncation=True, padding="max_length", max_length=128) tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["prompt", "code"]) # Data collator for language modeling data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) # Define training arguments training_args = TrainingArguments( output_dir="./finetuned_codegen", overwrite_output_dir=True, num_train_epochs=5, per_device_train_batch_size=1, gradient_accumulation_steps=4, save_steps=500, save_total_limit=2, logging_steps=10, # Reduced logging steps for more frequent loss recording learning_rate=5e-5, fp16=False, no_cuda=True, dataloader_pin_memory=False, ) # Custom callback to store training loss class LossCallback(TrainerCallback): def __init__(self): self.losses = [] self.steps = [] def on_log(self, args, state, control, logs=None, **kwargs): if logs and "loss" in logs: self.losses.append(logs["loss"]) self.steps.append(state.global_step) loss_callback = LossCallback() # Initialize Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset, data_collator=data_collator, callbacks=[loss_callback], ) # Start fine-tuning print("Starting fine-tuning...") trainer.train() # Save fine-tuned model model.save_pretrained("./finetuned_codegen") tokenizer.save_pretrained("./finetuned_codegen") # Plot training loss plt.plot(loss_callback.steps, loss_callback.losses, label="Training Loss") plt.xlabel("Steps") plt.ylabel("Loss") plt.title("Fine-Tuning Loss Curve") plt.legend() plt.savefig("./finetuned_codegen/loss_plot.png") plt.show() print("Fine-tuning completed. Model saved to ./finetuned_codegen. Loss plot saved to ./finetuned_codegen/loss_plot.png") # Test fine-tuned model print("\nTesting fine-tuned model...") prompts = [ "Write a Python program to print 'Hello, guys how are you!'" ] for prompt in prompts: inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device) outputs = model.generate( **inputs, max_length=200, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id, do_sample=True, temperature=0.7, top_p=0.9 ) generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True) print(f"Prompt: {prompt}\nGenerated Code:\n{generated_code}\n{'-'*50}")