File size: 3,867 Bytes
6e793bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, TrainerCallback
from datasets import load_from_disk
import matplotlib.pyplot as plt

# Set Hugging Face token (replace with your actual token)
os.environ["HF_TOKEN"] = "hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"  # Replace with your HF_TOKEN

# Download model and tokenizer
model_name = "Salesforce/codegen-350M-multi"
local_model_path = "./codegen_model"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=local_model_path)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, cache_dir=local_model_path)

# Set padding token
tokenizer.pad_token = tokenizer.eos_token

# Move model to CPU
device = torch.device("cpu")
model.to(device)

# Load custom dataset
dataset_path = "./custom_dataset"
dataset = load_from_disk(dataset_path)

# Tokenize dataset
def tokenize_function(examples):
    inputs = [f"{prompt}\n{code}" for prompt, code in zip(examples["prompt"], examples["code"])]
    return tokenizer(inputs, truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["prompt", "code"])

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./finetuned_codegen",
    overwrite_output_dir=True,
    num_train_epochs=5,  # Increased epochs for better fine-tuning
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    learning_rate=5e-5,
    fp16=False,
    no_cuda=True,
    dataloader_pin_memory=False,
)

# Custom callback to store training loss
class LossCallback(TrainerCallback):
    def __init__(self):
        self.losses = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and "loss" in logs:
            self.losses.append(logs["loss"])

loss_callback = LossCallback()

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    callbacks=[loss_callback],
)

# Start fine-tuning
print("Starting fine-tuning...")
trainer.train()

# Save fine-tuned model
model.save_pretrained("./finetuned_codegen")
tokenizer.save_pretrained("./finetuned_codegen")

# Plot training loss
plt.plot(loss_callback.losses, label="Training Loss")
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.title("Fine-Tuning Loss Curve")
plt.legend()
plt.savefig("./finetuned_codegen/loss_plot.png")
plt.show()

print("Fine-tuning completed. Model saved to ./finetuned_codegen. Loss plot saved to ./finetuned_codegen/loss_plot.png")

# Test fine-tuned model
print("\nTesting fine-tuned model...")
prompts = [
    "Write a Python program to print 'Hello, World!'",
    "Write a Python function to add two numbers.",
    "Write a Python function to subtract two numbers.",
    "Write a Python function to calculate factorial of a number",
    "Write a Python function to check if a number is prime",
    "Write a Python function to reverse a string"
]

for prompt in prompts:
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    outputs = model.generate(
        **inputs,
        max_length=200,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )
    generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Prompt: {prompt}\nGenerated Code:\n{generated_code}\n{'-'*50}")