|
import os |
|
import torch |
|
import json |
|
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, TrainerCallback |
|
from datasets import Dataset |
|
import matplotlib.pyplot as plt |
|
|
|
|
|
os.environ["HF_TOKEN"] = "hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" |
|
|
|
|
|
model_name = "Salesforce/codegen-350M-multi" |
|
local_model_path = "./codegen_model" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=local_model_path) |
|
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, cache_dir=local_model_path) |
|
|
|
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
|
|
device = torch.device("cpu") |
|
model.to(device) |
|
|
|
|
|
dataset_path = "./custom_dataset.jsonl" |
|
data = [] |
|
with open(dataset_path, 'r', encoding='utf-8') as f: |
|
for line in f: |
|
data.append(json.loads(line.strip())) |
|
dataset = Dataset.from_list(data) |
|
|
|
|
|
def tokenize_function(examples): |
|
inputs = [f"{prompt}\n{code}" for prompt, code in zip(examples["prompt"], examples["code"])] |
|
return tokenizer(inputs, truncation=True, padding="max_length", max_length=128) |
|
|
|
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["prompt", "code"]) |
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir="./finetuned_codegen", |
|
overwrite_output_dir=True, |
|
num_train_epochs=3, |
|
per_device_train_batch_size=1, |
|
gradient_accumulation_steps=4, |
|
save_steps=500, |
|
save_total_limit=2, |
|
logging_steps=100, |
|
learning_rate=5e-5, |
|
fp16=False, |
|
no_cuda=True, |
|
dataloader_pin_memory=False, |
|
) |
|
|
|
|
|
class LossCallback(TrainerCallback): |
|
def __init__(self): |
|
self.losses = [] |
|
|
|
def on_log(self, args, state, control, logs=None, **kwargs): |
|
if logs and "loss" in logs: |
|
self.losses.append(logs["loss"]) |
|
|
|
loss_callback = LossCallback() |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=tokenized_dataset, |
|
data_collator=data_collator, |
|
callbacks=[loss_callback], |
|
) |
|
|
|
|
|
print("Starting fine-tuning...") |
|
trainer.train() |
|
|
|
|
|
model.save_pretrained("./finetuned_codegen") |
|
tokenizer.save_pretrained("./finetuned_codegen") |
|
|
|
|
|
plt.plot(loss_callback.losses, label="Training Loss") |
|
plt.xlabel("Steps") |
|
plt.ylabel("Loss") |
|
plt.title("Fine-Tuning Loss Curve") |
|
plt.legend() |
|
plt.savefig("./finetuned_codegen/loss_plot.png") |
|
plt.show() |
|
|
|
print("Fine-tuning completed. Model saved to ./finetuned_codegen. Loss plot saved to ./finetuned_codegen/loss_plot.png") |
|
|
|
|
|
print("\nTesting fine-tuned model...") |
|
prompts = [ |
|
"Write a Python program to print 'Hello, World!'" |
|
] |
|
|
|
for prompt in prompts: |
|
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device) |
|
outputs = model.generate( |
|
**inputs, |
|
max_length=200, |
|
num_return_sequences=1, |
|
pad_token_id=tokenizer.eos_token_id, |
|
do_sample=True, |
|
temperature=0.7, |
|
top_p=0.9 |
|
) |
|
generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
print(f"Prompt: {prompt}\nGenerated Code:\n{generated_code}\n{'-'*50}") |