|
import json |
|
import os |
|
from unsloth import FastLanguageModel |
|
from datasets import Dataset |
|
from trl import DPOTrainer, DPOConfig |
|
import torch |
|
|
|
|
|
|
|
def load_model(): |
|
print("Initializing model loading...") |
|
model_name = "outputs_sample_code/checkpoint-200" |
|
max_seq_length = 512 |
|
model, tokenizer = FastLanguageModel.from_pretrained( |
|
model_name, |
|
dtype=None, |
|
load_in_4bit=True |
|
) |
|
print("Model and tokenizer loaded successfully.") |
|
print(f"Model type: {type(model)}, Tokenizer type: {type(tokenizer)}") |
|
|
|
if hasattr(model, 'config'): |
|
print("Setting max_seq_length in model.config") |
|
model.config.max_seq_length = max_seq_length |
|
else: |
|
print("Error: model.config does not exist!") |
|
|
|
model = FastLanguageModel.get_peft_model( |
|
model, |
|
r=32, |
|
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], |
|
lora_alpha=32, |
|
lora_dropout=0.05, |
|
bias="none", |
|
use_gradient_checkpointing="unsloth", |
|
random_state=3407, |
|
use_rslora=False, |
|
loftq_config=None, |
|
max_seq_length=max_seq_length |
|
) |
|
print("PEFT model configured.") |
|
return model, tokenizer |
|
|
|
|
|
def load_dataset(): |
|
print("Loading dataset...") |
|
dataset_name = "cyberagent/chatbot-arena-ja-calm2-7b-chat-experimental" |
|
|
|
from datasets import load_dataset |
|
dataset = load_dataset(dataset_name) |
|
|
|
formatted_data = [] |
|
for item in dataset["train"]: |
|
formatted_data.append({ |
|
"prompt": item.get("prompt", ""), |
|
"chosen": item.get("response_winner", ""), |
|
"rejected": item.get("response_loser", "") |
|
}) |
|
|
|
print(f"Formatted data: {len(formatted_data)} items") |
|
return Dataset.from_dict({ |
|
"prompt": [item["prompt"] for item in formatted_data], |
|
"chosen": [item["chosen"] for item in formatted_data], |
|
"rejected": [item["rejected"] for item in formatted_data] |
|
}) |
|
|
|
|
|
def train_dpo(model, tokenizer, dataset): |
|
print("Configuring training arguments...") |
|
|
|
training_args = DPOConfig( |
|
output_dir="./dpo_trained_model_1216", |
|
overwrite_output_dir=True, |
|
per_device_train_batch_size=8, |
|
gradient_accumulation_steps=128, |
|
per_device_eval_batch_size=8, |
|
learning_rate=1e-5, |
|
weight_decay=0.01, |
|
num_train_epochs=1, |
|
lr_scheduler_type="constant_with_warmup", |
|
warmup_steps=10, |
|
fp16=True, |
|
eval_strategy="steps", |
|
save_strategy="steps", |
|
save_steps=32, |
|
logging_steps=8, |
|
eval_steps=8, |
|
load_best_model_at_end=True, |
|
save_safetensors=False, |
|
save_only_model=True, |
|
remove_unused_columns=False, |
|
) |
|
print("Training arguments configured.") |
|
|
|
print("Initializing DPOTrainer...") |
|
dpo_trainer = DPOTrainer( |
|
model=model, |
|
args=training_args, |
|
beta=0.3, |
|
train_dataset=dataset, |
|
eval_dataset=dataset, |
|
tokenizer=tokenizer, |
|
max_prompt_length=162, |
|
max_length=512, |
|
loss_type="sigmoid", |
|
label_smoothing=0.0, |
|
) |
|
print("DPOTrainer initialized.") |
|
|
|
print("Starting training...") |
|
|
|
original_forward = model.forward |
|
|
|
def new_forward(*args, **kwargs): |
|
if "input_ids" in kwargs: |
|
kwargs["input_ids"] = kwargs["input_ids"].long() |
|
return original_forward(*args, **kwargs) |
|
|
|
model.forward = new_forward |
|
|
|
dpo_trainer.train() |
|
print("Training completed.") |
|
|
|
if __name__ == "__main__": |
|
print("Loading model...") |
|
model, tokenizer = load_model() |
|
|
|
print("Loading dataset...") |
|
dataset = load_dataset() |
|
|
|
print("Starting DPO training...") |
|
train_dpo(model, tokenizer, dataset) |
|
|
|
print("Training complete. Model saved.") |
|
|