File size: 4,848 Bytes
9b7e8b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from peft import get_peft_model, LoraConfig, TaskType
import evaluate
import numpy as np
from tqdm import tqdm

# Load the dataset
file_path = 'train_en.csv'
dataset = pd.read_csv(file_path)

# Map labels to expected responses
label_mapping = {
    "Yes": 0,
    "No": 1,
    "It doesn't matter": 2,
    "Unimportant": 2,
    "Incorrect questioning": 3,
    "Correct answers": 4
}

# Apply label mapping
dataset['label'] = dataset['label'].map(label_mapping)

# Handle NaN values: Drop rows where label is NaN
dataset = dataset.dropna(subset=['label'])

# Ensure labels are integers
dataset['label'] = dataset['label'].astype(int)

# Format puzzle, truth, text into rows
dataset['combined_text'] = (
    "==========================================\n"
    "puzzle: " + dataset['puzzle'] + "\n"
    "==========================================\n"
    "truth: " + dataset['truth'] + "\n"
    "==========================================\n"
    "text: " + dataset['text'] 
)

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(dataset, test_size=0.2, random_state=42)

# Convert the dataframes to datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Load the tokenizer and model
model_name = "meta-llama/Meta-Llama-3-8B"  # Replace with the actual model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

# Add a padding token if it's not already present
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
    model.resize_token_embeddings(len(tokenizer))
    tokenizer.pad_token = tokenizer.eos_token  # Set the padding token explicitly

# Ensure the padding token is set correctly in the model configuration
model.config.pad_token_id = tokenizer.pad_token_id

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['combined_text'], truncation=True, padding='max_length', max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True, num_proc=4)  # Use multiprocessing
val_dataset = val_dataset.map(tokenize_function, batched=True, num_proc=4)

# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Define LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none"
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=1e-4,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
    max_grad_norm=0.3,
    per_device_train_batch_size=8,  # Increase batch size if memory allows
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.001,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="wandb",
    fp16=True,
    gradient_checkpointing=True,
    gradient_accumulation_steps=2,  # Adjust based on memory constraints
    dataloader_num_workers=4,
    logging_steps=100,
    save_total_limit=2,
)

def compute_metrics(eval_pred):
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")
    f1_metric = evaluate.load("f1")
    accuracy_metric = evaluate.load("accuracy")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    
    return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train the model with progress bar
trainer.train()

# Save the model
model.save_pretrained('trained_llama_model')
tokenizer.save_pretrained('trained_llama_model')

# Evaluate the model with progress bar
eval_results = trainer.evaluate()
print(eval_results)