File size: 4,231 Bytes
9b7e8b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from peft import get_peft_model, LoraConfig, TaskType
import evaluate
import numpy as np

# Load the dataset
file_path = 'train_en.csv'
dataset = pd.read_csv(file_path)

# Map labels to expected responses
label_mapping = {
    "Yes": 0,
    "No": 1,
    "It doesn't matter": 2,
    "Unimportant": 2,  # Assuming "unimportant" is synonymous with "It doesn't matter"
    "Incorrect questioning": 3,
    "Correct answers": 4
}

# Apply label mapping
dataset['label'] = dataset['label'].map(label_mapping)

# Handle NaN values: Drop rows where label is NaN
dataset = dataset.dropna(subset=['label'])

# Ensure labels are integers
dataset['label'] = dataset['label'].astype(int)

# Format puzzle, truth, text into rows
dataset['combined_text'] = (
    "==========================================\n"
    "puzzle: " + dataset['puzzle'] + "\n"
    "==========================================\n"
    "truth: " + dataset['truth'] + "\n"
    "==========================================\n"
    "text: " + dataset['text'] 
)

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(dataset, test_size=0.2, random_state=42)

# Convert the dataframes to datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Load the tokenizer and model
model_name = "google/gemma-2-9b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['combined_text'], truncation=True, padding='max_length', max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Define LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none"
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=1e-4,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
    max_grad_norm=0.3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.001,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="wandb",
    fp16=True,
    gradient_checkpointing=True,
    gradient_accumulation_steps=4,
    dataloader_num_workers=4,
    logging_steps=100,
    save_total_limit=2,
)

def compute_metrics(eval_pred):
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")
    f1_metric = evaluate.load("f1")
    accuracy_metric = evaluate.load("accuracy")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    precision = precision_metric.compute(predictions=predictions, references=labels, average='macro')["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels, average='macro')["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='macro')["f1"]
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    
    return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained('trained_gemma_model')
tokenizer.save_pretrained('trained_gemma_model')

# Evaluate the model
trainer.evaluate()