Spaces:
Build error
Build error
File size: 4,231 Bytes
9b7e8b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from peft import get_peft_model, LoraConfig, TaskType
import evaluate
import numpy as np
# Load the dataset
file_path = 'train_en.csv'
dataset = pd.read_csv(file_path)
# Map labels to expected responses
label_mapping = {
"Yes": 0,
"No": 1,
"It doesn't matter": 2,
"Unimportant": 2, # Assuming "unimportant" is synonymous with "It doesn't matter"
"Incorrect questioning": 3,
"Correct answers": 4
}
# Apply label mapping
dataset['label'] = dataset['label'].map(label_mapping)
# Handle NaN values: Drop rows where label is NaN
dataset = dataset.dropna(subset=['label'])
# Ensure labels are integers
dataset['label'] = dataset['label'].astype(int)
# Format puzzle, truth, text into rows
dataset['combined_text'] = (
"==========================================\n"
"puzzle: " + dataset['puzzle'] + "\n"
"==========================================\n"
"truth: " + dataset['truth'] + "\n"
"==========================================\n"
"text: " + dataset['text']
)
# Split the dataset into training and validation sets
train_df, val_df = train_test_split(dataset, test_size=0.2, random_state=42)
# Convert the dataframes to datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
# Load the tokenizer and model
model_name = "google/gemma-2-9b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)
# Tokenize the data
def tokenize_function(examples):
return tokenizer(examples['combined_text'], truncation=True, padding='max_length', max_length=128)
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
# Define LoRA configuration
lora_config = LoraConfig(
task_type=TaskType.SEQ_CLS,
r=16,
lora_alpha=16,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
bias="none"
)
# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# Training arguments
training_args = TrainingArguments(
output_dir='./results',
learning_rate=1e-4,
lr_scheduler_type="linear",
warmup_ratio=0.1,
max_grad_norm=0.3,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
num_train_epochs=3,
weight_decay=0.001,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
report_to="wandb",
fp16=True,
gradient_checkpointing=True,
gradient_accumulation_steps=4,
dataloader_num_workers=4,
logging_steps=100,
save_total_limit=2,
)
def compute_metrics(eval_pred):
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")
accuracy_metric = evaluate.load("accuracy")
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
precision = precision_metric.compute(predictions=predictions, references=labels, average='macro')["precision"]
recall = recall_metric.compute(predictions=predictions, references=labels, average='macro')["recall"]
f1 = f1_metric.compute(predictions=predictions, references=labels, average='macro')["f1"]
accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}
# Initialize the Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=compute_metrics
)
# Train the model
trainer.train()
# Save the model
model.save_pretrained('trained_gemma_model')
tokenizer.save_pretrained('trained_gemma_model')
# Evaluate the model
trainer.evaluate() |