Spaces:
Build error
Build error
File size: 4,848 Bytes
9b7e8b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from peft import get_peft_model, LoraConfig, TaskType
import evaluate
import numpy as np
from tqdm import tqdm
# Load the dataset
file_path = 'train_en.csv'
dataset = pd.read_csv(file_path)
# Map labels to expected responses
label_mapping = {
"Yes": 0,
"No": 1,
"It doesn't matter": 2,
"Unimportant": 2,
"Incorrect questioning": 3,
"Correct answers": 4
}
# Apply label mapping
dataset['label'] = dataset['label'].map(label_mapping)
# Handle NaN values: Drop rows where label is NaN
dataset = dataset.dropna(subset=['label'])
# Ensure labels are integers
dataset['label'] = dataset['label'].astype(int)
# Format puzzle, truth, text into rows
dataset['combined_text'] = (
"==========================================\n"
"puzzle: " + dataset['puzzle'] + "\n"
"==========================================\n"
"truth: " + dataset['truth'] + "\n"
"==========================================\n"
"text: " + dataset['text']
)
# Split the dataset into training and validation sets
train_df, val_df = train_test_split(dataset, test_size=0.2, random_state=42)
# Convert the dataframes to datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
# Load the tokenizer and model
model_name = "meta-llama/Meta-Llama-3-8B" # Replace with the actual model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)
# Add a padding token if it's not already present
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
model.resize_token_embeddings(len(tokenizer))
tokenizer.pad_token = tokenizer.eos_token # Set the padding token explicitly
# Ensure the padding token is set correctly in the model configuration
model.config.pad_token_id = tokenizer.pad_token_id
# Tokenize the data
def tokenize_function(examples):
return tokenizer(examples['combined_text'], truncation=True, padding='max_length', max_length=128)
train_dataset = train_dataset.map(tokenize_function, batched=True, num_proc=4) # Use multiprocessing
val_dataset = val_dataset.map(tokenize_function, batched=True, num_proc=4)
# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
# Define LoRA configuration
lora_config = LoraConfig(
task_type=TaskType.SEQ_CLS,
r=16,
lora_alpha=16,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
bias="none"
)
# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# Training arguments
training_args = TrainingArguments(
output_dir='./results',
learning_rate=1e-4,
lr_scheduler_type="linear",
warmup_ratio=0.1,
max_grad_norm=0.3,
per_device_train_batch_size=8, # Increase batch size if memory allows
per_device_eval_batch_size=8,
num_train_epochs=3,
weight_decay=0.001,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
report_to="wandb",
fp16=True,
gradient_checkpointing=True,
gradient_accumulation_steps=2, # Adjust based on memory constraints
dataloader_num_workers=4,
logging_steps=100,
save_total_limit=2,
)
def compute_metrics(eval_pred):
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")
accuracy_metric = evaluate.load("accuracy")
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")["precision"]
recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")["recall"]
f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"]
accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}
# Initialize the Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=compute_metrics
)
# Train the model with progress bar
trainer.train()
# Save the model
model.save_pretrained('trained_llama_model')
tokenizer.save_pretrained('trained_llama_model')
# Evaluate the model with progress bar
eval_results = trainer.evaluate()
print(eval_results)
|