File size: 2,687 Bytes
44e6fb6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import numpy as np
import pandas as pd
from classifier import DebertaV2ForSequenceClassification
from datasets import Dataset
from scipy.stats import pearsonr
from sklearn.metrics import accuracy_score, precision_score, recall_score
from transformers import (AutoTokenizer, DataCollatorWithPadding, Trainer,
TrainingArguments)
tokenizer = AutoTokenizer.from_pretrained("microsoft/mdeberta-v3-base")
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def compute_metrics(eval_pred):
predictions, labels = eval_pred
scores, binary_logits = predictions
scores = scores.squeeze()
probs = sigmoid(binary_logits.squeeze())
predicted_labels = (probs >= 0.5).astype(int)
binary_labels = (labels >= 3).astype(int)
return {
'pearson': pearsonr(scores, labels)[0],
'accuracy': accuracy_score(binary_labels, predicted_labels),
'precision': precision_score(binary_labels, predicted_labels),
'recall': recall_score(binary_labels, predicted_labels),
}
def tokenize_function(examples):
return tokenizer(examples["text"], truncation=True, max_length=512)
def train_classifier():
train_csv = pd.read_csv(PATH_TO_TRAINSET)
train_dataset = Dataset.from_pandas(train_csv)
test_csv = pd.read_csv(PATH_TO_TESTSET).sample(n=10_000, random_state=42)
test_dataset = Dataset.from_pandas(test_csv)
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
train_dataset = train_dataset.with_format("torch")
test_dataset = test_dataset.with_format("torch")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
logging_dir="./logs",
logging_steps=10,
)
model = DebertaV2ForSequenceClassification.from_pretrained("microsoft/mdeberta-v3-base")
print ("Freezing model embeddings!")
model.freeze_embeddings()
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=test_dataset,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics
)
trainer.train()
# Evaluate the model
trainer.evaluate()
#trainer.push_to_hub(private=True, model_name="mFine-Edu-classifier")
if __name__ == "__main__":
train_classifier()
|