EuroFilter-v1 / train.py
RicardoRei's picture
multilingual fine-edu classifier
44e6fb6
raw
history blame
2.69 kB
import numpy as np
import pandas as pd
from classifier import DebertaV2ForSequenceClassification
from datasets import Dataset
from scipy.stats import pearsonr
from sklearn.metrics import accuracy_score, precision_score, recall_score
from transformers import (AutoTokenizer, DataCollatorWithPadding, Trainer,
TrainingArguments)
tokenizer = AutoTokenizer.from_pretrained("microsoft/mdeberta-v3-base")
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def compute_metrics(eval_pred):
predictions, labels = eval_pred
scores, binary_logits = predictions
scores = scores.squeeze()
probs = sigmoid(binary_logits.squeeze())
predicted_labels = (probs >= 0.5).astype(int)
binary_labels = (labels >= 3).astype(int)
return {
'pearson': pearsonr(scores, labels)[0],
'accuracy': accuracy_score(binary_labels, predicted_labels),
'precision': precision_score(binary_labels, predicted_labels),
'recall': recall_score(binary_labels, predicted_labels),
}
def tokenize_function(examples):
return tokenizer(examples["text"], truncation=True, max_length=512)
def train_classifier():
train_csv = pd.read_csv(PATH_TO_TRAINSET)
train_dataset = Dataset.from_pandas(train_csv)
test_csv = pd.read_csv(PATH_TO_TESTSET).sample(n=10_000, random_state=42)
test_dataset = Dataset.from_pandas(test_csv)
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
train_dataset = train_dataset.with_format("torch")
test_dataset = test_dataset.with_format("torch")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
logging_dir="./logs",
logging_steps=10,
)
model = DebertaV2ForSequenceClassification.from_pretrained("microsoft/mdeberta-v3-base")
print ("Freezing model embeddings!")
model.freeze_embeddings()
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=test_dataset,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics
)
trainer.train()
# Evaluate the model
trainer.evaluate()
#trainer.push_to_hub(private=True, model_name="mFine-Edu-classifier")
if __name__ == "__main__":
train_classifier()