File size: 2,687 Bytes
44e6fb6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import numpy as np
import pandas as pd
from classifier import DebertaV2ForSequenceClassification
from datasets import Dataset
from scipy.stats import pearsonr
from sklearn.metrics import accuracy_score, precision_score, recall_score
from transformers import (AutoTokenizer, DataCollatorWithPadding, Trainer,
                          TrainingArguments)

tokenizer = AutoTokenizer.from_pretrained("microsoft/mdeberta-v3-base")

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    scores, binary_logits = predictions
    scores = scores.squeeze()
    probs = sigmoid(binary_logits.squeeze())
    predicted_labels = (probs >= 0.5).astype(int)
    binary_labels = (labels >= 3).astype(int)
    return {
        'pearson': pearsonr(scores, labels)[0],
        'accuracy': accuracy_score(binary_labels, predicted_labels),
        'precision': precision_score(binary_labels, predicted_labels),
        'recall': recall_score(binary_labels, predicted_labels),
    }

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

def train_classifier():
    train_csv = pd.read_csv(PATH_TO_TRAINSET)
    train_dataset = Dataset.from_pandas(train_csv)
    
    test_csv = pd.read_csv(PATH_TO_TESTSET).sample(n=10_000, random_state=42)
    test_dataset = Dataset.from_pandas(test_csv)
    
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function, batched=True)
    train_dataset = train_dataset.with_format("torch")
    test_dataset = test_dataset.with_format("torch")
    
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
    )
    model = DebertaV2ForSequenceClassification.from_pretrained("microsoft/mdeberta-v3-base")
    print ("Freezing model embeddings!")
    model.freeze_embeddings()
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )
    trainer.train()
    # Evaluate the model
    trainer.evaluate()
    #trainer.push_to_hub(private=True, model_name="mFine-Edu-classifier")

if __name__ == "__main__":
    train_classifier()