|
from datasets import load_dataset |
|
import pandas as pd |
|
import numpy as np |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.preprocessing import MultiLabelBinarizer |
|
from sklearn.metrics import hamming_loss, f1_score, classification_report |
|
import xgboost as xgb |
|
from joblib import dump, load |
|
|
|
|
|
dataset = load_dataset("meriemm6/commit-classification-dataset", data_files={"train": "training.csv", "validation": "validation.csv"}) |
|
|
|
|
|
train_data = dataset["train"].to_pandas() |
|
validation_data = dataset["validation"].to_pandas() |
|
|
|
|
|
|
|
train_data['Message'] = train_data['Message'].fillna("unknown") |
|
validation_data['Message'] = validation_data['Message'].fillna("unknown") |
|
|
|
|
|
train_data['Ground truth'] = train_data['Ground truth'].fillna("maintenance/other") |
|
validation_data['Ground truth'] = validation_data['Ground truth'].fillna("maintenance/other") |
|
|
|
|
|
train_data['Ground truth'] = train_data['Ground truth'].apply(lambda x: x.split(', ')) |
|
validation_data['Ground truth'] = validation_data['Ground truth'].apply(lambda x: x.split(', ')) |
|
|
|
|
|
mlb = MultiLabelBinarizer() |
|
y_train_encoded = mlb.fit_transform(train_data['Ground truth']) |
|
y_val_encoded = mlb.transform(validation_data['Ground truth']) |
|
|
|
|
|
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words="english") |
|
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['Message']) |
|
X_val_tfidf = tfidf_vectorizer.transform(validation_data['Message']) |
|
|
|
|
|
|
|
|
|
dump(tfidf_vectorizer, "tfidf_vectorizer_xgboost.joblib") |
|
|
|
|
|
label_counts = y_train_encoded.sum(axis=0) |
|
scale_pos_weight = (len(y_train_encoded) - label_counts) / label_counts |
|
|
|
|
|
models = [] |
|
for i in range(y_train_encoded.shape[1]): |
|
model = xgb.XGBClassifier( |
|
objective="binary:logistic", |
|
use_label_encoder=False, |
|
eval_metric="logloss", |
|
scale_pos_weight=scale_pos_weight[i], |
|
max_depth=6, |
|
learning_rate=0.03, |
|
n_estimators=300, |
|
subsample=0.8, |
|
colsample_bytree=0.8, |
|
min_child_weight=1 |
|
) |
|
model.fit(X_train_tfidf, y_train_encoded[:, i]) |
|
models.append(model) |
|
|
|
|
|
for idx, model in enumerate(models): |
|
dump(model, f"xgboost_model_label_{idx}.joblib") |
|
|