|
from datasets import load_dataset |
|
import pandas as pd |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.preprocessing import MultiLabelBinarizer |
|
from sklearn.linear_model import LogisticRegression |
|
from sklearn.multiclass import OneVsRestClassifier |
|
from sklearn.metrics import classification_report, hamming_loss |
|
from sklearn.model_selection import GridSearchCV |
|
from joblib import dump |
|
|
|
|
|
dataset = load_dataset("meriemm6/commit-classification-dataset", data_files={"train": "training.csv", "validation": "validation.csv"}) |
|
|
|
|
|
train_data = dataset["train"].to_pandas() |
|
validation_data = dataset["validation"].to_pandas() |
|
|
|
|
|
|
|
train_data['Message'] = train_data['Message'].fillna("unknown") |
|
validation_data['Message'] = validation_data['Message'].fillna("unknown") |
|
|
|
|
|
train_data['Ground truth'] = train_data['Ground truth'].fillna("maintenance/other") |
|
validation_data['Ground truth'] = validation_data['Ground truth'].fillna("maintenance/other") |
|
|
|
|
|
train_data['Ground truth'] = train_data['Ground truth'].apply(lambda x: x.split(', ')) |
|
validation_data['Ground truth'] = validation_data['Ground truth'].apply(lambda x: x.split(', ')) |
|
|
|
|
|
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1, 2)) |
|
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['Message']) |
|
X_val_tfidf = tfidf_vectorizer.transform(validation_data['Message']) |
|
|
|
|
|
mlb = MultiLabelBinarizer() |
|
y_train_encoded = mlb.fit_transform(train_data['Ground truth']) |
|
y_val_encoded = mlb.transform(validation_data['Ground truth']) |
|
|
|
|
|
log_reg = LogisticRegression(class_weight='balanced', max_iter=5000, random_state=42) |
|
multi_log_reg = OneVsRestClassifier(log_reg) |
|
|
|
param_grid = { |
|
'estimator__C': [0.1, 1, 10], |
|
'estimator__solver': ['lbfgs', 'liblinear'], |
|
} |
|
grid_search = GridSearchCV( |
|
estimator=multi_log_reg, |
|
param_grid=param_grid, |
|
scoring='f1_weighted', |
|
cv=3, |
|
verbose=2, |
|
n_jobs=-1 |
|
) |
|
grid_search.fit(X_train_tfidf, y_train_encoded) |
|
best_model = grid_search.best_estimator_ |
|
|
|
|
|
y_val_pred = best_model.predict(X_val_tfidf) |
|
print("Validation Metrics:") |
|
print(f"F1 Score: {classification_report(y_val_encoded, y_val_pred, target_names=mlb.classes_, zero_division=0)}") |
|
print(f"Hamming Loss: {hamming_loss(y_val_encoded, y_val_pred):.4f}") |
|
|
|
|
|
dump(best_model, "logistic_model.joblib") |
|
dump(tfidf_vectorizer, "tfidf_vectorizer.joblib") |
|
dump(mlb, "label_binarizer.joblib") |
|
|
|
print("Optimized model and preprocessing files saved successfully.") |
|
|