Mastouri
commited on
Commit
·
8d416da
1
Parent(s):
98f1ca3
Improved Logistic Regression with hyperparameter tuning and TF-IDF enhancements
Browse files- logistic_reg.py +38 -36
logistic_reg.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1 |
from datasets import load_dataset
|
2 |
import pandas as pd
|
3 |
-
import numpy as np
|
4 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
5 |
from sklearn.preprocessing import MultiLabelBinarizer
|
6 |
-
from sklearn.
|
7 |
-
|
8 |
-
from
|
|
|
|
|
9 |
|
10 |
# Step 1: Load the Dataset Repository
|
11 |
dataset = load_dataset("meriemm6/commit-classification-dataset", data_files={"train": "training.csv", "validation": "validation.csv"})
|
@@ -27,43 +28,44 @@ validation_data['Ground truth'] = validation_data['Ground truth'].fillna("mainte
|
|
27 |
train_data['Ground truth'] = train_data['Ground truth'].apply(lambda x: x.split(', '))
|
28 |
validation_data['Ground truth'] = validation_data['Ground truth'].apply(lambda x: x.split(', '))
|
29 |
|
30 |
-
#
|
31 |
-
|
32 |
-
y_train_encoded = mlb.fit_transform(train_data['Ground truth'])
|
33 |
-
y_val_encoded = mlb.transform(validation_data['Ground truth'])
|
34 |
-
|
35 |
-
# Step 3: TF-IDF Vectorization (Increased Features)
|
36 |
-
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words="english")
|
37 |
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['Message'])
|
38 |
X_val_tfidf = tfidf_vectorizer.transform(validation_data['Message'])
|
39 |
|
|
|
|
|
|
|
|
|
40 |
|
|
|
|
|
|
|
41 |
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
-
# Step
|
46 |
-
|
47 |
-
|
|
|
|
|
48 |
|
49 |
-
# Step
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
objective="binary:logistic",
|
54 |
-
use_label_encoder=False,
|
55 |
-
eval_metric="logloss",
|
56 |
-
scale_pos_weight=scale_pos_weight[i], # Class weights
|
57 |
-
max_depth=6, # Reduced to prevent overfitting
|
58 |
-
learning_rate=0.03, # Lower learning rate for better generalization
|
59 |
-
n_estimators=300, # Increased estimators for better performance
|
60 |
-
subsample=0.8,
|
61 |
-
colsample_bytree=0.8,
|
62 |
-
min_child_weight=1 # Prevents overfitting on small datasets
|
63 |
-
)
|
64 |
-
model.fit(X_train_tfidf, y_train_encoded[:, i])
|
65 |
-
models.append(model)
|
66 |
|
67 |
-
|
68 |
-
for idx, model in enumerate(models):
|
69 |
-
dump(model, f"xgboost_model_label_{idx}.joblib")
|
|
|
1 |
from datasets import load_dataset
|
2 |
import pandas as pd
|
|
|
3 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
4 |
from sklearn.preprocessing import MultiLabelBinarizer
|
5 |
+
from sklearn.linear_model import LogisticRegression
|
6 |
+
from sklearn.multiclass import OneVsRestClassifier
|
7 |
+
from sklearn.metrics import classification_report, hamming_loss
|
8 |
+
from sklearn.model_selection import GridSearchCV
|
9 |
+
from joblib import dump
|
10 |
|
11 |
# Step 1: Load the Dataset Repository
|
12 |
dataset = load_dataset("meriemm6/commit-classification-dataset", data_files={"train": "training.csv", "validation": "validation.csv"})
|
|
|
28 |
train_data['Ground truth'] = train_data['Ground truth'].apply(lambda x: x.split(', '))
|
29 |
validation_data['Ground truth'] = validation_data['Ground truth'].apply(lambda x: x.split(', '))
|
30 |
|
31 |
+
# Step 3: TF-IDF Vectorization (Enhanced Features)
|
32 |
+
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1, 2))
|
|
|
|
|
|
|
|
|
|
|
33 |
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['Message'])
|
34 |
X_val_tfidf = tfidf_vectorizer.transform(validation_data['Message'])
|
35 |
|
36 |
+
# Step 4: MultiLabel Encoding
|
37 |
+
mlb = MultiLabelBinarizer()
|
38 |
+
y_train_encoded = mlb.fit_transform(train_data['Ground truth'])
|
39 |
+
y_val_encoded = mlb.transform(validation_data['Ground truth'])
|
40 |
|
41 |
+
# Step 5: Hyperparameter Tuning for Logistic Regression
|
42 |
+
log_reg = LogisticRegression(class_weight='balanced', max_iter=5000, random_state=42)
|
43 |
+
multi_log_reg = OneVsRestClassifier(log_reg)
|
44 |
|
45 |
+
param_grid = {
|
46 |
+
'estimator__C': [0.1, 1, 10], # Regularization strength
|
47 |
+
'estimator__solver': ['lbfgs', 'liblinear'], # Optimizers
|
48 |
+
}
|
49 |
+
grid_search = GridSearchCV(
|
50 |
+
estimator=multi_log_reg,
|
51 |
+
param_grid=param_grid,
|
52 |
+
scoring='f1_weighted',
|
53 |
+
cv=3,
|
54 |
+
verbose=2,
|
55 |
+
n_jobs=-1
|
56 |
+
)
|
57 |
+
grid_search.fit(X_train_tfidf, y_train_encoded)
|
58 |
+
best_model = grid_search.best_estimator_
|
59 |
|
60 |
+
# Step 6: Validation Metrics
|
61 |
+
y_val_pred = best_model.predict(X_val_tfidf)
|
62 |
+
print("Validation Metrics:")
|
63 |
+
print(f"F1 Score: {classification_report(y_val_encoded, y_val_pred, target_names=mlb.classes_, zero_division=0)}")
|
64 |
+
print(f"Hamming Loss: {hamming_loss(y_val_encoded, y_val_pred):.4f}")
|
65 |
|
66 |
+
# Step 7: Save the Model and Preprocessing Artifacts
|
67 |
+
dump(best_model, "optimized_logistic_model.joblib") # Save the optimized Logistic Regression model
|
68 |
+
dump(tfidf_vectorizer, "tfidf_vectorizer.joblib") # Save the TF-IDF vectorizer
|
69 |
+
dump(mlb, "label_binarizer.joblib") # Save the MultiLabelBinarizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
+
print("Optimized model and preprocessing files saved successfully.")
|
|
|
|