Mastouri commited on
Commit
8d416da
·
1 Parent(s): 98f1ca3

Improved Logistic Regression with hyperparameter tuning and TF-IDF enhancements

Browse files
Files changed (1) hide show
  1. logistic_reg.py +38 -36
logistic_reg.py CHANGED
@@ -1,11 +1,12 @@
1
  from datasets import load_dataset
2
  import pandas as pd
3
- import numpy as np
4
  from sklearn.feature_extraction.text import TfidfVectorizer
5
  from sklearn.preprocessing import MultiLabelBinarizer
6
- from sklearn.metrics import hamming_loss, f1_score, classification_report
7
- import xgboost as xgb
8
- from joblib import dump, load
 
 
9
 
10
  # Step 1: Load the Dataset Repository
11
  dataset = load_dataset("meriemm6/commit-classification-dataset", data_files={"train": "training.csv", "validation": "validation.csv"})
@@ -27,43 +28,44 @@ validation_data['Ground truth'] = validation_data['Ground truth'].fillna("mainte
27
  train_data['Ground truth'] = train_data['Ground truth'].apply(lambda x: x.split(', '))
28
  validation_data['Ground truth'] = validation_data['Ground truth'].apply(lambda x: x.split(', '))
29
 
30
- # Encode the labels
31
- mlb = MultiLabelBinarizer()
32
- y_train_encoded = mlb.fit_transform(train_data['Ground truth'])
33
- y_val_encoded = mlb.transform(validation_data['Ground truth'])
34
-
35
- # Step 3: TF-IDF Vectorization (Increased Features)
36
- tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words="english")
37
  X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['Message'])
38
  X_val_tfidf = tfidf_vectorizer.transform(validation_data['Message'])
39
 
 
 
 
 
40
 
 
 
 
41
 
42
- # Save the TF-IDF vectorizer
43
- dump(tfidf_vectorizer, "tfidf_vectorizer_xgboost.joblib")
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
- # Step 4: Add Class Weighting
46
- label_counts = y_train_encoded.sum(axis=0)
47
- scale_pos_weight = (len(y_train_encoded) - label_counts) / label_counts
 
 
48
 
49
- # Step 5: Train XGBoost Models with Class Weighting and Dynamic Parameters
50
- models = []
51
- for i in range(y_train_encoded.shape[1]):
52
- model = xgb.XGBClassifier(
53
- objective="binary:logistic",
54
- use_label_encoder=False,
55
- eval_metric="logloss",
56
- scale_pos_weight=scale_pos_weight[i], # Class weights
57
- max_depth=6, # Reduced to prevent overfitting
58
- learning_rate=0.03, # Lower learning rate for better generalization
59
- n_estimators=300, # Increased estimators for better performance
60
- subsample=0.8,
61
- colsample_bytree=0.8,
62
- min_child_weight=1 # Prevents overfitting on small datasets
63
- )
64
- model.fit(X_train_tfidf, y_train_encoded[:, i])
65
- models.append(model)
66
 
67
- # Save the models
68
- for idx, model in enumerate(models):
69
- dump(model, f"xgboost_model_label_{idx}.joblib")
 
1
  from datasets import load_dataset
2
  import pandas as pd
 
3
  from sklearn.feature_extraction.text import TfidfVectorizer
4
  from sklearn.preprocessing import MultiLabelBinarizer
5
+ from sklearn.linear_model import LogisticRegression
6
+ from sklearn.multiclass import OneVsRestClassifier
7
+ from sklearn.metrics import classification_report, hamming_loss
8
+ from sklearn.model_selection import GridSearchCV
9
+ from joblib import dump
10
 
11
  # Step 1: Load the Dataset Repository
12
  dataset = load_dataset("meriemm6/commit-classification-dataset", data_files={"train": "training.csv", "validation": "validation.csv"})
 
28
  train_data['Ground truth'] = train_data['Ground truth'].apply(lambda x: x.split(', '))
29
  validation_data['Ground truth'] = validation_data['Ground truth'].apply(lambda x: x.split(', '))
30
 
31
+ # Step 3: TF-IDF Vectorization (Enhanced Features)
32
+ tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1, 2))
 
 
 
 
 
33
  X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['Message'])
34
  X_val_tfidf = tfidf_vectorizer.transform(validation_data['Message'])
35
 
36
+ # Step 4: MultiLabel Encoding
37
+ mlb = MultiLabelBinarizer()
38
+ y_train_encoded = mlb.fit_transform(train_data['Ground truth'])
39
+ y_val_encoded = mlb.transform(validation_data['Ground truth'])
40
 
41
+ # Step 5: Hyperparameter Tuning for Logistic Regression
42
+ log_reg = LogisticRegression(class_weight='balanced', max_iter=5000, random_state=42)
43
+ multi_log_reg = OneVsRestClassifier(log_reg)
44
 
45
+ param_grid = {
46
+ 'estimator__C': [0.1, 1, 10], # Regularization strength
47
+ 'estimator__solver': ['lbfgs', 'liblinear'], # Optimizers
48
+ }
49
+ grid_search = GridSearchCV(
50
+ estimator=multi_log_reg,
51
+ param_grid=param_grid,
52
+ scoring='f1_weighted',
53
+ cv=3,
54
+ verbose=2,
55
+ n_jobs=-1
56
+ )
57
+ grid_search.fit(X_train_tfidf, y_train_encoded)
58
+ best_model = grid_search.best_estimator_
59
 
60
+ # Step 6: Validation Metrics
61
+ y_val_pred = best_model.predict(X_val_tfidf)
62
+ print("Validation Metrics:")
63
+ print(f"F1 Score: {classification_report(y_val_encoded, y_val_pred, target_names=mlb.classes_, zero_division=0)}")
64
+ print(f"Hamming Loss: {hamming_loss(y_val_encoded, y_val_pred):.4f}")
65
 
66
+ # Step 7: Save the Model and Preprocessing Artifacts
67
+ dump(best_model, "optimized_logistic_model.joblib") # Save the optimized Logistic Regression model
68
+ dump(tfidf_vectorizer, "tfidf_vectorizer.joblib") # Save the TF-IDF vectorizer
69
+ dump(mlb, "label_binarizer.joblib") # Save the MultiLabelBinarizer
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ print("Optimized model and preprocessing files saved successfully.")