Spaces:

meriemm6
/

DockerRelatedGithubCommitsClassification

Sleeping

DockerRelatedGithubCommitsClassification / logistic_reg.py

Mastouri

Updated XGBoost model with TF-IDF vectorizer

f424ca3 7 months ago

2.91 kB

	from datasets import load_dataset
	import pandas as pd
	import numpy as np
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.preprocessing import MultiLabelBinarizer
	from sklearn.metrics import hamming_loss, f1_score, classification_report
	import xgboost as xgb
	from joblib import dump, load

	# Step 1: Load the Dataset Repository
	dataset = load_dataset("meriemm6/commit-classification-dataset", data_files={"train": "training.csv", "validation": "validation.csv"})

	# Convert the training and validation splits to pandas DataFrames
	train_data = dataset["train"].to_pandas()
	validation_data = dataset["validation"].to_pandas()

	# Step 2: Clean and Process the Data
	# Fill missing values in the 'Message' column with "unknown"
	train_data['Message'] = train_data['Message'].fillna("unknown")
	validation_data['Message'] = validation_data['Message'].fillna("unknown")

	# Fill missing values in the 'Ground truth' column with "maintenance/other"
	train_data['Ground truth'] = train_data['Ground truth'].fillna("maintenance/other")
	validation_data['Ground truth'] = validation_data['Ground truth'].fillna("maintenance/other")

	# Split the 'Ground truth' column into lists of labels
	train_data['Ground truth'] = train_data['Ground truth'].apply(lambda x: x.split(', '))
	validation_data['Ground truth'] = validation_data['Ground truth'].apply(lambda x: x.split(', '))

	# Encode the labels
	mlb = MultiLabelBinarizer()
	y_train_encoded = mlb.fit_transform(train_data['Ground truth'])
	y_val_encoded = mlb.transform(validation_data['Ground truth'])

	# Step 3: TF-IDF Vectorization (Increased Features)
	tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words="english")
	X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['Message'])
	X_val_tfidf = tfidf_vectorizer.transform(validation_data['Message'])



	# Save the TF-IDF vectorizer
	dump(tfidf_vectorizer, "tfidf_vectorizer_xgboost.joblib")

	# Step 4: Add Class Weighting
	label_counts = y_train_encoded.sum(axis=0)
	scale_pos_weight = (len(y_train_encoded) - label_counts) / label_counts

	# Step 5: Train XGBoost Models with Class Weighting and Dynamic Parameters
	models = []
	for i in range(y_train_encoded.shape[1]):
	model = xgb.XGBClassifier(
	objective="binary:logistic",
	use_label_encoder=False,
	eval_metric="logloss",
	scale_pos_weight=scale_pos_weight[i], # Class weights
	max_depth=6, # Reduced to prevent overfitting
	learning_rate=0.03, # Lower learning rate for better generalization
	n_estimators=300, # Increased estimators for better performance
	subsample=0.8,
	colsample_bytree=0.8,
	min_child_weight=1 # Prevents overfitting on small datasets
	)
	model.fit(X_train_tfidf, y_train_encoded[:, i])
	models.append(model)

	# Save the models
	for idx, model in enumerate(models):
	dump(model, f"xgboost_model_label_{idx}.joblib")