Spaces:

CChircop
/

StreamlitAppAssignment4

Sleeping

App Files Files Community

CChircop commited on Oct 1, 2024

Commit

e4b4b92

verified ·

1 Parent(s): 468c336

Upload 7 files

Browse files

Files changed (8) hide show

.gitattributes +1 -0
app4.py +194 -0
micro_world_139countries.csv +3 -0
model_xgb.joblib +3 -0
regionwb_encoder.joblib +3 -0
requirements.txt +12 -0
scaler.joblib +3 -0
xgb_clf.joblib +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+micro_world_139countries.csv filter=lfs diff=lfs merge=lfs -text

app4.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import numpy as np
+import pandas as pd
+import seaborn as sns
+import statsmodels.api as sm
+import random
+import shap
+import joblib
+import matplotlib.pyplot as plt
+from sklearn.cluster import AgglomerativeClustering
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import LabelEncoder
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import classification_report
+from mlxtend.plotting import plot_confusion_matrix
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import cross_val_score
+from xgboost import XGBClassifier
+from sklearn.model_selection import GridSearchCV
+from sklearn.metrics import make_scorer
+from sklearn.metrics import mean_squared_error
+from sklearn.metrics import r2_score
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
+# Function to load the dataset
+file_path = 'micro_world_139countries.csv'
+df = pd.read_csv(file_path, encoding='ISO-8859-1')
+sample_df = df[['remittances', 'educ', 'age', 'female', 'mobileowner','internetaccess', 'pay_utilities', 'receive_transfers','receive_pension', 'economy', 'regionwb','account']].sample(n=5000, random_state=42)
+sample_df = sample_df.dropna(subset=['account','remittances', 'educ', 'age', 'female', 'mobileowner','internetaccess', 'pay_utilities', 'receive_transfers','receive_pension', 'economy', 'regionwb'])
+print(sample_df['regionwb'].unique)
+le_country_economy = LabelEncoder()
+sample_df['economy'] = le_country_economy.fit_transform(sample_df['economy'])#Giving unique int values to economies
+le_region = LabelEncoder()
+sample_df['regionwb'] = le_region.fit_transform(sample_df['regionwb'])#Unique int values to regions
+X = sample_df.drop('account', axis=1)
+y = sample_df['account']
+labelencoder_y = LabelEncoder()
+y= labelencoder_y.fit_transform(y)
+scaler = StandardScaler()
+X = scaler.fit_transform(X)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21)#Creating Test and Training samples, test sample = 20% of the dataset
+#Creating SML Model
+model = LogisticRegression()#multi_class="auto" could also work
+# Fit the model to your training data
+model.fit(X_train, y_train)
+model.score(X_train, y_train)
+true_accounts = labelencoder_y.inverse_transform(y_train)
+predicted_accounts = labelencoder_y.inverse_transform(model.predict(X_train))
+df = pd.DataFrame({'true_accounts': true_accounts, 'predicted_accounts': predicted_accounts})
+pd.crosstab(df.true_accounts, df.predicted_accounts)
+#print(classification_report(true_accounts,predicted_accounts, labels=labelencoder_y.classes_))
+#print(model.score(X_test, y_test))#Final Evaluation
+true_accounts = labelencoder_y.inverse_transform(y_test)
+predicted_accounts = labelencoder_y.inverse_transform(model.predict(X_test))
+#print(classification_report(true_accounts,predicted_accounts, labels=labelencoder_y.classes_))
+model = LogisticRegression()
+scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')  # 5-fold cross-validation
+#print("Cross-validation scores: ", scores)
+#print("Average cross-validation score: ", scores.mean())
+#Cross-Val Score: 0.775
+#Using XGBClassifier Model
+model = XGBClassifier()
+model.fit(X_train, y_train)
+true_accounts = labelencoder_y.inverse_transform(y_train)
+predicted_accounts = labelencoder_y.inverse_transform(model.predict(X_train))
+df = pd.DataFrame({'true_accounts': true_accounts, 'predicted_accounts': predicted_accounts})
+pd.crosstab(df.true_accounts, df.predicted_accounts)
+#print(classification_report(true_accounts,predicted_accounts, labels=labelencoder_y.classes_))
+#We see using training dataset XGBoost performs better with an accuracy of 97% compared to 78% of LogisticRegression.
+#print(model.score(X_test, y_test))#Final Evaluation
+true_accounts = labelencoder_y.inverse_transform(y_test)
+predicted_accounts = labelencoder_y.inverse_transform(model.predict(X_test))
+#print(classification_report(true_accounts,predicted_accounts, labels=labelencoder_y.classes_))
+model = XGBClassifier()
+scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')  # 5-fold cross-validation
+#print("Cross-validation scores: ", scores)
+#print("Average cross-validation score: ", scores.mean())
+#Cross Val Score = 0.824
+#Using Test dataset XBoost = 83% accuracy, LogisticRegression = 79%
+#Hyperparameter tuning
+model_xgb = XGBClassifier()
+model_xgb.fit(X_train, y_train)
+#print('Model LG' + ' ' + str(model_lg.score(X_test, y_test)))
+#print('Model XGB' + ' ' + str(model_xgb.score(X_test, y_test)))
+scorer = make_scorer(mean_squared_error)
+#Define the parameter
+parameters_xgb = {'n_estimators': [100, 200, 300],'max_depth': [3, 5, 7],'learning_rate': [0.01, 0.1, 0.3]}
+# Perform grid search on the classifier using 'scorer' as the scoring method.
+grid_obj = GridSearchCV(model_xgb, parameters_xgb, scoring=scorer)
+grid_fit = grid_obj.fit(X, y)
+# Get the estimator.
+best_reg = grid_fit.best_estimator_
+# Fit the new model.
+best_reg.fit(X_train, y_train)
+best_reg.score(X_test, y_test)
+#print(best_reg.score(X_test, y_test))
+#After Hyperameter tuning we find the XGBoost had a score of 0.786
+#Evaluating  Model
+# Generate predictions for the test set
+y_pred = best_reg.predict(X_test)
+# If this is a binary classification problem, you'll need the predicted probabilities for ROC-AUC
+y_pred_proba = best_reg.predict_proba(X_test)[:, 1]
+# Accuracy
+accuracy = accuracy_score(y_test, y_pred)
+# Precision
+precision = precision_score(y_test, y_pred)
+# Recall
+recall = recall_score(y_test, y_pred)
+# F1 Score
+f1 = f1_score(y_test, y_pred)
+# ROC-AUC Score (for binary classification)
+roc_auc = roc_auc_score(y_test, y_pred_proba)
+# Mean Squared Error (MSE)
+mse = mean_squared_error(y_test, y_pred)
+# Print the results
+#print(f"Accuracy: {accuracy:.4f}")
+#print(f"Precision: {precision:.4f}")
+#print(f"Recall: {recall:.4f}")
+#print(f"F1 Score: {f1:.4f}")
+#print(f"ROC-AUC Score: {roc_auc:.4f}")
+#print(f"Mean Squared Error: {mse:.4f}")
+#Plotting Confusion Matrix
+# Generate predictions
+y_pred = best_reg.predict(X_test)
+# Compute confusion matrix
+cm = confusion_matrix(y_test, y_pred)
+# Plot the confusion matrix
+plt.figure(figsize=(12, 10))
+sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, xticklabels=labelencoder_y.classes_, yticklabels=labelencoder_y.classes_, annot_kws={"size": 10})
+plt.xlabel('Predicted Labels')
+plt.ylabel('True Labels')
+plt.title('Confusion Matrix')
+plt.xticks(rotation=45, fontsize=12)  # Rotate x-axis labels
+plt.yticks(rotation=0, fontsize=12)  # Rotate y-axis labels
+plt.tight_layout()
+#plt.show()
+#Our model is 90% accurate at predicting when True label for account = true, but inaccurate when True Label for account = false.
+# Define the SHAP explainer
+explainer_shap = shap.Explainer(model_xgb)
+# Calculate SHAP values for test and train sets
+shap_values_test = explainer_shap(X_test)
+shap_values_train = explainer_shap(X_train)
+# Convert SHAP values to DataFrame
+df_shap_test = pd.DataFrame(shap_values_test.values, columns=sample_df.columns.drop('account'))
+df_shap_train = pd.DataFrame(shap_values_train.values, columns=sample_df.columns.drop('account'))
+# Display the first 10 rows of SHAP values for the test set
+#print(df_shap_test.head(10))
+# Identify categorical features based on the number of unique values
+categorical_features = np.argwhere(np.array([len(set(X_train[:, x])) for x in range(X_train.shape[1])]) <= 10).flatten()
+# Create a summary plot for SHAP values of the training set
+shap.summary_plot(shap_values_train.values, X_train, feature_names=sample_df.columns.drop('account'))
+joblib.dump(model_xgb, 'xgb_clf.joblib')
+joblib.dump(scaler, 'scaler.joblib')
+joblib.dump(labelencoder_y, 'encoder.joblib')
+joblib.dump(le_country_economy, 'country_encoder.joblib')
+joblib.dump(le_region, 'regionwb_encoder.joblib')

micro_world_139countries.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98ee1367d02f92b04d0933584a4620516b90ed5f9c554f867fa5037f3f721f7a
+size 40174289

model_xgb.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71fbdbf77eb5e9f46f647ea23e12e99ed14a8238756b1be79fbfa0ba9a5eeb74
+size 250917

regionwb_encoder.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:34fe2c5156d736829bff80daad85f151347d7e1eb073ce861e662c3778be7b6b
+size 817

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+numpy
+pandas
+seaborn
+streamlit
+scipy
+altair
+shap
+joblib
+matplotlib
+scikit-learn
+mlxtend
+xgboost

scaler.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4c9a927f87387df686cbe8868bb67b4eb4f267501cca9410438f887b50671cb
+size 1343

xgb_clf.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:434915ea67e58cb51d6936a21ae4561695be5927fa0a7019957d86ed7ece908b
+size 247246