Spaces:
Sleeping
Sleeping
Upload 7 files
Browse files- .gitattributes +1 -0
- app4.py +194 -0
- micro_world_139countries.csv +3 -0
- model_xgb.joblib +3 -0
- regionwb_encoder.joblib +3 -0
- requirements.txt +12 -0
- scaler.joblib +3 -0
- xgb_clf.joblib +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
micro_world_139countries.csv filter=lfs diff=lfs merge=lfs -text
|
app4.py
ADDED
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
import seaborn as sns
|
4 |
+
import statsmodels.api as sm
|
5 |
+
import random
|
6 |
+
import shap
|
7 |
+
import joblib
|
8 |
+
import matplotlib.pyplot as plt
|
9 |
+
from sklearn.cluster import AgglomerativeClustering
|
10 |
+
from sklearn.model_selection import train_test_split
|
11 |
+
from sklearn.preprocessing import StandardScaler
|
12 |
+
from sklearn.preprocessing import LabelEncoder
|
13 |
+
from sklearn.metrics import confusion_matrix
|
14 |
+
from sklearn.metrics import classification_report
|
15 |
+
from mlxtend.plotting import plot_confusion_matrix
|
16 |
+
from sklearn.linear_model import LogisticRegression
|
17 |
+
from sklearn.model_selection import cross_val_score
|
18 |
+
from xgboost import XGBClassifier
|
19 |
+
from sklearn.model_selection import GridSearchCV
|
20 |
+
from sklearn.metrics import make_scorer
|
21 |
+
from sklearn.metrics import mean_squared_error
|
22 |
+
from sklearn.metrics import r2_score
|
23 |
+
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
|
24 |
+
# Function to load the dataset
|
25 |
+
|
26 |
+
file_path = 'micro_world_139countries.csv'
|
27 |
+
df = pd.read_csv(file_path, encoding='ISO-8859-1')
|
28 |
+
|
29 |
+
sample_df = df[['remittances', 'educ', 'age', 'female', 'mobileowner','internetaccess', 'pay_utilities', 'receive_transfers','receive_pension', 'economy', 'regionwb','account']].sample(n=5000, random_state=42)
|
30 |
+
sample_df = sample_df.dropna(subset=['account','remittances', 'educ', 'age', 'female', 'mobileowner','internetaccess', 'pay_utilities', 'receive_transfers','receive_pension', 'economy', 'regionwb'])
|
31 |
+
print(sample_df['regionwb'].unique)
|
32 |
+
|
33 |
+
le_country_economy = LabelEncoder()
|
34 |
+
sample_df['economy'] = le_country_economy.fit_transform(sample_df['economy'])#Giving unique int values to economies
|
35 |
+
le_region = LabelEncoder()
|
36 |
+
sample_df['regionwb'] = le_region.fit_transform(sample_df['regionwb'])#Unique int values to regions
|
37 |
+
|
38 |
+
X = sample_df.drop('account', axis=1)
|
39 |
+
y = sample_df['account']
|
40 |
+
labelencoder_y = LabelEncoder()
|
41 |
+
y= labelencoder_y.fit_transform(y)
|
42 |
+
|
43 |
+
scaler = StandardScaler()
|
44 |
+
X = scaler.fit_transform(X)
|
45 |
+
|
46 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21)#Creating Test and Training samples, test sample = 20% of the dataset
|
47 |
+
|
48 |
+
|
49 |
+
#Creating SML Model
|
50 |
+
model = LogisticRegression()#multi_class="auto" could also work
|
51 |
+
# Fit the model to your training data
|
52 |
+
model.fit(X_train, y_train)
|
53 |
+
model.score(X_train, y_train)
|
54 |
+
|
55 |
+
true_accounts = labelencoder_y.inverse_transform(y_train)
|
56 |
+
|
57 |
+
predicted_accounts = labelencoder_y.inverse_transform(model.predict(X_train))
|
58 |
+
|
59 |
+
df = pd.DataFrame({'true_accounts': true_accounts, 'predicted_accounts': predicted_accounts})
|
60 |
+
|
61 |
+
pd.crosstab(df.true_accounts, df.predicted_accounts)
|
62 |
+
#print(classification_report(true_accounts,predicted_accounts, labels=labelencoder_y.classes_))
|
63 |
+
|
64 |
+
#print(model.score(X_test, y_test))#Final Evaluation
|
65 |
+
true_accounts = labelencoder_y.inverse_transform(y_test)
|
66 |
+
predicted_accounts = labelencoder_y.inverse_transform(model.predict(X_test))
|
67 |
+
#print(classification_report(true_accounts,predicted_accounts, labels=labelencoder_y.classes_))
|
68 |
+
|
69 |
+
model = LogisticRegression()
|
70 |
+
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy') # 5-fold cross-validation
|
71 |
+
#print("Cross-validation scores: ", scores)
|
72 |
+
#print("Average cross-validation score: ", scores.mean())
|
73 |
+
#Cross-Val Score: 0.775
|
74 |
+
|
75 |
+
#Using XGBClassifier Model
|
76 |
+
model = XGBClassifier()
|
77 |
+
model.fit(X_train, y_train)
|
78 |
+
true_accounts = labelencoder_y.inverse_transform(y_train)
|
79 |
+
|
80 |
+
predicted_accounts = labelencoder_y.inverse_transform(model.predict(X_train))
|
81 |
+
|
82 |
+
df = pd.DataFrame({'true_accounts': true_accounts, 'predicted_accounts': predicted_accounts})
|
83 |
+
|
84 |
+
pd.crosstab(df.true_accounts, df.predicted_accounts)
|
85 |
+
#print(classification_report(true_accounts,predicted_accounts, labels=labelencoder_y.classes_))
|
86 |
+
#We see using training dataset XGBoost performs better with an accuracy of 97% compared to 78% of LogisticRegression.
|
87 |
+
|
88 |
+
|
89 |
+
#print(model.score(X_test, y_test))#Final Evaluation
|
90 |
+
true_accounts = labelencoder_y.inverse_transform(y_test)
|
91 |
+
predicted_accounts = labelencoder_y.inverse_transform(model.predict(X_test))
|
92 |
+
#print(classification_report(true_accounts,predicted_accounts, labels=labelencoder_y.classes_))
|
93 |
+
|
94 |
+
model = XGBClassifier()
|
95 |
+
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy') # 5-fold cross-validation
|
96 |
+
#print("Cross-validation scores: ", scores)
|
97 |
+
#print("Average cross-validation score: ", scores.mean())
|
98 |
+
#Cross Val Score = 0.824
|
99 |
+
#Using Test dataset XBoost = 83% accuracy, LogisticRegression = 79%
|
100 |
+
|
101 |
+
#Hyperparameter tuning
|
102 |
+
model_xgb = XGBClassifier()
|
103 |
+
model_xgb.fit(X_train, y_train)
|
104 |
+
#print('Model LG' + ' ' + str(model_lg.score(X_test, y_test)))
|
105 |
+
#print('Model XGB' + ' ' + str(model_xgb.score(X_test, y_test)))
|
106 |
+
scorer = make_scorer(mean_squared_error)
|
107 |
+
|
108 |
+
#Define the parameter
|
109 |
+
parameters_xgb = {'n_estimators': [100, 200, 300],'max_depth': [3, 5, 7],'learning_rate': [0.01, 0.1, 0.3]}
|
110 |
+
# Perform grid search on the classifier using 'scorer' as the scoring method.
|
111 |
+
grid_obj = GridSearchCV(model_xgb, parameters_xgb, scoring=scorer)
|
112 |
+
grid_fit = grid_obj.fit(X, y)
|
113 |
+
# Get the estimator.
|
114 |
+
best_reg = grid_fit.best_estimator_
|
115 |
+
|
116 |
+
# Fit the new model.
|
117 |
+
best_reg.fit(X_train, y_train)
|
118 |
+
best_reg.score(X_test, y_test)
|
119 |
+
#print(best_reg.score(X_test, y_test))
|
120 |
+
#After Hyperameter tuning we find the XGBoost had a score of 0.786
|
121 |
+
|
122 |
+
#Evaluating Model
|
123 |
+
# Generate predictions for the test set
|
124 |
+
y_pred = best_reg.predict(X_test)
|
125 |
+
|
126 |
+
# If this is a binary classification problem, you'll need the predicted probabilities for ROC-AUC
|
127 |
+
y_pred_proba = best_reg.predict_proba(X_test)[:, 1]
|
128 |
+
|
129 |
+
# Accuracy
|
130 |
+
accuracy = accuracy_score(y_test, y_pred)
|
131 |
+
# Precision
|
132 |
+
precision = precision_score(y_test, y_pred)
|
133 |
+
# Recall
|
134 |
+
recall = recall_score(y_test, y_pred)
|
135 |
+
# F1 Score
|
136 |
+
f1 = f1_score(y_test, y_pred)
|
137 |
+
# ROC-AUC Score (for binary classification)
|
138 |
+
roc_auc = roc_auc_score(y_test, y_pred_proba)
|
139 |
+
# Mean Squared Error (MSE)
|
140 |
+
mse = mean_squared_error(y_test, y_pred)
|
141 |
+
# Print the results
|
142 |
+
#print(f"Accuracy: {accuracy:.4f}")
|
143 |
+
#print(f"Precision: {precision:.4f}")
|
144 |
+
#print(f"Recall: {recall:.4f}")
|
145 |
+
#print(f"F1 Score: {f1:.4f}")
|
146 |
+
#print(f"ROC-AUC Score: {roc_auc:.4f}")
|
147 |
+
#print(f"Mean Squared Error: {mse:.4f}")
|
148 |
+
|
149 |
+
#Plotting Confusion Matrix
|
150 |
+
# Generate predictions
|
151 |
+
y_pred = best_reg.predict(X_test)
|
152 |
+
|
153 |
+
# Compute confusion matrix
|
154 |
+
cm = confusion_matrix(y_test, y_pred)
|
155 |
+
|
156 |
+
# Plot the confusion matrix
|
157 |
+
plt.figure(figsize=(12, 10))
|
158 |
+
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, xticklabels=labelencoder_y.classes_, yticklabels=labelencoder_y.classes_, annot_kws={"size": 10})
|
159 |
+
plt.xlabel('Predicted Labels')
|
160 |
+
plt.ylabel('True Labels')
|
161 |
+
plt.title('Confusion Matrix')
|
162 |
+
plt.xticks(rotation=45, fontsize=12) # Rotate x-axis labels
|
163 |
+
plt.yticks(rotation=0, fontsize=12) # Rotate y-axis labels
|
164 |
+
plt.tight_layout()
|
165 |
+
#plt.show()
|
166 |
+
#Our model is 90% accurate at predicting when True label for account = true, but inaccurate when True Label for account = false.
|
167 |
+
|
168 |
+
|
169 |
+
# Define the SHAP explainer
|
170 |
+
explainer_shap = shap.Explainer(model_xgb)
|
171 |
+
|
172 |
+
# Calculate SHAP values for test and train sets
|
173 |
+
shap_values_test = explainer_shap(X_test)
|
174 |
+
shap_values_train = explainer_shap(X_train)
|
175 |
+
|
176 |
+
# Convert SHAP values to DataFrame
|
177 |
+
df_shap_test = pd.DataFrame(shap_values_test.values, columns=sample_df.columns.drop('account'))
|
178 |
+
df_shap_train = pd.DataFrame(shap_values_train.values, columns=sample_df.columns.drop('account'))
|
179 |
+
|
180 |
+
# Display the first 10 rows of SHAP values for the test set
|
181 |
+
#print(df_shap_test.head(10))
|
182 |
+
|
183 |
+
# Identify categorical features based on the number of unique values
|
184 |
+
categorical_features = np.argwhere(np.array([len(set(X_train[:, x])) for x in range(X_train.shape[1])]) <= 10).flatten()
|
185 |
+
|
186 |
+
# Create a summary plot for SHAP values of the training set
|
187 |
+
shap.summary_plot(shap_values_train.values, X_train, feature_names=sample_df.columns.drop('account'))
|
188 |
+
|
189 |
+
joblib.dump(model_xgb, 'xgb_clf.joblib')
|
190 |
+
joblib.dump(scaler, 'scaler.joblib')
|
191 |
+
joblib.dump(labelencoder_y, 'encoder.joblib')
|
192 |
+
joblib.dump(le_country_economy, 'country_encoder.joblib')
|
193 |
+
joblib.dump(le_region, 'regionwb_encoder.joblib')
|
194 |
+
|
micro_world_139countries.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:98ee1367d02f92b04d0933584a4620516b90ed5f9c554f867fa5037f3f721f7a
|
3 |
+
size 40174289
|
model_xgb.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:71fbdbf77eb5e9f46f647ea23e12e99ed14a8238756b1be79fbfa0ba9a5eeb74
|
3 |
+
size 250917
|
regionwb_encoder.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:34fe2c5156d736829bff80daad85f151347d7e1eb073ce861e662c3778be7b6b
|
3 |
+
size 817
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
numpy
|
2 |
+
pandas
|
3 |
+
seaborn
|
4 |
+
streamlit
|
5 |
+
scipy
|
6 |
+
altair
|
7 |
+
shap
|
8 |
+
joblib
|
9 |
+
matplotlib
|
10 |
+
scikit-learn
|
11 |
+
mlxtend
|
12 |
+
xgboost
|
scaler.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f4c9a927f87387df686cbe8868bb67b4eb4f267501cca9410438f887b50671cb
|
3 |
+
size 1343
|
xgb_clf.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:434915ea67e58cb51d6936a21ae4561695be5927fa0a7019957d86ed7ece908b
|
3 |
+
size 247246
|