CChircop commited on
Commit
e4b4b92
·
verified ·
1 Parent(s): 468c336

Upload 7 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ micro_world_139countries.csv filter=lfs diff=lfs merge=lfs -text
app4.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import seaborn as sns
4
+ import statsmodels.api as sm
5
+ import random
6
+ import shap
7
+ import joblib
8
+ import matplotlib.pyplot as plt
9
+ from sklearn.cluster import AgglomerativeClustering
10
+ from sklearn.model_selection import train_test_split
11
+ from sklearn.preprocessing import StandardScaler
12
+ from sklearn.preprocessing import LabelEncoder
13
+ from sklearn.metrics import confusion_matrix
14
+ from sklearn.metrics import classification_report
15
+ from mlxtend.plotting import plot_confusion_matrix
16
+ from sklearn.linear_model import LogisticRegression
17
+ from sklearn.model_selection import cross_val_score
18
+ from xgboost import XGBClassifier
19
+ from sklearn.model_selection import GridSearchCV
20
+ from sklearn.metrics import make_scorer
21
+ from sklearn.metrics import mean_squared_error
22
+ from sklearn.metrics import r2_score
23
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
24
+ # Function to load the dataset
25
+
26
+ file_path = 'micro_world_139countries.csv'
27
+ df = pd.read_csv(file_path, encoding='ISO-8859-1')
28
+
29
+ sample_df = df[['remittances', 'educ', 'age', 'female', 'mobileowner','internetaccess', 'pay_utilities', 'receive_transfers','receive_pension', 'economy', 'regionwb','account']].sample(n=5000, random_state=42)
30
+ sample_df = sample_df.dropna(subset=['account','remittances', 'educ', 'age', 'female', 'mobileowner','internetaccess', 'pay_utilities', 'receive_transfers','receive_pension', 'economy', 'regionwb'])
31
+ print(sample_df['regionwb'].unique)
32
+
33
+ le_country_economy = LabelEncoder()
34
+ sample_df['economy'] = le_country_economy.fit_transform(sample_df['economy'])#Giving unique int values to economies
35
+ le_region = LabelEncoder()
36
+ sample_df['regionwb'] = le_region.fit_transform(sample_df['regionwb'])#Unique int values to regions
37
+
38
+ X = sample_df.drop('account', axis=1)
39
+ y = sample_df['account']
40
+ labelencoder_y = LabelEncoder()
41
+ y= labelencoder_y.fit_transform(y)
42
+
43
+ scaler = StandardScaler()
44
+ X = scaler.fit_transform(X)
45
+
46
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21)#Creating Test and Training samples, test sample = 20% of the dataset
47
+
48
+
49
+ #Creating SML Model
50
+ model = LogisticRegression()#multi_class="auto" could also work
51
+ # Fit the model to your training data
52
+ model.fit(X_train, y_train)
53
+ model.score(X_train, y_train)
54
+
55
+ true_accounts = labelencoder_y.inverse_transform(y_train)
56
+
57
+ predicted_accounts = labelencoder_y.inverse_transform(model.predict(X_train))
58
+
59
+ df = pd.DataFrame({'true_accounts': true_accounts, 'predicted_accounts': predicted_accounts})
60
+
61
+ pd.crosstab(df.true_accounts, df.predicted_accounts)
62
+ #print(classification_report(true_accounts,predicted_accounts, labels=labelencoder_y.classes_))
63
+
64
+ #print(model.score(X_test, y_test))#Final Evaluation
65
+ true_accounts = labelencoder_y.inverse_transform(y_test)
66
+ predicted_accounts = labelencoder_y.inverse_transform(model.predict(X_test))
67
+ #print(classification_report(true_accounts,predicted_accounts, labels=labelencoder_y.classes_))
68
+
69
+ model = LogisticRegression()
70
+ scores = cross_val_score(model, X, y, cv=5, scoring='accuracy') # 5-fold cross-validation
71
+ #print("Cross-validation scores: ", scores)
72
+ #print("Average cross-validation score: ", scores.mean())
73
+ #Cross-Val Score: 0.775
74
+
75
+ #Using XGBClassifier Model
76
+ model = XGBClassifier()
77
+ model.fit(X_train, y_train)
78
+ true_accounts = labelencoder_y.inverse_transform(y_train)
79
+
80
+ predicted_accounts = labelencoder_y.inverse_transform(model.predict(X_train))
81
+
82
+ df = pd.DataFrame({'true_accounts': true_accounts, 'predicted_accounts': predicted_accounts})
83
+
84
+ pd.crosstab(df.true_accounts, df.predicted_accounts)
85
+ #print(classification_report(true_accounts,predicted_accounts, labels=labelencoder_y.classes_))
86
+ #We see using training dataset XGBoost performs better with an accuracy of 97% compared to 78% of LogisticRegression.
87
+
88
+
89
+ #print(model.score(X_test, y_test))#Final Evaluation
90
+ true_accounts = labelencoder_y.inverse_transform(y_test)
91
+ predicted_accounts = labelencoder_y.inverse_transform(model.predict(X_test))
92
+ #print(classification_report(true_accounts,predicted_accounts, labels=labelencoder_y.classes_))
93
+
94
+ model = XGBClassifier()
95
+ scores = cross_val_score(model, X, y, cv=5, scoring='accuracy') # 5-fold cross-validation
96
+ #print("Cross-validation scores: ", scores)
97
+ #print("Average cross-validation score: ", scores.mean())
98
+ #Cross Val Score = 0.824
99
+ #Using Test dataset XBoost = 83% accuracy, LogisticRegression = 79%
100
+
101
+ #Hyperparameter tuning
102
+ model_xgb = XGBClassifier()
103
+ model_xgb.fit(X_train, y_train)
104
+ #print('Model LG' + ' ' + str(model_lg.score(X_test, y_test)))
105
+ #print('Model XGB' + ' ' + str(model_xgb.score(X_test, y_test)))
106
+ scorer = make_scorer(mean_squared_error)
107
+
108
+ #Define the parameter
109
+ parameters_xgb = {'n_estimators': [100, 200, 300],'max_depth': [3, 5, 7],'learning_rate': [0.01, 0.1, 0.3]}
110
+ # Perform grid search on the classifier using 'scorer' as the scoring method.
111
+ grid_obj = GridSearchCV(model_xgb, parameters_xgb, scoring=scorer)
112
+ grid_fit = grid_obj.fit(X, y)
113
+ # Get the estimator.
114
+ best_reg = grid_fit.best_estimator_
115
+
116
+ # Fit the new model.
117
+ best_reg.fit(X_train, y_train)
118
+ best_reg.score(X_test, y_test)
119
+ #print(best_reg.score(X_test, y_test))
120
+ #After Hyperameter tuning we find the XGBoost had a score of 0.786
121
+
122
+ #Evaluating Model
123
+ # Generate predictions for the test set
124
+ y_pred = best_reg.predict(X_test)
125
+
126
+ # If this is a binary classification problem, you'll need the predicted probabilities for ROC-AUC
127
+ y_pred_proba = best_reg.predict_proba(X_test)[:, 1]
128
+
129
+ # Accuracy
130
+ accuracy = accuracy_score(y_test, y_pred)
131
+ # Precision
132
+ precision = precision_score(y_test, y_pred)
133
+ # Recall
134
+ recall = recall_score(y_test, y_pred)
135
+ # F1 Score
136
+ f1 = f1_score(y_test, y_pred)
137
+ # ROC-AUC Score (for binary classification)
138
+ roc_auc = roc_auc_score(y_test, y_pred_proba)
139
+ # Mean Squared Error (MSE)
140
+ mse = mean_squared_error(y_test, y_pred)
141
+ # Print the results
142
+ #print(f"Accuracy: {accuracy:.4f}")
143
+ #print(f"Precision: {precision:.4f}")
144
+ #print(f"Recall: {recall:.4f}")
145
+ #print(f"F1 Score: {f1:.4f}")
146
+ #print(f"ROC-AUC Score: {roc_auc:.4f}")
147
+ #print(f"Mean Squared Error: {mse:.4f}")
148
+
149
+ #Plotting Confusion Matrix
150
+ # Generate predictions
151
+ y_pred = best_reg.predict(X_test)
152
+
153
+ # Compute confusion matrix
154
+ cm = confusion_matrix(y_test, y_pred)
155
+
156
+ # Plot the confusion matrix
157
+ plt.figure(figsize=(12, 10))
158
+ sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, xticklabels=labelencoder_y.classes_, yticklabels=labelencoder_y.classes_, annot_kws={"size": 10})
159
+ plt.xlabel('Predicted Labels')
160
+ plt.ylabel('True Labels')
161
+ plt.title('Confusion Matrix')
162
+ plt.xticks(rotation=45, fontsize=12) # Rotate x-axis labels
163
+ plt.yticks(rotation=0, fontsize=12) # Rotate y-axis labels
164
+ plt.tight_layout()
165
+ #plt.show()
166
+ #Our model is 90% accurate at predicting when True label for account = true, but inaccurate when True Label for account = false.
167
+
168
+
169
+ # Define the SHAP explainer
170
+ explainer_shap = shap.Explainer(model_xgb)
171
+
172
+ # Calculate SHAP values for test and train sets
173
+ shap_values_test = explainer_shap(X_test)
174
+ shap_values_train = explainer_shap(X_train)
175
+
176
+ # Convert SHAP values to DataFrame
177
+ df_shap_test = pd.DataFrame(shap_values_test.values, columns=sample_df.columns.drop('account'))
178
+ df_shap_train = pd.DataFrame(shap_values_train.values, columns=sample_df.columns.drop('account'))
179
+
180
+ # Display the first 10 rows of SHAP values for the test set
181
+ #print(df_shap_test.head(10))
182
+
183
+ # Identify categorical features based on the number of unique values
184
+ categorical_features = np.argwhere(np.array([len(set(X_train[:, x])) for x in range(X_train.shape[1])]) <= 10).flatten()
185
+
186
+ # Create a summary plot for SHAP values of the training set
187
+ shap.summary_plot(shap_values_train.values, X_train, feature_names=sample_df.columns.drop('account'))
188
+
189
+ joblib.dump(model_xgb, 'xgb_clf.joblib')
190
+ joblib.dump(scaler, 'scaler.joblib')
191
+ joblib.dump(labelencoder_y, 'encoder.joblib')
192
+ joblib.dump(le_country_economy, 'country_encoder.joblib')
193
+ joblib.dump(le_region, 'regionwb_encoder.joblib')
194
+
micro_world_139countries.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98ee1367d02f92b04d0933584a4620516b90ed5f9c554f867fa5037f3f721f7a
3
+ size 40174289
model_xgb.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71fbdbf77eb5e9f46f647ea23e12e99ed14a8238756b1be79fbfa0ba9a5eeb74
3
+ size 250917
regionwb_encoder.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34fe2c5156d736829bff80daad85f151347d7e1eb073ce861e662c3778be7b6b
3
+ size 817
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy
2
+ pandas
3
+ seaborn
4
+ streamlit
5
+ scipy
6
+ altair
7
+ shap
8
+ joblib
9
+ matplotlib
10
+ scikit-learn
11
+ mlxtend
12
+ xgboost
scaler.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4c9a927f87387df686cbe8868bb67b4eb4f267501cca9410438f887b50671cb
3
+ size 1343
xgb_clf.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:434915ea67e58cb51d6936a21ae4561695be5927fa0a7019957d86ed7ece908b
3
+ size 247246