pkiage commited on
Commit
89654ef
Β·
2 Parent(s): 232e5e5 7f0977b

Merge branch 'refactor-structure'

Browse files
README.md CHANGED
@@ -21,6 +21,10 @@ An interactive tool demonstrating credit risk modelling.
21
 
22
  - Selecting optimal threshold using Youden's J statistic
23
 
 
 
 
 
24
  ## Political, Economic, Social, Technological, Legal and Environmental(PESTLE):
25
 
26
  [Europe fit for the Digital Age: Commission proposes new rules and actions for excellence and trust in Artificial Intelligence](https://ec.europa.eu/commission/presscorner/detail/en/ip_21_1682)
 
21
 
22
  - Selecting optimal threshold using Youden's J statistic
23
 
24
+ [Cookiecutter Data Science](https://drivendata.github.io/cookiecutter-data-science/)
25
+
26
+ - Project structure
27
+
28
  ## Political, Economic, Social, Technological, Legal and Environmental(PESTLE):
29
 
30
  [Europe fit for the Digital Age: Commission proposes new rules and actions for excellence and trust in Artificial Intelligence](https://ec.europa.eu/commission/presscorner/detail/en/ip_21_1682)
app.py CHANGED
@@ -1,15 +1,22 @@
1
- from typing import OrderedDict
2
  import streamlit as st
3
- from data_setup import initialise_data
4
- from views.decision_tree import decisiontree_view
5
- from views.logistic import logistic_view
6
- from views.model_comparison import model_comparison_view
7
- from views.strategy_table import strategy_table_view
 
 
 
 
 
 
8
 
9
 
10
  def main():
11
  currency_options = ["USD", "KES", "GBP"]
12
 
 
 
13
  currency = st.sidebar.selectbox(
14
  label="What currency will you be using?", options=currency_options
15
  )
@@ -22,30 +29,25 @@ def main():
22
 
23
  st.title("Modelling")
24
 
25
- model_options = ["Logistic Regression", "Decision Trees"]
26
-
27
- # Returns list
28
  models_selected_list = st.sidebar.multiselect(
29
  label="Select model", options=model_options, default=model_options
30
  )
31
 
32
  models_selected_set = set(models_selected_list)
33
- model_views = OrderedDict()
34
-
35
- if "Logistic Regression" in models_selected_set:
36
- logistic_model_view = logistic_view(split_dataset, currency)
37
- model_views["Logistic Regression"] = logistic_model_view
38
-
39
- if "Decision Trees" in models_selected_set:
40
- decision_tree_model_view = decisiontree_view(split_dataset, currency)
41
- model_views["Decision Trees"] = decision_tree_model_view
42
-
43
- if models_selected_list:
44
- model_comparison_view(
45
- split_dataset,
46
- model_views,
47
- )
48
- strategy_table_view(currency, model_views)
49
 
50
 
51
  if __name__ == "__main__":
 
 
1
  import streamlit as st
2
+ from typing import OrderedDict
3
+
4
+
5
+ from src.features.build_features import initialise_data
6
+
7
+ from src.models.xgboost_model import xgboost_class
8
+ from src.models.logistic_model import logistic_class
9
+
10
+ from src.models.util_model_comparison import model_comparison_view
11
+
12
+ from src.models.util_strategy_table import strategy_table_view
13
 
14
 
15
  def main():
16
  currency_options = ["USD", "KES", "GBP"]
17
 
18
+ model_options = ["XGBoost", "Logistic"]
19
+
20
  currency = st.sidebar.selectbox(
21
  label="What currency will you be using?", options=currency_options
22
  )
 
29
 
30
  st.title("Modelling")
31
 
 
 
 
32
  models_selected_list = st.sidebar.multiselect(
33
  label="Select model", options=model_options, default=model_options
34
  )
35
 
36
  models_selected_set = set(models_selected_list)
37
+
38
+ model_classes = OrderedDict()
39
+
40
+ if "Logistic" in models_selected_set:
41
+ logistic_model_class = logistic_class(split_dataset, currency)
42
+ model_classes["Logistic"] = logistic_model_class
43
+
44
+ if "XGBoost" in models_selected_set:
45
+ xgboost_model_class = xgboost_class(split_dataset, currency)
46
+ model_classes["XGBoost"] = xgboost_model_class
47
+
48
+ model_comparison_view(split_dataset, model_classes)
49
+
50
+ strategy_table_view(currency, model_classes)
 
 
51
 
52
 
53
  if __name__ == "__main__":
common/util.py DELETED
@@ -1,391 +0,0 @@
1
- # DATA MANIPULATION & ANALYSIS
2
-
3
- import pickle
4
- import streamlit as st
5
-
6
- # Arrays
7
- import numpy as np
8
-
9
- # DataFrames and Series
10
- import pandas as pd
11
-
12
- # Returns the indices of the maximum values along an axis
13
- from numpy import argmax
14
-
15
- # MODELLING
16
-
17
- # Logistic regression
18
- from sklearn.linear_model import LogisticRegression
19
-
20
- from sklearn.model_selection import StratifiedKFold
21
-
22
- # XGBoosted Decision Trees
23
- import xgboost as xgb
24
-
25
-
26
- # REPORTING, EVALUATION, AND INTERPRETATION
27
-
28
- # Classification report
29
- from sklearn.metrics import classification_report
30
-
31
- # Reciever Operator Curve
32
- from sklearn.metrics import roc_curve
33
-
34
-
35
- # Evaluate a score by cross-validation
36
- from sklearn.model_selection import cross_val_score
37
-
38
-
39
- # # Functions
40
-
41
-
42
- def drop_columns(df, columns):
43
- return df.drop(columns, axis=1)
44
-
45
-
46
- def remove_less_than_0_columns(df, column):
47
- df[column].dropna()
48
- return df.loc[(df[column] != 0).any(1)]
49
-
50
-
51
- def boolean_int_condition_label(df, label_column_name, condition):
52
- df[label_column_name] = condition
53
- y = df[label_column_name].astype(int)
54
- df = drop_columns(df, label_column_name)
55
- return y, df
56
-
57
-
58
- @st.cache(suppress_st_warning=True)
59
- def undersample_training_data(
60
- df: pd.DataFrame, column_name: str, split_dataset
61
- ):
62
- count_nondefault, count_default = split_dataset.X_y_train[
63
- column_name
64
- ].value_counts()
65
-
66
- nondefaults = df[df[column_name] == 0] # 0
67
-
68
- defaults = df[df[column_name] == 1]
69
-
70
- under_sample = min(count_nondefault, count_default)
71
-
72
- nondefaults_under = nondefaults.sample(under_sample)
73
-
74
- defaults_under = defaults.sample(under_sample)
75
-
76
- X_y_train_under = pd.concat(
77
- [
78
- nondefaults_under.reset_index(drop=True),
79
- defaults_under.reset_index(drop=True),
80
- ],
81
- axis=0,
82
- )
83
-
84
- X_train_under = X_y_train_under.drop([column_name], axis=1) # remove label
85
-
86
- y_train_under = X_y_train_under[column_name] # label only
87
-
88
- class_balance_default = X_y_train_under[column_name].value_counts()
89
-
90
- return [
91
- X_train_under,
92
- y_train_under,
93
- X_y_train_under,
94
- class_balance_default,
95
- ]
96
-
97
-
98
- def create_coeffient_feature_dictionary_logistic_model(
99
- logistic_model, training_data
100
- ):
101
- return {
102
- feat: coef
103
- for coef, feat in zip(
104
- logistic_model.coef_[0, :], training_data.columns
105
- )
106
- }
107
-
108
-
109
- @st.cache(suppress_st_warning=True)
110
- def test_variables_logistic(X_train, y_train):
111
- # Create and fit the logistic regression model
112
- return LogisticRegression(solver="lbfgs").fit(X_train, np.ravel(y_train))
113
-
114
-
115
- @st.cache(suppress_st_warning=True)
116
- def print_coeff_logistic(clf_logistic_model, split_dataset):
117
- # Dictionary of features and their coefficients
118
- return create_coeffient_feature_dictionary_logistic_model(
119
- clf_logistic_model, split_dataset.X_train
120
- )
121
-
122
-
123
- @st.cache(suppress_st_warning=True, hash_funcs={
124
- xgb.XGBClassifier: pickle.dumps
125
- })
126
- def test_variables_gbt(X_train, y_train):
127
- # Using hyperparameters learning_rate and max_depth
128
- return xgb.XGBClassifier(
129
- learning_rate=0.1,
130
- max_depth=7,
131
- use_label_encoder=False,
132
- eval_metric="logloss",
133
- ).fit(X_train, np.ravel(y_train), eval_metric="logloss")
134
-
135
-
136
- # In[398]:
137
-
138
-
139
- def get_df_trueStatus_probabilityDefault_threshStatus_loanAmount(
140
- model, X, y, threshold, loan_amount_col_name
141
- ):
142
- true_status = y.to_frame()
143
-
144
- loan_amount = X[loan_amount_col_name]
145
-
146
- clf_prediction_prob = model.predict_proba(np.ascontiguousarray(X))
147
-
148
- clf_prediction_prob_df = pd.DataFrame(
149
- clf_prediction_prob[:, 1], columns=["PROB_DEFAULT"]
150
- )
151
-
152
- clf_thresh_predicted_default_status = (
153
- clf_prediction_prob_df["PROB_DEFAULT"]
154
- .apply(lambda x: 1 if x > threshold else 0)
155
- .rename("PREDICT_DEFAULT_STATUS")
156
- )
157
-
158
- return pd.concat(
159
- [
160
- true_status.reset_index(drop=True),
161
- clf_prediction_prob_df.reset_index(drop=True),
162
- clf_thresh_predicted_default_status.reset_index(drop=True),
163
- loan_amount.reset_index(drop=True),
164
- ],
165
- axis=1,
166
- )
167
-
168
-
169
- def find_best_threshold_J_statistic(y, clf_prediction_prob_df):
170
- fpr, tpr, thresholds = roc_curve(y, clf_prediction_prob_df)
171
- # get the best threshold
172
- # Youden’s J statistic tpr-fpr
173
- # Argmax to get the index in
174
- # thresholds
175
- return thresholds[argmax(tpr - fpr)]
176
-
177
-
178
- # In[399]:
179
-
180
-
181
- # Function that makes dataframe with probability of default, predicted default status based on threshold
182
- # and actual default status
183
-
184
-
185
- def model_probability_values_df(model, X):
186
- return pd.DataFrame(model.predict_proba(X)[:, 1], columns=["PROB_DEFAULT"])
187
-
188
-
189
- def apply_threshold_to_probability_values(probability_values, threshold):
190
- return (
191
- probability_values["PROB_DEFAULT"]
192
- .apply(lambda x: 1 if x > threshold else 0)
193
- .rename("PREDICT_DEFAULT_STATUS")
194
- )
195
-
196
-
197
- @st.cache(suppress_st_warning=True)
198
- def find_best_threshold_J_statistic(y, clf_prediction_prob_df):
199
- fpr, tpr, thresholds = roc_curve(y, clf_prediction_prob_df)
200
- # get the best threshold
201
- J = tpr - fpr # Youden’s J statistic
202
- ix = argmax(J)
203
- return thresholds[ix]
204
-
205
-
206
- # In[401]:
207
-
208
-
209
- def create_cross_validation_df(
210
- X, y, eval_metric, seed, trees, n_folds, early_stopping_rounds
211
- ):
212
- # Test data x and y
213
- DTrain = xgb.DMatrix(X, label=y)
214
-
215
- # auc or logloss
216
- params = {
217
- "eval_metric": eval_metric,
218
- "objective": "binary:logistic", # logistic say 0 or 1 for loan status
219
- "seed": seed,
220
- }
221
-
222
- # Create the data frame of cross validations
223
- cv_df = xgb.cv(
224
- params,
225
- DTrain,
226
- num_boost_round=trees,
227
- nfold=n_folds,
228
- early_stopping_rounds=early_stopping_rounds,
229
- shuffle=True,
230
- )
231
-
232
- return [DTrain, cv_df]
233
-
234
-
235
- # In[450]:
236
-
237
-
238
- def cross_validation_scores(model, X, y, nfold, score, seed):
239
- # return cv scores of metric
240
- return cross_val_score(
241
- model,
242
- np.ascontiguousarray(X),
243
- np.ravel(np.ascontiguousarray(y)),
244
- cv=StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed),
245
- scoring=score,
246
- )
247
-
248
-
249
- def default_status_per_threshold(threshold_list, prob_default):
250
- threshold_default_status_list = []
251
- for threshold in threshold_list:
252
- threshold_default_status = prob_default.apply(
253
- lambda x: 1 if x > threshold else 0
254
- )
255
- threshold_default_status_list.append(threshold_default_status)
256
- return threshold_default_status_list
257
-
258
-
259
- def classification_report_per_threshold(
260
- threshold_list, threshold_default_status_list, y_test
261
- ):
262
- target_names = ["Non-Default", "Default"]
263
- classification_report_list = []
264
- for threshold_default_status in threshold_default_status_list:
265
- thresh_classification_report = classification_report(
266
- y_test,
267
- threshold_default_status,
268
- target_names=target_names,
269
- output_dict=True,
270
- zero_division=0,
271
- )
272
- classification_report_list.append(thresh_classification_report)
273
- # Return threshold classification report dict
274
- return dict(zip(threshold_list, classification_report_list))
275
-
276
-
277
- def thresh_classification_report_recall_accuracy(
278
- thresh_classification_report_dict,
279
- ):
280
- thresh_def_recalls_list = []
281
- thresh_nondef_recalls_list = []
282
- thresh_accs_list = []
283
- for x in [*thresh_classification_report_dict]:
284
- thresh_def_recall = thresh_classification_report_dict[x]["Default"][
285
- "recall"
286
- ]
287
- thresh_def_recalls_list.append(thresh_def_recall)
288
- thresh_nondef_recall = thresh_classification_report_dict[x][
289
- "Non-Default"
290
- ]["recall"]
291
- thresh_nondef_recalls_list.append(thresh_nondef_recall)
292
- thresh_accs = thresh_classification_report_dict[x]["accuracy"]
293
- thresh_accs_list.append(thresh_accs)
294
- return [
295
- thresh_def_recalls_list,
296
- thresh_nondef_recalls_list,
297
- thresh_accs_list,
298
- ]
299
-
300
-
301
- def create_accept_rate_list(start, end, samples):
302
- return np.linspace(start, end, samples, endpoint=True)
303
-
304
-
305
- def create_strategyTable_df(
306
- start, end, samples, actual_probability_predicted_acc_rate, true, currency
307
- ):
308
- accept_rates = create_accept_rate_list(start, end, samples)
309
- thresholds_strat = []
310
- bad_rates_start = []
311
- Avg_Loan_Amnt = actual_probability_predicted_acc_rate[true].mean()
312
- num_accepted_loans_start = []
313
-
314
- for rate in accept_rates:
315
- # Calculate the threshold for the acceptance rate
316
- thresh = np.quantile(
317
- actual_probability_predicted_acc_rate["PROB_DEFAULT"], rate
318
- ).round(3)
319
- # Add the threshold value to the list of thresholds
320
- thresholds_strat.append(
321
- np.quantile(
322
- actual_probability_predicted_acc_rate["PROB_DEFAULT"], rate
323
- ).round(3)
324
- )
325
-
326
- # Reassign the loan_status value using the threshold
327
- actual_probability_predicted_acc_rate[
328
- "PREDICT_DEFAULT_STATUS"
329
- ] = actual_probability_predicted_acc_rate["PROB_DEFAULT"].apply(
330
- lambda x: 1 if x > thresh else 0
331
- )
332
-
333
- # Create a set of accepted loans using this acceptance rate
334
- accepted_loans = actual_probability_predicted_acc_rate[
335
- actual_probability_predicted_acc_rate["PREDICT_DEFAULT_STATUS"]
336
- == 0
337
- ]
338
- # Calculate and append the bad rate using the acceptance rate
339
- bad_rates_start.append(
340
- np.sum((accepted_loans[true]) / len(accepted_loans[true])).round(3)
341
- )
342
- # Accepted loans
343
- num_accepted_loans_start.append(len(accepted_loans))
344
-
345
- # Calculate estimated value
346
- money_accepted_loans = [
347
- accepted_loans * Avg_Loan_Amnt
348
- for accepted_loans in num_accepted_loans_start
349
- ]
350
-
351
- money_bad_accepted_loans = [
352
- 2 * money_accepted_loan * bad_rate
353
- for money_accepted_loan, bad_rate in zip(
354
- money_accepted_loans, bad_rates_start
355
- )
356
- ]
357
-
358
- zip_object = zip(money_accepted_loans, money_bad_accepted_loans)
359
- estimated_value = [
360
- money_accepted_loan - money_bad_accepted_loan
361
- for money_accepted_loan, money_bad_accepted_loan in zip_object
362
- ]
363
-
364
- accept_rates = ["{:.2f}".format(elem) for elem in accept_rates]
365
-
366
- thresholds_strat = ["{:.2f}".format(elem) for elem in thresholds_strat]
367
-
368
- bad_rates_start = ["{:.2f}".format(elem) for elem in bad_rates_start]
369
-
370
- estimated_value = ["{:.2f}".format(elem) for elem in estimated_value]
371
-
372
- return (
373
- pd.DataFrame(
374
- zip(
375
- accept_rates,
376
- thresholds_strat,
377
- bad_rates_start,
378
- num_accepted_loans_start,
379
- estimated_value,
380
- ),
381
- columns=[
382
- "Acceptance Rate",
383
- "Threshold",
384
- "Bad Rate",
385
- "Num Accepted Loans",
386
- f"Estimated Value ({currency})",
387
- ],
388
- )
389
- .sort_values(by="Acceptance Rate", axis=0, ascending=False)
390
- .reset_index(drop=True)
391
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
common/views.py DELETED
@@ -1,361 +0,0 @@
1
- from typing import OrderedDict
2
- import streamlit as st # works on command prompt
3
- import matplotlib.pyplot as plt
4
- import numpy as np
5
- import pandas as pd
6
- import xgboost as xgb
7
- from sklearn.metrics import (
8
- roc_curve,
9
- )
10
- from sklearn.calibration import calibration_curve
11
- from xgboost import plot_tree
12
- from views.typing import ModelView
13
-
14
-
15
- def plot_logistic_coeff_barh(coef_dict, x, y):
16
- fig = plt.figure(figsize=(x, y))
17
- coef_dict_sorted = dict(
18
- sorted(coef_dict.items(), key=lambda item: item[1], reverse=False)
19
- )
20
- plt.barh(*zip(*coef_dict_sorted.items()))
21
- return fig
22
-
23
-
24
- def print_negative_coefficients_logistic_model(coef_dict):
25
- # Equal to or less than 0
26
- NegativeCoefficients = dict(
27
- filter(lambda x: x[1] <= 0.0, coef_dict.items())
28
- )
29
-
30
- NegativeCoefficientsSorted = sorted(
31
- NegativeCoefficients.items(), key=lambda x: x[1], reverse=False
32
- )
33
- text = (
34
- "\n\nFeatures the model found to be negatively correlated with probability of default are:"
35
- "\n{negative_features}:"
36
- )
37
- st.markdown(text.format(negative_features=NegativeCoefficientsSorted))
38
- st.markdown(type(NegativeCoefficientsSorted))
39
- st.markdown(NegativeCoefficients.items())
40
-
41
-
42
- def print_positive_coefficients_logistic_model(coef_dict):
43
- # Equal to or greater than 0
44
- PositiveCoefficients = dict(
45
- filter(lambda x: x[1] >= 0.0, coef_dict.items())
46
- )
47
-
48
- PositiveCoefficientsSorted = sorted(
49
- PositiveCoefficients.items(), key=lambda x: x[1], reverse=True
50
- )
51
- text = (
52
- "\n\nFeatures the model found to be positively correlated with probability of default are:"
53
- "\n{positive_features}:"
54
- )
55
- st.markdown(text.format(positive_features=PositiveCoefficientsSorted))
56
-
57
-
58
- def plot_importance_gbt(clf_gbt_model, barxsize, barysize):
59
- axobject1 = xgb.plot_importance(clf_gbt_model, importance_type="weight")
60
- fig1 = axobject1.figure
61
- st.write("Feature Importance Plot (Gradient Boosted Tree)")
62
- fig1.set_size_inches(barxsize, barysize)
63
- return fig1
64
-
65
-
66
- def download_importance_gbt(fig1, barxsize, barysize):
67
- if st.button(
68
- "Download Feature Importance Plot as png (Gradient Boosted Tree)"
69
- ):
70
- dpisize = max(barxsize, barysize)
71
- plt.savefig("bar.png", dpi=dpisize * 96, bbox_inches="tight")
72
- fig1.set_size_inches(barxsize, barysize)
73
-
74
-
75
- def plot_tree_gbt(treexsize, treeysize, clf_gbt_model):
76
- plot_tree(clf_gbt_model)
77
- fig2 = plt.gcf()
78
- fig2.set_size_inches(treexsize, treeysize)
79
- return fig2
80
-
81
-
82
- def download_tree_gbt(treexsize, treeysize):
83
- if st.button("Download Decision Tree Plot as png (Gradient Boosted Tree)"):
84
- dpisize = max(treexsize, treeysize)
85
- plt.savefig("tree.png", dpi=dpisize * 96, bbox_inches="tight")
86
-
87
-
88
- def cross_validation_graph(cv, eval_metric, trees):
89
-
90
- # Plot the test AUC scores for each iteration
91
- fig = plt.figure()
92
- plt.plot(cv[cv.columns[2]])
93
- plt.title(
94
- "Test {eval_metric} Score Over {it_numbr} Iterations".format(
95
- eval_metric=eval_metric, it_numbr=trees
96
- )
97
- )
98
- plt.xlabel("Iteration Number")
99
- plt.ylabel("Test {eval_metric} Score".format(eval_metric=eval_metric))
100
- return fig
101
-
102
-
103
- def recall_accuracy_threshold_tradeoff_fig(
104
- widthsize,
105
- heightsize,
106
- threshold_list,
107
- thresh_def_recalls_list,
108
- thresh_nondef_recalls_list,
109
- thresh_accs_list,
110
- ):
111
- fig = plt.figure(figsize=(widthsize, heightsize))
112
- plt.plot(threshold_list, thresh_def_recalls_list, label="Default Recall")
113
- plt.plot(
114
- threshold_list, thresh_nondef_recalls_list, label="Non-Default Recall"
115
- )
116
- plt.plot(threshold_list, thresh_accs_list, label="Model Accuracy")
117
- plt.xlabel("Probability Threshold")
118
- plt.ylabel("Score")
119
- plt.xlim(0, 1)
120
- plt.ylim(0, 1)
121
- plt.legend()
122
- plt.title("Recall and Accuracy Score Tradeoff with Probability Threshold")
123
- plt.grid(False)
124
- return fig
125
-
126
-
127
- def roc_auc_compare_n_models(y, model_views: OrderedDict[str, ModelView]):
128
- colors = ["blue", "green"]
129
- fig = plt.figure()
130
- for color_idx, (model_name, model_view) in enumerate(model_views.items()):
131
- fpr, tpr, _thresholds = roc_curve(
132
- y, model_view.prediction_probability_df
133
- )
134
- plt.plot(fpr, tpr, color=colors[color_idx], label=f"{model_name}")
135
- plt.plot([0, 1], [0, 1], linestyle="--", label="Random Prediction")
136
- model_names = list(model_views.keys())
137
- if not model_names:
138
- model_name_str = "None"
139
- elif len(model_names) == 1:
140
- model_name_str = model_names[0]
141
- else:
142
- model_name_str = " and ".join(
143
- [", ".join(model_names[:-1]), model_names[-1]]
144
- )
145
- plt.title(f"ROC Chart for {model_name_str} on the Probability of Default")
146
- plt.xlabel("False Positive Rate (FP Rate)")
147
- plt.ylabel("True Positive Rate (TP Rate)")
148
- plt.legend()
149
- plt.grid(False)
150
- plt.xlim(0, 1)
151
- plt.ylim(0, 1)
152
- return fig
153
-
154
-
155
- def calibration_curve_report_commented_n(
156
- y, model_views: OrderedDict[str, ModelView], bins: int
157
- ):
158
- fig = plt.figure()
159
- for model_name, model_view in model_views.items():
160
- frac_of_pos, mean_pred_val = calibration_curve(
161
- y,
162
- model_view.prediction_probability_df,
163
- n_bins=bins,
164
- normalize=True,
165
- )
166
- plt.plot(mean_pred_val, frac_of_pos, "s-", label=f"{model_name}")
167
-
168
- # Create the calibration curve plot with the guideline
169
- plt.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
170
-
171
- plt.ylabel("Fraction of positives")
172
- plt.xlabel("Average Predicted Probability")
173
- plt.title("Calibration Curve")
174
- plt.legend()
175
- plt.grid(False)
176
- plt.xlim(0, 1)
177
- plt.ylim(0, 1)
178
- return fig
179
-
180
-
181
- def acceptance_rate_threshold_fig(probability_default, acceptancerate, bins):
182
- # Probability distribution
183
- probability_stat_distribution = probability_default.describe()
184
-
185
- # Acceptance rate threshold
186
- acc_rate_thresh = np.quantile(probability_default, acceptancerate)
187
- fig = plt.figure()
188
-
189
- plt.hist(
190
- probability_default,
191
- color="blue",
192
- bins=bins,
193
- histtype="bar",
194
- ec="white",
195
- )
196
-
197
- # Add a reference line to the plot for the threshold
198
- plt.axvline(x=acc_rate_thresh, color="red")
199
- plt.title("Acceptance Rate Thershold")
200
-
201
- return (
202
- fig,
203
- probability_stat_distribution,
204
- acc_rate_thresh,
205
- )
206
-
207
-
208
- def streamlit_2columns_metrics_pct_df(
209
- column1name_label: str,
210
- column2name_label: str,
211
- df: pd.DataFrame,
212
- ):
213
- (
214
- column1name,
215
- column2name,
216
- ) = st.columns(2)
217
-
218
- with column1name:
219
- st.metric(
220
- label=column1name_label,
221
- value="{:.0%}".format(df.value_counts().get(1) / df.shape[0]),
222
- delta=None,
223
- delta_color="normal",
224
- )
225
-
226
- with column2name:
227
- st.metric(
228
- label=column2name_label,
229
- value="{:.0%}".format(df.value_counts().get(0) / df.shape[0]),
230
- delta=None,
231
- delta_color="normal",
232
- )
233
-
234
-
235
- def streamlit_2columns_metrics_df(
236
- column1name_label: str,
237
- column2name_label: str,
238
- df: pd.DataFrame,
239
- ):
240
- (
241
- column1name,
242
- column2name,
243
- ) = st.columns(2)
244
-
245
- with column1name:
246
- st.metric(
247
- label=column1name_label,
248
- value=df.value_counts().get(1),
249
- delta=None,
250
- delta_color="normal",
251
- )
252
-
253
- with column2name:
254
- st.metric(
255
- label=column2name_label,
256
- value=df.value_counts().get(0),
257
- delta=None,
258
- delta_color="normal",
259
- )
260
-
261
-
262
- def streamlit_2columns_metrics_df_shape(df: pd.DataFrame):
263
- (
264
- column1name,
265
- column2name,
266
- ) = st.columns(2)
267
-
268
- with column1name:
269
- st.metric(
270
- label="Rows",
271
- value=df.shape[0],
272
- delta=None,
273
- delta_color="normal",
274
- )
275
-
276
- with column2name:
277
- st.metric(
278
- label="Columns",
279
- value=df.shape[1],
280
- delta=None,
281
- delta_color="normal",
282
- )
283
-
284
-
285
- def streamlit_2columns_metrics_pct_series(
286
- column1name_label: str,
287
- column2name_label: str,
288
- series: pd.Series,
289
- ):
290
- (
291
- column1name,
292
- column2name,
293
- ) = st.columns(2)
294
- with column1name:
295
- st.metric(
296
- label=column1name_label,
297
- value="{:.0%}".format(series.get(1) / series.sum()),
298
- delta=None,
299
- delta_color="normal",
300
- )
301
-
302
- with column2name:
303
- st.metric(
304
- label=column2name_label,
305
- value="{:.0%}".format(series.get(0) / series.sum()),
306
- delta=None,
307
- delta_color="normal",
308
- )
309
-
310
-
311
- def streamlit_2columns_metrics_series(
312
- column1name_label: str,
313
- column2name_label: str,
314
- series: pd.Series,
315
- ):
316
- (
317
- column1name,
318
- column2name,
319
- ) = st.columns(2)
320
- with column1name:
321
- st.metric(
322
- label=column1name_label,
323
- value=series.get(1),
324
- delta=None,
325
- delta_color="normal",
326
- )
327
-
328
- with column2name:
329
- st.metric(
330
- label=column2name_label,
331
- value=series.get(0),
332
- delta=None,
333
- delta_color="normal",
334
- )
335
-
336
-
337
- def streamlit_chart_setting_height_width(
338
- title: str,
339
- default_widthvalue: int,
340
- default_heightvalue: int,
341
- widthkey: str,
342
- heightkey: str,
343
- ):
344
- with st.expander(title):
345
-
346
- lbarx_col, lbary_col = st.columns(2)
347
-
348
- with lbarx_col:
349
- width_size = st.number_input(
350
- label="Width in inches:",
351
- value=default_widthvalue,
352
- key=widthkey,
353
- )
354
-
355
- with lbary_col:
356
- height_size = st.number_input(
357
- label="Height in inches:",
358
- value=default_heightvalue,
359
- key=heightkey,
360
- )
361
- return width_size, height_size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
{common β†’ src}/__init__.py RENAMED
File without changes
{views β†’ src/features}/__init__.py RENAMED
File without changes
data_setup.py β†’ src/features/build_features.py RENAMED
@@ -1,13 +1,19 @@
1
- from typing import Tuple, cast
2
-
 
3
  import pandas as pd
 
4
  import streamlit as st
5
 
6
- from common.data import Dataset, SplitDataset
7
- from common.util import (
 
 
8
  undersample_training_data,
9
- )
10
- from common.views import (
 
 
11
  streamlit_2columns_metrics_df_shape,
12
  streamlit_2columns_metrics_series,
13
  streamlit_2columns_metrics_pct_series,
@@ -16,22 +22,9 @@ from common.views import (
16
  )
17
 
18
 
19
- # Initialize dataframe session state
20
  def initialise_data() -> Tuple[Dataset, SplitDataset]:
21
- if "input_data_frame" not in st.session_state:
22
- st.session_state.input_data_frame = pd.read_csv(
23
- r"./data/processed/cr_loan_w2.csv"
24
- )
25
- if "dataset" not in st.session_state:
26
- df = cast(pd.DataFrame, st.session_state.input_data_frame)
27
- dataset = Dataset(
28
- df=df,
29
- random_state=123235,
30
- test_size=40,
31
- )
32
- st.session_state.dataset = dataset
33
- else:
34
- dataset = st.session_state.dataset
35
 
36
  st.write(
37
  "Assuming data is already cleaned and relevant features (predictors) added."
@@ -41,31 +34,12 @@ def initialise_data() -> Tuple[Dataset, SplitDataset]:
41
  st.dataframe(dataset.df)
42
  streamlit_2columns_metrics_df_shape(dataset.df)
43
 
44
- st.header("Predictors")
45
 
46
- possible_columns = dataset.x_values_column_names
47
-
48
- selected_columns = st.sidebar.multiselect(
49
- label="Select Predictors",
50
- options=possible_columns,
51
- default=possible_columns,
52
- )
53
-
54
- selected_x_values = dataset.x_values_filtered_columns(selected_columns)
55
-
56
- st.sidebar.metric(
57
- label="# of Predictors Selected",
58
- value=selected_x_values.shape[1],
59
- delta=None,
60
- delta_color="normal",
61
- )
62
  with st.expander("Predictors Dataframe (X)"):
63
  st.dataframe(selected_x_values)
64
  streamlit_2columns_metrics_df_shape(selected_x_values)
65
 
66
- # 40% of data used for training
67
- # 14321 as random seed for reproducability
68
-
69
  st.header("Split Testing and Training Data")
70
 
71
  test_size_slider_col, seed_col = st.columns(2)
@@ -88,7 +62,6 @@ def initialise_data() -> Tuple[Dataset, SplitDataset]:
88
 
89
  split_dataset = dataset.train_test_split(selected_x_values)
90
 
91
- # Series
92
  true_status = split_dataset.y_test.to_frame().value_counts()
93
 
94
  st.sidebar.metric(
 
1
+ from typing import List, Union, cast, Tuple
2
+ from dataclasses import dataclass
3
+ from sklearn.model_selection import train_test_split
4
  import pandas as pd
5
+
6
  import streamlit as st
7
 
8
+
9
+ from src.features.util_build_features import (
10
+ Dataset,
11
+ SplitDataset,
12
  undersample_training_data,
13
+ select_predictors,
14
+ import_data)
15
+
16
+ from src.visualization.metrics import (
17
  streamlit_2columns_metrics_df_shape,
18
  streamlit_2columns_metrics_series,
19
  streamlit_2columns_metrics_pct_series,
 
22
  )
23
 
24
 
 
25
  def initialise_data() -> Tuple[Dataset, SplitDataset]:
26
+
27
+ dataset = import_data()
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  st.write(
30
  "Assuming data is already cleaned and relevant features (predictors) added."
 
34
  st.dataframe(dataset.df)
35
  streamlit_2columns_metrics_df_shape(dataset.df)
36
 
37
+ selected_x_values = select_predictors(dataset)
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  with st.expander("Predictors Dataframe (X)"):
40
  st.dataframe(selected_x_values)
41
  streamlit_2columns_metrics_df_shape(selected_x_values)
42
 
 
 
 
43
  st.header("Split Testing and Training Data")
44
 
45
  test_size_slider_col, seed_col = st.columns(2)
 
62
 
63
  split_dataset = dataset.train_test_split(selected_x_values)
64
 
 
65
  true_status = split_dataset.y_test.to_frame().value_counts()
66
 
67
  st.sidebar.metric(
common/data.py β†’ src/features/util_build_features.py RENAMED
@@ -1,9 +1,116 @@
 
 
1
  from typing import List, Union, cast
 
2
  from dataclasses import dataclass
 
3
  from sklearn.model_selection import train_test_split
 
4
  import pandas as pd
5
 
6
- from common.util import drop_columns
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
 
9
  @dataclass
@@ -92,3 +199,91 @@ class Dataset:
92
  y_train=cast(pd.Series, y_train),
93
  y_test=cast(pd.Series, y_test),
94
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
  from typing import List, Union, cast
4
+
5
  from dataclasses import dataclass
6
+
7
  from sklearn.model_selection import train_test_split
8
+
9
  import pandas as pd
10
 
11
+
12
+ @dataclass
13
+ class SplitDataset:
14
+ X_test: pd.DataFrame
15
+ X_train: pd.DataFrame
16
+ y_test: pd.Series
17
+ y_train: pd.Series
18
+
19
+ @property
20
+ def X_y_test(self) -> pd.DataFrame:
21
+ return pd.concat(
22
+ cast(
23
+ List[Union[pd.DataFrame, pd.Series]],
24
+ [
25
+ self.X_test.reset_index(drop=True),
26
+ self.y_test.reset_index(drop=True),
27
+ ],
28
+ ),
29
+ axis=1,
30
+ )
31
+
32
+ @property
33
+ def X_y_train(self) -> pd.DataFrame:
34
+ return pd.concat(
35
+ cast(
36
+ List[Union[pd.DataFrame, pd.Series]],
37
+ [
38
+ self.X_train.reset_index(drop=True),
39
+ self.y_train.reset_index(drop=True),
40
+ ],
41
+ ),
42
+ axis=1,
43
+ )
44
+
45
+
46
+ @dataclass
47
+ class Dataset:
48
+ df: pd.DataFrame
49
+ random_state: int
50
+ test_size: int
51
+
52
+ @property
53
+ def y_value(self) -> pd.DataFrame:
54
+ return self.df["loan_status"]
55
+
56
+ @property
57
+ def x_values(self) -> pd.DataFrame:
58
+ return cast(
59
+ pd.DataFrame,
60
+ drop_columns(
61
+ self.df,
62
+ [
63
+ "loan_status",
64
+ "loan_grade_A",
65
+ "loan_grade_B",
66
+ "loan_grade_C",
67
+ "loan_grade_D",
68
+ "loan_grade_E",
69
+ "loan_grade_F",
70
+ "loan_grade_G",
71
+ ],
72
+ ),
73
+ )
74
+
75
+ @property
76
+ def x_values_column_names(self):
77
+ return self.x_values.columns.tolist()
78
+
79
+ def x_values_filtered_columns(self, columns: List[str]) -> pd.DataFrame:
80
+ return self.df.filter(columns)
81
+
82
+ def train_test_split(
83
+ self, selected_x_values: pd.DataFrame
84
+ ) -> SplitDataset:
85
+ X_train, X_test, y_train, y_test = train_test_split(
86
+ selected_x_values,
87
+ self.y_value,
88
+ test_size=self.test_size / 100, # since up was given as pct
89
+ random_state=self.random_state,
90
+ )
91
+
92
+ return SplitDataset(
93
+ X_train=cast(pd.DataFrame, X_train),
94
+ X_test=cast(pd.DataFrame, X_test),
95
+ y_train=cast(pd.Series, y_train),
96
+ y_test=cast(pd.Series, y_test),
97
+ )
98
+
99
+
100
+ def drop_columns(df, columns):
101
+ return df.drop(columns, axis=1)
102
+
103
+
104
+ def remove_less_than_0_columns(df, column):
105
+ df[column].dropna()
106
+ return df.loc[(df[column] != 0).any(1)]
107
+
108
+
109
+ def boolean_int_condition_label(df, label_column_name, condition):
110
+ df[label_column_name] = condition
111
+ y = df[label_column_name].astype(int)
112
+ df = drop_columns(df, label_column_name)
113
+ return y, df
114
 
115
 
116
  @dataclass
 
199
  y_train=cast(pd.Series, y_train),
200
  y_test=cast(pd.Series, y_test),
201
  )
202
+
203
+
204
+ def drop_columns(df, columns):
205
+ return df.drop(columns, axis=1)
206
+
207
+
208
+ def remove_less_than_0_columns(df, column):
209
+ df[column].dropna()
210
+ return df.loc[(df[column] != 0).any(1)]
211
+
212
+
213
+ def boolean_int_condition_label(df, label_column_name, condition):
214
+ df[label_column_name] = condition
215
+ y = df[label_column_name].astype(int)
216
+ df = drop_columns(df, label_column_name)
217
+ return y, df
218
+
219
+
220
+ @st.cache(suppress_st_warning=True)
221
+ def undersample_training_data(
222
+ df: pd.DataFrame, column_name: str, split_dataset
223
+ ):
224
+ count_nondefault, count_default = split_dataset.X_y_train[
225
+ column_name
226
+ ].value_counts()
227
+
228
+ nondefaults = df[df[column_name] == 0] # 0
229
+
230
+ defaults = df[df[column_name] == 1]
231
+
232
+ under_sample = min(count_nondefault, count_default)
233
+
234
+ nondefaults_under = nondefaults.sample(under_sample)
235
+
236
+ defaults_under = defaults.sample(under_sample)
237
+
238
+ X_y_train_under = pd.concat(
239
+ [
240
+ nondefaults_under.reset_index(drop=True),
241
+ defaults_under.reset_index(drop=True),
242
+ ],
243
+ axis=0,
244
+ )
245
+
246
+ X_train_under = X_y_train_under.drop([column_name], axis=1) # remove label
247
+
248
+ y_train_under = X_y_train_under[column_name] # label only
249
+
250
+ class_balance_default = X_y_train_under[column_name].value_counts()
251
+
252
+ return [
253
+ X_train_under,
254
+ y_train_under,
255
+ X_y_train_under,
256
+ class_balance_default,
257
+ ]
258
+
259
+
260
+ def select_predictors(dataset):
261
+ st.header("Predictors")
262
+
263
+ possible_columns = dataset.x_values_column_names
264
+
265
+ selected_columns = st.sidebar.multiselect(
266
+ label="Select Predictors",
267
+ options=possible_columns,
268
+ default=possible_columns,
269
+ )
270
+ return dataset.x_values_filtered_columns(selected_columns)
271
+
272
+
273
+ def import_data():
274
+ if "input_data_frame" not in st.session_state:
275
+ st.session_state.input_data_frame = pd.read_csv(
276
+ r"./data/processed/cr_loan_w2.csv"
277
+ )
278
+ if "dataset" not in st.session_state:
279
+ df = cast(pd.DataFrame, st.session_state.input_data_frame)
280
+ dataset = Dataset(
281
+ df=df,
282
+ random_state=123235,
283
+ test_size=40,
284
+ )
285
+ st.session_state.dataset = dataset
286
+ else:
287
+ dataset = st.session_state.dataset
288
+
289
+ return dataset
src/models/__init__.py ADDED
File without changes
src/models/logistic_model.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.features.build_features import SplitDataset
2
+
3
+ from src.models.logistic_train_model import logistic_train_model
4
+ from src.models.logistic_predict_model import logistic_predict_model
5
+ from src.models.logistic_test_model import logistic_test_model
6
+
7
+ from src.models.util_model_class import ModelClass
8
+
9
+
10
+ def logistic_class(split_dataset: SplitDataset, currency: str) -> ModelClass:
11
+
12
+ # Train Model
13
+ clf_logistic_model = logistic_train_model(split_dataset)
14
+
15
+ # Predict using Trained Model
16
+ clf_logistic_predictions = logistic_predict_model(
17
+ clf_logistic_model, split_dataset)
18
+
19
+ # Test and Evaluate Model
20
+ df_trueStatus_probabilityDefault_threshStatus_loanAmount_logistic = logistic_test_model(
21
+ clf_logistic_model,
22
+ split_dataset,
23
+ currency,
24
+ clf_logistic_predictions.probability_threshold_selected,
25
+ clf_logistic_predictions.predicted_default_status)
26
+
27
+ return ModelClass(
28
+ model=clf_logistic_model,
29
+ trueStatus_probabilityDefault_threshStatus_loanAmount_df=df_trueStatus_probabilityDefault_threshStatus_loanAmount_logistic,
30
+ probability_threshold_selected=clf_logistic_predictions.probability_threshold_selected,
31
+ predicted_default_status=clf_logistic_predictions.predicted_default_status,
32
+ prediction_probability_df=clf_logistic_predictions.prediction_probability_df,
33
+ )
src/models/logistic_predict_model.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from src.models.util_predict_model import make_prediction_view
2
+
3
+ logistic_predict_model = make_prediction_view(
4
+ "Logistic", "Logisitic Model")
src/models/logistic_test_model.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from src.models.util_test import make_tests_view
2
+
3
+ logistic_test_model = make_tests_view(
4
+ "Logistic", "Logistic Model")
src/models/logistic_train_model.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import numpy as np
3
+ from sklearn.linear_model import LogisticRegression
4
+ from src.features.build_features import SplitDataset
5
+ import streamlit as st
6
+ import pandas as pd
7
+
8
+ from src.visualization.graphs_logistic import plot_logistic_coeff_barh
9
+
10
+
11
+ @st.cache(suppress_st_warning=True)
12
+ def create_clf_logistic_model(X_train, y_train):
13
+ # Create and fit the logistic regression model
14
+ return LogisticRegression(solver="lbfgs").fit(X_train, np.ravel(y_train))
15
+
16
+
17
+ @st.cache(suppress_st_warning=True)
18
+ def create_coeff_dict_logistic_model(
19
+ logistic_model, training_data
20
+ ):
21
+ return {
22
+ feat: coef
23
+ for coef, feat in zip(
24
+ logistic_model.coef_[0, :], training_data.columns
25
+ )
26
+ }
27
+
28
+
29
+ def coeff_dict_to_sorted_df(coef_dict):
30
+ coef_dict_sorted = dict(
31
+ sorted(coef_dict.items(), key=lambda item: item[1], reverse=False)
32
+ )
33
+
34
+ data_items = coef_dict_sorted.items()
35
+ data_list = list(data_items)
36
+
37
+ return pd.DataFrame(data_list, columns=["Coefficient", "Value"])
38
+
39
+
40
+ def interpret_clf_logistic_model(clf_logistic_model, split_dataset):
41
+ st.metric(
42
+ label="# of Coefficients in Logistic Regression",
43
+ value=clf_logistic_model.n_features_in_,
44
+ delta=None,
45
+ delta_color="normal",
46
+ )
47
+
48
+ st.subheader("Logistic Regression Coefficient Values")
49
+ st.write(split_dataset)
50
+ st.write(type(split_dataset))
51
+
52
+ coef_dict = create_coeff_dict_logistic_model(
53
+ clf_logistic_model, split_dataset.X_y_train)
54
+
55
+ df = coeff_dict_to_sorted_df(coef_dict)
56
+
57
+ fig = plot_logistic_coeff_barh(df)
58
+
59
+ st.plotly_chart(fig)
60
+
61
+
62
+ def logistic_train_model(split_dataset: SplitDataset):
63
+ st.header("Logistic Regression Model")
64
+
65
+ clf_logistic_model = create_clf_logistic_model(
66
+ split_dataset.X_train, split_dataset.y_train
67
+ )
68
+
69
+ interpret_clf_logistic_model(clf_logistic_model, split_dataset)
70
+
71
+ return clf_logistic_model
views/typing.py β†’ src/models/util_model_class.py RENAMED
@@ -7,7 +7,7 @@ from sklearn.linear_model import LogisticRegression
7
 
8
 
9
  @dataclass(frozen=True)
10
- class ModelView:
11
  model: Union[XGBClassifier, LogisticRegression]
12
  probability_threshold_selected: float
13
  predicted_default_status: pd.Series
 
7
 
8
 
9
  @dataclass(frozen=True)
10
+ class ModelClass:
11
  model: Union[XGBClassifier, LogisticRegression]
12
  probability_threshold_selected: float
13
  predicted_default_status: pd.Series
views/model_comparison.py β†’ src/models/util_model_comparison.py RENAMED
@@ -1,16 +1,21 @@
1
  from typing import OrderedDict
2
  import streamlit as st
3
  from sklearn.metrics import roc_auc_score
4
- from common.data import SplitDataset
5
- from common.views import (
 
 
 
 
6
  roc_auc_compare_n_models,
7
- streamlit_chart_setting_height_width,
8
- calibration_curve_report_commented_n,
9
  )
10
- from views.typing import ModelView
11
 
12
 
13
- def roc_auc_for_model(split_dataset: SplitDataset, model_view: ModelView):
 
 
 
14
  roc_auc_model = roc_auc_score(
15
  split_dataset.y_test, model_view.predicted_default_status
16
  )
@@ -31,7 +36,7 @@ def roc_auc_for_model(split_dataset: SplitDataset, model_view: ModelView):
31
 
32
  def model_comparison_view(
33
  split_dataset: SplitDataset,
34
- model_views: OrderedDict[str, ModelView],
35
  ):
36
  st.header("Model Comparison")
37
 
@@ -43,7 +48,7 @@ def model_comparison_view(
43
  f"Receiver Operating Characteristic (ROC) Curve - {model_name}"
44
  )
45
  st.markdown(
46
- f'Area Under the Receiver Operating Characteristic Curve from prediction scores from "{model_name}" model is {roc_auc_model}.\n'
47
  )
48
  st.markdown(
49
  f'The score of {"{:.2f}".format(roc_auc_model)} is in the {roc_auc_lvl} ROC AUC score category.'
@@ -78,4 +83,4 @@ def model_comparison_view(
78
 
79
  fig2.set_size_inches(xsize_cal, ysize_cal)
80
 
81
- st.pyplot(fig2.figure)
 
1
  from typing import OrderedDict
2
  import streamlit as st
3
  from sklearn.metrics import roc_auc_score
4
+ from src.features.util_build_features import SplitDataset
5
+ from src.visualization.graphs_settings import (
6
+ streamlit_chart_setting_height_width
7
+ )
8
+
9
+ from src.visualization.graphs_test import (
10
  roc_auc_compare_n_models,
11
+ calibration_curve_report_commented_n
 
12
  )
 
13
 
14
 
15
+ from src.models.util_model_class import ModelClass
16
+
17
+
18
+ def roc_auc_for_model(split_dataset: SplitDataset, model_view: ModelClass):
19
  roc_auc_model = roc_auc_score(
20
  split_dataset.y_test, model_view.predicted_default_status
21
  )
 
36
 
37
  def model_comparison_view(
38
  split_dataset: SplitDataset,
39
+ model_views: OrderedDict[str, ModelClass],
40
  ):
41
  st.header("Model Comparison")
42
 
 
48
  f"Receiver Operating Characteristic (ROC) Curve - {model_name}"
49
  )
50
  st.markdown(
51
+ f'Area Under the Receiver Operating Characteristic Curve from prediction scores from {model_name} model is {roc_auc_model}.\n'
52
  )
53
  st.markdown(
54
  f'The score of {"{:.2f}".format(roc_auc_model)} is in the {roc_auc_lvl} ROC AUC score category.'
 
83
 
84
  fig2.set_size_inches(xsize_cal, ysize_cal)
85
 
86
+ st.pyplot(fig2)
src/models/util_predict_model.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union, cast
2
+ from sklearn.linear_model import LogisticRegression
3
+
4
+
5
+ import pandas as pd
6
+
7
+ from dataclasses import dataclass
8
+
9
+ from xgboost import XGBClassifier
10
+ from src.features.util_build_features import SplitDataset
11
+
12
+ from src.models.util_predict_model_threshold import (
13
+ user_defined_probability_threshold,
14
+ J_statistic_driven_probability_threshold,
15
+ tradeoff_threshold,
16
+ acceptance_rate_driven_threshold,
17
+ select_probability_threshold,
18
+ model_probability_values_df)
19
+
20
+ import streamlit as st
21
+
22
+
23
+ def probability_threshold_explainer(model_name):
24
+ st.write(
25
+ f"""
26
+ The {model_name} model (obtained using training data) is applied on testing data to predict the loans probabilities of defaulting.\n
27
+ Probabilities of defaulting of the loans are compared to a probability threshold.\n
28
+ A loan is predicted to default if its predicted probability of defaulting is greater than the probability threshold.
29
+ """
30
+ )
31
+
32
+
33
+ @dataclass(frozen=True)
34
+ class Threshold:
35
+ probability_threshold_selected: float
36
+ predicted_default_status: pd.Series
37
+ prediction_probability_df: pd.DataFrame
38
+
39
+
40
+ def make_prediction_view(
41
+ model_name_short: str,
42
+ model_name: str,
43
+ ):
44
+ def view(
45
+ clf_xgbt_model: Union[XGBClassifier, LogisticRegression],
46
+ split_dataset: SplitDataset,
47
+ ) -> Threshold:
48
+
49
+ probability_threshold_explainer(model_name)
50
+
51
+ clf_prediction_prob_df_gbt = model_probability_values_df(
52
+ clf_xgbt_model,
53
+ split_dataset.X_test,
54
+ )
55
+
56
+ (clf_thresh_predicted_default_status_user_gbt,
57
+ user_threshold
58
+ ) = user_defined_probability_threshold(
59
+ model_name_short, clf_xgbt_model, split_dataset)
60
+
61
+ (clf_thresh_predicted_default_status_Jstatistic_gbt,
62
+ J_statistic_best_threshold) = J_statistic_driven_probability_threshold(
63
+ clf_prediction_prob_df_gbt, clf_xgbt_model, split_dataset)
64
+
65
+ tradeoff_threshold(clf_prediction_prob_df_gbt, split_dataset)
66
+
67
+ (acc_rate_thresh_gbt,
68
+ clf_thresh_predicted_default_status_acceptance_gbt) = acceptance_rate_driven_threshold(model_name_short, clf_prediction_prob_df_gbt)
69
+
70
+ (prob_thresh_selected_gbt,
71
+ predicted_default_status_gbt) = select_probability_threshold(model_name_short,
72
+ user_threshold,
73
+ clf_thresh_predicted_default_status_user_gbt,
74
+ J_statistic_best_threshold,
75
+ clf_thresh_predicted_default_status_Jstatistic_gbt,
76
+ acc_rate_thresh_gbt,
77
+ clf_thresh_predicted_default_status_acceptance_gbt)
78
+
79
+ return Threshold(
80
+ probability_threshold_selected=cast(
81
+ float, prob_thresh_selected_gbt
82
+ ),
83
+ predicted_default_status=predicted_default_status_gbt,
84
+ prediction_probability_df=clf_prediction_prob_df_gbt,
85
+ )
86
+
87
+ return view
src/models/util_predict_model_threshold.py ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from sklearn.metrics import classification_report, roc_curve
4
+
5
+ import numpy as np
6
+
7
+ import plotly.express as px
8
+
9
+ import pandas as pd
10
+
11
+ from numpy import argmax
12
+
13
+ from src.visualization.metrics import streamlit_2columns_metrics_df, streamlit_2columns_metrics_pct_df
14
+
15
+ from src.visualization.graphs_threshold import acceptance_rate_driven_threshold_graph
16
+
17
+
18
+ def model_probability_values_df(model, X):
19
+ return pd.DataFrame(model.predict_proba(X)[:, 1], columns=["PROB_DEFAULT"])
20
+
21
+
22
+ def find_best_threshold_J_statistic(y, clf_prediction_prob_df):
23
+ fpr, tpr, thresholds = roc_curve(y, clf_prediction_prob_df)
24
+ # get the best threshold
25
+ # Youden’s J statistic tpr-fpr
26
+ # Argmax to get the index in
27
+ # thresholds
28
+ return thresholds[argmax(tpr - fpr)]
29
+
30
+ # Function that makes dataframe with probability of default, predicted default status based on threshold
31
+ # and actual default status
32
+
33
+
34
+ def classification_report_per_threshold(
35
+ threshold_list, threshold_default_status_list, y_test
36
+ ):
37
+ target_names = ["Non-Default", "Default"]
38
+ classification_report_list = []
39
+ for threshold_default_status in threshold_default_status_list:
40
+ thresh_classification_report = classification_report(
41
+ y_test,
42
+ threshold_default_status,
43
+ target_names=target_names,
44
+ output_dict=True,
45
+ zero_division=0,
46
+ )
47
+ classification_report_list.append(thresh_classification_report)
48
+ # Return threshold classification report dict
49
+ return dict(zip(threshold_list, classification_report_list))
50
+
51
+
52
+ def thresh_classification_report_recall_accuracy(
53
+ thresh_classification_report_dict,
54
+ ):
55
+ thresh_def_recalls_list = []
56
+ thresh_nondef_recalls_list = []
57
+ thresh_accs_list = []
58
+ for x in [*thresh_classification_report_dict]:
59
+ thresh_def_recall = thresh_classification_report_dict[x]["Default"][
60
+ "recall"
61
+ ]
62
+ thresh_def_recalls_list.append(thresh_def_recall)
63
+ thresh_nondef_recall = thresh_classification_report_dict[x][
64
+ "Non-Default"
65
+ ]["recall"]
66
+ thresh_nondef_recalls_list.append(thresh_nondef_recall)
67
+ thresh_accs = thresh_classification_report_dict[x]["accuracy"]
68
+ thresh_accs_list.append(thresh_accs)
69
+ return [
70
+ thresh_def_recalls_list,
71
+ thresh_nondef_recalls_list,
72
+ thresh_accs_list,
73
+ ]
74
+
75
+
76
+ def apply_threshold_to_probability_values(probability_values, threshold):
77
+ return (
78
+ probability_values["PROB_DEFAULT"]
79
+ .apply(lambda x: 1 if x > threshold else 0)
80
+ .rename("PREDICT_DEFAULT_STATUS")
81
+ )
82
+
83
+
84
+ @st.cache(suppress_st_warning=True)
85
+ def find_best_threshold_J_statistic(y, clf_prediction_prob_df):
86
+ fpr, tpr, thresholds = roc_curve(y, clf_prediction_prob_df)
87
+ # get the best threshold
88
+ J = tpr - fpr # Youden’s J statistic
89
+ ix = argmax(J)
90
+ return thresholds[ix]
91
+
92
+
93
+ def default_status_per_threshold(threshold_list, prob_default):
94
+ threshold_default_status_list = []
95
+ for threshold in threshold_list:
96
+ threshold_default_status = prob_default.apply(
97
+ lambda x: 1 if x > threshold else 0
98
+ )
99
+ threshold_default_status_list.append(threshold_default_status)
100
+ return threshold_default_status_list
101
+
102
+
103
+ def threshold_and_predictions(clf_xgbt_model, split_dataset, threshold):
104
+
105
+ clf_prediction_prob_df_gbt = model_probability_values_df(
106
+ clf_xgbt_model,
107
+ split_dataset.X_test,
108
+ )
109
+ clf_thresh_predicted_default_status = (
110
+ apply_threshold_to_probability_values(
111
+ clf_prediction_prob_df_gbt,
112
+ threshold,
113
+ )
114
+ )
115
+
116
+ streamlit_2columns_metrics_df(
117
+ "# of Predicted Defaults",
118
+ "# of Predicted Non-Default",
119
+ clf_thresh_predicted_default_status,
120
+ )
121
+
122
+ streamlit_2columns_metrics_pct_df(
123
+ "% of Loans Predicted to Default",
124
+ "% of Loans Predicted not to Default",
125
+ clf_thresh_predicted_default_status,
126
+ )
127
+
128
+ return clf_thresh_predicted_default_status
129
+
130
+
131
+ def user_defined_probability_threshold(model_name_short, clf_xgbt_model, split_dataset):
132
+ st.subheader("Classification Probability Threshold - User Defined")
133
+
134
+ user_defined_threshold = st.slider(
135
+ label="Default Probability Threshold:",
136
+ min_value=0.0,
137
+ max_value=1.0,
138
+ value=0.8,
139
+ key=f"threshold_{model_name_short}_default",
140
+ )
141
+
142
+ clf_thresh_predicted_default_status = threshold_and_predictions(
143
+ clf_xgbt_model, split_dataset, user_defined_threshold)
144
+
145
+ return clf_thresh_predicted_default_status, user_defined_threshold
146
+
147
+
148
+ def J_statistic_driven_probability_threshold(clf_prediction_prob_df_gbt, clf_xgbt_model, split_dataset):
149
+ st.subheader("J Statistic Driven Classification Probability Threshold")
150
+
151
+ J_statistic_best_threshold = find_best_threshold_J_statistic(
152
+ split_dataset.y_test, clf_prediction_prob_df_gbt
153
+ )
154
+ st.metric(
155
+ label="Youden's J statistic calculated best threshold",
156
+ value=J_statistic_best_threshold,
157
+ )
158
+
159
+ clf_thresh_predicted_default_status = threshold_and_predictions(
160
+ clf_xgbt_model, split_dataset, J_statistic_best_threshold)
161
+
162
+ return clf_thresh_predicted_default_status, J_statistic_best_threshold
163
+
164
+
165
+ def create_tradeoff_graph(df):
166
+ fig2 = px.line(
167
+ data_frame=df,
168
+ y=["Default Recall", "Non Default Recall", "Accuracy"],
169
+ x="Threshold",
170
+ )
171
+
172
+ fig2.update_layout(
173
+ title="Recall and Accuracy score Trade-off with Probability Threshold",
174
+ xaxis_title="Probability Threshold",
175
+ yaxis_title="Score",
176
+ )
177
+ fig2.update_yaxes(range=[0.0, 1.0])
178
+
179
+ st.plotly_chart(fig2)
180
+
181
+
182
+ def tradeoff_threshold(clf_prediction_prob_df_gbt, split_dataset):
183
+ st.subheader(
184
+ "Recall and Accuracy Tradeoff with given Probability Threshold"
185
+ )
186
+
187
+ threshold_list = np.arange(
188
+ 0, 1, 0.025).round(decimals=3).tolist()
189
+
190
+ threshold_default_status_list = default_status_per_threshold(
191
+ threshold_list, clf_prediction_prob_df_gbt["PROB_DEFAULT"]
192
+ )
193
+ thresh_classification_report_dict = (
194
+ classification_report_per_threshold(
195
+ threshold_list,
196
+ threshold_default_status_list,
197
+ split_dataset.y_test,
198
+ )
199
+ )
200
+
201
+ (
202
+ thresh_def_recalls_list,
203
+ thresh_nondef_recalls_list,
204
+ thresh_accs_list,
205
+ ) = thresh_classification_report_recall_accuracy(
206
+ thresh_classification_report_dict
207
+ )
208
+
209
+ namelist = [
210
+ "Default Recall",
211
+ "Non Default Recall",
212
+ "Accuracy",
213
+ "Threshold",
214
+ ]
215
+
216
+ df = pd.DataFrame(
217
+ [
218
+ thresh_def_recalls_list,
219
+ thresh_nondef_recalls_list,
220
+ thresh_accs_list,
221
+ threshold_list,
222
+ ],
223
+ index=namelist,
224
+ )
225
+
226
+ df = df.T
227
+
228
+ create_tradeoff_graph(df)
229
+
230
+
231
+ def select_probability_threshold(model_name_short,
232
+ user_defined_threshold,
233
+ clf_thresh_predicted_default_status_user_gbt,
234
+ J_statistic_best_threshold,
235
+ clf_thresh_predicted_default_status_Jstatistic_gbt,
236
+ acc_rate_thresh_gbt,
237
+ clf_thresh_predicted_default_status_acceptance_gbt):
238
+ st.subheader("Selected Probability Threshold")
239
+
240
+ options = [
241
+ "User Defined",
242
+ "J Statistic Driven",
243
+ "Acceptance Rate Driven",
244
+ ]
245
+ prob_thresh_option = st.radio(
246
+ label="Selected Probability Threshold",
247
+ options=options,
248
+ key=f"{model_name_short}_radio_thresh",
249
+ )
250
+
251
+ if prob_thresh_option == "User Defined":
252
+ prob_thresh_selected_gbt = user_defined_threshold
253
+ predicted_default_status_gbt = (
254
+ clf_thresh_predicted_default_status_user_gbt
255
+ )
256
+ elif prob_thresh_option == "J Statistic Driven":
257
+ prob_thresh_selected_gbt = J_statistic_best_threshold
258
+ predicted_default_status_gbt = (
259
+ clf_thresh_predicted_default_status_Jstatistic_gbt
260
+ )
261
+ else:
262
+ prob_thresh_selected_gbt = acc_rate_thresh_gbt
263
+ predicted_default_status_gbt = (
264
+ clf_thresh_predicted_default_status_acceptance_gbt
265
+ )
266
+
267
+ st.write(
268
+ f"Selected probability threshold is {prob_thresh_selected_gbt}"
269
+ )
270
+
271
+ return prob_thresh_selected_gbt, predicted_default_status_gbt
272
+
273
+
274
+ def acceptance_rate_driven_threshold(model_name_short, clf_prediction_prob_df_gbt):
275
+ st.subheader("Acceptance Rate Driven Probability Threshold")
276
+ # Steps
277
+ # Set acceptance rate
278
+ # Get default status per threshold
279
+ # Get classification report per threshold
280
+ # Get recall, nondef recall, and accuracy per threshold
281
+
282
+ acceptance_rate = (
283
+ st.slider(
284
+ label="% of loans accepted (acceptance rate):",
285
+ min_value=0,
286
+ max_value=100,
287
+ value=85,
288
+ key=f"acceptance_rate_{model_name_short}",
289
+ format="%f%%",
290
+ )
291
+ / 100
292
+ )
293
+
294
+ acc_rate_thresh_gbt = np.quantile(
295
+ clf_prediction_prob_df_gbt["PROB_DEFAULT"], acceptance_rate
296
+ )
297
+
298
+ st.write(
299
+ f"An acceptance rate of {acceptance_rate} results in probability threshold of {acc_rate_thresh_gbt}"
300
+ )
301
+
302
+ acceptance_rate_driven_threshold_graph(
303
+ clf_prediction_prob_df_gbt, acc_rate_thresh_gbt)
304
+
305
+ clf_thresh_predicted_default_status_acceptance_gbt = apply_threshold_to_probability_values(
306
+ clf_prediction_prob_df_gbt,
307
+ acc_rate_thresh_gbt,
308
+ )
309
+
310
+ return acc_rate_thresh_gbt, clf_thresh_predicted_default_status_acceptance_gbt
views/strategy_table.py β†’ src/models/util_strategy_table.py RENAMED
@@ -2,12 +2,12 @@ from typing import OrderedDict
2
  import plotly.express as px
3
  import numpy as np
4
  import streamlit as st
5
- from common.util import create_strategyTable_df
6
- from views.typing import ModelView
7
 
8
 
9
  def strategy_table_view(
10
- currency: str, model_views: OrderedDict[str, ModelView]
11
  ):
12
  st.header("Strategy Table")
13
 
@@ -89,7 +89,7 @@ def strategy_table_view(
89
  )
90
 
91
  st.metric(
92
- label=f"Total expected loss:",
93
  value=f"{currency} {tot_exp_loss:,.2f}",
94
  delta=None,
95
  delta_color="normal",
 
2
  import plotly.express as px
3
  import numpy as np
4
  import streamlit as st
5
+ from src.models.util_test import create_strategyTable_df
6
+ from src.models.util_model_class import ModelClass
7
 
8
 
9
  def strategy_table_view(
10
+ currency: str, model_views: OrderedDict[str, ModelClass]
11
  ):
12
  st.header("Strategy Table")
13
 
 
89
  )
90
 
91
  st.metric(
92
+ label='Total expected loss:',
93
  value=f"{currency} {tot_exp_loss:,.2f}",
94
  delta=None,
95
  delta_color="normal",
views/evaluation.py β†’ src/models/util_test.py RENAMED
@@ -1,5 +1,6 @@
1
  from typing import Union
2
  import pandas as pd
 
3
  import streamlit as st
4
  import numpy as np
5
  from sklearn.metrics import (
@@ -7,24 +8,25 @@ from sklearn.metrics import (
7
  confusion_matrix,
8
  )
9
  from sklearn.linear_model import LogisticRegression
 
10
  from xgboost.sklearn import XGBClassifier
11
- from common.data import SplitDataset
12
- from common.util import (
13
  create_cross_validation_df,
14
  cross_validation_scores,
15
  get_df_trueStatus_probabilityDefault_threshStatus_loanAmount,
16
- )
17
- from common.views import (
18
  cross_validation_graph,
19
  )
20
 
21
 
22
- def make_evaluation_view(
23
  model_name_short: str,
24
  model_name_generic: str,
25
  ):
26
  def view(
27
- clf_gbt_model: Union[XGBClassifier, LogisticRegression],
28
  split_dataset: SplitDataset,
29
  currency: str,
30
  prob_thresh_selected,
@@ -40,7 +42,7 @@ def make_evaluation_view(
40
  train on each fold suggests performance will be stable."
41
  )
42
 
43
- st.write(f"XGBoost cross validation test:")
44
 
45
  stcol_seed, stcol_eval_metric = st.columns(2)
46
 
@@ -170,7 +172,7 @@ def make_evaluation_view(
170
  )
171
 
172
  cv_scores = cross_validation_scores(
173
- clf_gbt_model,
174
  split_dataset.X_test,
175
  split_dataset.y_test,
176
  nfolds_score,
@@ -325,7 +327,7 @@ def make_evaluation_view(
325
 
326
  df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
327
  get_df_trueStatus_probabilityDefault_threshStatus_loanAmount(
328
- clf_gbt_model,
329
  split_dataset.X_test,
330
  split_dataset.y_test,
331
  prob_thresh_selected,
@@ -406,5 +408,161 @@ def make_evaluation_view(
406
  return view
407
 
408
 
409
- decision_tree_evaluation_view = make_evaluation_view("gbt", "Decision Tree")
410
- logistic_evaluation_view = make_evaluation_view("lg", "Logistic Regression")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from typing import Union
2
  import pandas as pd
3
+ from sklearn.model_selection import StratifiedKFold, cross_val_score
4
  import streamlit as st
5
  import numpy as np
6
  from sklearn.metrics import (
 
8
  confusion_matrix,
9
  )
10
  from sklearn.linear_model import LogisticRegression
11
+ import xgboost as xgb
12
  from xgboost.sklearn import XGBClassifier
13
+ from src.features.util_build_features import SplitDataset
14
+ """from src.models.model_utils import (
15
  create_cross_validation_df,
16
  cross_validation_scores,
17
  get_df_trueStatus_probabilityDefault_threshStatus_loanAmount,
18
+ )"""
19
+ from src.visualization.graphs_test import (
20
  cross_validation_graph,
21
  )
22
 
23
 
24
+ def make_tests_view(
25
  model_name_short: str,
26
  model_name_generic: str,
27
  ):
28
  def view(
29
+ clf_xgbt_model: Union[XGBClassifier, LogisticRegression],
30
  split_dataset: SplitDataset,
31
  currency: str,
32
  prob_thresh_selected,
 
42
  train on each fold suggests performance will be stable."
43
  )
44
 
45
+ st.write('xgb cross validation test:')
46
 
47
  stcol_seed, stcol_eval_metric = st.columns(2)
48
 
 
172
  )
173
 
174
  cv_scores = cross_validation_scores(
175
+ clf_xgbt_model,
176
  split_dataset.X_test,
177
  split_dataset.y_test,
178
  nfolds_score,
 
327
 
328
  df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
329
  get_df_trueStatus_probabilityDefault_threshStatus_loanAmount(
330
+ clf_xgbt_model,
331
  split_dataset.X_test,
332
  split_dataset.y_test,
333
  prob_thresh_selected,
 
408
  return view
409
 
410
 
411
+ def cross_validation_scores(model, X, y, nfold, score, seed):
412
+ # return cv scores of metric
413
+ return cross_val_score(
414
+ model,
415
+ np.ascontiguousarray(X),
416
+ np.ravel(np.ascontiguousarray(y)),
417
+ cv=StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed),
418
+ scoring=score,
419
+ )
420
+
421
+
422
+ def create_cross_validation_df(
423
+ X, y, eval_metric, seed, trees, n_folds, early_stopping_rounds
424
+ ):
425
+ # Test data x and y
426
+ DTrain = xgb.DMatrix(X, label=y)
427
+
428
+ # auc or logloss
429
+ params = {
430
+ "eval_metric": eval_metric,
431
+ "objective": "binary:logistic", # logistic say 0 or 1 for loan status
432
+ "seed": seed,
433
+ }
434
+
435
+ # Create the data frame of cross validations
436
+ cv_df = xgb.cv(
437
+ params,
438
+ DTrain,
439
+ num_boost_round=trees,
440
+ nfold=n_folds,
441
+ early_stopping_rounds=early_stopping_rounds,
442
+ shuffle=True,
443
+ )
444
+
445
+ return [DTrain, cv_df]
446
+
447
+
448
+ def create_accept_rate_list(start, end, samples):
449
+ return np.linspace(start, end, samples, endpoint=True)
450
+
451
+
452
+ def create_strategyTable_df(
453
+ start, end, samples, actual_probability_predicted_acc_rate, true, currency
454
+ ):
455
+ accept_rates = create_accept_rate_list(start, end, samples)
456
+ thresholds_strat = []
457
+ bad_rates_start = []
458
+ Avg_Loan_Amnt = actual_probability_predicted_acc_rate[true].mean()
459
+ num_accepted_loans_start = []
460
+
461
+ for rate in accept_rates:
462
+ # Calculate the threshold for the acceptance rate
463
+ thresh = np.quantile(
464
+ actual_probability_predicted_acc_rate["PROB_DEFAULT"], rate
465
+ ).round(3)
466
+ # Add the threshold value to the list of thresholds
467
+ thresholds_strat.append(
468
+ np.quantile(
469
+ actual_probability_predicted_acc_rate["PROB_DEFAULT"], rate
470
+ ).round(3)
471
+ )
472
+
473
+ # Reassign the loan_status value using the threshold
474
+ actual_probability_predicted_acc_rate[
475
+ "PREDICT_DEFAULT_STATUS"
476
+ ] = actual_probability_predicted_acc_rate["PROB_DEFAULT"].apply(
477
+ lambda x: 1 if x > thresh else 0
478
+ )
479
+
480
+ # Create a set of accepted loans using this acceptance rate
481
+ accepted_loans = actual_probability_predicted_acc_rate[
482
+ actual_probability_predicted_acc_rate["PREDICT_DEFAULT_STATUS"]
483
+ == 0
484
+ ]
485
+ # Calculate and append the bad rate using the acceptance rate
486
+ bad_rates_start.append(
487
+ np.sum((accepted_loans[true]) / len(accepted_loans[true])).round(3)
488
+ )
489
+ # Accepted loans
490
+ num_accepted_loans_start.append(len(accepted_loans))
491
+
492
+ # Calculate estimated value
493
+ money_accepted_loans = [
494
+ accepted_loans * Avg_Loan_Amnt
495
+ for accepted_loans in num_accepted_loans_start
496
+ ]
497
+
498
+ money_bad_accepted_loans = [
499
+ 2 * money_accepted_loan * bad_rate
500
+ for money_accepted_loan, bad_rate in zip(
501
+ money_accepted_loans, bad_rates_start
502
+ )
503
+ ]
504
+
505
+ zip_object = zip(money_accepted_loans, money_bad_accepted_loans)
506
+ estimated_value = [
507
+ money_accepted_loan - money_bad_accepted_loan
508
+ for money_accepted_loan, money_bad_accepted_loan in zip_object
509
+ ]
510
+
511
+ accept_rates = ["{:.2f}".format(elem) for elem in accept_rates]
512
+
513
+ thresholds_strat = ["{:.2f}".format(elem) for elem in thresholds_strat]
514
+
515
+ bad_rates_start = ["{:.2f}".format(elem) for elem in bad_rates_start]
516
+
517
+ estimated_value = ["{:.2f}".format(elem) for elem in estimated_value]
518
+
519
+ return (
520
+ pd.DataFrame(
521
+ zip(
522
+ accept_rates,
523
+ thresholds_strat,
524
+ bad_rates_start,
525
+ num_accepted_loans_start,
526
+ estimated_value,
527
+ ),
528
+ columns=[
529
+ "Acceptance Rate",
530
+ "Threshold",
531
+ "Bad Rate",
532
+ "Num Accepted Loans",
533
+ f"Estimated Value ({currency})",
534
+ ],
535
+ )
536
+ .sort_values(by="Acceptance Rate", axis=0, ascending=False)
537
+ .reset_index(drop=True)
538
+ )
539
+
540
+
541
+ def get_df_trueStatus_probabilityDefault_threshStatus_loanAmount(
542
+ model, X, y, threshold, loan_amount_col_name
543
+ ):
544
+ true_status = y.to_frame()
545
+
546
+ loan_amount = X[loan_amount_col_name]
547
+
548
+ clf_prediction_prob = model.predict_proba(np.ascontiguousarray(X))
549
+
550
+ clf_prediction_prob_df = pd.DataFrame(
551
+ clf_prediction_prob[:, 1], columns=["PROB_DEFAULT"]
552
+ )
553
+
554
+ clf_thresh_predicted_default_status = (
555
+ clf_prediction_prob_df["PROB_DEFAULT"]
556
+ .apply(lambda x: 1 if x > threshold else 0)
557
+ .rename("PREDICT_DEFAULT_STATUS")
558
+ )
559
+
560
+ return pd.concat(
561
+ [
562
+ true_status.reset_index(drop=True),
563
+ clf_prediction_prob_df.reset_index(drop=True),
564
+ clf_thresh_predicted_default_status.reset_index(drop=True),
565
+ loan_amount.reset_index(drop=True),
566
+ ],
567
+ axis=1,
568
+ )
src/models/xgboost_model.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.features.build_features import SplitDataset
2
+
3
+ from src.models.xgboost_train_model import xgboost_train_model
4
+ from src.models.xgboost_predict_model import xgboost_predit_model
5
+ from src.models.xgboost_test_model import xgboost_test_model
6
+
7
+ from src.models.util_model_class import ModelClass
8
+
9
+
10
+ def xgboost_class(split_dataset: SplitDataset, currency: str):
11
+
12
+ # Train Model
13
+ clf_xgbt_model = xgboost_train_model(split_dataset)
14
+
15
+ # Predit using Trained Model
16
+ clf_xgbt_predictions = xgboost_predit_model(
17
+ clf_xgbt_model, split_dataset)
18
+
19
+ # Test and Evaluate Model
20
+ df_trueStatus_probabilityDefault_threshStatus_loanAmount_xgbt = xgboost_test_model(
21
+ clf_xgbt_model,
22
+ split_dataset,
23
+ currency,
24
+ clf_xgbt_predictions.probability_threshold_selected,
25
+ clf_xgbt_predictions.predicted_default_status)
26
+
27
+ return ModelClass(
28
+ model=clf_xgbt_model,
29
+ trueStatus_probabilityDefault_threshStatus_loanAmount_df=df_trueStatus_probabilityDefault_threshStatus_loanAmount_xgbt,
30
+ probability_threshold_selected=clf_xgbt_predictions.probability_threshold_selected,
31
+ predicted_default_status=clf_xgbt_predictions.predicted_default_status,
32
+ prediction_probability_df=clf_xgbt_predictions.prediction_probability_df,
33
+ )
src/models/xgboost_predict_model.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from src.models.util_predict_model import make_prediction_view
2
+
3
+ xgboost_predit_model = make_prediction_view(
4
+ "XGBoost", "Gradient Boosted Tree with XGBoost")
src/models/xgboost_test_model.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from src.models.util_test import make_tests_view
2
+
3
+ xgboost_test_model = make_tests_view(
4
+ "XGBoost", "Gradient Boosted Tree with XGBoost")
src/models/xgboost_train_model.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+
3
+ import numpy as np
4
+ import xgboost as xgb
5
+ from src.features.build_features import SplitDataset
6
+ import streamlit as st
7
+
8
+ from src.visualization.graphs_decision_tree import(plot_importance_gbt,
9
+ plot_tree_gbt)
10
+
11
+ from src.visualization.graphs_settings import streamlit_chart_setting_height_width
12
+
13
+ from src.visualization.graphs_download import (download_importance_gbt,
14
+ download_tree_gbt)
15
+
16
+
17
+ @ st.cache(suppress_st_warning=True, hash_funcs={
18
+ xgb.XGBClassifier: pickle.dumps
19
+ })
20
+ def create_clf_xgbt_model(X_train, y_train):
21
+ # Using hyperparameters learning_rate and max_depth
22
+ return xgb.XGBClassifier(
23
+ learning_rate=0.1,
24
+ max_depth=7,
25
+ use_label_encoder=False,
26
+ eval_metric="logloss",
27
+ ).fit(X_train, np.ravel(y_train), eval_metric="logloss")
28
+
29
+
30
+ def interpret_clf_xgbt_model(clf_xgbt_model):
31
+ st.subheader("XGBoost Decision Tree Feature Importance")
32
+
33
+ (barxsize, barysize,) = streamlit_chart_setting_height_width(
34
+ "Chart Settings", 10, 15, "barxsize", "barysize"
35
+ )
36
+
37
+ fig1 = plot_importance_gbt(clf_xgbt_model, barxsize, barysize)
38
+
39
+ st.pyplot(fig1)
40
+
41
+ download_importance_gbt(fig1, barxsize, barysize)
42
+
43
+ st.subheader("XGBoost Decision Tree Structure")
44
+
45
+ (treexsize, treeysize,) = streamlit_chart_setting_height_width(
46
+ "Chart Settings", 15, 10, "treexsize", "treeysize"
47
+ )
48
+
49
+ fig2 = plot_tree_gbt(treexsize, treeysize, clf_xgbt_model)
50
+
51
+ st.pyplot(fig2)
52
+
53
+ download_tree_gbt(treexsize, treeysize)
54
+ st.markdown(
55
+ "Note: The downloaded XGBoost Decision Tree plot chart in png has higher resolution than that displayed here."
56
+ )
57
+
58
+
59
+ def xgboost_train_model(split_dataset: SplitDataset):
60
+ st.header("XGBoost Decision Trees")
61
+
62
+ clf_xgbt_model = create_clf_xgbt_model(
63
+ split_dataset.X_train, split_dataset.y_train
64
+ )
65
+
66
+ interpret_clf_xgbt_model(clf_xgbt_model)
67
+
68
+ return clf_xgbt_model
src/visualization/__init__.py ADDED
File without changes
src/visualization/graphs_decision_tree.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import xgboost as xgb
3
+
4
+ import streamlit as st
5
+
6
+ import matplotlib.pyplot as plt
7
+
8
+ from xgboost import plot_tree
9
+
10
+
11
+ def plot_importance_gbt(clf_xgbt_model, barxsize, barysize):
12
+ axobject1 = xgb.plot_importance(clf_xgbt_model, importance_type="weight")
13
+ fig1 = axobject1.figure
14
+ st.write("Feature Importance Plot (Gradient Boosted Tree)")
15
+ fig1.set_size_inches(barxsize, barysize)
16
+ return fig1
17
+
18
+
19
+ def plot_tree_gbt(treexsize, treeysize, clf_xgbt_model):
20
+ plot_tree(clf_xgbt_model)
21
+ fig2 = plt.gcf()
22
+ fig2.set_size_inches(treexsize, treeysize)
23
+ return fig2
src/visualization/graphs_download.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import matplotlib.pyplot as plt
3
+
4
+
5
+ def download_importance_gbt(fig1, barxsize, barysize):
6
+ if st.button(
7
+ "Download Feature Importance Plot as png (Gradient Boosted Tree)"
8
+ ):
9
+ dpisize = max(barxsize, barysize)
10
+ plt.savefig("bar.png", dpi=dpisize * 96, bbox_inches="tight")
11
+ fig1.set_size_inches(barxsize, barysize)
12
+
13
+
14
+ def download_tree_gbt(treexsize, treeysize):
15
+ if st.button("Download XGBoost Decision Tree Plot as png (Gradient Boosted Tree)"):
16
+ dpisize = max(treexsize, treeysize)
17
+ plt.savefig("tree.png", dpi=dpisize * 96, bbox_inches="tight")
src/visualization/graphs_logistic.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import plotly.express as px
2
+
3
+
4
+ def plot_logistic_coeff_barh(df):
5
+ fig = px.bar(data_frame=df, x="Value",
6
+ y="Coefficient", orientation="h")
7
+
8
+ fig.update_layout(
9
+ title="Logistic Regression Coefficients",
10
+ xaxis_title="Value",
11
+ yaxis_title="Coefficient",)
12
+ return fig
src/visualization/graphs_settings.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+
4
+ def streamlit_chart_setting_height_width(
5
+ title: str,
6
+ default_widthvalue: int,
7
+ default_heightvalue: int,
8
+ widthkey: str,
9
+ heightkey: str,
10
+ ):
11
+ with st.expander(title):
12
+
13
+ lbarx_col, lbary_col = st.columns(2)
14
+
15
+ with lbarx_col:
16
+ width_size = st.number_input(
17
+ label="Width in inches:",
18
+ value=default_widthvalue,
19
+ key=widthkey,
20
+ )
21
+
22
+ with lbary_col:
23
+ height_size = st.number_input(
24
+ label="Height in inches:",
25
+ value=default_heightvalue,
26
+ key=heightkey,
27
+ )
28
+ return width_size, height_size
src/visualization/graphs_test.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from matplotlib import pyplot as plt
2
+
3
+ from sklearn.metrics import roc_curve
4
+
5
+ from typing import OrderedDict
6
+
7
+ from src.models.util_model_class import ModelClass
8
+
9
+ from sklearn.calibration import calibration_curve
10
+
11
+
12
+ def cross_validation_graph(cv, eval_metric, trees):
13
+
14
+ # Plot the test AUC scores for each iteration
15
+ fig = plt.figure()
16
+ plt.plot(cv[cv.columns[2]])
17
+ plt.title(
18
+ "Test {eval_metric} Score Over {it_numbr} Iterations".format(
19
+ eval_metric=eval_metric, it_numbr=trees
20
+ )
21
+ )
22
+ plt.xlabel("Iteration Number")
23
+ plt.ylabel("Test {eval_metric} Score".format(eval_metric=eval_metric))
24
+ return fig
25
+
26
+
27
+ def roc_auc_compare_n_models(y, model_views: OrderedDict[str, ModelClass]):
28
+ colors = ["blue", "green"]
29
+ fig = plt.figure()
30
+ for color_idx, (model_name, model_view) in enumerate(model_views.items()):
31
+ fpr, tpr, _thresholds = roc_curve(
32
+ y, model_view.prediction_probability_df
33
+ )
34
+ plt.plot(fpr, tpr, color=colors[color_idx], label=f"{model_name}")
35
+ plt.plot([0, 1], [0, 1], linestyle="--", label="Random Prediction")
36
+ model_names = list(model_views.keys())
37
+ if not model_names:
38
+ model_name_str = "None"
39
+ elif len(model_names) == 1:
40
+ model_name_str = model_names[0]
41
+ else:
42
+ model_name_str = " and ".join(
43
+ [", ".join(model_names[:-1]), model_names[-1]]
44
+ )
45
+ plt.title(f"ROC Chart for {model_name_str} on the Probability of Default")
46
+ plt.xlabel("False Positive Rate (FP Rate)")
47
+ plt.ylabel("True Positive Rate (TP Rate)")
48
+ plt.legend()
49
+ plt.grid(False)
50
+ plt.xlim(0, 1)
51
+ plt.ylim(0, 1)
52
+ return fig
53
+
54
+
55
+ def calibration_curve_report_commented_n(
56
+ y, model_views: OrderedDict[str, ModelClass], bins: int
57
+ ):
58
+ fig = plt.figure()
59
+ for model_name, model_view in model_views.items():
60
+ frac_of_pos, mean_pred_val = calibration_curve(
61
+ y,
62
+ model_view.prediction_probability_df,
63
+ n_bins=bins,
64
+ normalize=True,
65
+ )
66
+ plt.plot(mean_pred_val, frac_of_pos, "s-", label=f"{model_name}")
67
+
68
+ # Create the calibration curve plot with the guideline
69
+ plt.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
70
+
71
+ plt.ylabel("Fraction of positives")
72
+ plt.xlabel("Average Predicted Probability")
73
+ plt.title("Calibration Curve")
74
+ plt.legend()
75
+ plt.grid(False)
76
+ plt.xlim(0, 1)
77
+ plt.ylim(0, 1)
78
+ return fig
src/visualization/graphs_threshold.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import plotly.express as px
3
+
4
+ import streamlit as st
5
+
6
+ import matplotlib.pyplot as plt
7
+
8
+ import numpy as np
9
+
10
+
11
+ def acceptance_rate_driven_threshold_graph(clf_prediction_prob_df_gbt, acc_rate_thresh_gbt):
12
+ figa = px.histogram(clf_prediction_prob_df_gbt["PROB_DEFAULT"])
13
+
14
+ figa.update_layout(
15
+ title="Acceptance Rate Threshold vs. Loans Accepted",
16
+ xaxis_title="Acceptance Rate Threshold",
17
+ yaxis_title="Loans Accepted",
18
+ )
19
+
20
+ figa.update_traces(marker_line_width=1, marker_line_color="white")
21
+
22
+ figa.add_vline(
23
+ x=acc_rate_thresh_gbt,
24
+ line_width=3,
25
+ line_dash="solid",
26
+ line_color="red",
27
+ )
28
+
29
+ st.plotly_chart(figa)
30
+
31
+
32
+ def recall_accuracy_threshold_tradeoff_fig(
33
+ widthsize,
34
+ heightsize,
35
+ threshold_list,
36
+ thresh_def_recalls_list,
37
+ thresh_nondef_recalls_list,
38
+ thresh_accs_list,
39
+ ):
40
+ fig = plt.figure(figsize=(widthsize, heightsize))
41
+ plt.plot(threshold_list, thresh_def_recalls_list, label="Default Recall")
42
+ plt.plot(
43
+ threshold_list, thresh_nondef_recalls_list, label="Non-Default Recall"
44
+ )
45
+ plt.plot(threshold_list, thresh_accs_list, label="Model Accuracy")
46
+ plt.xlabel("Probability Threshold")
47
+ plt.ylabel("Score")
48
+ plt.xlim(0, 1)
49
+ plt.ylim(0, 1)
50
+ plt.legend()
51
+ plt.title("Recall and Accuracy Score Tradeoff with Probability Threshold")
52
+ plt.grid(False)
53
+ return fig
54
+
55
+
56
+ def acceptance_rate_threshold_fig(probability_default, acceptancerate, bins):
57
+ # Probability distribution
58
+ probability_stat_distribution = probability_default.describe()
59
+
60
+ # Acceptance rate threshold
61
+ acc_rate_thresh = np.quantile(probability_default, acceptancerate)
62
+ fig = plt.figure()
63
+
64
+ plt.hist(
65
+ probability_default,
66
+ color="blue",
67
+ bins=bins,
68
+ histtype="bar",
69
+ ec="white",
70
+ )
71
+
72
+ # Add a reference line to the plot for the threshold
73
+ plt.axvline(x=acc_rate_thresh, color="red")
74
+ plt.title("Acceptance Rate Thershold")
75
+
76
+ return (
77
+ fig,
78
+ probability_stat_distribution,
79
+ acc_rate_thresh,
80
+ )
src/visualization/metrics.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ import streamlit as st
4
+
5
+
6
+ def streamlit_2columns_metrics_pct_df(
7
+ column1name_label: str,
8
+ column2name_label: str,
9
+ df: pd.DataFrame,
10
+ ):
11
+ (
12
+ column1name,
13
+ column2name,
14
+ ) = st.columns(2)
15
+
16
+ with column1name:
17
+ st.metric(
18
+ label=column1name_label,
19
+ value="{:.0%}".format(df.value_counts().get(1) / df.shape[0]),
20
+ delta=None,
21
+ delta_color="normal",
22
+ )
23
+
24
+ with column2name:
25
+ st.metric(
26
+ label=column2name_label,
27
+ value="{:.0%}".format(df.value_counts().get(0) / df.shape[0]),
28
+ delta=None,
29
+ delta_color="normal",
30
+ )
31
+
32
+
33
+ def streamlit_2columns_metrics_df(
34
+ column1name_label: str,
35
+ column2name_label: str,
36
+ df: pd.DataFrame,
37
+ ):
38
+ (
39
+ column1name,
40
+ column2name,
41
+ ) = st.columns(2)
42
+
43
+ with column1name:
44
+ st.metric(
45
+ label=column1name_label,
46
+ value=df.value_counts().get(1),
47
+ delta=None,
48
+ delta_color="normal",
49
+ )
50
+
51
+ with column2name:
52
+ st.metric(
53
+ label=column2name_label,
54
+ value=df.value_counts().get(0),
55
+ delta=None,
56
+ delta_color="normal",
57
+ )
58
+
59
+
60
+ def streamlit_2columns_metrics_df_shape(df: pd.DataFrame):
61
+ (
62
+ column1name,
63
+ column2name,
64
+ ) = st.columns(2)
65
+
66
+ with column1name:
67
+ st.metric(
68
+ label="Rows",
69
+ value=df.shape[0],
70
+ delta=None,
71
+ delta_color="normal",
72
+ )
73
+
74
+ with column2name:
75
+ st.metric(
76
+ label="Columns",
77
+ value=df.shape[1],
78
+ delta=None,
79
+ delta_color="normal",
80
+ )
81
+
82
+
83
+ def streamlit_2columns_metrics_pct_series(
84
+ column1name_label: str,
85
+ column2name_label: str,
86
+ series: pd.Series,
87
+ ):
88
+ (
89
+ column1name,
90
+ column2name,
91
+ ) = st.columns(2)
92
+ with column1name:
93
+ st.metric(
94
+ label=column1name_label,
95
+ value="{:.0%}".format(series.get(1) / series.sum()),
96
+ delta=None,
97
+ delta_color="normal",
98
+ )
99
+
100
+ with column2name:
101
+ st.metric(
102
+ label=column2name_label,
103
+ value="{:.0%}".format(series.get(0) / series.sum()),
104
+ delta=None,
105
+ delta_color="normal",
106
+ )
107
+
108
+
109
+ def streamlit_2columns_metrics_series(
110
+ column1name_label: str,
111
+ column2name_label: str,
112
+ series: pd.Series,
113
+ ):
114
+ (
115
+ column1name,
116
+ column2name,
117
+ ) = st.columns(2)
118
+ with column1name:
119
+ st.metric(
120
+ label=column1name_label,
121
+ value=series.get(1),
122
+ delta=None,
123
+ delta_color="normal",
124
+ )
125
+
126
+ with column2name:
127
+ st.metric(
128
+ label=column2name_label,
129
+ value=series.get(0),
130
+ delta=None,
131
+ delta_color="normal",
132
+ )
views/decision_tree.py DELETED
@@ -1,70 +0,0 @@
1
- from common.data import SplitDataset
2
- import streamlit as st
3
- from common.util import (
4
- test_variables_gbt,
5
- )
6
- from common.views import (
7
- streamlit_chart_setting_height_width,
8
- plot_importance_gbt,
9
- plot_tree_gbt,
10
- download_importance_gbt,
11
- download_tree_gbt,
12
- )
13
- from views.typing import ModelView
14
- from views.threshold import decision_tree_threshold_view
15
- from views.evaluation import decision_tree_evaluation_view
16
-
17
-
18
- def decisiontree_view(split_dataset: SplitDataset, currency: str):
19
- st.header("Decision Trees")
20
-
21
- clf_gbt_model = test_variables_gbt(
22
- split_dataset.X_train, split_dataset.y_train
23
- )
24
-
25
- st.subheader("Decision Tree Feature Importance")
26
-
27
- (barxsize, barysize,) = streamlit_chart_setting_height_width(
28
- "Chart Settings", 10, 15, "barxsize", "barysize"
29
- )
30
-
31
- fig1 = plot_importance_gbt(clf_gbt_model, barxsize, barysize)
32
-
33
- st.pyplot(fig1)
34
-
35
- download_importance_gbt(fig1, barxsize, barysize)
36
-
37
- st.subheader("Decision Tree Structure")
38
-
39
- (treexsize, treeysize,) = streamlit_chart_setting_height_width(
40
- "Chart Settings", 15, 10, "treexsize", "treeysize"
41
- )
42
-
43
- fig2 = plot_tree_gbt(treexsize, treeysize, clf_gbt_model)
44
-
45
- st.pyplot(fig2)
46
-
47
- download_tree_gbt(treexsize, treeysize)
48
- st.markdown(
49
- "Note: The downloaded decision tree plot chart in png has higher resolution than that displayed here."
50
- )
51
-
52
- threshold = decision_tree_threshold_view(clf_gbt_model, split_dataset)
53
-
54
- df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
55
- decision_tree_evaluation_view(
56
- clf_gbt_model,
57
- split_dataset,
58
- currency,
59
- threshold.probability_threshold_selected,
60
- threshold.predicted_default_status,
61
- )
62
- )
63
-
64
- return ModelView(
65
- model=clf_gbt_model,
66
- trueStatus_probabilityDefault_threshStatus_loanAmount_df=df_trueStatus_probabilityDefault_threshStatus_loanAmount,
67
- probability_threshold_selected=threshold.probability_threshold_selected,
68
- predicted_default_status=threshold.predicted_default_status,
69
- prediction_probability_df=threshold.prediction_probability_df,
70
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
views/logistic.py DELETED
@@ -1,119 +0,0 @@
1
- from common.data import SplitDataset
2
- import streamlit as st
3
- import pandas as pd
4
- import plotly.express as px
5
- from views.threshold import logistic_threshold_view
6
- from views.evaluation import logistic_evaluation_view
7
- from common.util import (
8
- test_variables_logistic,
9
- print_coeff_logistic,
10
- model_probability_values_df,
11
- apply_threshold_to_probability_values,
12
- )
13
- from common.views import (
14
- streamlit_2columns_metrics_df,
15
- streamlit_2columns_metrics_pct_df,
16
- )
17
- from views.typing import ModelView
18
-
19
-
20
- def logistic_view(split_dataset: SplitDataset, currency: str) -> ModelView:
21
- # ### Test and create variables logically
22
-
23
- st.header("Logistic Regression")
24
-
25
- clf_logistic_model = test_variables_logistic(
26
- split_dataset.X_train, split_dataset.y_train
27
- )
28
-
29
- st.metric(
30
- label="# of Coefficients in Logistic Regression",
31
- value=clf_logistic_model.n_features_in_,
32
- delta=None,
33
- delta_color="normal",
34
- )
35
-
36
- coef_dict = print_coeff_logistic(clf_logistic_model, split_dataset)
37
-
38
- st.subheader("Logistic Regression Coefficient Values")
39
-
40
- coef_dict_sorted = dict(
41
- sorted(coef_dict.items(), key=lambda item: item[1], reverse=False)
42
- )
43
-
44
- data_items = coef_dict_sorted.items()
45
- data_list = list(data_items)
46
-
47
- df = pd.DataFrame(data_list, columns=["Coefficient", "Value"])
48
-
49
- fig1 = px.bar(data_frame=df, x="Value", y="Coefficient", orientation="h")
50
-
51
- fig1.update_layout(
52
- title="Logistic Regression Coefficients",
53
- xaxis_title="Value",
54
- yaxis_title="Coefficient",
55
- )
56
-
57
- st.plotly_chart(fig1)
58
-
59
- st.subheader("Classification Probability Threshold")
60
-
61
- st.write(
62
- """
63
- The logistic regression model (obtained using training data) is applied on testing data to predict the loans probabilities of defaulting.\n
64
- Probabilities of defaulting of the loans are compared to a probability threshold.\n
65
- A loan is predicted to default if its predicted probability of defaulting is greater than the probability threshold.
66
- """
67
- )
68
-
69
- threshold = st.slider(
70
- label="Default Probability Threshold:",
71
- min_value=0.0,
72
- max_value=1.0,
73
- value=0.7,
74
- key="key_threshold",
75
- )
76
-
77
- clf_prediction_prob_df_log = model_probability_values_df(
78
- clf_logistic_model,
79
- split_dataset.X_test,
80
- )
81
-
82
- clf_thresh_predicted_default_status_user = (
83
- apply_threshold_to_probability_values(
84
- clf_prediction_prob_df_log,
85
- threshold,
86
- )
87
- )
88
-
89
- streamlit_2columns_metrics_df(
90
- "# of Predicted Defaults",
91
- "# of Predicted Non-Default",
92
- clf_thresh_predicted_default_status_user,
93
- )
94
-
95
- streamlit_2columns_metrics_pct_df(
96
- "% of Loans Predicted to Default",
97
- "% of Loans Predicted not to Default",
98
- clf_thresh_predicted_default_status_user,
99
- )
100
-
101
- threshold = logistic_threshold_view(clf_logistic_model, split_dataset)
102
-
103
- df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
104
- logistic_evaluation_view(
105
- clf_logistic_model,
106
- split_dataset,
107
- currency,
108
- threshold.probability_threshold_selected,
109
- threshold.predicted_default_status,
110
- )
111
- )
112
-
113
- return ModelView(
114
- model=clf_logistic_model,
115
- trueStatus_probabilityDefault_threshStatus_loanAmount_df=df_trueStatus_probabilityDefault_threshStatus_loanAmount,
116
- probability_threshold_selected=threshold.probability_threshold_selected,
117
- predicted_default_status=threshold.predicted_default_status,
118
- prediction_probability_df=threshold.prediction_probability_df,
119
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
views/threshold.py DELETED
@@ -1,272 +0,0 @@
1
- from dataclasses import dataclass
2
- from typing import Union, cast
3
- import numpy as np
4
- import streamlit as st
5
- import plotly.express as px
6
- import pandas as pd
7
- from xgboost.sklearn import XGBClassifier
8
- from sklearn.linear_model import LogisticRegression
9
- from common.data import SplitDataset
10
- from common.util import (
11
- model_probability_values_df,
12
- apply_threshold_to_probability_values,
13
- find_best_threshold_J_statistic,
14
- default_status_per_threshold,
15
- classification_report_per_threshold,
16
- thresh_classification_report_recall_accuracy,
17
- )
18
- from common.views import (
19
- streamlit_2columns_metrics_df,
20
- streamlit_2columns_metrics_pct_df,
21
- )
22
-
23
-
24
- @dataclass(frozen=True)
25
- class Threshold:
26
- probability_threshold_selected: float
27
- predicted_default_status: pd.Series
28
- prediction_probability_df: pd.DataFrame
29
-
30
-
31
- def make_threshold_view(
32
- model_name_short: str,
33
- model_name: str,
34
- ):
35
- def view(
36
- clf_gbt_model: Union[XGBClassifier, LogisticRegression],
37
- split_dataset: SplitDataset,
38
- ) -> Threshold:
39
- st.subheader("Classification Probability Threshold - User Defined")
40
- st.write(
41
- f"""
42
- The {model_name} model (obtained using training data) is applied on testing data to predict the loans probabilities of defaulting.\n
43
- Probabilities of defaulting of the loans are compared to a probability threshold.\n
44
- A loan is predicted to default if its predicted probability of defaulting is greater than the probability threshold.
45
- """
46
- )
47
-
48
- threshold_gbt_default = st.slider(
49
- label="Default Probability Threshold:",
50
- min_value=0.0,
51
- max_value=1.0,
52
- value=0.8,
53
- key=f"threshold_{model_name_short}_default",
54
- )
55
-
56
- clf_prediction_prob_df_gbt = model_probability_values_df(
57
- clf_gbt_model,
58
- split_dataset.X_test,
59
- )
60
-
61
- clf_thresh_predicted_default_status_user_gbt = (
62
- apply_threshold_to_probability_values(
63
- clf_prediction_prob_df_gbt,
64
- threshold_gbt_default,
65
- )
66
- )
67
-
68
- streamlit_2columns_metrics_df(
69
- "# of Predicted Defaults",
70
- "# of Predicted Non-Default",
71
- clf_thresh_predicted_default_status_user_gbt,
72
- )
73
-
74
- streamlit_2columns_metrics_pct_df(
75
- "% of Loans Predicted to Default",
76
- "% of Loans Predicted not to Default",
77
- clf_thresh_predicted_default_status_user_gbt,
78
- )
79
-
80
- st.subheader("J Statistic Driven Classification Probability Threshold")
81
-
82
- J_statistic_best_threshold = find_best_threshold_J_statistic(
83
- split_dataset.y_test, clf_prediction_prob_df_gbt
84
- )
85
- st.metric(
86
- label="Youden's J statistic calculated best threshold",
87
- value=J_statistic_best_threshold,
88
- )
89
-
90
- clf_thresh_predicted_default_status_Jstatistic_gbt = (
91
- apply_threshold_to_probability_values(
92
- clf_prediction_prob_df_gbt,
93
- J_statistic_best_threshold,
94
- )
95
- )
96
-
97
- streamlit_2columns_metrics_df(
98
- "# of Predicted Defaults",
99
- "# of Predicted Non-Default",
100
- clf_thresh_predicted_default_status_Jstatistic_gbt,
101
- )
102
-
103
- streamlit_2columns_metrics_pct_df(
104
- "% of Loans Predicted to Default",
105
- "% of Loans Predicted not to Default",
106
- clf_thresh_predicted_default_status_Jstatistic_gbt,
107
- )
108
-
109
- st.subheader(
110
- "Recall and Accuracy Tradeoff with given Probability Threshold"
111
- )
112
- # Steps
113
- # Get list of thresholds
114
- # Get default status per threshold
115
- # Get classification report per threshold
116
- # Get recall, nondef recall, and accuracy per threshold
117
-
118
- threshold_list = np.arange(0, 1, 0.025).round(decimals=3).tolist()
119
-
120
- threshold_default_status_list = default_status_per_threshold(
121
- threshold_list, clf_prediction_prob_df_gbt["PROB_DEFAULT"]
122
- )
123
- thresh_classification_report_dict = (
124
- classification_report_per_threshold(
125
- threshold_list,
126
- threshold_default_status_list,
127
- split_dataset.y_test,
128
- )
129
- )
130
-
131
- (
132
- thresh_def_recalls_list,
133
- thresh_nondef_recalls_list,
134
- thresh_accs_list,
135
- ) = thresh_classification_report_recall_accuracy(
136
- thresh_classification_report_dict
137
- )
138
-
139
- namelist = [
140
- "Default Recall",
141
- "Non Default Recall",
142
- "Accuracy",
143
- "Threshold",
144
- ]
145
-
146
- df = pd.DataFrame(
147
- [
148
- thresh_def_recalls_list,
149
- thresh_nondef_recalls_list,
150
- thresh_accs_list,
151
- threshold_list,
152
- ],
153
- index=namelist,
154
- )
155
-
156
- df = df.T
157
-
158
- fig2 = px.line(
159
- data_frame=df,
160
- y=["Default Recall", "Non Default Recall", "Accuracy"],
161
- x="Threshold",
162
- )
163
-
164
- fig2.update_layout(
165
- title="Recall and Accuracy score Trade-off with Probability Threshold",
166
- xaxis_title="Probability Threshold",
167
- yaxis_title="Score",
168
- )
169
- fig2.update_yaxes(range=[0.0, 1.0])
170
-
171
- st.plotly_chart(fig2)
172
-
173
- st.subheader("Acceptance Rate Driven Probability Threshold")
174
- # Steps
175
- # Set acceptance rate
176
- # Get default status per threshold
177
- # Get classification report per threshold
178
- # Get recall, nondef recall, and accuracy per threshold
179
-
180
- acceptance_rate = (
181
- st.slider(
182
- label="% of loans accepted (acceptance rate):",
183
- min_value=0,
184
- max_value=100,
185
- value=85,
186
- key=f"acceptance_rate_{model_name_short}",
187
- format="%f%%",
188
- )
189
- / 100
190
- )
191
-
192
- acc_rate_thresh_gbt = np.quantile(
193
- clf_prediction_prob_df_gbt["PROB_DEFAULT"], acceptance_rate
194
- )
195
-
196
- st.write(
197
- f"An acceptance rate of {acceptance_rate} results in probability threshold of {acc_rate_thresh_gbt}"
198
- )
199
-
200
- figa = px.histogram(clf_prediction_prob_df_gbt["PROB_DEFAULT"])
201
-
202
- figa.update_layout(
203
- title="Acceptance Rate Threshold vs. Loans Accepted",
204
- xaxis_title="Acceptance Rate Threshold",
205
- yaxis_title="Loans Accepted",
206
- )
207
-
208
- figa.update_traces(marker_line_width=1, marker_line_color="white")
209
-
210
- figa.add_vline(
211
- x=acc_rate_thresh_gbt,
212
- line_width=3,
213
- line_dash="solid",
214
- line_color="red",
215
- )
216
-
217
- st.plotly_chart(figa)
218
-
219
- clf_thresh_predicted_default_status_acceptance_gbt = (
220
- apply_threshold_to_probability_values(
221
- clf_prediction_prob_df_gbt,
222
- acc_rate_thresh_gbt,
223
- )
224
- )
225
-
226
- st.write()
227
- st.subheader("Selected Probability Threshold")
228
-
229
- options = [
230
- "User Defined",
231
- "J Statistic Driven",
232
- "Acceptance Rate Driven",
233
- ]
234
- prob_thresh_option = st.radio(
235
- label="Selected Probability Threshold",
236
- options=options,
237
- key=f"{model_name_short}_radio_thresh",
238
- )
239
-
240
- if prob_thresh_option == "User Defined":
241
- prob_thresh_selected_gbt = threshold_gbt_default
242
- predicted_default_status_gbt = (
243
- clf_thresh_predicted_default_status_user_gbt
244
- )
245
- elif prob_thresh_option == "J Statistic Driven":
246
- prob_thresh_selected_gbt = J_statistic_best_threshold
247
- predicted_default_status_gbt = (
248
- clf_thresh_predicted_default_status_Jstatistic_gbt
249
- )
250
- else:
251
- prob_thresh_selected_gbt = acc_rate_thresh_gbt
252
- predicted_default_status_gbt = (
253
- clf_thresh_predicted_default_status_acceptance_gbt
254
- )
255
-
256
- st.write(
257
- f"Selected probability threshold is {prob_thresh_selected_gbt}"
258
- )
259
-
260
- return Threshold(
261
- probability_threshold_selected=cast(
262
- float, prob_thresh_selected_gbt
263
- ),
264
- predicted_default_status=predicted_default_status_gbt,
265
- prediction_probability_df=clf_prediction_prob_df_gbt,
266
- )
267
-
268
- return view
269
-
270
-
271
- decision_tree_threshold_view = make_threshold_view("gbt", "decision tree")
272
- logistic_threshold_view = make_threshold_view("lg", "logistic")