pkiage commited on
Commit
37b5a88
·
1 Parent(s): 232e5e5
common/__init__.py DELETED
File without changes
common/data.py DELETED
@@ -1,94 +0,0 @@
1
- from typing import List, Union, cast
2
- from dataclasses import dataclass
3
- from sklearn.model_selection import train_test_split
4
- import pandas as pd
5
-
6
- from common.util import drop_columns
7
-
8
-
9
- @dataclass
10
- class SplitDataset:
11
- X_test: pd.DataFrame
12
- X_train: pd.DataFrame
13
- y_test: pd.Series
14
- y_train: pd.Series
15
-
16
- @property
17
- def X_y_test(self) -> pd.DataFrame:
18
- return pd.concat(
19
- cast(
20
- List[Union[pd.DataFrame, pd.Series]],
21
- [
22
- self.X_test.reset_index(drop=True),
23
- self.y_test.reset_index(drop=True),
24
- ],
25
- ),
26
- axis=1,
27
- )
28
-
29
- @property
30
- def X_y_train(self) -> pd.DataFrame:
31
- return pd.concat(
32
- cast(
33
- List[Union[pd.DataFrame, pd.Series]],
34
- [
35
- self.X_train.reset_index(drop=True),
36
- self.y_train.reset_index(drop=True),
37
- ],
38
- ),
39
- axis=1,
40
- )
41
-
42
-
43
- @dataclass
44
- class Dataset:
45
- df: pd.DataFrame
46
- random_state: int
47
- test_size: int
48
-
49
- @property
50
- def y_value(self) -> pd.DataFrame:
51
- return self.df["loan_status"]
52
-
53
- @property
54
- def x_values(self) -> pd.DataFrame:
55
- return cast(
56
- pd.DataFrame,
57
- drop_columns(
58
- self.df,
59
- [
60
- "loan_status",
61
- "loan_grade_A",
62
- "loan_grade_B",
63
- "loan_grade_C",
64
- "loan_grade_D",
65
- "loan_grade_E",
66
- "loan_grade_F",
67
- "loan_grade_G",
68
- ],
69
- ),
70
- )
71
-
72
- @property
73
- def x_values_column_names(self):
74
- return self.x_values.columns.tolist()
75
-
76
- def x_values_filtered_columns(self, columns: List[str]) -> pd.DataFrame:
77
- return self.df.filter(columns)
78
-
79
- def train_test_split(
80
- self, selected_x_values: pd.DataFrame
81
- ) -> SplitDataset:
82
- X_train, X_test, y_train, y_test = train_test_split(
83
- selected_x_values,
84
- self.y_value,
85
- test_size=self.test_size / 100, # since up was given as pct
86
- random_state=self.random_state,
87
- )
88
-
89
- return SplitDataset(
90
- X_train=cast(pd.DataFrame, X_train),
91
- X_test=cast(pd.DataFrame, X_test),
92
- y_train=cast(pd.Series, y_train),
93
- y_test=cast(pd.Series, y_test),
94
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
common/util.py DELETED
@@ -1,391 +0,0 @@
1
- # DATA MANIPULATION & ANALYSIS
2
-
3
- import pickle
4
- import streamlit as st
5
-
6
- # Arrays
7
- import numpy as np
8
-
9
- # DataFrames and Series
10
- import pandas as pd
11
-
12
- # Returns the indices of the maximum values along an axis
13
- from numpy import argmax
14
-
15
- # MODELLING
16
-
17
- # Logistic regression
18
- from sklearn.linear_model import LogisticRegression
19
-
20
- from sklearn.model_selection import StratifiedKFold
21
-
22
- # XGBoosted Decision Trees
23
- import xgboost as xgb
24
-
25
-
26
- # REPORTING, EVALUATION, AND INTERPRETATION
27
-
28
- # Classification report
29
- from sklearn.metrics import classification_report
30
-
31
- # Reciever Operator Curve
32
- from sklearn.metrics import roc_curve
33
-
34
-
35
- # Evaluate a score by cross-validation
36
- from sklearn.model_selection import cross_val_score
37
-
38
-
39
- # # Functions
40
-
41
-
42
- def drop_columns(df, columns):
43
- return df.drop(columns, axis=1)
44
-
45
-
46
- def remove_less_than_0_columns(df, column):
47
- df[column].dropna()
48
- return df.loc[(df[column] != 0).any(1)]
49
-
50
-
51
- def boolean_int_condition_label(df, label_column_name, condition):
52
- df[label_column_name] = condition
53
- y = df[label_column_name].astype(int)
54
- df = drop_columns(df, label_column_name)
55
- return y, df
56
-
57
-
58
- @st.cache(suppress_st_warning=True)
59
- def undersample_training_data(
60
- df: pd.DataFrame, column_name: str, split_dataset
61
- ):
62
- count_nondefault, count_default = split_dataset.X_y_train[
63
- column_name
64
- ].value_counts()
65
-
66
- nondefaults = df[df[column_name] == 0] # 0
67
-
68
- defaults = df[df[column_name] == 1]
69
-
70
- under_sample = min(count_nondefault, count_default)
71
-
72
- nondefaults_under = nondefaults.sample(under_sample)
73
-
74
- defaults_under = defaults.sample(under_sample)
75
-
76
- X_y_train_under = pd.concat(
77
- [
78
- nondefaults_under.reset_index(drop=True),
79
- defaults_under.reset_index(drop=True),
80
- ],
81
- axis=0,
82
- )
83
-
84
- X_train_under = X_y_train_under.drop([column_name], axis=1) # remove label
85
-
86
- y_train_under = X_y_train_under[column_name] # label only
87
-
88
- class_balance_default = X_y_train_under[column_name].value_counts()
89
-
90
- return [
91
- X_train_under,
92
- y_train_under,
93
- X_y_train_under,
94
- class_balance_default,
95
- ]
96
-
97
-
98
- def create_coeffient_feature_dictionary_logistic_model(
99
- logistic_model, training_data
100
- ):
101
- return {
102
- feat: coef
103
- for coef, feat in zip(
104
- logistic_model.coef_[0, :], training_data.columns
105
- )
106
- }
107
-
108
-
109
- @st.cache(suppress_st_warning=True)
110
- def test_variables_logistic(X_train, y_train):
111
- # Create and fit the logistic regression model
112
- return LogisticRegression(solver="lbfgs").fit(X_train, np.ravel(y_train))
113
-
114
-
115
- @st.cache(suppress_st_warning=True)
116
- def print_coeff_logistic(clf_logistic_model, split_dataset):
117
- # Dictionary of features and their coefficients
118
- return create_coeffient_feature_dictionary_logistic_model(
119
- clf_logistic_model, split_dataset.X_train
120
- )
121
-
122
-
123
- @st.cache(suppress_st_warning=True, hash_funcs={
124
- xgb.XGBClassifier: pickle.dumps
125
- })
126
- def test_variables_gbt(X_train, y_train):
127
- # Using hyperparameters learning_rate and max_depth
128
- return xgb.XGBClassifier(
129
- learning_rate=0.1,
130
- max_depth=7,
131
- use_label_encoder=False,
132
- eval_metric="logloss",
133
- ).fit(X_train, np.ravel(y_train), eval_metric="logloss")
134
-
135
-
136
- # In[398]:
137
-
138
-
139
- def get_df_trueStatus_probabilityDefault_threshStatus_loanAmount(
140
- model, X, y, threshold, loan_amount_col_name
141
- ):
142
- true_status = y.to_frame()
143
-
144
- loan_amount = X[loan_amount_col_name]
145
-
146
- clf_prediction_prob = model.predict_proba(np.ascontiguousarray(X))
147
-
148
- clf_prediction_prob_df = pd.DataFrame(
149
- clf_prediction_prob[:, 1], columns=["PROB_DEFAULT"]
150
- )
151
-
152
- clf_thresh_predicted_default_status = (
153
- clf_prediction_prob_df["PROB_DEFAULT"]
154
- .apply(lambda x: 1 if x > threshold else 0)
155
- .rename("PREDICT_DEFAULT_STATUS")
156
- )
157
-
158
- return pd.concat(
159
- [
160
- true_status.reset_index(drop=True),
161
- clf_prediction_prob_df.reset_index(drop=True),
162
- clf_thresh_predicted_default_status.reset_index(drop=True),
163
- loan_amount.reset_index(drop=True),
164
- ],
165
- axis=1,
166
- )
167
-
168
-
169
- def find_best_threshold_J_statistic(y, clf_prediction_prob_df):
170
- fpr, tpr, thresholds = roc_curve(y, clf_prediction_prob_df)
171
- # get the best threshold
172
- # Youden’s J statistic tpr-fpr
173
- # Argmax to get the index in
174
- # thresholds
175
- return thresholds[argmax(tpr - fpr)]
176
-
177
-
178
- # In[399]:
179
-
180
-
181
- # Function that makes dataframe with probability of default, predicted default status based on threshold
182
- # and actual default status
183
-
184
-
185
- def model_probability_values_df(model, X):
186
- return pd.DataFrame(model.predict_proba(X)[:, 1], columns=["PROB_DEFAULT"])
187
-
188
-
189
- def apply_threshold_to_probability_values(probability_values, threshold):
190
- return (
191
- probability_values["PROB_DEFAULT"]
192
- .apply(lambda x: 1 if x > threshold else 0)
193
- .rename("PREDICT_DEFAULT_STATUS")
194
- )
195
-
196
-
197
- @st.cache(suppress_st_warning=True)
198
- def find_best_threshold_J_statistic(y, clf_prediction_prob_df):
199
- fpr, tpr, thresholds = roc_curve(y, clf_prediction_prob_df)
200
- # get the best threshold
201
- J = tpr - fpr # Youden’s J statistic
202
- ix = argmax(J)
203
- return thresholds[ix]
204
-
205
-
206
- # In[401]:
207
-
208
-
209
- def create_cross_validation_df(
210
- X, y, eval_metric, seed, trees, n_folds, early_stopping_rounds
211
- ):
212
- # Test data x and y
213
- DTrain = xgb.DMatrix(X, label=y)
214
-
215
- # auc or logloss
216
- params = {
217
- "eval_metric": eval_metric,
218
- "objective": "binary:logistic", # logistic say 0 or 1 for loan status
219
- "seed": seed,
220
- }
221
-
222
- # Create the data frame of cross validations
223
- cv_df = xgb.cv(
224
- params,
225
- DTrain,
226
- num_boost_round=trees,
227
- nfold=n_folds,
228
- early_stopping_rounds=early_stopping_rounds,
229
- shuffle=True,
230
- )
231
-
232
- return [DTrain, cv_df]
233
-
234
-
235
- # In[450]:
236
-
237
-
238
- def cross_validation_scores(model, X, y, nfold, score, seed):
239
- # return cv scores of metric
240
- return cross_val_score(
241
- model,
242
- np.ascontiguousarray(X),
243
- np.ravel(np.ascontiguousarray(y)),
244
- cv=StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed),
245
- scoring=score,
246
- )
247
-
248
-
249
- def default_status_per_threshold(threshold_list, prob_default):
250
- threshold_default_status_list = []
251
- for threshold in threshold_list:
252
- threshold_default_status = prob_default.apply(
253
- lambda x: 1 if x > threshold else 0
254
- )
255
- threshold_default_status_list.append(threshold_default_status)
256
- return threshold_default_status_list
257
-
258
-
259
- def classification_report_per_threshold(
260
- threshold_list, threshold_default_status_list, y_test
261
- ):
262
- target_names = ["Non-Default", "Default"]
263
- classification_report_list = []
264
- for threshold_default_status in threshold_default_status_list:
265
- thresh_classification_report = classification_report(
266
- y_test,
267
- threshold_default_status,
268
- target_names=target_names,
269
- output_dict=True,
270
- zero_division=0,
271
- )
272
- classification_report_list.append(thresh_classification_report)
273
- # Return threshold classification report dict
274
- return dict(zip(threshold_list, classification_report_list))
275
-
276
-
277
- def thresh_classification_report_recall_accuracy(
278
- thresh_classification_report_dict,
279
- ):
280
- thresh_def_recalls_list = []
281
- thresh_nondef_recalls_list = []
282
- thresh_accs_list = []
283
- for x in [*thresh_classification_report_dict]:
284
- thresh_def_recall = thresh_classification_report_dict[x]["Default"][
285
- "recall"
286
- ]
287
- thresh_def_recalls_list.append(thresh_def_recall)
288
- thresh_nondef_recall = thresh_classification_report_dict[x][
289
- "Non-Default"
290
- ]["recall"]
291
- thresh_nondef_recalls_list.append(thresh_nondef_recall)
292
- thresh_accs = thresh_classification_report_dict[x]["accuracy"]
293
- thresh_accs_list.append(thresh_accs)
294
- return [
295
- thresh_def_recalls_list,
296
- thresh_nondef_recalls_list,
297
- thresh_accs_list,
298
- ]
299
-
300
-
301
- def create_accept_rate_list(start, end, samples):
302
- return np.linspace(start, end, samples, endpoint=True)
303
-
304
-
305
- def create_strategyTable_df(
306
- start, end, samples, actual_probability_predicted_acc_rate, true, currency
307
- ):
308
- accept_rates = create_accept_rate_list(start, end, samples)
309
- thresholds_strat = []
310
- bad_rates_start = []
311
- Avg_Loan_Amnt = actual_probability_predicted_acc_rate[true].mean()
312
- num_accepted_loans_start = []
313
-
314
- for rate in accept_rates:
315
- # Calculate the threshold for the acceptance rate
316
- thresh = np.quantile(
317
- actual_probability_predicted_acc_rate["PROB_DEFAULT"], rate
318
- ).round(3)
319
- # Add the threshold value to the list of thresholds
320
- thresholds_strat.append(
321
- np.quantile(
322
- actual_probability_predicted_acc_rate["PROB_DEFAULT"], rate
323
- ).round(3)
324
- )
325
-
326
- # Reassign the loan_status value using the threshold
327
- actual_probability_predicted_acc_rate[
328
- "PREDICT_DEFAULT_STATUS"
329
- ] = actual_probability_predicted_acc_rate["PROB_DEFAULT"].apply(
330
- lambda x: 1 if x > thresh else 0
331
- )
332
-
333
- # Create a set of accepted loans using this acceptance rate
334
- accepted_loans = actual_probability_predicted_acc_rate[
335
- actual_probability_predicted_acc_rate["PREDICT_DEFAULT_STATUS"]
336
- == 0
337
- ]
338
- # Calculate and append the bad rate using the acceptance rate
339
- bad_rates_start.append(
340
- np.sum((accepted_loans[true]) / len(accepted_loans[true])).round(3)
341
- )
342
- # Accepted loans
343
- num_accepted_loans_start.append(len(accepted_loans))
344
-
345
- # Calculate estimated value
346
- money_accepted_loans = [
347
- accepted_loans * Avg_Loan_Amnt
348
- for accepted_loans in num_accepted_loans_start
349
- ]
350
-
351
- money_bad_accepted_loans = [
352
- 2 * money_accepted_loan * bad_rate
353
- for money_accepted_loan, bad_rate in zip(
354
- money_accepted_loans, bad_rates_start
355
- )
356
- ]
357
-
358
- zip_object = zip(money_accepted_loans, money_bad_accepted_loans)
359
- estimated_value = [
360
- money_accepted_loan - money_bad_accepted_loan
361
- for money_accepted_loan, money_bad_accepted_loan in zip_object
362
- ]
363
-
364
- accept_rates = ["{:.2f}".format(elem) for elem in accept_rates]
365
-
366
- thresholds_strat = ["{:.2f}".format(elem) for elem in thresholds_strat]
367
-
368
- bad_rates_start = ["{:.2f}".format(elem) for elem in bad_rates_start]
369
-
370
- estimated_value = ["{:.2f}".format(elem) for elem in estimated_value]
371
-
372
- return (
373
- pd.DataFrame(
374
- zip(
375
- accept_rates,
376
- thresholds_strat,
377
- bad_rates_start,
378
- num_accepted_loans_start,
379
- estimated_value,
380
- ),
381
- columns=[
382
- "Acceptance Rate",
383
- "Threshold",
384
- "Bad Rate",
385
- "Num Accepted Loans",
386
- f"Estimated Value ({currency})",
387
- ],
388
- )
389
- .sort_values(by="Acceptance Rate", axis=0, ascending=False)
390
- .reset_index(drop=True)
391
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
common/views.py DELETED
@@ -1,361 +0,0 @@
1
- from typing import OrderedDict
2
- import streamlit as st # works on command prompt
3
- import matplotlib.pyplot as plt
4
- import numpy as np
5
- import pandas as pd
6
- import xgboost as xgb
7
- from sklearn.metrics import (
8
- roc_curve,
9
- )
10
- from sklearn.calibration import calibration_curve
11
- from xgboost import plot_tree
12
- from views.typing import ModelView
13
-
14
-
15
- def plot_logistic_coeff_barh(coef_dict, x, y):
16
- fig = plt.figure(figsize=(x, y))
17
- coef_dict_sorted = dict(
18
- sorted(coef_dict.items(), key=lambda item: item[1], reverse=False)
19
- )
20
- plt.barh(*zip(*coef_dict_sorted.items()))
21
- return fig
22
-
23
-
24
- def print_negative_coefficients_logistic_model(coef_dict):
25
- # Equal to or less than 0
26
- NegativeCoefficients = dict(
27
- filter(lambda x: x[1] <= 0.0, coef_dict.items())
28
- )
29
-
30
- NegativeCoefficientsSorted = sorted(
31
- NegativeCoefficients.items(), key=lambda x: x[1], reverse=False
32
- )
33
- text = (
34
- "\n\nFeatures the model found to be negatively correlated with probability of default are:"
35
- "\n{negative_features}:"
36
- )
37
- st.markdown(text.format(negative_features=NegativeCoefficientsSorted))
38
- st.markdown(type(NegativeCoefficientsSorted))
39
- st.markdown(NegativeCoefficients.items())
40
-
41
-
42
- def print_positive_coefficients_logistic_model(coef_dict):
43
- # Equal to or greater than 0
44
- PositiveCoefficients = dict(
45
- filter(lambda x: x[1] >= 0.0, coef_dict.items())
46
- )
47
-
48
- PositiveCoefficientsSorted = sorted(
49
- PositiveCoefficients.items(), key=lambda x: x[1], reverse=True
50
- )
51
- text = (
52
- "\n\nFeatures the model found to be positively correlated with probability of default are:"
53
- "\n{positive_features}:"
54
- )
55
- st.markdown(text.format(positive_features=PositiveCoefficientsSorted))
56
-
57
-
58
- def plot_importance_gbt(clf_gbt_model, barxsize, barysize):
59
- axobject1 = xgb.plot_importance(clf_gbt_model, importance_type="weight")
60
- fig1 = axobject1.figure
61
- st.write("Feature Importance Plot (Gradient Boosted Tree)")
62
- fig1.set_size_inches(barxsize, barysize)
63
- return fig1
64
-
65
-
66
- def download_importance_gbt(fig1, barxsize, barysize):
67
- if st.button(
68
- "Download Feature Importance Plot as png (Gradient Boosted Tree)"
69
- ):
70
- dpisize = max(barxsize, barysize)
71
- plt.savefig("bar.png", dpi=dpisize * 96, bbox_inches="tight")
72
- fig1.set_size_inches(barxsize, barysize)
73
-
74
-
75
- def plot_tree_gbt(treexsize, treeysize, clf_gbt_model):
76
- plot_tree(clf_gbt_model)
77
- fig2 = plt.gcf()
78
- fig2.set_size_inches(treexsize, treeysize)
79
- return fig2
80
-
81
-
82
- def download_tree_gbt(treexsize, treeysize):
83
- if st.button("Download Decision Tree Plot as png (Gradient Boosted Tree)"):
84
- dpisize = max(treexsize, treeysize)
85
- plt.savefig("tree.png", dpi=dpisize * 96, bbox_inches="tight")
86
-
87
-
88
- def cross_validation_graph(cv, eval_metric, trees):
89
-
90
- # Plot the test AUC scores for each iteration
91
- fig = plt.figure()
92
- plt.plot(cv[cv.columns[2]])
93
- plt.title(
94
- "Test {eval_metric} Score Over {it_numbr} Iterations".format(
95
- eval_metric=eval_metric, it_numbr=trees
96
- )
97
- )
98
- plt.xlabel("Iteration Number")
99
- plt.ylabel("Test {eval_metric} Score".format(eval_metric=eval_metric))
100
- return fig
101
-
102
-
103
- def recall_accuracy_threshold_tradeoff_fig(
104
- widthsize,
105
- heightsize,
106
- threshold_list,
107
- thresh_def_recalls_list,
108
- thresh_nondef_recalls_list,
109
- thresh_accs_list,
110
- ):
111
- fig = plt.figure(figsize=(widthsize, heightsize))
112
- plt.plot(threshold_list, thresh_def_recalls_list, label="Default Recall")
113
- plt.plot(
114
- threshold_list, thresh_nondef_recalls_list, label="Non-Default Recall"
115
- )
116
- plt.plot(threshold_list, thresh_accs_list, label="Model Accuracy")
117
- plt.xlabel("Probability Threshold")
118
- plt.ylabel("Score")
119
- plt.xlim(0, 1)
120
- plt.ylim(0, 1)
121
- plt.legend()
122
- plt.title("Recall and Accuracy Score Tradeoff with Probability Threshold")
123
- plt.grid(False)
124
- return fig
125
-
126
-
127
- def roc_auc_compare_n_models(y, model_views: OrderedDict[str, ModelView]):
128
- colors = ["blue", "green"]
129
- fig = plt.figure()
130
- for color_idx, (model_name, model_view) in enumerate(model_views.items()):
131
- fpr, tpr, _thresholds = roc_curve(
132
- y, model_view.prediction_probability_df
133
- )
134
- plt.plot(fpr, tpr, color=colors[color_idx], label=f"{model_name}")
135
- plt.plot([0, 1], [0, 1], linestyle="--", label="Random Prediction")
136
- model_names = list(model_views.keys())
137
- if not model_names:
138
- model_name_str = "None"
139
- elif len(model_names) == 1:
140
- model_name_str = model_names[0]
141
- else:
142
- model_name_str = " and ".join(
143
- [", ".join(model_names[:-1]), model_names[-1]]
144
- )
145
- plt.title(f"ROC Chart for {model_name_str} on the Probability of Default")
146
- plt.xlabel("False Positive Rate (FP Rate)")
147
- plt.ylabel("True Positive Rate (TP Rate)")
148
- plt.legend()
149
- plt.grid(False)
150
- plt.xlim(0, 1)
151
- plt.ylim(0, 1)
152
- return fig
153
-
154
-
155
- def calibration_curve_report_commented_n(
156
- y, model_views: OrderedDict[str, ModelView], bins: int
157
- ):
158
- fig = plt.figure()
159
- for model_name, model_view in model_views.items():
160
- frac_of_pos, mean_pred_val = calibration_curve(
161
- y,
162
- model_view.prediction_probability_df,
163
- n_bins=bins,
164
- normalize=True,
165
- )
166
- plt.plot(mean_pred_val, frac_of_pos, "s-", label=f"{model_name}")
167
-
168
- # Create the calibration curve plot with the guideline
169
- plt.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
170
-
171
- plt.ylabel("Fraction of positives")
172
- plt.xlabel("Average Predicted Probability")
173
- plt.title("Calibration Curve")
174
- plt.legend()
175
- plt.grid(False)
176
- plt.xlim(0, 1)
177
- plt.ylim(0, 1)
178
- return fig
179
-
180
-
181
- def acceptance_rate_threshold_fig(probability_default, acceptancerate, bins):
182
- # Probability distribution
183
- probability_stat_distribution = probability_default.describe()
184
-
185
- # Acceptance rate threshold
186
- acc_rate_thresh = np.quantile(probability_default, acceptancerate)
187
- fig = plt.figure()
188
-
189
- plt.hist(
190
- probability_default,
191
- color="blue",
192
- bins=bins,
193
- histtype="bar",
194
- ec="white",
195
- )
196
-
197
- # Add a reference line to the plot for the threshold
198
- plt.axvline(x=acc_rate_thresh, color="red")
199
- plt.title("Acceptance Rate Thershold")
200
-
201
- return (
202
- fig,
203
- probability_stat_distribution,
204
- acc_rate_thresh,
205
- )
206
-
207
-
208
- def streamlit_2columns_metrics_pct_df(
209
- column1name_label: str,
210
- column2name_label: str,
211
- df: pd.DataFrame,
212
- ):
213
- (
214
- column1name,
215
- column2name,
216
- ) = st.columns(2)
217
-
218
- with column1name:
219
- st.metric(
220
- label=column1name_label,
221
- value="{:.0%}".format(df.value_counts().get(1) / df.shape[0]),
222
- delta=None,
223
- delta_color="normal",
224
- )
225
-
226
- with column2name:
227
- st.metric(
228
- label=column2name_label,
229
- value="{:.0%}".format(df.value_counts().get(0) / df.shape[0]),
230
- delta=None,
231
- delta_color="normal",
232
- )
233
-
234
-
235
- def streamlit_2columns_metrics_df(
236
- column1name_label: str,
237
- column2name_label: str,
238
- df: pd.DataFrame,
239
- ):
240
- (
241
- column1name,
242
- column2name,
243
- ) = st.columns(2)
244
-
245
- with column1name:
246
- st.metric(
247
- label=column1name_label,
248
- value=df.value_counts().get(1),
249
- delta=None,
250
- delta_color="normal",
251
- )
252
-
253
- with column2name:
254
- st.metric(
255
- label=column2name_label,
256
- value=df.value_counts().get(0),
257
- delta=None,
258
- delta_color="normal",
259
- )
260
-
261
-
262
- def streamlit_2columns_metrics_df_shape(df: pd.DataFrame):
263
- (
264
- column1name,
265
- column2name,
266
- ) = st.columns(2)
267
-
268
- with column1name:
269
- st.metric(
270
- label="Rows",
271
- value=df.shape[0],
272
- delta=None,
273
- delta_color="normal",
274
- )
275
-
276
- with column2name:
277
- st.metric(
278
- label="Columns",
279
- value=df.shape[1],
280
- delta=None,
281
- delta_color="normal",
282
- )
283
-
284
-
285
- def streamlit_2columns_metrics_pct_series(
286
- column1name_label: str,
287
- column2name_label: str,
288
- series: pd.Series,
289
- ):
290
- (
291
- column1name,
292
- column2name,
293
- ) = st.columns(2)
294
- with column1name:
295
- st.metric(
296
- label=column1name_label,
297
- value="{:.0%}".format(series.get(1) / series.sum()),
298
- delta=None,
299
- delta_color="normal",
300
- )
301
-
302
- with column2name:
303
- st.metric(
304
- label=column2name_label,
305
- value="{:.0%}".format(series.get(0) / series.sum()),
306
- delta=None,
307
- delta_color="normal",
308
- )
309
-
310
-
311
- def streamlit_2columns_metrics_series(
312
- column1name_label: str,
313
- column2name_label: str,
314
- series: pd.Series,
315
- ):
316
- (
317
- column1name,
318
- column2name,
319
- ) = st.columns(2)
320
- with column1name:
321
- st.metric(
322
- label=column1name_label,
323
- value=series.get(1),
324
- delta=None,
325
- delta_color="normal",
326
- )
327
-
328
- with column2name:
329
- st.metric(
330
- label=column2name_label,
331
- value=series.get(0),
332
- delta=None,
333
- delta_color="normal",
334
- )
335
-
336
-
337
- def streamlit_chart_setting_height_width(
338
- title: str,
339
- default_widthvalue: int,
340
- default_heightvalue: int,
341
- widthkey: str,
342
- heightkey: str,
343
- ):
344
- with st.expander(title):
345
-
346
- lbarx_col, lbary_col = st.columns(2)
347
-
348
- with lbarx_col:
349
- width_size = st.number_input(
350
- label="Width in inches:",
351
- value=default_widthvalue,
352
- key=widthkey,
353
- )
354
-
355
- with lbary_col:
356
- height_size = st.number_input(
357
- label="Height in inches:",
358
- value=default_heightvalue,
359
- key=heightkey,
360
- )
361
- return width_size, height_size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data_setup.py DELETED
@@ -1,180 +0,0 @@
1
- from typing import Tuple, cast
2
-
3
- import pandas as pd
4
- import streamlit as st
5
-
6
- from common.data import Dataset, SplitDataset
7
- from common.util import (
8
- undersample_training_data,
9
- )
10
- from common.views import (
11
- streamlit_2columns_metrics_df_shape,
12
- streamlit_2columns_metrics_series,
13
- streamlit_2columns_metrics_pct_series,
14
- streamlit_2columns_metrics_df,
15
- streamlit_2columns_metrics_pct_df,
16
- )
17
-
18
-
19
- # Initialize dataframe session state
20
- def initialise_data() -> Tuple[Dataset, SplitDataset]:
21
- if "input_data_frame" not in st.session_state:
22
- st.session_state.input_data_frame = pd.read_csv(
23
- r"./data/processed/cr_loan_w2.csv"
24
- )
25
- if "dataset" not in st.session_state:
26
- df = cast(pd.DataFrame, st.session_state.input_data_frame)
27
- dataset = Dataset(
28
- df=df,
29
- random_state=123235,
30
- test_size=40,
31
- )
32
- st.session_state.dataset = dataset
33
- else:
34
- dataset = st.session_state.dataset
35
-
36
- st.write(
37
- "Assuming data is already cleaned and relevant features (predictors) added."
38
- )
39
-
40
- with st.expander("Input Dataframe (X and y)"):
41
- st.dataframe(dataset.df)
42
- streamlit_2columns_metrics_df_shape(dataset.df)
43
-
44
- st.header("Predictors")
45
-
46
- possible_columns = dataset.x_values_column_names
47
-
48
- selected_columns = st.sidebar.multiselect(
49
- label="Select Predictors",
50
- options=possible_columns,
51
- default=possible_columns,
52
- )
53
-
54
- selected_x_values = dataset.x_values_filtered_columns(selected_columns)
55
-
56
- st.sidebar.metric(
57
- label="# of Predictors Selected",
58
- value=selected_x_values.shape[1],
59
- delta=None,
60
- delta_color="normal",
61
- )
62
- with st.expander("Predictors Dataframe (X)"):
63
- st.dataframe(selected_x_values)
64
- streamlit_2columns_metrics_df_shape(selected_x_values)
65
-
66
- # 40% of data used for training
67
- # 14321 as random seed for reproducability
68
-
69
- st.header("Split Testing and Training Data")
70
-
71
- test_size_slider_col, seed_col = st.columns(2)
72
-
73
- with test_size_slider_col:
74
- # Initialize test size
75
- dataset.test_size = st.slider(
76
- label="Test Size Percentage of Input Dataframe:",
77
- min_value=0,
78
- max_value=100,
79
- value=dataset.test_size,
80
- key="init_test_size",
81
- format="%f%%",
82
- )
83
-
84
- with seed_col:
85
- dataset.random_state = int(
86
- st.number_input(label="Random State:", value=dataset.random_state)
87
- )
88
-
89
- split_dataset = dataset.train_test_split(selected_x_values)
90
-
91
- # Series
92
- true_status = split_dataset.y_test.to_frame().value_counts()
93
-
94
- st.sidebar.metric(
95
- label="Testing Data # of Actual Default (=1)",
96
- value=true_status.get(1),
97
- )
98
-
99
- st.sidebar.metric(
100
- label="Testing Data % of Actual Default",
101
- value="{:.0%}".format(true_status.get(1) / true_status.sum()),
102
- )
103
-
104
- st.sidebar.metric(
105
- label="Testing Data # of Actual Non-Default (=0)",
106
- value=true_status.get(0),
107
- )
108
-
109
- st.sidebar.metric(
110
- label="Testing Data % of Actual Non-Default",
111
- value="{:.0%}".format(true_status.get(0) / true_status.sum()),
112
- )
113
-
114
- # Concat the testing sets
115
- X_y_test = split_dataset.X_y_test
116
- X_y_train = split_dataset.X_y_train
117
-
118
- with st.expander("Testing Dataframe (X and y)"):
119
- st.dataframe(X_y_test)
120
- streamlit_2columns_metrics_df_shape(X_y_test)
121
-
122
- streamlit_2columns_metrics_series(
123
- "# Defaults(=1) (Testing Data)",
124
- "# Non-Defaults(=0) (Testing Data)",
125
- true_status,
126
- )
127
-
128
- streamlit_2columns_metrics_pct_series(
129
- "% Defaults (Testing Data)",
130
- "% Non-Defaults (Testing Data)",
131
- true_status,
132
- )
133
-
134
- st.header("Training Data")
135
-
136
- with st.expander("Training Dataframe (X and y)"):
137
- st.dataframe(X_y_train)
138
- streamlit_2columns_metrics_df_shape(X_y_train)
139
-
140
- st.subheader("Class Count")
141
-
142
- streamlit_2columns_metrics_df(
143
- "# Defaults (Training Data Class Balance Check)",
144
- "# Non-Defaults (Training Data Class Balance Check)",
145
- split_dataset.y_train,
146
- )
147
-
148
- streamlit_2columns_metrics_pct_df(
149
- "% Defaults (Training Data Class Balance Check)",
150
- "% Non-Defaults (Training Data Class Balance Check)",
151
- split_dataset.y_train,
152
- )
153
-
154
- balance_the_classes = st.radio(
155
- label="Balance the Classes:", options=("Yes", "No")
156
- )
157
-
158
- if balance_the_classes == "Yes":
159
- st.subheader("Balanced Classes (by Undersampling)")
160
-
161
- (
162
- split_dataset.X_train,
163
- split_dataset.y_train,
164
- _X_y_train,
165
- class_balance_default,
166
- ) = undersample_training_data(X_y_train, "loan_status", split_dataset)
167
-
168
- streamlit_2columns_metrics_series(
169
- "# Defaults (Training Data with Class Balance)",
170
- "# Non-Defaults (Training Data with Class Balance)",
171
- class_balance_default,
172
- )
173
-
174
- streamlit_2columns_metrics_pct_series(
175
- "% of Defaults (Training Data with Class Balance)",
176
- "% of Non-Defaults (Training Data with Class Balance)",
177
- class_balance_default,
178
- )
179
-
180
- return dataset, split_dataset
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
views/__init__.py DELETED
File without changes
views/decision_tree.py DELETED
@@ -1,70 +0,0 @@
1
- from common.data import SplitDataset
2
- import streamlit as st
3
- from common.util import (
4
- test_variables_gbt,
5
- )
6
- from common.views import (
7
- streamlit_chart_setting_height_width,
8
- plot_importance_gbt,
9
- plot_tree_gbt,
10
- download_importance_gbt,
11
- download_tree_gbt,
12
- )
13
- from views.typing import ModelView
14
- from views.threshold import decision_tree_threshold_view
15
- from views.evaluation import decision_tree_evaluation_view
16
-
17
-
18
- def decisiontree_view(split_dataset: SplitDataset, currency: str):
19
- st.header("Decision Trees")
20
-
21
- clf_gbt_model = test_variables_gbt(
22
- split_dataset.X_train, split_dataset.y_train
23
- )
24
-
25
- st.subheader("Decision Tree Feature Importance")
26
-
27
- (barxsize, barysize,) = streamlit_chart_setting_height_width(
28
- "Chart Settings", 10, 15, "barxsize", "barysize"
29
- )
30
-
31
- fig1 = plot_importance_gbt(clf_gbt_model, barxsize, barysize)
32
-
33
- st.pyplot(fig1)
34
-
35
- download_importance_gbt(fig1, barxsize, barysize)
36
-
37
- st.subheader("Decision Tree Structure")
38
-
39
- (treexsize, treeysize,) = streamlit_chart_setting_height_width(
40
- "Chart Settings", 15, 10, "treexsize", "treeysize"
41
- )
42
-
43
- fig2 = plot_tree_gbt(treexsize, treeysize, clf_gbt_model)
44
-
45
- st.pyplot(fig2)
46
-
47
- download_tree_gbt(treexsize, treeysize)
48
- st.markdown(
49
- "Note: The downloaded decision tree plot chart in png has higher resolution than that displayed here."
50
- )
51
-
52
- threshold = decision_tree_threshold_view(clf_gbt_model, split_dataset)
53
-
54
- df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
55
- decision_tree_evaluation_view(
56
- clf_gbt_model,
57
- split_dataset,
58
- currency,
59
- threshold.probability_threshold_selected,
60
- threshold.predicted_default_status,
61
- )
62
- )
63
-
64
- return ModelView(
65
- model=clf_gbt_model,
66
- trueStatus_probabilityDefault_threshStatus_loanAmount_df=df_trueStatus_probabilityDefault_threshStatus_loanAmount,
67
- probability_threshold_selected=threshold.probability_threshold_selected,
68
- predicted_default_status=threshold.predicted_default_status,
69
- prediction_probability_df=threshold.prediction_probability_df,
70
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
views/evaluation.py DELETED
@@ -1,410 +0,0 @@
1
- from typing import Union
2
- import pandas as pd
3
- import streamlit as st
4
- import numpy as np
5
- from sklearn.metrics import (
6
- classification_report,
7
- confusion_matrix,
8
- )
9
- from sklearn.linear_model import LogisticRegression
10
- from xgboost.sklearn import XGBClassifier
11
- from common.data import SplitDataset
12
- from common.util import (
13
- create_cross_validation_df,
14
- cross_validation_scores,
15
- get_df_trueStatus_probabilityDefault_threshStatus_loanAmount,
16
- )
17
- from common.views import (
18
- cross_validation_graph,
19
- )
20
-
21
-
22
- def make_evaluation_view(
23
- model_name_short: str,
24
- model_name_generic: str,
25
- ):
26
- def view(
27
- clf_gbt_model: Union[XGBClassifier, LogisticRegression],
28
- split_dataset: SplitDataset,
29
- currency: str,
30
- prob_thresh_selected,
31
- predicted_default_status,
32
- ):
33
- st.header(f"Model Evaluation - {model_name_generic}")
34
-
35
- st.subheader("Cross Validation")
36
-
37
- st.write("Shows how our model will perform as new loans come in.")
38
- st.write(
39
- "If evaluation metric for test and train set improve as models \
40
- train on each fold suggests performance will be stable."
41
- )
42
-
43
- st.write(f"XGBoost cross validation test:")
44
-
45
- stcol_seed, stcol_eval_metric = st.columns(2)
46
-
47
- with stcol_seed:
48
- cv_seed = int(
49
- st.number_input(
50
- label="Random State Seed for Cross Validation:",
51
- value=123235,
52
- key=f"cv_seed_{model_name_short}",
53
- )
54
- )
55
-
56
- with stcol_eval_metric:
57
- eval_metric = st.selectbox(
58
- label="Select evaluation metric",
59
- options=[
60
- "auc",
61
- "aucpr",
62
- "rmse",
63
- "mae",
64
- "logloss",
65
- "error",
66
- "merror",
67
- "mlogloss",
68
- ],
69
- key=f"eval_metric_{model_name_short}",
70
- )
71
-
72
- stcol_trees, stcol_eval_nfold, stcol_earlystoppingrounds = st.columns(
73
- 3
74
- )
75
-
76
- with stcol_trees:
77
- trees = int(
78
- st.number_input(
79
- label="Number of trees",
80
- value=5,
81
- key=f"trees_{model_name_short}",
82
- )
83
- )
84
-
85
- with stcol_eval_nfold:
86
- nfolds = int(
87
- st.number_input(
88
- label="Number of folds",
89
- value=5,
90
- key=f"nfolds_{model_name_short}",
91
- )
92
- )
93
-
94
- with stcol_earlystoppingrounds:
95
- early_stopping_rounds = int(
96
- st.number_input(
97
- label="Early stopping rounds",
98
- value=10,
99
- key=f"early_stopping_rounds_{model_name_short}",
100
- )
101
- )
102
-
103
- DTrain, cv_df = create_cross_validation_df(
104
- split_dataset.X_test,
105
- split_dataset.y_test,
106
- eval_metric,
107
- cv_seed,
108
- trees,
109
- nfolds,
110
- early_stopping_rounds,
111
- )
112
-
113
- st.write(cv_df)
114
-
115
- scoring_options = [
116
- "roc_auc",
117
- "accuracy",
118
- "precision",
119
- "recall",
120
- "f1",
121
- "jaccard",
122
- ]
123
-
124
- overfit_test = st.radio(
125
- label="Overfit test:",
126
- options=("No", "Yes"),
127
- key=f"overfit_test_{model_name_short}",
128
- )
129
-
130
- if overfit_test == "Yes":
131
- st.write("Overfit test:")
132
- iterations = int(
133
- st.number_input(
134
- label="Number of folds (iterations)",
135
- value=500,
136
- key=f"iterations_{model_name_short}",
137
- )
138
- )
139
-
140
- DTrain, cv_df_it = create_cross_validation_df(
141
- split_dataset.X_test,
142
- split_dataset.y_test,
143
- eval_metric,
144
- cv_seed,
145
- iterations,
146
- nfolds,
147
- iterations,
148
- )
149
-
150
- fig_it = cross_validation_graph(cv_df_it, eval_metric, iterations)
151
- st.pyplot(fig_it)
152
-
153
- st.write("Sklearn cross validation test:")
154
- stcol_scoringmetric, st_nfold = st.columns(2)
155
-
156
- with stcol_scoringmetric:
157
- score_metric = st.selectbox(
158
- label="Select score",
159
- options=scoring_options,
160
- key=f"stcol_scoringmetric_{model_name_short}",
161
- )
162
-
163
- with st_nfold:
164
- nfolds_score = int(
165
- st.number_input(
166
- label="Number of folds",
167
- value=5,
168
- key=f"st_nfold_{model_name_short}",
169
- )
170
- )
171
-
172
- cv_scores = cross_validation_scores(
173
- clf_gbt_model,
174
- split_dataset.X_test,
175
- split_dataset.y_test,
176
- nfolds_score,
177
- score_metric,
178
- cv_seed,
179
- )
180
-
181
- stcol_vals, stcol_mean, st_std = st.columns(3)
182
-
183
- with stcol_vals:
184
- st.markdown(f"{score_metric} scores:")
185
- st.write(
186
- pd.DataFrame(
187
- cv_scores,
188
- columns=[score_metric],
189
- )
190
- )
191
-
192
- with stcol_mean:
193
- st.metric(
194
- label=f"Average {score_metric} score ",
195
- value="{:.4f}".format(cv_scores.mean()),
196
- delta=None,
197
- delta_color="normal",
198
- )
199
-
200
- with st_std:
201
- st.metric(
202
- label=f"{score_metric} standard deviation (+/-)",
203
- value="{:.4f}".format(cv_scores.std()),
204
- delta=None,
205
- delta_color="normal",
206
- )
207
-
208
- st.subheader("Classification Report")
209
-
210
- target_names = ["Non-Default", "Default"]
211
-
212
- classification_report_dict = classification_report(
213
- split_dataset.y_test,
214
- predicted_default_status,
215
- target_names=target_names,
216
- output_dict=True,
217
- )
218
-
219
- (
220
- stcol_defaultpres,
221
- stcol_defaultrecall,
222
- stcol_defaultf1score,
223
- stcol_f1score,
224
- ) = st.columns(4)
225
- with stcol_defaultpres:
226
- st.metric(
227
- label="Default Precision",
228
- value="{:.0%}".format(
229
- classification_report_dict["Default"]["precision"]
230
- ),
231
- delta=None,
232
- delta_color="normal",
233
- )
234
-
235
- with stcol_defaultrecall:
236
- st.metric(
237
- label="Default Recall",
238
- value="{:.0%}".format(
239
- classification_report_dict["Default"]["recall"]
240
- ),
241
- delta=None,
242
- delta_color="normal",
243
- )
244
-
245
- with stcol_defaultf1score:
246
- st.metric(
247
- label="Default F1 Score",
248
- value="{:.2f}".format(
249
- classification_report_dict["Default"]["f1-score"]
250
- ),
251
- delta=None,
252
- delta_color="normal",
253
- )
254
-
255
- with stcol_f1score:
256
- st.metric(
257
- label="Macro avg F1 Score (Model F1 Score):",
258
- value="{:.2f}".format(
259
- classification_report_dict["macro avg"]["f1-score"]
260
- ),
261
- delta=None,
262
- delta_color="normal",
263
- )
264
-
265
- with st.expander("Classification Report Dictionary:"):
266
- st.write(classification_report_dict)
267
-
268
- st.markdown(
269
- f'Default precision: {"{:.0%}".format(classification_report_dict["Default"]["precision"])} of loans predicted as default were actually default.'
270
- )
271
-
272
- st.markdown(
273
- f'Default recall: {"{:.0%}".format(classification_report_dict["Default"]["recall"])} of true defaults predicted correctly.'
274
- )
275
-
276
- f1_gap = 1 - classification_report_dict["Default"]["f1-score"]
277
- st.markdown(
278
- f'Default F1 score: {"{:.2f}".format(classification_report_dict["Default"]["f1-score"])}\
279
- is {"{:.2f}".format(f1_gap)} away from perfect precision and recall (no false positive rate).'
280
- )
281
-
282
- st.markdown(
283
- f'macro avg F1 score: {"{:.2f}".format(classification_report_dict["macro avg"]["f1-score"])} is the models F1 score.'
284
- )
285
-
286
- st.subheader("Confusion Matrix")
287
- confuctiomatrix_dict = confusion_matrix(
288
- split_dataset.y_test, predicted_default_status
289
- )
290
-
291
- tn, fp, fn, tp = confusion_matrix(
292
- split_dataset.y_test, predicted_default_status
293
- ).ravel()
294
-
295
- with st.expander(
296
- "Confusion matrix (column name = classification model prediction, row name = true status, values = number of loans"
297
- ):
298
- st.write(confuctiomatrix_dict)
299
-
300
- st.markdown(
301
- f'{tp} ,\
302
- {"{:.0%}".format(tp / len(predicted_default_status))} \
303
- true positives (defaults correctly predicted as defaults).'
304
- )
305
-
306
- st.markdown(
307
- f'{fp} ,\
308
- {"{:.0%}".format(fp / len(predicted_default_status))} \
309
- false positives (non-defaults incorrectly predicted as defaults).'
310
- )
311
-
312
- st.markdown(
313
- f'{fn} ,\
314
- {"{:.0%}".format(fn / len(predicted_default_status))} \
315
- false negatives (defaults incorrectly predicted as non-defaults).'
316
- )
317
-
318
- st.markdown(
319
- f'{tn} ,\
320
- {"{:.0%}".format(tn / len(predicted_default_status))} \
321
- true negatives (non-defaults correctly predicted as non-defaults).'
322
- )
323
-
324
- st.subheader("Bad Rate")
325
-
326
- df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
327
- get_df_trueStatus_probabilityDefault_threshStatus_loanAmount(
328
- clf_gbt_model,
329
- split_dataset.X_test,
330
- split_dataset.y_test,
331
- prob_thresh_selected,
332
- "loan_amnt",
333
- )
334
- )
335
-
336
- with st.expander(
337
- "Loan Status, Probability of Default, & Loan Amount DataFrame"
338
- ):
339
- st.write(df_trueStatus_probabilityDefault_threshStatus_loanAmount)
340
-
341
- accepted_loans = (
342
- df_trueStatus_probabilityDefault_threshStatus_loanAmount[
343
- df_trueStatus_probabilityDefault_threshStatus_loanAmount[
344
- "PREDICT_DEFAULT_STATUS"
345
- ]
346
- == 0
347
- ]
348
- )
349
-
350
- bad_rate = (
351
- np.sum(accepted_loans["loan_status"])
352
- / accepted_loans["loan_status"].count()
353
- )
354
-
355
- with st.expander("Loan Amount Summary Statistics"):
356
- st.write(
357
- df_trueStatus_probabilityDefault_threshStatus_loanAmount[
358
- "loan_amnt"
359
- ].describe()
360
- )
361
-
362
- avg_loan = np.mean(
363
- df_trueStatus_probabilityDefault_threshStatus_loanAmount[
364
- "loan_amnt"
365
- ]
366
- )
367
-
368
- crosstab_df = pd.crosstab(
369
- df_trueStatus_probabilityDefault_threshStatus_loanAmount[
370
- "loan_status"
371
- ], # row label
372
- df_trueStatus_probabilityDefault_threshStatus_loanAmount[
373
- "PREDICT_DEFAULT_STATUS"
374
- ],
375
- ).apply(
376
- lambda x: x * avg_loan, axis=0
377
- ) # column label
378
-
379
- with st.expander(
380
- "Cross tabulation (column name = classification model prediction, row name = true status, values = number of loans * average loan value"
381
- ):
382
- st.write(crosstab_df)
383
-
384
- st.write(
385
- f'Bad rate: {"{:.2%}".format(bad_rate)} of all the loans the model accepted (classified as non-default) from the test set were actually defaults.'
386
- )
387
-
388
- st.write(
389
- f'Estimated value of the bad rate is {currency} {"{:,.2f}".format(crosstab_df[0][1])}.'
390
- )
391
-
392
- st.write(
393
- f'Total estimated value of actual non-default loans is {currency} {"{:,.2f}".format(crosstab_df[0][0]+crosstab_df[0][1])}'
394
- )
395
-
396
- st.write(
397
- f'Estimated value of loans incorrectly predicted as default is {currency} {"{:,.2f}".format(crosstab_df[1][0])}'
398
- )
399
-
400
- st.write(
401
- f'Estimated value of loans correctly predicted as defaults is {currency} {"{:,.2f}".format(crosstab_df[1][1])}'
402
- )
403
-
404
- return df_trueStatus_probabilityDefault_threshStatus_loanAmount
405
-
406
- return view
407
-
408
-
409
- decision_tree_evaluation_view = make_evaluation_view("gbt", "Decision Tree")
410
- logistic_evaluation_view = make_evaluation_view("lg", "Logistic Regression")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
views/logistic.py DELETED
@@ -1,119 +0,0 @@
1
- from common.data import SplitDataset
2
- import streamlit as st
3
- import pandas as pd
4
- import plotly.express as px
5
- from views.threshold import logistic_threshold_view
6
- from views.evaluation import logistic_evaluation_view
7
- from common.util import (
8
- test_variables_logistic,
9
- print_coeff_logistic,
10
- model_probability_values_df,
11
- apply_threshold_to_probability_values,
12
- )
13
- from common.views import (
14
- streamlit_2columns_metrics_df,
15
- streamlit_2columns_metrics_pct_df,
16
- )
17
- from views.typing import ModelView
18
-
19
-
20
- def logistic_view(split_dataset: SplitDataset, currency: str) -> ModelView:
21
- # ### Test and create variables logically
22
-
23
- st.header("Logistic Regression")
24
-
25
- clf_logistic_model = test_variables_logistic(
26
- split_dataset.X_train, split_dataset.y_train
27
- )
28
-
29
- st.metric(
30
- label="# of Coefficients in Logistic Regression",
31
- value=clf_logistic_model.n_features_in_,
32
- delta=None,
33
- delta_color="normal",
34
- )
35
-
36
- coef_dict = print_coeff_logistic(clf_logistic_model, split_dataset)
37
-
38
- st.subheader("Logistic Regression Coefficient Values")
39
-
40
- coef_dict_sorted = dict(
41
- sorted(coef_dict.items(), key=lambda item: item[1], reverse=False)
42
- )
43
-
44
- data_items = coef_dict_sorted.items()
45
- data_list = list(data_items)
46
-
47
- df = pd.DataFrame(data_list, columns=["Coefficient", "Value"])
48
-
49
- fig1 = px.bar(data_frame=df, x="Value", y="Coefficient", orientation="h")
50
-
51
- fig1.update_layout(
52
- title="Logistic Regression Coefficients",
53
- xaxis_title="Value",
54
- yaxis_title="Coefficient",
55
- )
56
-
57
- st.plotly_chart(fig1)
58
-
59
- st.subheader("Classification Probability Threshold")
60
-
61
- st.write(
62
- """
63
- The logistic regression model (obtained using training data) is applied on testing data to predict the loans probabilities of defaulting.\n
64
- Probabilities of defaulting of the loans are compared to a probability threshold.\n
65
- A loan is predicted to default if its predicted probability of defaulting is greater than the probability threshold.
66
- """
67
- )
68
-
69
- threshold = st.slider(
70
- label="Default Probability Threshold:",
71
- min_value=0.0,
72
- max_value=1.0,
73
- value=0.7,
74
- key="key_threshold",
75
- )
76
-
77
- clf_prediction_prob_df_log = model_probability_values_df(
78
- clf_logistic_model,
79
- split_dataset.X_test,
80
- )
81
-
82
- clf_thresh_predicted_default_status_user = (
83
- apply_threshold_to_probability_values(
84
- clf_prediction_prob_df_log,
85
- threshold,
86
- )
87
- )
88
-
89
- streamlit_2columns_metrics_df(
90
- "# of Predicted Defaults",
91
- "# of Predicted Non-Default",
92
- clf_thresh_predicted_default_status_user,
93
- )
94
-
95
- streamlit_2columns_metrics_pct_df(
96
- "% of Loans Predicted to Default",
97
- "% of Loans Predicted not to Default",
98
- clf_thresh_predicted_default_status_user,
99
- )
100
-
101
- threshold = logistic_threshold_view(clf_logistic_model, split_dataset)
102
-
103
- df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
104
- logistic_evaluation_view(
105
- clf_logistic_model,
106
- split_dataset,
107
- currency,
108
- threshold.probability_threshold_selected,
109
- threshold.predicted_default_status,
110
- )
111
- )
112
-
113
- return ModelView(
114
- model=clf_logistic_model,
115
- trueStatus_probabilityDefault_threshStatus_loanAmount_df=df_trueStatus_probabilityDefault_threshStatus_loanAmount,
116
- probability_threshold_selected=threshold.probability_threshold_selected,
117
- predicted_default_status=threshold.predicted_default_status,
118
- prediction_probability_df=threshold.prediction_probability_df,
119
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
views/model_comparison.py DELETED
@@ -1,81 +0,0 @@
1
- from typing import OrderedDict
2
- import streamlit as st
3
- from sklearn.metrics import roc_auc_score
4
- from common.data import SplitDataset
5
- from common.views import (
6
- roc_auc_compare_n_models,
7
- streamlit_chart_setting_height_width,
8
- calibration_curve_report_commented_n,
9
- )
10
- from views.typing import ModelView
11
-
12
-
13
- def roc_auc_for_model(split_dataset: SplitDataset, model_view: ModelView):
14
- roc_auc_model = roc_auc_score(
15
- split_dataset.y_test, model_view.predicted_default_status
16
- )
17
-
18
- if roc_auc_model > 0.9:
19
- roc_auc_lvl = f'Very good {"{:.2f}".format(roc_auc_model)} > 0.9)'
20
- elif 0.8 < roc_auc_model < 0.9:
21
- roc_auc_lvl = f'Good (0.8 < {"{:.2f}".format(roc_auc_model)} <0.9)'
22
- elif 0.7 < roc_auc_model < 0.8:
23
- roc_auc_lvl = f'Fair (0.7 < {"{:.2f}".format(roc_auc_model)} < 0.8)'
24
- elif 0.6 < roc_auc_model < 0.7:
25
- roc_auc_lvl = f'Poor (0.6 < {"{:.2f}".format(roc_auc_model)} < 0.7)'
26
- else:
27
- roc_auc_lvl = f'Fail ( {"{:.2f}".format(roc_auc_model)} < 0.6)'
28
-
29
- return roc_auc_model, roc_auc_lvl
30
-
31
-
32
- def model_comparison_view(
33
- split_dataset: SplitDataset,
34
- model_views: OrderedDict[str, ModelView],
35
- ):
36
- st.header("Model Comparison")
37
-
38
- for model_name, model_view in model_views.items():
39
- roc_auc_model, roc_auc_lvl = roc_auc_for_model(
40
- split_dataset, model_view
41
- )
42
- st.subheader(
43
- f"Receiver Operating Characteristic (ROC) Curve - {model_name}"
44
- )
45
- st.markdown(
46
- f'Area Under the Receiver Operating Characteristic Curve from prediction scores from "{model_name}" model is {roc_auc_model}.\n'
47
- )
48
- st.markdown(
49
- f'The score of {"{:.2f}".format(roc_auc_model)} is in the {roc_auc_lvl} ROC AUC score category.'
50
- )
51
- fig1 = roc_auc_compare_n_models(
52
- split_dataset.y_test,
53
- model_views,
54
- )
55
-
56
- fig1 = fig1.figure
57
-
58
- (xsize_roc, ysize_roc) = streamlit_chart_setting_height_width(
59
- "Chart Settings", 7, 7, "xsize_roc", "ysize_roc"
60
- )
61
-
62
- fig1.set_size_inches(xsize_roc, ysize_roc)
63
-
64
- st.pyplot(fig1)
65
-
66
- st.subheader("Models Calibration Curve")
67
-
68
- fig2 = calibration_curve_report_commented_n(
69
- split_dataset.y_test,
70
- model_views,
71
- 10,
72
- )
73
- fig2 = fig2.figure
74
-
75
- (xsize_cal, ysize_cal) = streamlit_chart_setting_height_width(
76
- "Chart Settings", 7, 7, "xsize_cal", "ysize_cal"
77
- )
78
-
79
- fig2.set_size_inches(xsize_cal, ysize_cal)
80
-
81
- st.pyplot(fig2.figure)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
views/strategy_table.py DELETED
@@ -1,96 +0,0 @@
1
- from typing import OrderedDict
2
- import plotly.express as px
3
- import numpy as np
4
- import streamlit as st
5
- from common.util import create_strategyTable_df
6
- from views.typing import ModelView
7
-
8
-
9
- def strategy_table_view(
10
- currency: str, model_views: OrderedDict[str, ModelView]
11
- ):
12
- st.header("Strategy Table")
13
-
14
- for (model_name, model_view) in model_views.items():
15
- st.subheader(model_name)
16
- strat_df = create_strategyTable_df(
17
- 0.05,
18
- 1,
19
- 20,
20
- model_view.trueStatus_probabilityDefault_threshStatus_loanAmount_df,
21
- "loan_status",
22
- currency,
23
- )
24
-
25
- columns = strat_df.columns
26
-
27
- with st.expander("Strategy Table:"):
28
- st.write(strat_df)
29
-
30
- for i in columns:
31
- strat_df[i] = strat_df[i].astype(np.float64)
32
-
33
- strat_df_boxPlot_data = strat_df.iloc[:, 0:3]
34
-
35
- plot = px.box(data_frame=strat_df_boxPlot_data)
36
-
37
- st.plotly_chart(plot)
38
-
39
- # Plot the strategy curve
40
-
41
- fig1 = px.line(
42
- strat_df_boxPlot_data,
43
- x="Acceptance Rate",
44
- y="Bad Rate",
45
- title="Acceptance and Bad Rates",
46
- )
47
-
48
- st.plotly_chart(fig1)
49
-
50
- fig2 = px.line(
51
- strat_df,
52
- x="Acceptance Rate",
53
- y=f"Estimated Value ({currency})",
54
- title=f"Estimated Value ({currency}) by Acceptance Rate",
55
- )
56
-
57
- st.plotly_chart(fig2)
58
-
59
- st.write("Row with the greatest estimated value:")
60
-
61
- max_estimated_value = np.max(
62
- strat_df[f"Estimated Value ({currency})"].astype(np.float64)
63
- )
64
- columns = strat_df.columns
65
-
66
- max_estimated_value = np.max(strat_df[f"Estimated Value ({currency})"])
67
-
68
- st.write(
69
- strat_df.loc[
70
- strat_df[f"Estimated Value ({currency})"]
71
- == max_estimated_value
72
- ]
73
- )
74
-
75
- loss_given_default = 1
76
- df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
77
- model_view.trueStatus_probabilityDefault_threshStatus_loanAmount_df[
78
- "PROB_DEFAULT"
79
- ]
80
- * loss_given_default
81
- * model_view.trueStatus_probabilityDefault_threshStatus_loanAmount_df[
82
- "loan_amnt"
83
- ]
84
- )
85
-
86
- tot_exp_loss = round(
87
- np.sum(df_trueStatus_probabilityDefault_threshStatus_loanAmount),
88
- 2,
89
- )
90
-
91
- st.metric(
92
- label=f"Total expected loss:",
93
- value=f"{currency} {tot_exp_loss:,.2f}",
94
- delta=None,
95
- delta_color="normal",
96
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
views/threshold.py DELETED
@@ -1,272 +0,0 @@
1
- from dataclasses import dataclass
2
- from typing import Union, cast
3
- import numpy as np
4
- import streamlit as st
5
- import plotly.express as px
6
- import pandas as pd
7
- from xgboost.sklearn import XGBClassifier
8
- from sklearn.linear_model import LogisticRegression
9
- from common.data import SplitDataset
10
- from common.util import (
11
- model_probability_values_df,
12
- apply_threshold_to_probability_values,
13
- find_best_threshold_J_statistic,
14
- default_status_per_threshold,
15
- classification_report_per_threshold,
16
- thresh_classification_report_recall_accuracy,
17
- )
18
- from common.views import (
19
- streamlit_2columns_metrics_df,
20
- streamlit_2columns_metrics_pct_df,
21
- )
22
-
23
-
24
- @dataclass(frozen=True)
25
- class Threshold:
26
- probability_threshold_selected: float
27
- predicted_default_status: pd.Series
28
- prediction_probability_df: pd.DataFrame
29
-
30
-
31
- def make_threshold_view(
32
- model_name_short: str,
33
- model_name: str,
34
- ):
35
- def view(
36
- clf_gbt_model: Union[XGBClassifier, LogisticRegression],
37
- split_dataset: SplitDataset,
38
- ) -> Threshold:
39
- st.subheader("Classification Probability Threshold - User Defined")
40
- st.write(
41
- f"""
42
- The {model_name} model (obtained using training data) is applied on testing data to predict the loans probabilities of defaulting.\n
43
- Probabilities of defaulting of the loans are compared to a probability threshold.\n
44
- A loan is predicted to default if its predicted probability of defaulting is greater than the probability threshold.
45
- """
46
- )
47
-
48
- threshold_gbt_default = st.slider(
49
- label="Default Probability Threshold:",
50
- min_value=0.0,
51
- max_value=1.0,
52
- value=0.8,
53
- key=f"threshold_{model_name_short}_default",
54
- )
55
-
56
- clf_prediction_prob_df_gbt = model_probability_values_df(
57
- clf_gbt_model,
58
- split_dataset.X_test,
59
- )
60
-
61
- clf_thresh_predicted_default_status_user_gbt = (
62
- apply_threshold_to_probability_values(
63
- clf_prediction_prob_df_gbt,
64
- threshold_gbt_default,
65
- )
66
- )
67
-
68
- streamlit_2columns_metrics_df(
69
- "# of Predicted Defaults",
70
- "# of Predicted Non-Default",
71
- clf_thresh_predicted_default_status_user_gbt,
72
- )
73
-
74
- streamlit_2columns_metrics_pct_df(
75
- "% of Loans Predicted to Default",
76
- "% of Loans Predicted not to Default",
77
- clf_thresh_predicted_default_status_user_gbt,
78
- )
79
-
80
- st.subheader("J Statistic Driven Classification Probability Threshold")
81
-
82
- J_statistic_best_threshold = find_best_threshold_J_statistic(
83
- split_dataset.y_test, clf_prediction_prob_df_gbt
84
- )
85
- st.metric(
86
- label="Youden's J statistic calculated best threshold",
87
- value=J_statistic_best_threshold,
88
- )
89
-
90
- clf_thresh_predicted_default_status_Jstatistic_gbt = (
91
- apply_threshold_to_probability_values(
92
- clf_prediction_prob_df_gbt,
93
- J_statistic_best_threshold,
94
- )
95
- )
96
-
97
- streamlit_2columns_metrics_df(
98
- "# of Predicted Defaults",
99
- "# of Predicted Non-Default",
100
- clf_thresh_predicted_default_status_Jstatistic_gbt,
101
- )
102
-
103
- streamlit_2columns_metrics_pct_df(
104
- "% of Loans Predicted to Default",
105
- "% of Loans Predicted not to Default",
106
- clf_thresh_predicted_default_status_Jstatistic_gbt,
107
- )
108
-
109
- st.subheader(
110
- "Recall and Accuracy Tradeoff with given Probability Threshold"
111
- )
112
- # Steps
113
- # Get list of thresholds
114
- # Get default status per threshold
115
- # Get classification report per threshold
116
- # Get recall, nondef recall, and accuracy per threshold
117
-
118
- threshold_list = np.arange(0, 1, 0.025).round(decimals=3).tolist()
119
-
120
- threshold_default_status_list = default_status_per_threshold(
121
- threshold_list, clf_prediction_prob_df_gbt["PROB_DEFAULT"]
122
- )
123
- thresh_classification_report_dict = (
124
- classification_report_per_threshold(
125
- threshold_list,
126
- threshold_default_status_list,
127
- split_dataset.y_test,
128
- )
129
- )
130
-
131
- (
132
- thresh_def_recalls_list,
133
- thresh_nondef_recalls_list,
134
- thresh_accs_list,
135
- ) = thresh_classification_report_recall_accuracy(
136
- thresh_classification_report_dict
137
- )
138
-
139
- namelist = [
140
- "Default Recall",
141
- "Non Default Recall",
142
- "Accuracy",
143
- "Threshold",
144
- ]
145
-
146
- df = pd.DataFrame(
147
- [
148
- thresh_def_recalls_list,
149
- thresh_nondef_recalls_list,
150
- thresh_accs_list,
151
- threshold_list,
152
- ],
153
- index=namelist,
154
- )
155
-
156
- df = df.T
157
-
158
- fig2 = px.line(
159
- data_frame=df,
160
- y=["Default Recall", "Non Default Recall", "Accuracy"],
161
- x="Threshold",
162
- )
163
-
164
- fig2.update_layout(
165
- title="Recall and Accuracy score Trade-off with Probability Threshold",
166
- xaxis_title="Probability Threshold",
167
- yaxis_title="Score",
168
- )
169
- fig2.update_yaxes(range=[0.0, 1.0])
170
-
171
- st.plotly_chart(fig2)
172
-
173
- st.subheader("Acceptance Rate Driven Probability Threshold")
174
- # Steps
175
- # Set acceptance rate
176
- # Get default status per threshold
177
- # Get classification report per threshold
178
- # Get recall, nondef recall, and accuracy per threshold
179
-
180
- acceptance_rate = (
181
- st.slider(
182
- label="% of loans accepted (acceptance rate):",
183
- min_value=0,
184
- max_value=100,
185
- value=85,
186
- key=f"acceptance_rate_{model_name_short}",
187
- format="%f%%",
188
- )
189
- / 100
190
- )
191
-
192
- acc_rate_thresh_gbt = np.quantile(
193
- clf_prediction_prob_df_gbt["PROB_DEFAULT"], acceptance_rate
194
- )
195
-
196
- st.write(
197
- f"An acceptance rate of {acceptance_rate} results in probability threshold of {acc_rate_thresh_gbt}"
198
- )
199
-
200
- figa = px.histogram(clf_prediction_prob_df_gbt["PROB_DEFAULT"])
201
-
202
- figa.update_layout(
203
- title="Acceptance Rate Threshold vs. Loans Accepted",
204
- xaxis_title="Acceptance Rate Threshold",
205
- yaxis_title="Loans Accepted",
206
- )
207
-
208
- figa.update_traces(marker_line_width=1, marker_line_color="white")
209
-
210
- figa.add_vline(
211
- x=acc_rate_thresh_gbt,
212
- line_width=3,
213
- line_dash="solid",
214
- line_color="red",
215
- )
216
-
217
- st.plotly_chart(figa)
218
-
219
- clf_thresh_predicted_default_status_acceptance_gbt = (
220
- apply_threshold_to_probability_values(
221
- clf_prediction_prob_df_gbt,
222
- acc_rate_thresh_gbt,
223
- )
224
- )
225
-
226
- st.write()
227
- st.subheader("Selected Probability Threshold")
228
-
229
- options = [
230
- "User Defined",
231
- "J Statistic Driven",
232
- "Acceptance Rate Driven",
233
- ]
234
- prob_thresh_option = st.radio(
235
- label="Selected Probability Threshold",
236
- options=options,
237
- key=f"{model_name_short}_radio_thresh",
238
- )
239
-
240
- if prob_thresh_option == "User Defined":
241
- prob_thresh_selected_gbt = threshold_gbt_default
242
- predicted_default_status_gbt = (
243
- clf_thresh_predicted_default_status_user_gbt
244
- )
245
- elif prob_thresh_option == "J Statistic Driven":
246
- prob_thresh_selected_gbt = J_statistic_best_threshold
247
- predicted_default_status_gbt = (
248
- clf_thresh_predicted_default_status_Jstatistic_gbt
249
- )
250
- else:
251
- prob_thresh_selected_gbt = acc_rate_thresh_gbt
252
- predicted_default_status_gbt = (
253
- clf_thresh_predicted_default_status_acceptance_gbt
254
- )
255
-
256
- st.write(
257
- f"Selected probability threshold is {prob_thresh_selected_gbt}"
258
- )
259
-
260
- return Threshold(
261
- probability_threshold_selected=cast(
262
- float, prob_thresh_selected_gbt
263
- ),
264
- predicted_default_status=predicted_default_status_gbt,
265
- prediction_probability_df=clf_prediction_prob_df_gbt,
266
- )
267
-
268
- return view
269
-
270
-
271
- decision_tree_threshold_view = make_threshold_view("gbt", "decision tree")
272
- logistic_threshold_view = make_threshold_view("lg", "logistic")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
views/typing.py DELETED
@@ -1,15 +0,0 @@
1
- from dataclasses import dataclass
2
- from typing import Union
3
-
4
- import pandas as pd
5
- from xgboost.sklearn import XGBClassifier
6
- from sklearn.linear_model import LogisticRegression
7
-
8
-
9
- @dataclass(frozen=True)
10
- class ModelView:
11
- model: Union[XGBClassifier, LogisticRegression]
12
- probability_threshold_selected: float
13
- predicted_default_status: pd.Series
14
- trueStatus_probabilityDefault_threshStatus_loanAmount_df: pd.DataFrame
15
- prediction_probability_df: pd.DataFrame