pkiage commited on
Commit
7f0977b
·
1 Parent(s): 0395eed

refactor: model comparison, utils, and clean up

Browse files
README.md CHANGED
@@ -21,6 +21,10 @@ An interactive tool demonstrating credit risk modelling.
21
 
22
  - Selecting optimal threshold using Youden's J statistic
23
 
 
 
 
 
24
  ## Political, Economic, Social, Technological, Legal and Environmental(PESTLE):
25
 
26
  [Europe fit for the Digital Age: Commission proposes new rules and actions for excellence and trust in Artificial Intelligence](https://ec.europa.eu/commission/presscorner/detail/en/ip_21_1682)
 
21
 
22
  - Selecting optimal threshold using Youden's J statistic
23
 
24
+ [Cookiecutter Data Science](https://drivendata.github.io/cookiecutter-data-science/)
25
+
26
+ - Project structure
27
+
28
  ## Political, Economic, Social, Technological, Legal and Environmental(PESTLE):
29
 
30
  [Europe fit for the Digital Age: Commission proposes new rules and actions for excellence and trust in Artificial Intelligence](https://ec.europa.eu/commission/presscorner/detail/en/ip_21_1682)
app.py CHANGED
@@ -7,6 +7,7 @@ from src.features.build_features import initialise_data
7
  from src.models.xgboost_model import xgboost_class
8
  from src.models.logistic_model import logistic_class
9
 
 
10
 
11
  from src.models.util_strategy_table import strategy_table_view
12
 
@@ -44,6 +45,8 @@ def main():
44
  xgboost_model_class = xgboost_class(split_dataset, currency)
45
  model_classes["XGBoost"] = xgboost_model_class
46
 
 
 
47
  strategy_table_view(currency, model_classes)
48
 
49
 
 
7
  from src.models.xgboost_model import xgboost_class
8
  from src.models.logistic_model import logistic_class
9
 
10
+ from src.models.util_model_comparison import model_comparison_view
11
 
12
  from src.models.util_strategy_table import strategy_table_view
13
 
 
45
  xgboost_model_class = xgboost_class(split_dataset, currency)
46
  model_classes["XGBoost"] = xgboost_model_class
47
 
48
+ model_comparison_view(split_dataset, model_classes)
49
+
50
  strategy_table_view(currency, model_classes)
51
 
52
 
src/features/build_features.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Union, cast, Tuple
2
+ from dataclasses import dataclass
3
+ from sklearn.model_selection import train_test_split
4
+ import pandas as pd
5
+
6
+ import streamlit as st
7
+
8
+
9
+ from src.features.util_build_features import (
10
+ Dataset,
11
+ SplitDataset,
12
+ undersample_training_data,
13
+ select_predictors,
14
+ import_data)
15
+
16
+ from src.visualization.metrics import (
17
+ streamlit_2columns_metrics_df_shape,
18
+ streamlit_2columns_metrics_series,
19
+ streamlit_2columns_metrics_pct_series,
20
+ streamlit_2columns_metrics_df,
21
+ streamlit_2columns_metrics_pct_df,
22
+ )
23
+
24
+
25
+ def initialise_data() -> Tuple[Dataset, SplitDataset]:
26
+
27
+ dataset = import_data()
28
+
29
+ st.write(
30
+ "Assuming data is already cleaned and relevant features (predictors) added."
31
+ )
32
+
33
+ with st.expander("Input Dataframe (X and y)"):
34
+ st.dataframe(dataset.df)
35
+ streamlit_2columns_metrics_df_shape(dataset.df)
36
+
37
+ selected_x_values = select_predictors(dataset)
38
+
39
+ with st.expander("Predictors Dataframe (X)"):
40
+ st.dataframe(selected_x_values)
41
+ streamlit_2columns_metrics_df_shape(selected_x_values)
42
+
43
+ st.header("Split Testing and Training Data")
44
+
45
+ test_size_slider_col, seed_col = st.columns(2)
46
+
47
+ with test_size_slider_col:
48
+ # Initialize test size
49
+ dataset.test_size = st.slider(
50
+ label="Test Size Percentage of Input Dataframe:",
51
+ min_value=0,
52
+ max_value=100,
53
+ value=dataset.test_size,
54
+ key="init_test_size",
55
+ format="%f%%",
56
+ )
57
+
58
+ with seed_col:
59
+ dataset.random_state = int(
60
+ st.number_input(label="Random State:", value=dataset.random_state)
61
+ )
62
+
63
+ split_dataset = dataset.train_test_split(selected_x_values)
64
+
65
+ true_status = split_dataset.y_test.to_frame().value_counts()
66
+
67
+ st.sidebar.metric(
68
+ label="Testing Data # of Actual Default (=1)",
69
+ value=true_status.get(1),
70
+ )
71
+
72
+ st.sidebar.metric(
73
+ label="Testing Data % of Actual Default",
74
+ value="{:.0%}".format(true_status.get(1) / true_status.sum()),
75
+ )
76
+
77
+ st.sidebar.metric(
78
+ label="Testing Data # of Actual Non-Default (=0)",
79
+ value=true_status.get(0),
80
+ )
81
+
82
+ st.sidebar.metric(
83
+ label="Testing Data % of Actual Non-Default",
84
+ value="{:.0%}".format(true_status.get(0) / true_status.sum()),
85
+ )
86
+
87
+ # Concat the testing sets
88
+ X_y_test = split_dataset.X_y_test
89
+ X_y_train = split_dataset.X_y_train
90
+
91
+ with st.expander("Testing Dataframe (X and y)"):
92
+ st.dataframe(X_y_test)
93
+ streamlit_2columns_metrics_df_shape(X_y_test)
94
+
95
+ streamlit_2columns_metrics_series(
96
+ "# Defaults(=1) (Testing Data)",
97
+ "# Non-Defaults(=0) (Testing Data)",
98
+ true_status,
99
+ )
100
+
101
+ streamlit_2columns_metrics_pct_series(
102
+ "% Defaults (Testing Data)",
103
+ "% Non-Defaults (Testing Data)",
104
+ true_status,
105
+ )
106
+
107
+ st.header("Training Data")
108
+
109
+ with st.expander("Training Dataframe (X and y)"):
110
+ st.dataframe(X_y_train)
111
+ streamlit_2columns_metrics_df_shape(X_y_train)
112
+
113
+ st.subheader("Class Count")
114
+
115
+ streamlit_2columns_metrics_df(
116
+ "# Defaults (Training Data Class Balance Check)",
117
+ "# Non-Defaults (Training Data Class Balance Check)",
118
+ split_dataset.y_train,
119
+ )
120
+
121
+ streamlit_2columns_metrics_pct_df(
122
+ "% Defaults (Training Data Class Balance Check)",
123
+ "% Non-Defaults (Training Data Class Balance Check)",
124
+ split_dataset.y_train,
125
+ )
126
+
127
+ balance_the_classes = st.radio(
128
+ label="Balance the Classes:", options=("Yes", "No")
129
+ )
130
+
131
+ if balance_the_classes == "Yes":
132
+ st.subheader("Balanced Classes (by Undersampling)")
133
+
134
+ (
135
+ split_dataset.X_train,
136
+ split_dataset.y_train,
137
+ _X_y_train,
138
+ class_balance_default,
139
+ ) = undersample_training_data(X_y_train, "loan_status", split_dataset)
140
+
141
+ streamlit_2columns_metrics_series(
142
+ "# Defaults (Training Data with Class Balance)",
143
+ "# Non-Defaults (Training Data with Class Balance)",
144
+ class_balance_default,
145
+ )
146
+
147
+ streamlit_2columns_metrics_pct_series(
148
+ "% of Defaults (Training Data with Class Balance)",
149
+ "% of Non-Defaults (Training Data with Class Balance)",
150
+ class_balance_default,
151
+ )
152
+
153
+ return dataset, split_dataset
src/features/util_build_features.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from typing import List, Union, cast
4
+
5
+ from dataclasses import dataclass
6
+
7
+ from sklearn.model_selection import train_test_split
8
+
9
+ import pandas as pd
10
+
11
+
12
+ @dataclass
13
+ class SplitDataset:
14
+ X_test: pd.DataFrame
15
+ X_train: pd.DataFrame
16
+ y_test: pd.Series
17
+ y_train: pd.Series
18
+
19
+ @property
20
+ def X_y_test(self) -> pd.DataFrame:
21
+ return pd.concat(
22
+ cast(
23
+ List[Union[pd.DataFrame, pd.Series]],
24
+ [
25
+ self.X_test.reset_index(drop=True),
26
+ self.y_test.reset_index(drop=True),
27
+ ],
28
+ ),
29
+ axis=1,
30
+ )
31
+
32
+ @property
33
+ def X_y_train(self) -> pd.DataFrame:
34
+ return pd.concat(
35
+ cast(
36
+ List[Union[pd.DataFrame, pd.Series]],
37
+ [
38
+ self.X_train.reset_index(drop=True),
39
+ self.y_train.reset_index(drop=True),
40
+ ],
41
+ ),
42
+ axis=1,
43
+ )
44
+
45
+
46
+ @dataclass
47
+ class Dataset:
48
+ df: pd.DataFrame
49
+ random_state: int
50
+ test_size: int
51
+
52
+ @property
53
+ def y_value(self) -> pd.DataFrame:
54
+ return self.df["loan_status"]
55
+
56
+ @property
57
+ def x_values(self) -> pd.DataFrame:
58
+ return cast(
59
+ pd.DataFrame,
60
+ drop_columns(
61
+ self.df,
62
+ [
63
+ "loan_status",
64
+ "loan_grade_A",
65
+ "loan_grade_B",
66
+ "loan_grade_C",
67
+ "loan_grade_D",
68
+ "loan_grade_E",
69
+ "loan_grade_F",
70
+ "loan_grade_G",
71
+ ],
72
+ ),
73
+ )
74
+
75
+ @property
76
+ def x_values_column_names(self):
77
+ return self.x_values.columns.tolist()
78
+
79
+ def x_values_filtered_columns(self, columns: List[str]) -> pd.DataFrame:
80
+ return self.df.filter(columns)
81
+
82
+ def train_test_split(
83
+ self, selected_x_values: pd.DataFrame
84
+ ) -> SplitDataset:
85
+ X_train, X_test, y_train, y_test = train_test_split(
86
+ selected_x_values,
87
+ self.y_value,
88
+ test_size=self.test_size / 100, # since up was given as pct
89
+ random_state=self.random_state,
90
+ )
91
+
92
+ return SplitDataset(
93
+ X_train=cast(pd.DataFrame, X_train),
94
+ X_test=cast(pd.DataFrame, X_test),
95
+ y_train=cast(pd.Series, y_train),
96
+ y_test=cast(pd.Series, y_test),
97
+ )
98
+
99
+
100
+ def drop_columns(df, columns):
101
+ return df.drop(columns, axis=1)
102
+
103
+
104
+ def remove_less_than_0_columns(df, column):
105
+ df[column].dropna()
106
+ return df.loc[(df[column] != 0).any(1)]
107
+
108
+
109
+ def boolean_int_condition_label(df, label_column_name, condition):
110
+ df[label_column_name] = condition
111
+ y = df[label_column_name].astype(int)
112
+ df = drop_columns(df, label_column_name)
113
+ return y, df
114
+
115
+
116
+ @dataclass
117
+ class SplitDataset:
118
+ X_test: pd.DataFrame
119
+ X_train: pd.DataFrame
120
+ y_test: pd.Series
121
+ y_train: pd.Series
122
+
123
+ @property
124
+ def X_y_test(self) -> pd.DataFrame:
125
+ return pd.concat(
126
+ cast(
127
+ List[Union[pd.DataFrame, pd.Series]],
128
+ [
129
+ self.X_test.reset_index(drop=True),
130
+ self.y_test.reset_index(drop=True),
131
+ ],
132
+ ),
133
+ axis=1,
134
+ )
135
+
136
+ @property
137
+ def X_y_train(self) -> pd.DataFrame:
138
+ return pd.concat(
139
+ cast(
140
+ List[Union[pd.DataFrame, pd.Series]],
141
+ [
142
+ self.X_train.reset_index(drop=True),
143
+ self.y_train.reset_index(drop=True),
144
+ ],
145
+ ),
146
+ axis=1,
147
+ )
148
+
149
+
150
+ @dataclass
151
+ class Dataset:
152
+ df: pd.DataFrame
153
+ random_state: int
154
+ test_size: int
155
+
156
+ @property
157
+ def y_value(self) -> pd.DataFrame:
158
+ return self.df["loan_status"]
159
+
160
+ @property
161
+ def x_values(self) -> pd.DataFrame:
162
+ return cast(
163
+ pd.DataFrame,
164
+ drop_columns(
165
+ self.df,
166
+ [
167
+ "loan_status",
168
+ "loan_grade_A",
169
+ "loan_grade_B",
170
+ "loan_grade_C",
171
+ "loan_grade_D",
172
+ "loan_grade_E",
173
+ "loan_grade_F",
174
+ "loan_grade_G",
175
+ ],
176
+ ),
177
+ )
178
+
179
+ @property
180
+ def x_values_column_names(self):
181
+ return self.x_values.columns.tolist()
182
+
183
+ def x_values_filtered_columns(self, columns: List[str]) -> pd.DataFrame:
184
+ return self.df.filter(columns)
185
+
186
+ def train_test_split(
187
+ self, selected_x_values: pd.DataFrame
188
+ ) -> SplitDataset:
189
+ X_train, X_test, y_train, y_test = train_test_split(
190
+ selected_x_values,
191
+ self.y_value,
192
+ test_size=self.test_size / 100, # since up was given as pct
193
+ random_state=self.random_state,
194
+ )
195
+
196
+ return SplitDataset(
197
+ X_train=cast(pd.DataFrame, X_train),
198
+ X_test=cast(pd.DataFrame, X_test),
199
+ y_train=cast(pd.Series, y_train),
200
+ y_test=cast(pd.Series, y_test),
201
+ )
202
+
203
+
204
+ def drop_columns(df, columns):
205
+ return df.drop(columns, axis=1)
206
+
207
+
208
+ def remove_less_than_0_columns(df, column):
209
+ df[column].dropna()
210
+ return df.loc[(df[column] != 0).any(1)]
211
+
212
+
213
+ def boolean_int_condition_label(df, label_column_name, condition):
214
+ df[label_column_name] = condition
215
+ y = df[label_column_name].astype(int)
216
+ df = drop_columns(df, label_column_name)
217
+ return y, df
218
+
219
+
220
+ @st.cache(suppress_st_warning=True)
221
+ def undersample_training_data(
222
+ df: pd.DataFrame, column_name: str, split_dataset
223
+ ):
224
+ count_nondefault, count_default = split_dataset.X_y_train[
225
+ column_name
226
+ ].value_counts()
227
+
228
+ nondefaults = df[df[column_name] == 0] # 0
229
+
230
+ defaults = df[df[column_name] == 1]
231
+
232
+ under_sample = min(count_nondefault, count_default)
233
+
234
+ nondefaults_under = nondefaults.sample(under_sample)
235
+
236
+ defaults_under = defaults.sample(under_sample)
237
+
238
+ X_y_train_under = pd.concat(
239
+ [
240
+ nondefaults_under.reset_index(drop=True),
241
+ defaults_under.reset_index(drop=True),
242
+ ],
243
+ axis=0,
244
+ )
245
+
246
+ X_train_under = X_y_train_under.drop([column_name], axis=1) # remove label
247
+
248
+ y_train_under = X_y_train_under[column_name] # label only
249
+
250
+ class_balance_default = X_y_train_under[column_name].value_counts()
251
+
252
+ return [
253
+ X_train_under,
254
+ y_train_under,
255
+ X_y_train_under,
256
+ class_balance_default,
257
+ ]
258
+
259
+
260
+ def select_predictors(dataset):
261
+ st.header("Predictors")
262
+
263
+ possible_columns = dataset.x_values_column_names
264
+
265
+ selected_columns = st.sidebar.multiselect(
266
+ label="Select Predictors",
267
+ options=possible_columns,
268
+ default=possible_columns,
269
+ )
270
+ return dataset.x_values_filtered_columns(selected_columns)
271
+
272
+
273
+ def import_data():
274
+ if "input_data_frame" not in st.session_state:
275
+ st.session_state.input_data_frame = pd.read_csv(
276
+ r"./data/processed/cr_loan_w2.csv"
277
+ )
278
+ if "dataset" not in st.session_state:
279
+ df = cast(pd.DataFrame, st.session_state.input_data_frame)
280
+ dataset = Dataset(
281
+ df=df,
282
+ random_state=123235,
283
+ test_size=40,
284
+ )
285
+ st.session_state.dataset = dataset
286
+ else:
287
+ dataset = st.session_state.dataset
288
+
289
+ return dataset
src/models/util_model_class.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Union
3
+
4
+ import pandas as pd
5
+ from xgboost.sklearn import XGBClassifier
6
+ from sklearn.linear_model import LogisticRegression
7
+
8
+
9
+ @dataclass(frozen=True)
10
+ class ModelClass:
11
+ model: Union[XGBClassifier, LogisticRegression]
12
+ probability_threshold_selected: float
13
+ predicted_default_status: pd.Series
14
+ trueStatus_probabilityDefault_threshStatus_loanAmount_df: pd.DataFrame
15
+ prediction_probability_df: pd.DataFrame
src/models/util_model_comparison.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import OrderedDict
2
+ import streamlit as st
3
+ from sklearn.metrics import roc_auc_score
4
+ from src.features.util_build_features import SplitDataset
5
+ from src.visualization.graphs_settings import (
6
+ streamlit_chart_setting_height_width
7
+ )
8
+
9
+ from src.visualization.graphs_test import (
10
+ roc_auc_compare_n_models,
11
+ calibration_curve_report_commented_n
12
+ )
13
+
14
+
15
+ from src.models.util_model_class import ModelClass
16
+
17
+
18
+ def roc_auc_for_model(split_dataset: SplitDataset, model_view: ModelClass):
19
+ roc_auc_model = roc_auc_score(
20
+ split_dataset.y_test, model_view.predicted_default_status
21
+ )
22
+
23
+ if roc_auc_model > 0.9:
24
+ roc_auc_lvl = f'Very good {"{:.2f}".format(roc_auc_model)} > 0.9)'
25
+ elif 0.8 < roc_auc_model < 0.9:
26
+ roc_auc_lvl = f'Good (0.8 < {"{:.2f}".format(roc_auc_model)} <0.9)'
27
+ elif 0.7 < roc_auc_model < 0.8:
28
+ roc_auc_lvl = f'Fair (0.7 < {"{:.2f}".format(roc_auc_model)} < 0.8)'
29
+ elif 0.6 < roc_auc_model < 0.7:
30
+ roc_auc_lvl = f'Poor (0.6 < {"{:.2f}".format(roc_auc_model)} < 0.7)'
31
+ else:
32
+ roc_auc_lvl = f'Fail ( {"{:.2f}".format(roc_auc_model)} < 0.6)'
33
+
34
+ return roc_auc_model, roc_auc_lvl
35
+
36
+
37
+ def model_comparison_view(
38
+ split_dataset: SplitDataset,
39
+ model_views: OrderedDict[str, ModelClass],
40
+ ):
41
+ st.header("Model Comparison")
42
+
43
+ for model_name, model_view in model_views.items():
44
+ roc_auc_model, roc_auc_lvl = roc_auc_for_model(
45
+ split_dataset, model_view
46
+ )
47
+ st.subheader(
48
+ f"Receiver Operating Characteristic (ROC) Curve - {model_name}"
49
+ )
50
+ st.markdown(
51
+ f'Area Under the Receiver Operating Characteristic Curve from prediction scores from {model_name} model is {roc_auc_model}.\n'
52
+ )
53
+ st.markdown(
54
+ f'The score of {"{:.2f}".format(roc_auc_model)} is in the {roc_auc_lvl} ROC AUC score category.'
55
+ )
56
+ fig1 = roc_auc_compare_n_models(
57
+ split_dataset.y_test,
58
+ model_views,
59
+ )
60
+
61
+ fig1 = fig1.figure
62
+
63
+ (xsize_roc, ysize_roc) = streamlit_chart_setting_height_width(
64
+ "Chart Settings", 7, 7, "xsize_roc", "ysize_roc"
65
+ )
66
+
67
+ fig1.set_size_inches(xsize_roc, ysize_roc)
68
+
69
+ st.pyplot(fig1)
70
+
71
+ st.subheader("Models Calibration Curve")
72
+
73
+ fig2 = calibration_curve_report_commented_n(
74
+ split_dataset.y_test,
75
+ model_views,
76
+ 10,
77
+ )
78
+ fig2 = fig2.figure
79
+
80
+ (xsize_cal, ysize_cal) = streamlit_chart_setting_height_width(
81
+ "Chart Settings", 7, 7, "xsize_cal", "ysize_cal"
82
+ )
83
+
84
+ fig2.set_size_inches(xsize_cal, ysize_cal)
85
+
86
+ st.pyplot(fig2)
src/models/util_predict_model.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union, cast
2
+ from sklearn.linear_model import LogisticRegression
3
+
4
+
5
+ import pandas as pd
6
+
7
+ from dataclasses import dataclass
8
+
9
+ from xgboost import XGBClassifier
10
+ from src.features.util_build_features import SplitDataset
11
+
12
+ from src.models.util_predict_model_threshold import (
13
+ user_defined_probability_threshold,
14
+ J_statistic_driven_probability_threshold,
15
+ tradeoff_threshold,
16
+ acceptance_rate_driven_threshold,
17
+ select_probability_threshold,
18
+ model_probability_values_df)
19
+
20
+ import streamlit as st
21
+
22
+
23
+ def probability_threshold_explainer(model_name):
24
+ st.write(
25
+ f"""
26
+ The {model_name} model (obtained using training data) is applied on testing data to predict the loans probabilities of defaulting.\n
27
+ Probabilities of defaulting of the loans are compared to a probability threshold.\n
28
+ A loan is predicted to default if its predicted probability of defaulting is greater than the probability threshold.
29
+ """
30
+ )
31
+
32
+
33
+ @dataclass(frozen=True)
34
+ class Threshold:
35
+ probability_threshold_selected: float
36
+ predicted_default_status: pd.Series
37
+ prediction_probability_df: pd.DataFrame
38
+
39
+
40
+ def make_prediction_view(
41
+ model_name_short: str,
42
+ model_name: str,
43
+ ):
44
+ def view(
45
+ clf_xgbt_model: Union[XGBClassifier, LogisticRegression],
46
+ split_dataset: SplitDataset,
47
+ ) -> Threshold:
48
+
49
+ probability_threshold_explainer(model_name)
50
+
51
+ clf_prediction_prob_df_gbt = model_probability_values_df(
52
+ clf_xgbt_model,
53
+ split_dataset.X_test,
54
+ )
55
+
56
+ (clf_thresh_predicted_default_status_user_gbt,
57
+ user_threshold
58
+ ) = user_defined_probability_threshold(
59
+ model_name_short, clf_xgbt_model, split_dataset)
60
+
61
+ (clf_thresh_predicted_default_status_Jstatistic_gbt,
62
+ J_statistic_best_threshold) = J_statistic_driven_probability_threshold(
63
+ clf_prediction_prob_df_gbt, clf_xgbt_model, split_dataset)
64
+
65
+ tradeoff_threshold(clf_prediction_prob_df_gbt, split_dataset)
66
+
67
+ (acc_rate_thresh_gbt,
68
+ clf_thresh_predicted_default_status_acceptance_gbt) = acceptance_rate_driven_threshold(model_name_short, clf_prediction_prob_df_gbt)
69
+
70
+ (prob_thresh_selected_gbt,
71
+ predicted_default_status_gbt) = select_probability_threshold(model_name_short,
72
+ user_threshold,
73
+ clf_thresh_predicted_default_status_user_gbt,
74
+ J_statistic_best_threshold,
75
+ clf_thresh_predicted_default_status_Jstatistic_gbt,
76
+ acc_rate_thresh_gbt,
77
+ clf_thresh_predicted_default_status_acceptance_gbt)
78
+
79
+ return Threshold(
80
+ probability_threshold_selected=cast(
81
+ float, prob_thresh_selected_gbt
82
+ ),
83
+ predicted_default_status=predicted_default_status_gbt,
84
+ prediction_probability_df=clf_prediction_prob_df_gbt,
85
+ )
86
+
87
+ return view
src/models/util_predict_model_threshold.py ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from sklearn.metrics import classification_report, roc_curve
4
+
5
+ import numpy as np
6
+
7
+ import plotly.express as px
8
+
9
+ import pandas as pd
10
+
11
+ from numpy import argmax
12
+
13
+ from src.visualization.metrics import streamlit_2columns_metrics_df, streamlit_2columns_metrics_pct_df
14
+
15
+ from src.visualization.graphs_threshold import acceptance_rate_driven_threshold_graph
16
+
17
+
18
+ def model_probability_values_df(model, X):
19
+ return pd.DataFrame(model.predict_proba(X)[:, 1], columns=["PROB_DEFAULT"])
20
+
21
+
22
+ def find_best_threshold_J_statistic(y, clf_prediction_prob_df):
23
+ fpr, tpr, thresholds = roc_curve(y, clf_prediction_prob_df)
24
+ # get the best threshold
25
+ # Youden’s J statistic tpr-fpr
26
+ # Argmax to get the index in
27
+ # thresholds
28
+ return thresholds[argmax(tpr - fpr)]
29
+
30
+ # Function that makes dataframe with probability of default, predicted default status based on threshold
31
+ # and actual default status
32
+
33
+
34
+ def classification_report_per_threshold(
35
+ threshold_list, threshold_default_status_list, y_test
36
+ ):
37
+ target_names = ["Non-Default", "Default"]
38
+ classification_report_list = []
39
+ for threshold_default_status in threshold_default_status_list:
40
+ thresh_classification_report = classification_report(
41
+ y_test,
42
+ threshold_default_status,
43
+ target_names=target_names,
44
+ output_dict=True,
45
+ zero_division=0,
46
+ )
47
+ classification_report_list.append(thresh_classification_report)
48
+ # Return threshold classification report dict
49
+ return dict(zip(threshold_list, classification_report_list))
50
+
51
+
52
+ def thresh_classification_report_recall_accuracy(
53
+ thresh_classification_report_dict,
54
+ ):
55
+ thresh_def_recalls_list = []
56
+ thresh_nondef_recalls_list = []
57
+ thresh_accs_list = []
58
+ for x in [*thresh_classification_report_dict]:
59
+ thresh_def_recall = thresh_classification_report_dict[x]["Default"][
60
+ "recall"
61
+ ]
62
+ thresh_def_recalls_list.append(thresh_def_recall)
63
+ thresh_nondef_recall = thresh_classification_report_dict[x][
64
+ "Non-Default"
65
+ ]["recall"]
66
+ thresh_nondef_recalls_list.append(thresh_nondef_recall)
67
+ thresh_accs = thresh_classification_report_dict[x]["accuracy"]
68
+ thresh_accs_list.append(thresh_accs)
69
+ return [
70
+ thresh_def_recalls_list,
71
+ thresh_nondef_recalls_list,
72
+ thresh_accs_list,
73
+ ]
74
+
75
+
76
+ def apply_threshold_to_probability_values(probability_values, threshold):
77
+ return (
78
+ probability_values["PROB_DEFAULT"]
79
+ .apply(lambda x: 1 if x > threshold else 0)
80
+ .rename("PREDICT_DEFAULT_STATUS")
81
+ )
82
+
83
+
84
+ @st.cache(suppress_st_warning=True)
85
+ def find_best_threshold_J_statistic(y, clf_prediction_prob_df):
86
+ fpr, tpr, thresholds = roc_curve(y, clf_prediction_prob_df)
87
+ # get the best threshold
88
+ J = tpr - fpr # Youden’s J statistic
89
+ ix = argmax(J)
90
+ return thresholds[ix]
91
+
92
+
93
+ def default_status_per_threshold(threshold_list, prob_default):
94
+ threshold_default_status_list = []
95
+ for threshold in threshold_list:
96
+ threshold_default_status = prob_default.apply(
97
+ lambda x: 1 if x > threshold else 0
98
+ )
99
+ threshold_default_status_list.append(threshold_default_status)
100
+ return threshold_default_status_list
101
+
102
+
103
+ def threshold_and_predictions(clf_xgbt_model, split_dataset, threshold):
104
+
105
+ clf_prediction_prob_df_gbt = model_probability_values_df(
106
+ clf_xgbt_model,
107
+ split_dataset.X_test,
108
+ )
109
+ clf_thresh_predicted_default_status = (
110
+ apply_threshold_to_probability_values(
111
+ clf_prediction_prob_df_gbt,
112
+ threshold,
113
+ )
114
+ )
115
+
116
+ streamlit_2columns_metrics_df(
117
+ "# of Predicted Defaults",
118
+ "# of Predicted Non-Default",
119
+ clf_thresh_predicted_default_status,
120
+ )
121
+
122
+ streamlit_2columns_metrics_pct_df(
123
+ "% of Loans Predicted to Default",
124
+ "% of Loans Predicted not to Default",
125
+ clf_thresh_predicted_default_status,
126
+ )
127
+
128
+ return clf_thresh_predicted_default_status
129
+
130
+
131
+ def user_defined_probability_threshold(model_name_short, clf_xgbt_model, split_dataset):
132
+ st.subheader("Classification Probability Threshold - User Defined")
133
+
134
+ user_defined_threshold = st.slider(
135
+ label="Default Probability Threshold:",
136
+ min_value=0.0,
137
+ max_value=1.0,
138
+ value=0.8,
139
+ key=f"threshold_{model_name_short}_default",
140
+ )
141
+
142
+ clf_thresh_predicted_default_status = threshold_and_predictions(
143
+ clf_xgbt_model, split_dataset, user_defined_threshold)
144
+
145
+ return clf_thresh_predicted_default_status, user_defined_threshold
146
+
147
+
148
+ def J_statistic_driven_probability_threshold(clf_prediction_prob_df_gbt, clf_xgbt_model, split_dataset):
149
+ st.subheader("J Statistic Driven Classification Probability Threshold")
150
+
151
+ J_statistic_best_threshold = find_best_threshold_J_statistic(
152
+ split_dataset.y_test, clf_prediction_prob_df_gbt
153
+ )
154
+ st.metric(
155
+ label="Youden's J statistic calculated best threshold",
156
+ value=J_statistic_best_threshold,
157
+ )
158
+
159
+ clf_thresh_predicted_default_status = threshold_and_predictions(
160
+ clf_xgbt_model, split_dataset, J_statistic_best_threshold)
161
+
162
+ return clf_thresh_predicted_default_status, J_statistic_best_threshold
163
+
164
+
165
+ def create_tradeoff_graph(df):
166
+ fig2 = px.line(
167
+ data_frame=df,
168
+ y=["Default Recall", "Non Default Recall", "Accuracy"],
169
+ x="Threshold",
170
+ )
171
+
172
+ fig2.update_layout(
173
+ title="Recall and Accuracy score Trade-off with Probability Threshold",
174
+ xaxis_title="Probability Threshold",
175
+ yaxis_title="Score",
176
+ )
177
+ fig2.update_yaxes(range=[0.0, 1.0])
178
+
179
+ st.plotly_chart(fig2)
180
+
181
+
182
+ def tradeoff_threshold(clf_prediction_prob_df_gbt, split_dataset):
183
+ st.subheader(
184
+ "Recall and Accuracy Tradeoff with given Probability Threshold"
185
+ )
186
+
187
+ threshold_list = np.arange(
188
+ 0, 1, 0.025).round(decimals=3).tolist()
189
+
190
+ threshold_default_status_list = default_status_per_threshold(
191
+ threshold_list, clf_prediction_prob_df_gbt["PROB_DEFAULT"]
192
+ )
193
+ thresh_classification_report_dict = (
194
+ classification_report_per_threshold(
195
+ threshold_list,
196
+ threshold_default_status_list,
197
+ split_dataset.y_test,
198
+ )
199
+ )
200
+
201
+ (
202
+ thresh_def_recalls_list,
203
+ thresh_nondef_recalls_list,
204
+ thresh_accs_list,
205
+ ) = thresh_classification_report_recall_accuracy(
206
+ thresh_classification_report_dict
207
+ )
208
+
209
+ namelist = [
210
+ "Default Recall",
211
+ "Non Default Recall",
212
+ "Accuracy",
213
+ "Threshold",
214
+ ]
215
+
216
+ df = pd.DataFrame(
217
+ [
218
+ thresh_def_recalls_list,
219
+ thresh_nondef_recalls_list,
220
+ thresh_accs_list,
221
+ threshold_list,
222
+ ],
223
+ index=namelist,
224
+ )
225
+
226
+ df = df.T
227
+
228
+ create_tradeoff_graph(df)
229
+
230
+
231
+ def select_probability_threshold(model_name_short,
232
+ user_defined_threshold,
233
+ clf_thresh_predicted_default_status_user_gbt,
234
+ J_statistic_best_threshold,
235
+ clf_thresh_predicted_default_status_Jstatistic_gbt,
236
+ acc_rate_thresh_gbt,
237
+ clf_thresh_predicted_default_status_acceptance_gbt):
238
+ st.subheader("Selected Probability Threshold")
239
+
240
+ options = [
241
+ "User Defined",
242
+ "J Statistic Driven",
243
+ "Acceptance Rate Driven",
244
+ ]
245
+ prob_thresh_option = st.radio(
246
+ label="Selected Probability Threshold",
247
+ options=options,
248
+ key=f"{model_name_short}_radio_thresh",
249
+ )
250
+
251
+ if prob_thresh_option == "User Defined":
252
+ prob_thresh_selected_gbt = user_defined_threshold
253
+ predicted_default_status_gbt = (
254
+ clf_thresh_predicted_default_status_user_gbt
255
+ )
256
+ elif prob_thresh_option == "J Statistic Driven":
257
+ prob_thresh_selected_gbt = J_statistic_best_threshold
258
+ predicted_default_status_gbt = (
259
+ clf_thresh_predicted_default_status_Jstatistic_gbt
260
+ )
261
+ else:
262
+ prob_thresh_selected_gbt = acc_rate_thresh_gbt
263
+ predicted_default_status_gbt = (
264
+ clf_thresh_predicted_default_status_acceptance_gbt
265
+ )
266
+
267
+ st.write(
268
+ f"Selected probability threshold is {prob_thresh_selected_gbt}"
269
+ )
270
+
271
+ return prob_thresh_selected_gbt, predicted_default_status_gbt
272
+
273
+
274
+ def acceptance_rate_driven_threshold(model_name_short, clf_prediction_prob_df_gbt):
275
+ st.subheader("Acceptance Rate Driven Probability Threshold")
276
+ # Steps
277
+ # Set acceptance rate
278
+ # Get default status per threshold
279
+ # Get classification report per threshold
280
+ # Get recall, nondef recall, and accuracy per threshold
281
+
282
+ acceptance_rate = (
283
+ st.slider(
284
+ label="% of loans accepted (acceptance rate):",
285
+ min_value=0,
286
+ max_value=100,
287
+ value=85,
288
+ key=f"acceptance_rate_{model_name_short}",
289
+ format="%f%%",
290
+ )
291
+ / 100
292
+ )
293
+
294
+ acc_rate_thresh_gbt = np.quantile(
295
+ clf_prediction_prob_df_gbt["PROB_DEFAULT"], acceptance_rate
296
+ )
297
+
298
+ st.write(
299
+ f"An acceptance rate of {acceptance_rate} results in probability threshold of {acc_rate_thresh_gbt}"
300
+ )
301
+
302
+ acceptance_rate_driven_threshold_graph(
303
+ clf_prediction_prob_df_gbt, acc_rate_thresh_gbt)
304
+
305
+ clf_thresh_predicted_default_status_acceptance_gbt = apply_threshold_to_probability_values(
306
+ clf_prediction_prob_df_gbt,
307
+ acc_rate_thresh_gbt,
308
+ )
309
+
310
+ return acc_rate_thresh_gbt, clf_thresh_predicted_default_status_acceptance_gbt
src/models/util_strategy_table.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import OrderedDict
2
+ import plotly.express as px
3
+ import numpy as np
4
+ import streamlit as st
5
+ from src.models.util_test import create_strategyTable_df
6
+ from src.models.util_model_class import ModelClass
7
+
8
+
9
+ def strategy_table_view(
10
+ currency: str, model_views: OrderedDict[str, ModelClass]
11
+ ):
12
+ st.header("Strategy Table")
13
+
14
+ for (model_name, model_view) in model_views.items():
15
+ st.subheader(model_name)
16
+ strat_df = create_strategyTable_df(
17
+ 0.05,
18
+ 1,
19
+ 20,
20
+ model_view.trueStatus_probabilityDefault_threshStatus_loanAmount_df,
21
+ "loan_status",
22
+ currency,
23
+ )
24
+
25
+ columns = strat_df.columns
26
+
27
+ with st.expander("Strategy Table:"):
28
+ st.write(strat_df)
29
+
30
+ for i in columns:
31
+ strat_df[i] = strat_df[i].astype(np.float64)
32
+
33
+ strat_df_boxPlot_data = strat_df.iloc[:, 0:3]
34
+
35
+ plot = px.box(data_frame=strat_df_boxPlot_data)
36
+
37
+ st.plotly_chart(plot)
38
+
39
+ # Plot the strategy curve
40
+
41
+ fig1 = px.line(
42
+ strat_df_boxPlot_data,
43
+ x="Acceptance Rate",
44
+ y="Bad Rate",
45
+ title="Acceptance and Bad Rates",
46
+ )
47
+
48
+ st.plotly_chart(fig1)
49
+
50
+ fig2 = px.line(
51
+ strat_df,
52
+ x="Acceptance Rate",
53
+ y=f"Estimated Value ({currency})",
54
+ title=f"Estimated Value ({currency}) by Acceptance Rate",
55
+ )
56
+
57
+ st.plotly_chart(fig2)
58
+
59
+ st.write("Row with the greatest estimated value:")
60
+
61
+ max_estimated_value = np.max(
62
+ strat_df[f"Estimated Value ({currency})"].astype(np.float64)
63
+ )
64
+ columns = strat_df.columns
65
+
66
+ max_estimated_value = np.max(strat_df[f"Estimated Value ({currency})"])
67
+
68
+ st.write(
69
+ strat_df.loc[
70
+ strat_df[f"Estimated Value ({currency})"]
71
+ == max_estimated_value
72
+ ]
73
+ )
74
+
75
+ loss_given_default = 1
76
+ df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
77
+ model_view.trueStatus_probabilityDefault_threshStatus_loanAmount_df[
78
+ "PROB_DEFAULT"
79
+ ]
80
+ * loss_given_default
81
+ * model_view.trueStatus_probabilityDefault_threshStatus_loanAmount_df[
82
+ "loan_amnt"
83
+ ]
84
+ )
85
+
86
+ tot_exp_loss = round(
87
+ np.sum(df_trueStatus_probabilityDefault_threshStatus_loanAmount),
88
+ 2,
89
+ )
90
+
91
+ st.metric(
92
+ label='Total expected loss:',
93
+ value=f"{currency} {tot_exp_loss:,.2f}",
94
+ delta=None,
95
+ delta_color="normal",
96
+ )
src/models/util_test.py ADDED
@@ -0,0 +1,568 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union
2
+ import pandas as pd
3
+ from sklearn.model_selection import StratifiedKFold, cross_val_score
4
+ import streamlit as st
5
+ import numpy as np
6
+ from sklearn.metrics import (
7
+ classification_report,
8
+ confusion_matrix,
9
+ )
10
+ from sklearn.linear_model import LogisticRegression
11
+ import xgboost as xgb
12
+ from xgboost.sklearn import XGBClassifier
13
+ from src.features.util_build_features import SplitDataset
14
+ """from src.models.model_utils import (
15
+ create_cross_validation_df,
16
+ cross_validation_scores,
17
+ get_df_trueStatus_probabilityDefault_threshStatus_loanAmount,
18
+ )"""
19
+ from src.visualization.graphs_test import (
20
+ cross_validation_graph,
21
+ )
22
+
23
+
24
+ def make_tests_view(
25
+ model_name_short: str,
26
+ model_name_generic: str,
27
+ ):
28
+ def view(
29
+ clf_xgbt_model: Union[XGBClassifier, LogisticRegression],
30
+ split_dataset: SplitDataset,
31
+ currency: str,
32
+ prob_thresh_selected,
33
+ predicted_default_status,
34
+ ):
35
+ st.header(f"Model Evaluation - {model_name_generic}")
36
+
37
+ st.subheader("Cross Validation")
38
+
39
+ st.write("Shows how our model will perform as new loans come in.")
40
+ st.write(
41
+ "If evaluation metric for test and train set improve as models \
42
+ train on each fold suggests performance will be stable."
43
+ )
44
+
45
+ st.write('xgb cross validation test:')
46
+
47
+ stcol_seed, stcol_eval_metric = st.columns(2)
48
+
49
+ with stcol_seed:
50
+ cv_seed = int(
51
+ st.number_input(
52
+ label="Random State Seed for Cross Validation:",
53
+ value=123235,
54
+ key=f"cv_seed_{model_name_short}",
55
+ )
56
+ )
57
+
58
+ with stcol_eval_metric:
59
+ eval_metric = st.selectbox(
60
+ label="Select evaluation metric",
61
+ options=[
62
+ "auc",
63
+ "aucpr",
64
+ "rmse",
65
+ "mae",
66
+ "logloss",
67
+ "error",
68
+ "merror",
69
+ "mlogloss",
70
+ ],
71
+ key=f"eval_metric_{model_name_short}",
72
+ )
73
+
74
+ stcol_trees, stcol_eval_nfold, stcol_earlystoppingrounds = st.columns(
75
+ 3
76
+ )
77
+
78
+ with stcol_trees:
79
+ trees = int(
80
+ st.number_input(
81
+ label="Number of trees",
82
+ value=5,
83
+ key=f"trees_{model_name_short}",
84
+ )
85
+ )
86
+
87
+ with stcol_eval_nfold:
88
+ nfolds = int(
89
+ st.number_input(
90
+ label="Number of folds",
91
+ value=5,
92
+ key=f"nfolds_{model_name_short}",
93
+ )
94
+ )
95
+
96
+ with stcol_earlystoppingrounds:
97
+ early_stopping_rounds = int(
98
+ st.number_input(
99
+ label="Early stopping rounds",
100
+ value=10,
101
+ key=f"early_stopping_rounds_{model_name_short}",
102
+ )
103
+ )
104
+
105
+ DTrain, cv_df = create_cross_validation_df(
106
+ split_dataset.X_test,
107
+ split_dataset.y_test,
108
+ eval_metric,
109
+ cv_seed,
110
+ trees,
111
+ nfolds,
112
+ early_stopping_rounds,
113
+ )
114
+
115
+ st.write(cv_df)
116
+
117
+ scoring_options = [
118
+ "roc_auc",
119
+ "accuracy",
120
+ "precision",
121
+ "recall",
122
+ "f1",
123
+ "jaccard",
124
+ ]
125
+
126
+ overfit_test = st.radio(
127
+ label="Overfit test:",
128
+ options=("No", "Yes"),
129
+ key=f"overfit_test_{model_name_short}",
130
+ )
131
+
132
+ if overfit_test == "Yes":
133
+ st.write("Overfit test:")
134
+ iterations = int(
135
+ st.number_input(
136
+ label="Number of folds (iterations)",
137
+ value=500,
138
+ key=f"iterations_{model_name_short}",
139
+ )
140
+ )
141
+
142
+ DTrain, cv_df_it = create_cross_validation_df(
143
+ split_dataset.X_test,
144
+ split_dataset.y_test,
145
+ eval_metric,
146
+ cv_seed,
147
+ iterations,
148
+ nfolds,
149
+ iterations,
150
+ )
151
+
152
+ fig_it = cross_validation_graph(cv_df_it, eval_metric, iterations)
153
+ st.pyplot(fig_it)
154
+
155
+ st.write("Sklearn cross validation test:")
156
+ stcol_scoringmetric, st_nfold = st.columns(2)
157
+
158
+ with stcol_scoringmetric:
159
+ score_metric = st.selectbox(
160
+ label="Select score",
161
+ options=scoring_options,
162
+ key=f"stcol_scoringmetric_{model_name_short}",
163
+ )
164
+
165
+ with st_nfold:
166
+ nfolds_score = int(
167
+ st.number_input(
168
+ label="Number of folds",
169
+ value=5,
170
+ key=f"st_nfold_{model_name_short}",
171
+ )
172
+ )
173
+
174
+ cv_scores = cross_validation_scores(
175
+ clf_xgbt_model,
176
+ split_dataset.X_test,
177
+ split_dataset.y_test,
178
+ nfolds_score,
179
+ score_metric,
180
+ cv_seed,
181
+ )
182
+
183
+ stcol_vals, stcol_mean, st_std = st.columns(3)
184
+
185
+ with stcol_vals:
186
+ st.markdown(f"{score_metric} scores:")
187
+ st.write(
188
+ pd.DataFrame(
189
+ cv_scores,
190
+ columns=[score_metric],
191
+ )
192
+ )
193
+
194
+ with stcol_mean:
195
+ st.metric(
196
+ label=f"Average {score_metric} score ",
197
+ value="{:.4f}".format(cv_scores.mean()),
198
+ delta=None,
199
+ delta_color="normal",
200
+ )
201
+
202
+ with st_std:
203
+ st.metric(
204
+ label=f"{score_metric} standard deviation (+/-)",
205
+ value="{:.4f}".format(cv_scores.std()),
206
+ delta=None,
207
+ delta_color="normal",
208
+ )
209
+
210
+ st.subheader("Classification Report")
211
+
212
+ target_names = ["Non-Default", "Default"]
213
+
214
+ classification_report_dict = classification_report(
215
+ split_dataset.y_test,
216
+ predicted_default_status,
217
+ target_names=target_names,
218
+ output_dict=True,
219
+ )
220
+
221
+ (
222
+ stcol_defaultpres,
223
+ stcol_defaultrecall,
224
+ stcol_defaultf1score,
225
+ stcol_f1score,
226
+ ) = st.columns(4)
227
+ with stcol_defaultpres:
228
+ st.metric(
229
+ label="Default Precision",
230
+ value="{:.0%}".format(
231
+ classification_report_dict["Default"]["precision"]
232
+ ),
233
+ delta=None,
234
+ delta_color="normal",
235
+ )
236
+
237
+ with stcol_defaultrecall:
238
+ st.metric(
239
+ label="Default Recall",
240
+ value="{:.0%}".format(
241
+ classification_report_dict["Default"]["recall"]
242
+ ),
243
+ delta=None,
244
+ delta_color="normal",
245
+ )
246
+
247
+ with stcol_defaultf1score:
248
+ st.metric(
249
+ label="Default F1 Score",
250
+ value="{:.2f}".format(
251
+ classification_report_dict["Default"]["f1-score"]
252
+ ),
253
+ delta=None,
254
+ delta_color="normal",
255
+ )
256
+
257
+ with stcol_f1score:
258
+ st.metric(
259
+ label="Macro avg F1 Score (Model F1 Score):",
260
+ value="{:.2f}".format(
261
+ classification_report_dict["macro avg"]["f1-score"]
262
+ ),
263
+ delta=None,
264
+ delta_color="normal",
265
+ )
266
+
267
+ with st.expander("Classification Report Dictionary:"):
268
+ st.write(classification_report_dict)
269
+
270
+ st.markdown(
271
+ f'Default precision: {"{:.0%}".format(classification_report_dict["Default"]["precision"])} of loans predicted as default were actually default.'
272
+ )
273
+
274
+ st.markdown(
275
+ f'Default recall: {"{:.0%}".format(classification_report_dict["Default"]["recall"])} of true defaults predicted correctly.'
276
+ )
277
+
278
+ f1_gap = 1 - classification_report_dict["Default"]["f1-score"]
279
+ st.markdown(
280
+ f'Default F1 score: {"{:.2f}".format(classification_report_dict["Default"]["f1-score"])}\
281
+ is {"{:.2f}".format(f1_gap)} away from perfect precision and recall (no false positive rate).'
282
+ )
283
+
284
+ st.markdown(
285
+ f'macro avg F1 score: {"{:.2f}".format(classification_report_dict["macro avg"]["f1-score"])} is the models F1 score.'
286
+ )
287
+
288
+ st.subheader("Confusion Matrix")
289
+ confuctiomatrix_dict = confusion_matrix(
290
+ split_dataset.y_test, predicted_default_status
291
+ )
292
+
293
+ tn, fp, fn, tp = confusion_matrix(
294
+ split_dataset.y_test, predicted_default_status
295
+ ).ravel()
296
+
297
+ with st.expander(
298
+ "Confusion matrix (column name = classification model prediction, row name = true status, values = number of loans"
299
+ ):
300
+ st.write(confuctiomatrix_dict)
301
+
302
+ st.markdown(
303
+ f'{tp} ,\
304
+ {"{:.0%}".format(tp / len(predicted_default_status))} \
305
+ true positives (defaults correctly predicted as defaults).'
306
+ )
307
+
308
+ st.markdown(
309
+ f'{fp} ,\
310
+ {"{:.0%}".format(fp / len(predicted_default_status))} \
311
+ false positives (non-defaults incorrectly predicted as defaults).'
312
+ )
313
+
314
+ st.markdown(
315
+ f'{fn} ,\
316
+ {"{:.0%}".format(fn / len(predicted_default_status))} \
317
+ false negatives (defaults incorrectly predicted as non-defaults).'
318
+ )
319
+
320
+ st.markdown(
321
+ f'{tn} ,\
322
+ {"{:.0%}".format(tn / len(predicted_default_status))} \
323
+ true negatives (non-defaults correctly predicted as non-defaults).'
324
+ )
325
+
326
+ st.subheader("Bad Rate")
327
+
328
+ df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
329
+ get_df_trueStatus_probabilityDefault_threshStatus_loanAmount(
330
+ clf_xgbt_model,
331
+ split_dataset.X_test,
332
+ split_dataset.y_test,
333
+ prob_thresh_selected,
334
+ "loan_amnt",
335
+ )
336
+ )
337
+
338
+ with st.expander(
339
+ "Loan Status, Probability of Default, & Loan Amount DataFrame"
340
+ ):
341
+ st.write(df_trueStatus_probabilityDefault_threshStatus_loanAmount)
342
+
343
+ accepted_loans = (
344
+ df_trueStatus_probabilityDefault_threshStatus_loanAmount[
345
+ df_trueStatus_probabilityDefault_threshStatus_loanAmount[
346
+ "PREDICT_DEFAULT_STATUS"
347
+ ]
348
+ == 0
349
+ ]
350
+ )
351
+
352
+ bad_rate = (
353
+ np.sum(accepted_loans["loan_status"])
354
+ / accepted_loans["loan_status"].count()
355
+ )
356
+
357
+ with st.expander("Loan Amount Summary Statistics"):
358
+ st.write(
359
+ df_trueStatus_probabilityDefault_threshStatus_loanAmount[
360
+ "loan_amnt"
361
+ ].describe()
362
+ )
363
+
364
+ avg_loan = np.mean(
365
+ df_trueStatus_probabilityDefault_threshStatus_loanAmount[
366
+ "loan_amnt"
367
+ ]
368
+ )
369
+
370
+ crosstab_df = pd.crosstab(
371
+ df_trueStatus_probabilityDefault_threshStatus_loanAmount[
372
+ "loan_status"
373
+ ], # row label
374
+ df_trueStatus_probabilityDefault_threshStatus_loanAmount[
375
+ "PREDICT_DEFAULT_STATUS"
376
+ ],
377
+ ).apply(
378
+ lambda x: x * avg_loan, axis=0
379
+ ) # column label
380
+
381
+ with st.expander(
382
+ "Cross tabulation (column name = classification model prediction, row name = true status, values = number of loans * average loan value"
383
+ ):
384
+ st.write(crosstab_df)
385
+
386
+ st.write(
387
+ f'Bad rate: {"{:.2%}".format(bad_rate)} of all the loans the model accepted (classified as non-default) from the test set were actually defaults.'
388
+ )
389
+
390
+ st.write(
391
+ f'Estimated value of the bad rate is {currency} {"{:,.2f}".format(crosstab_df[0][1])}.'
392
+ )
393
+
394
+ st.write(
395
+ f'Total estimated value of actual non-default loans is {currency} {"{:,.2f}".format(crosstab_df[0][0]+crosstab_df[0][1])}'
396
+ )
397
+
398
+ st.write(
399
+ f'Estimated value of loans incorrectly predicted as default is {currency} {"{:,.2f}".format(crosstab_df[1][0])}'
400
+ )
401
+
402
+ st.write(
403
+ f'Estimated value of loans correctly predicted as defaults is {currency} {"{:,.2f}".format(crosstab_df[1][1])}'
404
+ )
405
+
406
+ return df_trueStatus_probabilityDefault_threshStatus_loanAmount
407
+
408
+ return view
409
+
410
+
411
+ def cross_validation_scores(model, X, y, nfold, score, seed):
412
+ # return cv scores of metric
413
+ return cross_val_score(
414
+ model,
415
+ np.ascontiguousarray(X),
416
+ np.ravel(np.ascontiguousarray(y)),
417
+ cv=StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed),
418
+ scoring=score,
419
+ )
420
+
421
+
422
+ def create_cross_validation_df(
423
+ X, y, eval_metric, seed, trees, n_folds, early_stopping_rounds
424
+ ):
425
+ # Test data x and y
426
+ DTrain = xgb.DMatrix(X, label=y)
427
+
428
+ # auc or logloss
429
+ params = {
430
+ "eval_metric": eval_metric,
431
+ "objective": "binary:logistic", # logistic say 0 or 1 for loan status
432
+ "seed": seed,
433
+ }
434
+
435
+ # Create the data frame of cross validations
436
+ cv_df = xgb.cv(
437
+ params,
438
+ DTrain,
439
+ num_boost_round=trees,
440
+ nfold=n_folds,
441
+ early_stopping_rounds=early_stopping_rounds,
442
+ shuffle=True,
443
+ )
444
+
445
+ return [DTrain, cv_df]
446
+
447
+
448
+ def create_accept_rate_list(start, end, samples):
449
+ return np.linspace(start, end, samples, endpoint=True)
450
+
451
+
452
+ def create_strategyTable_df(
453
+ start, end, samples, actual_probability_predicted_acc_rate, true, currency
454
+ ):
455
+ accept_rates = create_accept_rate_list(start, end, samples)
456
+ thresholds_strat = []
457
+ bad_rates_start = []
458
+ Avg_Loan_Amnt = actual_probability_predicted_acc_rate[true].mean()
459
+ num_accepted_loans_start = []
460
+
461
+ for rate in accept_rates:
462
+ # Calculate the threshold for the acceptance rate
463
+ thresh = np.quantile(
464
+ actual_probability_predicted_acc_rate["PROB_DEFAULT"], rate
465
+ ).round(3)
466
+ # Add the threshold value to the list of thresholds
467
+ thresholds_strat.append(
468
+ np.quantile(
469
+ actual_probability_predicted_acc_rate["PROB_DEFAULT"], rate
470
+ ).round(3)
471
+ )
472
+
473
+ # Reassign the loan_status value using the threshold
474
+ actual_probability_predicted_acc_rate[
475
+ "PREDICT_DEFAULT_STATUS"
476
+ ] = actual_probability_predicted_acc_rate["PROB_DEFAULT"].apply(
477
+ lambda x: 1 if x > thresh else 0
478
+ )
479
+
480
+ # Create a set of accepted loans using this acceptance rate
481
+ accepted_loans = actual_probability_predicted_acc_rate[
482
+ actual_probability_predicted_acc_rate["PREDICT_DEFAULT_STATUS"]
483
+ == 0
484
+ ]
485
+ # Calculate and append the bad rate using the acceptance rate
486
+ bad_rates_start.append(
487
+ np.sum((accepted_loans[true]) / len(accepted_loans[true])).round(3)
488
+ )
489
+ # Accepted loans
490
+ num_accepted_loans_start.append(len(accepted_loans))
491
+
492
+ # Calculate estimated value
493
+ money_accepted_loans = [
494
+ accepted_loans * Avg_Loan_Amnt
495
+ for accepted_loans in num_accepted_loans_start
496
+ ]
497
+
498
+ money_bad_accepted_loans = [
499
+ 2 * money_accepted_loan * bad_rate
500
+ for money_accepted_loan, bad_rate in zip(
501
+ money_accepted_loans, bad_rates_start
502
+ )
503
+ ]
504
+
505
+ zip_object = zip(money_accepted_loans, money_bad_accepted_loans)
506
+ estimated_value = [
507
+ money_accepted_loan - money_bad_accepted_loan
508
+ for money_accepted_loan, money_bad_accepted_loan in zip_object
509
+ ]
510
+
511
+ accept_rates = ["{:.2f}".format(elem) for elem in accept_rates]
512
+
513
+ thresholds_strat = ["{:.2f}".format(elem) for elem in thresholds_strat]
514
+
515
+ bad_rates_start = ["{:.2f}".format(elem) for elem in bad_rates_start]
516
+
517
+ estimated_value = ["{:.2f}".format(elem) for elem in estimated_value]
518
+
519
+ return (
520
+ pd.DataFrame(
521
+ zip(
522
+ accept_rates,
523
+ thresholds_strat,
524
+ bad_rates_start,
525
+ num_accepted_loans_start,
526
+ estimated_value,
527
+ ),
528
+ columns=[
529
+ "Acceptance Rate",
530
+ "Threshold",
531
+ "Bad Rate",
532
+ "Num Accepted Loans",
533
+ f"Estimated Value ({currency})",
534
+ ],
535
+ )
536
+ .sort_values(by="Acceptance Rate", axis=0, ascending=False)
537
+ .reset_index(drop=True)
538
+ )
539
+
540
+
541
+ def get_df_trueStatus_probabilityDefault_threshStatus_loanAmount(
542
+ model, X, y, threshold, loan_amount_col_name
543
+ ):
544
+ true_status = y.to_frame()
545
+
546
+ loan_amount = X[loan_amount_col_name]
547
+
548
+ clf_prediction_prob = model.predict_proba(np.ascontiguousarray(X))
549
+
550
+ clf_prediction_prob_df = pd.DataFrame(
551
+ clf_prediction_prob[:, 1], columns=["PROB_DEFAULT"]
552
+ )
553
+
554
+ clf_thresh_predicted_default_status = (
555
+ clf_prediction_prob_df["PROB_DEFAULT"]
556
+ .apply(lambda x: 1 if x > threshold else 0)
557
+ .rename("PREDICT_DEFAULT_STATUS")
558
+ )
559
+
560
+ return pd.concat(
561
+ [
562
+ true_status.reset_index(drop=True),
563
+ clf_prediction_prob_df.reset_index(drop=True),
564
+ clf_thresh_predicted_default_status.reset_index(drop=True),
565
+ loan_amount.reset_index(drop=True),
566
+ ],
567
+ axis=1,
568
+ )
src/models/xgboost_model.py CHANGED
@@ -3,19 +3,20 @@ from src.features.build_features import SplitDataset
3
  from src.models.xgboost_train_model import xgboost_train_model
4
  from src.models.xgboost_predict_model import xgboost_predit_model
5
  from src.models.xgboost_test_model import xgboost_test_model
 
6
  from src.models.util_model_class import ModelClass
7
 
8
 
9
  def xgboost_class(split_dataset: SplitDataset, currency: str):
10
 
11
  # Train Model
12
- clf_xgbt_model = xgboost_train_model(split_dataset, currency)
13
 
14
  # Predit using Trained Model
15
  clf_xgbt_predictions = xgboost_predit_model(
16
  clf_xgbt_model, split_dataset)
17
 
18
- # Test Predictions of Trained Model
19
  df_trueStatus_probabilityDefault_threshStatus_loanAmount_xgbt = xgboost_test_model(
20
  clf_xgbt_model,
21
  split_dataset,
 
3
  from src.models.xgboost_train_model import xgboost_train_model
4
  from src.models.xgboost_predict_model import xgboost_predit_model
5
  from src.models.xgboost_test_model import xgboost_test_model
6
+
7
  from src.models.util_model_class import ModelClass
8
 
9
 
10
  def xgboost_class(split_dataset: SplitDataset, currency: str):
11
 
12
  # Train Model
13
+ clf_xgbt_model = xgboost_train_model(split_dataset)
14
 
15
  # Predit using Trained Model
16
  clf_xgbt_predictions = xgboost_predit_model(
17
  clf_xgbt_model, split_dataset)
18
 
19
+ # Test and Evaluate Model
20
  df_trueStatus_probabilityDefault_threshStatus_loanAmount_xgbt = xgboost_test_model(
21
  clf_xgbt_model,
22
  split_dataset,
src/visualization/graphs_decision_tree.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import xgboost as xgb
3
+
4
+ import streamlit as st
5
+
6
+ import matplotlib.pyplot as plt
7
+
8
+ from xgboost import plot_tree
9
+
10
+
11
+ def plot_importance_gbt(clf_xgbt_model, barxsize, barysize):
12
+ axobject1 = xgb.plot_importance(clf_xgbt_model, importance_type="weight")
13
+ fig1 = axobject1.figure
14
+ st.write("Feature Importance Plot (Gradient Boosted Tree)")
15
+ fig1.set_size_inches(barxsize, barysize)
16
+ return fig1
17
+
18
+
19
+ def plot_tree_gbt(treexsize, treeysize, clf_xgbt_model):
20
+ plot_tree(clf_xgbt_model)
21
+ fig2 = plt.gcf()
22
+ fig2.set_size_inches(treexsize, treeysize)
23
+ return fig2
src/visualization/graphs_download.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import matplotlib.pyplot as plt
3
+
4
+
5
+ def download_importance_gbt(fig1, barxsize, barysize):
6
+ if st.button(
7
+ "Download Feature Importance Plot as png (Gradient Boosted Tree)"
8
+ ):
9
+ dpisize = max(barxsize, barysize)
10
+ plt.savefig("bar.png", dpi=dpisize * 96, bbox_inches="tight")
11
+ fig1.set_size_inches(barxsize, barysize)
12
+
13
+
14
+ def download_tree_gbt(treexsize, treeysize):
15
+ if st.button("Download XGBoost Decision Tree Plot as png (Gradient Boosted Tree)"):
16
+ dpisize = max(treexsize, treeysize)
17
+ plt.savefig("tree.png", dpi=dpisize * 96, bbox_inches="tight")
src/visualization/graphs_settings.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+
4
+ def streamlit_chart_setting_height_width(
5
+ title: str,
6
+ default_widthvalue: int,
7
+ default_heightvalue: int,
8
+ widthkey: str,
9
+ heightkey: str,
10
+ ):
11
+ with st.expander(title):
12
+
13
+ lbarx_col, lbary_col = st.columns(2)
14
+
15
+ with lbarx_col:
16
+ width_size = st.number_input(
17
+ label="Width in inches:",
18
+ value=default_widthvalue,
19
+ key=widthkey,
20
+ )
21
+
22
+ with lbary_col:
23
+ height_size = st.number_input(
24
+ label="Height in inches:",
25
+ value=default_heightvalue,
26
+ key=heightkey,
27
+ )
28
+ return width_size, height_size
src/visualization/graphs_test.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from matplotlib import pyplot as plt
2
+
3
+ from sklearn.metrics import roc_curve
4
+
5
+ from typing import OrderedDict
6
+
7
+ from src.models.util_model_class import ModelClass
8
+
9
+ from sklearn.calibration import calibration_curve
10
+
11
+
12
+ def cross_validation_graph(cv, eval_metric, trees):
13
+
14
+ # Plot the test AUC scores for each iteration
15
+ fig = plt.figure()
16
+ plt.plot(cv[cv.columns[2]])
17
+ plt.title(
18
+ "Test {eval_metric} Score Over {it_numbr} Iterations".format(
19
+ eval_metric=eval_metric, it_numbr=trees
20
+ )
21
+ )
22
+ plt.xlabel("Iteration Number")
23
+ plt.ylabel("Test {eval_metric} Score".format(eval_metric=eval_metric))
24
+ return fig
25
+
26
+
27
+ def roc_auc_compare_n_models(y, model_views: OrderedDict[str, ModelClass]):
28
+ colors = ["blue", "green"]
29
+ fig = plt.figure()
30
+ for color_idx, (model_name, model_view) in enumerate(model_views.items()):
31
+ fpr, tpr, _thresholds = roc_curve(
32
+ y, model_view.prediction_probability_df
33
+ )
34
+ plt.plot(fpr, tpr, color=colors[color_idx], label=f"{model_name}")
35
+ plt.plot([0, 1], [0, 1], linestyle="--", label="Random Prediction")
36
+ model_names = list(model_views.keys())
37
+ if not model_names:
38
+ model_name_str = "None"
39
+ elif len(model_names) == 1:
40
+ model_name_str = model_names[0]
41
+ else:
42
+ model_name_str = " and ".join(
43
+ [", ".join(model_names[:-1]), model_names[-1]]
44
+ )
45
+ plt.title(f"ROC Chart for {model_name_str} on the Probability of Default")
46
+ plt.xlabel("False Positive Rate (FP Rate)")
47
+ plt.ylabel("True Positive Rate (TP Rate)")
48
+ plt.legend()
49
+ plt.grid(False)
50
+ plt.xlim(0, 1)
51
+ plt.ylim(0, 1)
52
+ return fig
53
+
54
+
55
+ def calibration_curve_report_commented_n(
56
+ y, model_views: OrderedDict[str, ModelClass], bins: int
57
+ ):
58
+ fig = plt.figure()
59
+ for model_name, model_view in model_views.items():
60
+ frac_of_pos, mean_pred_val = calibration_curve(
61
+ y,
62
+ model_view.prediction_probability_df,
63
+ n_bins=bins,
64
+ normalize=True,
65
+ )
66
+ plt.plot(mean_pred_val, frac_of_pos, "s-", label=f"{model_name}")
67
+
68
+ # Create the calibration curve plot with the guideline
69
+ plt.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
70
+
71
+ plt.ylabel("Fraction of positives")
72
+ plt.xlabel("Average Predicted Probability")
73
+ plt.title("Calibration Curve")
74
+ plt.legend()
75
+ plt.grid(False)
76
+ plt.xlim(0, 1)
77
+ plt.ylim(0, 1)
78
+ return fig
src/visualization/graphs_threshold.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import plotly.express as px
3
+
4
+ import streamlit as st
5
+
6
+ import matplotlib.pyplot as plt
7
+
8
+ import numpy as np
9
+
10
+
11
+ def acceptance_rate_driven_threshold_graph(clf_prediction_prob_df_gbt, acc_rate_thresh_gbt):
12
+ figa = px.histogram(clf_prediction_prob_df_gbt["PROB_DEFAULT"])
13
+
14
+ figa.update_layout(
15
+ title="Acceptance Rate Threshold vs. Loans Accepted",
16
+ xaxis_title="Acceptance Rate Threshold",
17
+ yaxis_title="Loans Accepted",
18
+ )
19
+
20
+ figa.update_traces(marker_line_width=1, marker_line_color="white")
21
+
22
+ figa.add_vline(
23
+ x=acc_rate_thresh_gbt,
24
+ line_width=3,
25
+ line_dash="solid",
26
+ line_color="red",
27
+ )
28
+
29
+ st.plotly_chart(figa)
30
+
31
+
32
+ def recall_accuracy_threshold_tradeoff_fig(
33
+ widthsize,
34
+ heightsize,
35
+ threshold_list,
36
+ thresh_def_recalls_list,
37
+ thresh_nondef_recalls_list,
38
+ thresh_accs_list,
39
+ ):
40
+ fig = plt.figure(figsize=(widthsize, heightsize))
41
+ plt.plot(threshold_list, thresh_def_recalls_list, label="Default Recall")
42
+ plt.plot(
43
+ threshold_list, thresh_nondef_recalls_list, label="Non-Default Recall"
44
+ )
45
+ plt.plot(threshold_list, thresh_accs_list, label="Model Accuracy")
46
+ plt.xlabel("Probability Threshold")
47
+ plt.ylabel("Score")
48
+ plt.xlim(0, 1)
49
+ plt.ylim(0, 1)
50
+ plt.legend()
51
+ plt.title("Recall and Accuracy Score Tradeoff with Probability Threshold")
52
+ plt.grid(False)
53
+ return fig
54
+
55
+
56
+ def acceptance_rate_threshold_fig(probability_default, acceptancerate, bins):
57
+ # Probability distribution
58
+ probability_stat_distribution = probability_default.describe()
59
+
60
+ # Acceptance rate threshold
61
+ acc_rate_thresh = np.quantile(probability_default, acceptancerate)
62
+ fig = plt.figure()
63
+
64
+ plt.hist(
65
+ probability_default,
66
+ color="blue",
67
+ bins=bins,
68
+ histtype="bar",
69
+ ec="white",
70
+ )
71
+
72
+ # Add a reference line to the plot for the threshold
73
+ plt.axvline(x=acc_rate_thresh, color="red")
74
+ plt.title("Acceptance Rate Thershold")
75
+
76
+ return (
77
+ fig,
78
+ probability_stat_distribution,
79
+ acc_rate_thresh,
80
+ )
src/visualization/metrics.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ import streamlit as st
4
+
5
+
6
+ def streamlit_2columns_metrics_pct_df(
7
+ column1name_label: str,
8
+ column2name_label: str,
9
+ df: pd.DataFrame,
10
+ ):
11
+ (
12
+ column1name,
13
+ column2name,
14
+ ) = st.columns(2)
15
+
16
+ with column1name:
17
+ st.metric(
18
+ label=column1name_label,
19
+ value="{:.0%}".format(df.value_counts().get(1) / df.shape[0]),
20
+ delta=None,
21
+ delta_color="normal",
22
+ )
23
+
24
+ with column2name:
25
+ st.metric(
26
+ label=column2name_label,
27
+ value="{:.0%}".format(df.value_counts().get(0) / df.shape[0]),
28
+ delta=None,
29
+ delta_color="normal",
30
+ )
31
+
32
+
33
+ def streamlit_2columns_metrics_df(
34
+ column1name_label: str,
35
+ column2name_label: str,
36
+ df: pd.DataFrame,
37
+ ):
38
+ (
39
+ column1name,
40
+ column2name,
41
+ ) = st.columns(2)
42
+
43
+ with column1name:
44
+ st.metric(
45
+ label=column1name_label,
46
+ value=df.value_counts().get(1),
47
+ delta=None,
48
+ delta_color="normal",
49
+ )
50
+
51
+ with column2name:
52
+ st.metric(
53
+ label=column2name_label,
54
+ value=df.value_counts().get(0),
55
+ delta=None,
56
+ delta_color="normal",
57
+ )
58
+
59
+
60
+ def streamlit_2columns_metrics_df_shape(df: pd.DataFrame):
61
+ (
62
+ column1name,
63
+ column2name,
64
+ ) = st.columns(2)
65
+
66
+ with column1name:
67
+ st.metric(
68
+ label="Rows",
69
+ value=df.shape[0],
70
+ delta=None,
71
+ delta_color="normal",
72
+ )
73
+
74
+ with column2name:
75
+ st.metric(
76
+ label="Columns",
77
+ value=df.shape[1],
78
+ delta=None,
79
+ delta_color="normal",
80
+ )
81
+
82
+
83
+ def streamlit_2columns_metrics_pct_series(
84
+ column1name_label: str,
85
+ column2name_label: str,
86
+ series: pd.Series,
87
+ ):
88
+ (
89
+ column1name,
90
+ column2name,
91
+ ) = st.columns(2)
92
+ with column1name:
93
+ st.metric(
94
+ label=column1name_label,
95
+ value="{:.0%}".format(series.get(1) / series.sum()),
96
+ delta=None,
97
+ delta_color="normal",
98
+ )
99
+
100
+ with column2name:
101
+ st.metric(
102
+ label=column2name_label,
103
+ value="{:.0%}".format(series.get(0) / series.sum()),
104
+ delta=None,
105
+ delta_color="normal",
106
+ )
107
+
108
+
109
+ def streamlit_2columns_metrics_series(
110
+ column1name_label: str,
111
+ column2name_label: str,
112
+ series: pd.Series,
113
+ ):
114
+ (
115
+ column1name,
116
+ column2name,
117
+ ) = st.columns(2)
118
+ with column1name:
119
+ st.metric(
120
+ label=column1name_label,
121
+ value=series.get(1),
122
+ delta=None,
123
+ delta_color="normal",
124
+ )
125
+
126
+ with column2name:
127
+ st.metric(
128
+ label=column2name_label,
129
+ value=series.get(0),
130
+ delta=None,
131
+ delta_color="normal",
132
+ )