Spaces:

pkiage
/

credit_risk_modeling_demo

Build error

App Files Files Community

pkiage commited on Feb 8, 2022

Commit

7f0977b

1 Parent(s): 0395eed

refactor: model comparison, utils, and clean up

Browse files

Files changed (17) hide show

README.md +4 -0
app.py +3 -0
src/features/build_features.py +153 -0
src/features/util_build_features.py +289 -0
src/models/util_model_class.py +15 -0
src/models/util_model_comparison.py +86 -0
src/models/util_predict_model.py +87 -0
src/models/util_predict_model_threshold.py +310 -0
src/models/util_strategy_table.py +96 -0
src/models/util_test.py +568 -0
src/models/xgboost_model.py +3 -2
src/visualization/graphs_decision_tree.py +23 -0
src/visualization/graphs_download.py +17 -0
src/visualization/graphs_settings.py +28 -0
src/visualization/graphs_test.py +78 -0
src/visualization/graphs_threshold.py +80 -0
src/visualization/metrics.py +132 -0

README.md CHANGED Viewed

@@ -21,6 +21,10 @@ An interactive tool demonstrating credit risk modelling.
 - Selecting optimal threshold using Youden's J statistic
 ## Political, Economic, Social, Technological, Legal and Environmental(PESTLE):
 [Europe fit for the Digital Age: Commission proposes new rules and actions for excellence and trust in Artificial Intelligence](https://ec.europa.eu/commission/presscorner/detail/en/ip_21_1682)

 - Selecting optimal threshold using Youden's J statistic
+[Cookiecutter Data Science](https://drivendata.github.io/cookiecutter-data-science/)
+- Project structure
 ## Political, Economic, Social, Technological, Legal and Environmental(PESTLE):
 [Europe fit for the Digital Age: Commission proposes new rules and actions for excellence and trust in Artificial Intelligence](https://ec.europa.eu/commission/presscorner/detail/en/ip_21_1682)

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ from src.features.build_features import initialise_data
 from src.models.xgboost_model import xgboost_class
 from src.models.logistic_model import logistic_class
 from src.models.util_strategy_table import strategy_table_view
@@ -44,6 +45,8 @@ def main():
         xgboost_model_class = xgboost_class(split_dataset, currency)
         model_classes["XGBoost"] = xgboost_model_class
     strategy_table_view(currency, model_classes)

 from src.models.xgboost_model import xgboost_class
 from src.models.logistic_model import logistic_class
+from src.models.util_model_comparison import model_comparison_view
 from src.models.util_strategy_table import strategy_table_view
         xgboost_model_class = xgboost_class(split_dataset, currency)
         model_classes["XGBoost"] = xgboost_model_class
+    model_comparison_view(split_dataset, model_classes)
     strategy_table_view(currency, model_classes)

src/features/build_features.py ADDED Viewed

	@@ -0,0 +1,153 @@

+from typing import List, Union, cast, Tuple
+from dataclasses import dataclass
+from sklearn.model_selection import train_test_split
+import pandas as pd
+import streamlit as st
+from src.features.util_build_features import (
+    Dataset,
+    SplitDataset,
+    undersample_training_data,
+    select_predictors,
+    import_data)
+from src.visualization.metrics import (
+    streamlit_2columns_metrics_df_shape,
+    streamlit_2columns_metrics_series,
+    streamlit_2columns_metrics_pct_series,
+    streamlit_2columns_metrics_df,
+    streamlit_2columns_metrics_pct_df,
+)
+def initialise_data() -> Tuple[Dataset, SplitDataset]:
+    dataset = import_data()
+    st.write(
+        "Assuming data is already cleaned and relevant features (predictors) added."
+    )
+    with st.expander("Input Dataframe (X and y)"):
+        st.dataframe(dataset.df)
+        streamlit_2columns_metrics_df_shape(dataset.df)
+    selected_x_values = select_predictors(dataset)
+    with st.expander("Predictors Dataframe (X)"):
+        st.dataframe(selected_x_values)
+        streamlit_2columns_metrics_df_shape(selected_x_values)
+    st.header("Split Testing and Training Data")
+    test_size_slider_col, seed_col = st.columns(2)
+    with test_size_slider_col:
+        # Initialize test size
+        dataset.test_size = st.slider(
+            label="Test Size Percentage of Input Dataframe:",
+            min_value=0,
+            max_value=100,
+            value=dataset.test_size,
+            key="init_test_size",
+            format="%f%%",
+        )
+    with seed_col:
+        dataset.random_state = int(
+            st.number_input(label="Random State:", value=dataset.random_state)
+        )
+    split_dataset = dataset.train_test_split(selected_x_values)
+    true_status = split_dataset.y_test.to_frame().value_counts()
+    st.sidebar.metric(
+        label="Testing Data # of Actual Default (=1)",
+        value=true_status.get(1),
+    )
+    st.sidebar.metric(
+        label="Testing Data % of Actual Default",
+        value="{:.0%}".format(true_status.get(1) / true_status.sum()),
+    )
+    st.sidebar.metric(
+        label="Testing Data # of Actual Non-Default (=0)",
+        value=true_status.get(0),
+    )
+    st.sidebar.metric(
+        label="Testing Data % of Actual Non-Default",
+        value="{:.0%}".format(true_status.get(0) / true_status.sum()),
+    )
+    # Concat the testing sets
+    X_y_test = split_dataset.X_y_test
+    X_y_train = split_dataset.X_y_train
+    with st.expander("Testing Dataframe (X and y)"):
+        st.dataframe(X_y_test)
+        streamlit_2columns_metrics_df_shape(X_y_test)
+    streamlit_2columns_metrics_series(
+        "# Defaults(=1) (Testing Data)",
+        "# Non-Defaults(=0) (Testing Data)",
+        true_status,
+    )
+    streamlit_2columns_metrics_pct_series(
+        "% Defaults (Testing Data)",
+        "% Non-Defaults (Testing Data)",
+        true_status,
+    )
+    st.header("Training Data")
+    with st.expander("Training Dataframe (X and y)"):
+        st.dataframe(X_y_train)
+        streamlit_2columns_metrics_df_shape(X_y_train)
+    st.subheader("Class Count")
+    streamlit_2columns_metrics_df(
+        "# Defaults (Training Data Class Balance Check)",
+        "# Non-Defaults (Training Data Class Balance Check)",
+        split_dataset.y_train,
+    )
+    streamlit_2columns_metrics_pct_df(
+        "% Defaults (Training Data Class Balance Check)",
+        "% Non-Defaults (Training Data Class Balance Check)",
+        split_dataset.y_train,
+    )
+    balance_the_classes = st.radio(
+        label="Balance the Classes:", options=("Yes", "No")
+    )
+    if balance_the_classes == "Yes":
+        st.subheader("Balanced Classes (by Undersampling)")
+        (
+            split_dataset.X_train,
+            split_dataset.y_train,
+            _X_y_train,
+            class_balance_default,
+        ) = undersample_training_data(X_y_train, "loan_status", split_dataset)
+        streamlit_2columns_metrics_series(
+            "# Defaults (Training Data with Class Balance)",
+            "# Non-Defaults (Training Data with Class Balance)",
+            class_balance_default,
+        )
+        streamlit_2columns_metrics_pct_series(
+            "% of Defaults (Training Data with Class Balance)",
+            "% of Non-Defaults (Training Data with Class Balance)",
+            class_balance_default,
+        )
+    return dataset, split_dataset

src/features/util_build_features.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import streamlit as st
+from typing import List, Union, cast
+from dataclasses import dataclass
+from sklearn.model_selection import train_test_split
+import pandas as pd
+@dataclass
+class SplitDataset:
+    X_test: pd.DataFrame
+    X_train: pd.DataFrame
+    y_test: pd.Series
+    y_train: pd.Series
+    @property
+    def X_y_test(self) -> pd.DataFrame:
+        return pd.concat(
+            cast(
+                List[Union[pd.DataFrame, pd.Series]],
+                [
+                    self.X_test.reset_index(drop=True),
+                    self.y_test.reset_index(drop=True),
+                ],
+            ),
+            axis=1,
+        )
+    @property
+    def X_y_train(self) -> pd.DataFrame:
+        return pd.concat(
+            cast(
+                List[Union[pd.DataFrame, pd.Series]],
+                [
+                    self.X_train.reset_index(drop=True),
+                    self.y_train.reset_index(drop=True),
+                ],
+            ),
+            axis=1,
+        )
+@dataclass
+class Dataset:
+    df: pd.DataFrame
+    random_state: int
+    test_size: int
+    @property
+    def y_value(self) -> pd.DataFrame:
+        return self.df["loan_status"]
+    @property
+    def x_values(self) -> pd.DataFrame:
+        return cast(
+            pd.DataFrame,
+            drop_columns(
+                self.df,
+                [
+                    "loan_status",
+                    "loan_grade_A",
+                    "loan_grade_B",
+                    "loan_grade_C",
+                    "loan_grade_D",
+                    "loan_grade_E",
+                    "loan_grade_F",
+                    "loan_grade_G",
+                ],
+            ),
+        )
+    @property
+    def x_values_column_names(self):
+        return self.x_values.columns.tolist()
+    def x_values_filtered_columns(self, columns: List[str]) -> pd.DataFrame:
+        return self.df.filter(columns)
+    def train_test_split(
+        self, selected_x_values: pd.DataFrame
+    ) -> SplitDataset:
+        X_train, X_test, y_train, y_test = train_test_split(
+            selected_x_values,
+            self.y_value,
+            test_size=self.test_size / 100,  # since up was given as pct
+            random_state=self.random_state,
+        )
+        return SplitDataset(
+            X_train=cast(pd.DataFrame, X_train),
+            X_test=cast(pd.DataFrame, X_test),
+            y_train=cast(pd.Series, y_train),
+            y_test=cast(pd.Series, y_test),
+        )
+def drop_columns(df, columns):
+    return df.drop(columns, axis=1)
+def remove_less_than_0_columns(df, column):
+    df[column].dropna()
+    return df.loc[(df[column] != 0).any(1)]
+def boolean_int_condition_label(df, label_column_name, condition):
+    df[label_column_name] = condition
+    y = df[label_column_name].astype(int)
+    df = drop_columns(df, label_column_name)
+    return y, df
+@dataclass
+class SplitDataset:
+    X_test: pd.DataFrame
+    X_train: pd.DataFrame
+    y_test: pd.Series
+    y_train: pd.Series
+    @property
+    def X_y_test(self) -> pd.DataFrame:
+        return pd.concat(
+            cast(
+                List[Union[pd.DataFrame, pd.Series]],
+                [
+                    self.X_test.reset_index(drop=True),
+                    self.y_test.reset_index(drop=True),
+                ],
+            ),
+            axis=1,
+        )
+    @property
+    def X_y_train(self) -> pd.DataFrame:
+        return pd.concat(
+            cast(
+                List[Union[pd.DataFrame, pd.Series]],
+                [
+                    self.X_train.reset_index(drop=True),
+                    self.y_train.reset_index(drop=True),
+                ],
+            ),
+            axis=1,
+        )
+@dataclass
+class Dataset:
+    df: pd.DataFrame
+    random_state: int
+    test_size: int
+    @property
+    def y_value(self) -> pd.DataFrame:
+        return self.df["loan_status"]
+    @property
+    def x_values(self) -> pd.DataFrame:
+        return cast(
+            pd.DataFrame,
+            drop_columns(
+                self.df,
+                [
+                    "loan_status",
+                    "loan_grade_A",
+                    "loan_grade_B",
+                    "loan_grade_C",
+                    "loan_grade_D",
+                    "loan_grade_E",
+                    "loan_grade_F",
+                    "loan_grade_G",
+                ],
+            ),
+        )
+    @property
+    def x_values_column_names(self):
+        return self.x_values.columns.tolist()
+    def x_values_filtered_columns(self, columns: List[str]) -> pd.DataFrame:
+        return self.df.filter(columns)
+    def train_test_split(
+        self, selected_x_values: pd.DataFrame
+    ) -> SplitDataset:
+        X_train, X_test, y_train, y_test = train_test_split(
+            selected_x_values,
+            self.y_value,
+            test_size=self.test_size / 100,  # since up was given as pct
+            random_state=self.random_state,
+        )
+        return SplitDataset(
+            X_train=cast(pd.DataFrame, X_train),
+            X_test=cast(pd.DataFrame, X_test),
+            y_train=cast(pd.Series, y_train),
+            y_test=cast(pd.Series, y_test),
+        )
+def drop_columns(df, columns):
+    return df.drop(columns, axis=1)
+def remove_less_than_0_columns(df, column):
+    df[column].dropna()
+    return df.loc[(df[column] != 0).any(1)]
+def boolean_int_condition_label(df, label_column_name, condition):
+    df[label_column_name] = condition
+    y = df[label_column_name].astype(int)
+    df = drop_columns(df, label_column_name)
+    return y, df
+@st.cache(suppress_st_warning=True)
+def undersample_training_data(
+    df: pd.DataFrame, column_name: str, split_dataset
+):
+    count_nondefault, count_default = split_dataset.X_y_train[
+        column_name
+    ].value_counts()
+    nondefaults = df[df[column_name] == 0]  # 0
+    defaults = df[df[column_name] == 1]
+    under_sample = min(count_nondefault, count_default)
+    nondefaults_under = nondefaults.sample(under_sample)
+    defaults_under = defaults.sample(under_sample)
+    X_y_train_under = pd.concat(
+        [
+            nondefaults_under.reset_index(drop=True),
+            defaults_under.reset_index(drop=True),
+        ],
+        axis=0,
+    )
+    X_train_under = X_y_train_under.drop([column_name], axis=1)  # remove label
+    y_train_under = X_y_train_under[column_name]  # label only
+    class_balance_default = X_y_train_under[column_name].value_counts()
+    return [
+        X_train_under,
+        y_train_under,
+        X_y_train_under,
+        class_balance_default,
+    ]
+def select_predictors(dataset):
+    st.header("Predictors")
+    possible_columns = dataset.x_values_column_names
+    selected_columns = st.sidebar.multiselect(
+        label="Select Predictors",
+        options=possible_columns,
+        default=possible_columns,
+    )
+    return dataset.x_values_filtered_columns(selected_columns)
+def import_data():
+    if "input_data_frame" not in st.session_state:
+        st.session_state.input_data_frame = pd.read_csv(
+            r"./data/processed/cr_loan_w2.csv"
+        )
+    if "dataset" not in st.session_state:
+        df = cast(pd.DataFrame, st.session_state.input_data_frame)
+        dataset = Dataset(
+            df=df,
+            random_state=123235,
+            test_size=40,
+        )
+        st.session_state.dataset = dataset
+    else:
+        dataset = st.session_state.dataset
+    return dataset

src/models/util_model_class.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from dataclasses import dataclass
+from typing import Union
+import pandas as pd
+from xgboost.sklearn import XGBClassifier
+from sklearn.linear_model import LogisticRegression
+@dataclass(frozen=True)
+class ModelClass:
+    model: Union[XGBClassifier, LogisticRegression]
+    probability_threshold_selected: float
+    predicted_default_status: pd.Series
+    trueStatus_probabilityDefault_threshStatus_loanAmount_df: pd.DataFrame
+    prediction_probability_df: pd.DataFrame

src/models/util_model_comparison.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from typing import OrderedDict
+import streamlit as st
+from sklearn.metrics import roc_auc_score
+from src.features.util_build_features import SplitDataset
+from src.visualization.graphs_settings import (
+    streamlit_chart_setting_height_width
+)
+from src.visualization.graphs_test import (
+    roc_auc_compare_n_models,
+    calibration_curve_report_commented_n
+)
+from src.models.util_model_class import ModelClass
+def roc_auc_for_model(split_dataset: SplitDataset, model_view: ModelClass):
+    roc_auc_model = roc_auc_score(
+        split_dataset.y_test, model_view.predicted_default_status
+    )
+    if roc_auc_model > 0.9:
+        roc_auc_lvl = f'Very good {"{:.2f}".format(roc_auc_model)} > 0.9)'
+    elif 0.8 < roc_auc_model < 0.9:
+        roc_auc_lvl = f'Good (0.8 < {"{:.2f}".format(roc_auc_model)} <0.9)'
+    elif 0.7 < roc_auc_model < 0.8:
+        roc_auc_lvl = f'Fair (0.7 <  {"{:.2f}".format(roc_auc_model)} < 0.8)'
+    elif 0.6 < roc_auc_model < 0.7:
+        roc_auc_lvl = f'Poor (0.6 <  {"{:.2f}".format(roc_auc_model)} < 0.7)'
+    else:
+        roc_auc_lvl = f'Fail ( {"{:.2f}".format(roc_auc_model)} < 0.6)'
+    return roc_auc_model, roc_auc_lvl
+def model_comparison_view(
+    split_dataset: SplitDataset,
+    model_views: OrderedDict[str, ModelClass],
+):
+    st.header("Model Comparison")
+    for model_name, model_view in model_views.items():
+        roc_auc_model, roc_auc_lvl = roc_auc_for_model(
+            split_dataset, model_view
+        )
+        st.subheader(
+            f"Receiver Operating Characteristic (ROC) Curve - {model_name}"
+        )
+        st.markdown(
+            f'Area Under the Receiver Operating Characteristic Curve from prediction scores from {model_name} model is {roc_auc_model}.\n'
+        )
+        st.markdown(
+            f'The score of {"{:.2f}".format(roc_auc_model)} is in the {roc_auc_lvl} ROC AUC score category.'
+        )
+    fig1 = roc_auc_compare_n_models(
+        split_dataset.y_test,
+        model_views,
+    )
+    fig1 = fig1.figure
+    (xsize_roc, ysize_roc) = streamlit_chart_setting_height_width(
+        "Chart Settings", 7, 7, "xsize_roc", "ysize_roc"
+    )
+    fig1.set_size_inches(xsize_roc, ysize_roc)
+    st.pyplot(fig1)
+    st.subheader("Models Calibration Curve")
+    fig2 = calibration_curve_report_commented_n(
+        split_dataset.y_test,
+        model_views,
+        10,
+    )
+    fig2 = fig2.figure
+    (xsize_cal, ysize_cal) = streamlit_chart_setting_height_width(
+        "Chart Settings", 7, 7, "xsize_cal", "ysize_cal"
+    )
+    fig2.set_size_inches(xsize_cal, ysize_cal)
+    st.pyplot(fig2)

src/models/util_predict_model.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from typing import Union, cast
+from sklearn.linear_model import LogisticRegression
+import pandas as pd
+from dataclasses import dataclass
+from xgboost import XGBClassifier
+from src.features.util_build_features import SplitDataset
+from src.models.util_predict_model_threshold import (
+    user_defined_probability_threshold,
+    J_statistic_driven_probability_threshold,
+    tradeoff_threshold,
+    acceptance_rate_driven_threshold,
+    select_probability_threshold,
+    model_probability_values_df)
+import streamlit as st
+def probability_threshold_explainer(model_name):
+    st.write(
+        f"""
+            The {model_name} model (obtained using training data) is applied on testing data to predict the loans probabilities of defaulting.\n
+            Probabilities of defaulting of the loans are compared to a probability threshold.\n
+            A loan is predicted to default if its predicted probability of defaulting is greater than the probability threshold.
+            """
+    )
+@dataclass(frozen=True)
+class Threshold:
+    probability_threshold_selected: float
+    predicted_default_status: pd.Series
+    prediction_probability_df: pd.DataFrame
+def make_prediction_view(
+    model_name_short: str,
+    model_name: str,
+):
+    def view(
+        clf_xgbt_model: Union[XGBClassifier, LogisticRegression],
+        split_dataset: SplitDataset,
+    ) -> Threshold:
+        probability_threshold_explainer(model_name)
+        clf_prediction_prob_df_gbt = model_probability_values_df(
+            clf_xgbt_model,
+            split_dataset.X_test,
+        )
+        (clf_thresh_predicted_default_status_user_gbt,
+         user_threshold
+         ) = user_defined_probability_threshold(
+            model_name_short, clf_xgbt_model, split_dataset)
+        (clf_thresh_predicted_default_status_Jstatistic_gbt,
+         J_statistic_best_threshold) = J_statistic_driven_probability_threshold(
+            clf_prediction_prob_df_gbt, clf_xgbt_model, split_dataset)
+        tradeoff_threshold(clf_prediction_prob_df_gbt, split_dataset)
+        (acc_rate_thresh_gbt,
+         clf_thresh_predicted_default_status_acceptance_gbt) = acceptance_rate_driven_threshold(model_name_short, clf_prediction_prob_df_gbt)
+        (prob_thresh_selected_gbt,
+         predicted_default_status_gbt) = select_probability_threshold(model_name_short,
+                                                                      user_threshold,
+                                                                      clf_thresh_predicted_default_status_user_gbt,
+                                                                      J_statistic_best_threshold,
+                                                                      clf_thresh_predicted_default_status_Jstatistic_gbt,
+                                                                      acc_rate_thresh_gbt,
+                                                                      clf_thresh_predicted_default_status_acceptance_gbt)
+        return Threshold(
+            probability_threshold_selected=cast(
+                float, prob_thresh_selected_gbt
+            ),
+            predicted_default_status=predicted_default_status_gbt,
+            prediction_probability_df=clf_prediction_prob_df_gbt,
+        )
+    return view

src/models/util_predict_model_threshold.py ADDED Viewed

	@@ -0,0 +1,310 @@

+import streamlit as st
+from sklearn.metrics import classification_report, roc_curve
+import numpy as np
+import plotly.express as px
+import pandas as pd
+from numpy import argmax
+from src.visualization.metrics import streamlit_2columns_metrics_df, streamlit_2columns_metrics_pct_df
+from src.visualization.graphs_threshold import acceptance_rate_driven_threshold_graph
+def model_probability_values_df(model, X):
+    return pd.DataFrame(model.predict_proba(X)[:, 1], columns=["PROB_DEFAULT"])
+def find_best_threshold_J_statistic(y, clf_prediction_prob_df):
+    fpr, tpr, thresholds = roc_curve(y, clf_prediction_prob_df)
+    # get the best threshold
+    # Youden’s J statistic tpr-fpr
+    # Argmax to get the index in
+    # thresholds
+    return thresholds[argmax(tpr - fpr)]
+# Function that makes dataframe with probability of default, predicted default status based on threshold
+# and actual default status
+def classification_report_per_threshold(
+    threshold_list, threshold_default_status_list, y_test
+):
+    target_names = ["Non-Default", "Default"]
+    classification_report_list = []
+    for threshold_default_status in threshold_default_status_list:
+        thresh_classification_report = classification_report(
+            y_test,
+            threshold_default_status,
+            target_names=target_names,
+            output_dict=True,
+            zero_division=0,
+        )
+        classification_report_list.append(thresh_classification_report)
+    # Return threshold classification report dict
+    return dict(zip(threshold_list, classification_report_list))
+def thresh_classification_report_recall_accuracy(
+    thresh_classification_report_dict,
+):
+    thresh_def_recalls_list = []
+    thresh_nondef_recalls_list = []
+    thresh_accs_list = []
+    for x in [*thresh_classification_report_dict]:
+        thresh_def_recall = thresh_classification_report_dict[x]["Default"][
+            "recall"
+        ]
+        thresh_def_recalls_list.append(thresh_def_recall)
+        thresh_nondef_recall = thresh_classification_report_dict[x][
+            "Non-Default"
+        ]["recall"]
+        thresh_nondef_recalls_list.append(thresh_nondef_recall)
+        thresh_accs = thresh_classification_report_dict[x]["accuracy"]
+        thresh_accs_list.append(thresh_accs)
+    return [
+        thresh_def_recalls_list,
+        thresh_nondef_recalls_list,
+        thresh_accs_list,
+    ]
+def apply_threshold_to_probability_values(probability_values, threshold):
+    return (
+        probability_values["PROB_DEFAULT"]
+        .apply(lambda x: 1 if x > threshold else 0)
+        .rename("PREDICT_DEFAULT_STATUS")
+    )
+@st.cache(suppress_st_warning=True)
+def find_best_threshold_J_statistic(y, clf_prediction_prob_df):
+    fpr, tpr, thresholds = roc_curve(y, clf_prediction_prob_df)
+    # get the best threshold
+    J = tpr - fpr  # Youden’s J statistic
+    ix = argmax(J)
+    return thresholds[ix]
+def default_status_per_threshold(threshold_list, prob_default):
+    threshold_default_status_list = []
+    for threshold in threshold_list:
+        threshold_default_status = prob_default.apply(
+            lambda x: 1 if x > threshold else 0
+        )
+        threshold_default_status_list.append(threshold_default_status)
+    return threshold_default_status_list
+def threshold_and_predictions(clf_xgbt_model, split_dataset, threshold):
+    clf_prediction_prob_df_gbt = model_probability_values_df(
+        clf_xgbt_model,
+        split_dataset.X_test,
+    )
+    clf_thresh_predicted_default_status = (
+        apply_threshold_to_probability_values(
+            clf_prediction_prob_df_gbt,
+            threshold,
+        )
+    )
+    streamlit_2columns_metrics_df(
+        "# of Predicted Defaults",
+        "# of Predicted Non-Default",
+        clf_thresh_predicted_default_status,
+    )
+    streamlit_2columns_metrics_pct_df(
+        "% of Loans Predicted to Default",
+        "% of Loans Predicted not to Default",
+        clf_thresh_predicted_default_status,
+    )
+    return clf_thresh_predicted_default_status
+def user_defined_probability_threshold(model_name_short, clf_xgbt_model, split_dataset):
+    st.subheader("Classification Probability Threshold - User Defined")
+    user_defined_threshold = st.slider(
+        label="Default Probability Threshold:",
+        min_value=0.0,
+        max_value=1.0,
+        value=0.8,
+        key=f"threshold_{model_name_short}_default",
+    )
+    clf_thresh_predicted_default_status = threshold_and_predictions(
+        clf_xgbt_model, split_dataset, user_defined_threshold)
+    return clf_thresh_predicted_default_status, user_defined_threshold
+def J_statistic_driven_probability_threshold(clf_prediction_prob_df_gbt, clf_xgbt_model, split_dataset):
+    st.subheader("J Statistic Driven Classification Probability Threshold")
+    J_statistic_best_threshold = find_best_threshold_J_statistic(
+        split_dataset.y_test, clf_prediction_prob_df_gbt
+    )
+    st.metric(
+        label="Youden's J statistic calculated best threshold",
+        value=J_statistic_best_threshold,
+    )
+    clf_thresh_predicted_default_status = threshold_and_predictions(
+        clf_xgbt_model, split_dataset, J_statistic_best_threshold)
+    return clf_thresh_predicted_default_status, J_statistic_best_threshold
+def create_tradeoff_graph(df):
+    fig2 = px.line(
+        data_frame=df,
+        y=["Default Recall", "Non Default Recall", "Accuracy"],
+        x="Threshold",
+    )
+    fig2.update_layout(
+        title="Recall and Accuracy score Trade-off with Probability Threshold",
+        xaxis_title="Probability Threshold",
+        yaxis_title="Score",
+    )
+    fig2.update_yaxes(range=[0.0, 1.0])
+    st.plotly_chart(fig2)
+def tradeoff_threshold(clf_prediction_prob_df_gbt, split_dataset):
+    st.subheader(
+        "Recall and Accuracy Tradeoff with given Probability Threshold"
+    )
+    threshold_list = np.arange(
+        0, 1, 0.025).round(decimals=3).tolist()
+    threshold_default_status_list = default_status_per_threshold(
+        threshold_list, clf_prediction_prob_df_gbt["PROB_DEFAULT"]
+    )
+    thresh_classification_report_dict = (
+        classification_report_per_threshold(
+            threshold_list,
+            threshold_default_status_list,
+            split_dataset.y_test,
+        )
+    )
+    (
+        thresh_def_recalls_list,
+        thresh_nondef_recalls_list,
+        thresh_accs_list,
+    ) = thresh_classification_report_recall_accuracy(
+        thresh_classification_report_dict
+    )
+    namelist = [
+        "Default Recall",
+        "Non Default Recall",
+        "Accuracy",
+        "Threshold",
+    ]
+    df = pd.DataFrame(
+        [
+            thresh_def_recalls_list,
+            thresh_nondef_recalls_list,
+            thresh_accs_list,
+            threshold_list,
+        ],
+        index=namelist,
+    )
+    df = df.T
+    create_tradeoff_graph(df)
+def select_probability_threshold(model_name_short,
+                                 user_defined_threshold,
+                                 clf_thresh_predicted_default_status_user_gbt,
+                                 J_statistic_best_threshold,
+                                 clf_thresh_predicted_default_status_Jstatistic_gbt,
+                                 acc_rate_thresh_gbt,
+                                 clf_thresh_predicted_default_status_acceptance_gbt):
+    st.subheader("Selected Probability Threshold")
+    options = [
+        "User Defined",
+        "J Statistic Driven",
+        "Acceptance Rate Driven",
+    ]
+    prob_thresh_option = st.radio(
+        label="Selected Probability Threshold",
+        options=options,
+        key=f"{model_name_short}_radio_thresh",
+    )
+    if prob_thresh_option == "User Defined":
+        prob_thresh_selected_gbt = user_defined_threshold
+        predicted_default_status_gbt = (
+            clf_thresh_predicted_default_status_user_gbt
+        )
+    elif prob_thresh_option == "J Statistic Driven":
+        prob_thresh_selected_gbt = J_statistic_best_threshold
+        predicted_default_status_gbt = (
+            clf_thresh_predicted_default_status_Jstatistic_gbt
+        )
+    else:
+        prob_thresh_selected_gbt = acc_rate_thresh_gbt
+        predicted_default_status_gbt = (
+            clf_thresh_predicted_default_status_acceptance_gbt
+        )
+    st.write(
+        f"Selected probability threshold is {prob_thresh_selected_gbt}"
+    )
+    return prob_thresh_selected_gbt, predicted_default_status_gbt
+def acceptance_rate_driven_threshold(model_name_short, clf_prediction_prob_df_gbt):
+    st.subheader("Acceptance Rate Driven Probability Threshold")
+    # Steps
+    # Set acceptance rate
+    # Get default status per threshold
+    # Get classification report per threshold
+    # Get recall, nondef recall, and accuracy per threshold
+    acceptance_rate = (
+        st.slider(
+            label="% of loans accepted (acceptance rate):",
+            min_value=0,
+            max_value=100,
+            value=85,
+            key=f"acceptance_rate_{model_name_short}",
+            format="%f%%",
+        )
+        / 100
+    )
+    acc_rate_thresh_gbt = np.quantile(
+        clf_prediction_prob_df_gbt["PROB_DEFAULT"], acceptance_rate
+    )
+    st.write(
+        f"An acceptance rate of {acceptance_rate} results in probability threshold of {acc_rate_thresh_gbt}"
+    )
+    acceptance_rate_driven_threshold_graph(
+        clf_prediction_prob_df_gbt, acc_rate_thresh_gbt)
+    clf_thresh_predicted_default_status_acceptance_gbt = apply_threshold_to_probability_values(
+        clf_prediction_prob_df_gbt,
+        acc_rate_thresh_gbt,
+    )
+    return acc_rate_thresh_gbt, clf_thresh_predicted_default_status_acceptance_gbt

src/models/util_strategy_table.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from typing import OrderedDict
+import plotly.express as px
+import numpy as np
+import streamlit as st
+from src.models.util_test import create_strategyTable_df
+from src.models.util_model_class import ModelClass
+def strategy_table_view(
+    currency: str, model_views: OrderedDict[str, ModelClass]
+):
+    st.header("Strategy Table")
+    for (model_name, model_view) in model_views.items():
+        st.subheader(model_name)
+        strat_df = create_strategyTable_df(
+            0.05,
+            1,
+            20,
+            model_view.trueStatus_probabilityDefault_threshStatus_loanAmount_df,
+            "loan_status",
+            currency,
+        )
+        columns = strat_df.columns
+        with st.expander("Strategy Table:"):
+            st.write(strat_df)
+        for i in columns:
+            strat_df[i] = strat_df[i].astype(np.float64)
+        strat_df_boxPlot_data = strat_df.iloc[:, 0:3]
+        plot = px.box(data_frame=strat_df_boxPlot_data)
+        st.plotly_chart(plot)
+        # Plot the strategy curve
+        fig1 = px.line(
+            strat_df_boxPlot_data,
+            x="Acceptance Rate",
+            y="Bad Rate",
+            title="Acceptance and Bad Rates",
+        )
+        st.plotly_chart(fig1)
+        fig2 = px.line(
+            strat_df,
+            x="Acceptance Rate",
+            y=f"Estimated Value ({currency})",
+            title=f"Estimated Value ({currency}) by Acceptance Rate",
+        )
+        st.plotly_chart(fig2)
+        st.write("Row with the greatest estimated value:")
+        max_estimated_value = np.max(
+            strat_df[f"Estimated Value ({currency})"].astype(np.float64)
+        )
+        columns = strat_df.columns
+        max_estimated_value = np.max(strat_df[f"Estimated Value ({currency})"])
+        st.write(
+            strat_df.loc[
+                strat_df[f"Estimated Value ({currency})"]
+                == max_estimated_value
+            ]
+        )
+        loss_given_default = 1
+        df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
+            model_view.trueStatus_probabilityDefault_threshStatus_loanAmount_df[
+                "PROB_DEFAULT"
+            ]
+            * loss_given_default
+            * model_view.trueStatus_probabilityDefault_threshStatus_loanAmount_df[
+                "loan_amnt"
+            ]
+        )
+        tot_exp_loss = round(
+            np.sum(df_trueStatus_probabilityDefault_threshStatus_loanAmount),
+            2,
+        )
+        st.metric(
+            label='Total expected loss:',
+            value=f"{currency} {tot_exp_loss:,.2f}",
+            delta=None,
+            delta_color="normal",
+        )

src/models/util_test.py ADDED Viewed

	@@ -0,0 +1,568 @@

+from typing import Union
+import pandas as pd
+from sklearn.model_selection import StratifiedKFold, cross_val_score
+import streamlit as st
+import numpy as np
+from sklearn.metrics import (
+    classification_report,
+    confusion_matrix,
+)
+from sklearn.linear_model import LogisticRegression
+import xgboost as xgb
+from xgboost.sklearn import XGBClassifier
+from src.features.util_build_features import SplitDataset
+"""from src.models.model_utils import (
+    create_cross_validation_df,
+    cross_validation_scores,
+    get_df_trueStatus_probabilityDefault_threshStatus_loanAmount,
+)"""
+from src.visualization.graphs_test import (
+    cross_validation_graph,
+)
+def make_tests_view(
+    model_name_short: str,
+    model_name_generic: str,
+):
+    def view(
+        clf_xgbt_model: Union[XGBClassifier, LogisticRegression],
+        split_dataset: SplitDataset,
+        currency: str,
+        prob_thresh_selected,
+        predicted_default_status,
+    ):
+        st.header(f"Model Evaluation - {model_name_generic}")
+        st.subheader("Cross Validation")
+        st.write("Shows how our model will perform as new loans come in.")
+        st.write(
+            "If evaluation metric for test and train set improve as models \
+            train on each fold suggests performance will be stable."
+        )
+        st.write('xgb cross validation test:')
+        stcol_seed, stcol_eval_metric = st.columns(2)
+        with stcol_seed:
+            cv_seed = int(
+                st.number_input(
+                    label="Random State Seed for Cross Validation:",
+                    value=123235,
+                    key=f"cv_seed_{model_name_short}",
+                )
+            )
+        with stcol_eval_metric:
+            eval_metric = st.selectbox(
+                label="Select evaluation metric",
+                options=[
+                    "auc",
+                    "aucpr",
+                    "rmse",
+                    "mae",
+                    "logloss",
+                    "error",
+                    "merror",
+                    "mlogloss",
+                ],
+                key=f"eval_metric_{model_name_short}",
+            )
+        stcol_trees, stcol_eval_nfold, stcol_earlystoppingrounds = st.columns(
+            3
+        )
+        with stcol_trees:
+            trees = int(
+                st.number_input(
+                    label="Number of trees",
+                    value=5,
+                    key=f"trees_{model_name_short}",
+                )
+            )
+        with stcol_eval_nfold:
+            nfolds = int(
+                st.number_input(
+                    label="Number of folds",
+                    value=5,
+                    key=f"nfolds_{model_name_short}",
+                )
+            )
+        with stcol_earlystoppingrounds:
+            early_stopping_rounds = int(
+                st.number_input(
+                    label="Early stopping rounds",
+                    value=10,
+                    key=f"early_stopping_rounds_{model_name_short}",
+                )
+            )
+        DTrain, cv_df = create_cross_validation_df(
+            split_dataset.X_test,
+            split_dataset.y_test,
+            eval_metric,
+            cv_seed,
+            trees,
+            nfolds,
+            early_stopping_rounds,
+        )
+        st.write(cv_df)
+        scoring_options = [
+            "roc_auc",
+            "accuracy",
+            "precision",
+            "recall",
+            "f1",
+            "jaccard",
+        ]
+        overfit_test = st.radio(
+            label="Overfit test:",
+            options=("No", "Yes"),
+            key=f"overfit_test_{model_name_short}",
+        )
+        if overfit_test == "Yes":
+            st.write("Overfit test:")
+            iterations = int(
+                st.number_input(
+                    label="Number of folds (iterations)",
+                    value=500,
+                    key=f"iterations_{model_name_short}",
+                )
+            )
+            DTrain, cv_df_it = create_cross_validation_df(
+                split_dataset.X_test,
+                split_dataset.y_test,
+                eval_metric,
+                cv_seed,
+                iterations,
+                nfolds,
+                iterations,
+            )
+            fig_it = cross_validation_graph(cv_df_it, eval_metric, iterations)
+            st.pyplot(fig_it)
+        st.write("Sklearn cross validation test:")
+        stcol_scoringmetric, st_nfold = st.columns(2)
+        with stcol_scoringmetric:
+            score_metric = st.selectbox(
+                label="Select score",
+                options=scoring_options,
+                key=f"stcol_scoringmetric_{model_name_short}",
+            )
+        with st_nfold:
+            nfolds_score = int(
+                st.number_input(
+                    label="Number of folds",
+                    value=5,
+                    key=f"st_nfold_{model_name_short}",
+                )
+            )
+        cv_scores = cross_validation_scores(
+            clf_xgbt_model,
+            split_dataset.X_test,
+            split_dataset.y_test,
+            nfolds_score,
+            score_metric,
+            cv_seed,
+        )
+        stcol_vals, stcol_mean, st_std = st.columns(3)
+        with stcol_vals:
+            st.markdown(f"{score_metric} scores:")
+            st.write(
+                pd.DataFrame(
+                    cv_scores,
+                    columns=[score_metric],
+                )
+            )
+        with stcol_mean:
+            st.metric(
+                label=f"Average {score_metric} score ",
+                value="{:.4f}".format(cv_scores.mean()),
+                delta=None,
+                delta_color="normal",
+            )
+        with st_std:
+            st.metric(
+                label=f"{score_metric} standard deviation (+/-)",
+                value="{:.4f}".format(cv_scores.std()),
+                delta=None,
+                delta_color="normal",
+            )
+        st.subheader("Classification Report")
+        target_names = ["Non-Default", "Default"]
+        classification_report_dict = classification_report(
+            split_dataset.y_test,
+            predicted_default_status,
+            target_names=target_names,
+            output_dict=True,
+        )
+        (
+            stcol_defaultpres,
+            stcol_defaultrecall,
+            stcol_defaultf1score,
+            stcol_f1score,
+        ) = st.columns(4)
+        with stcol_defaultpres:
+            st.metric(
+                label="Default Precision",
+                value="{:.0%}".format(
+                    classification_report_dict["Default"]["precision"]
+                ),
+                delta=None,
+                delta_color="normal",
+            )
+        with stcol_defaultrecall:
+            st.metric(
+                label="Default Recall",
+                value="{:.0%}".format(
+                    classification_report_dict["Default"]["recall"]
+                ),
+                delta=None,
+                delta_color="normal",
+            )
+        with stcol_defaultf1score:
+            st.metric(
+                label="Default F1 Score",
+                value="{:.2f}".format(
+                    classification_report_dict["Default"]["f1-score"]
+                ),
+                delta=None,
+                delta_color="normal",
+            )
+        with stcol_f1score:
+            st.metric(
+                label="Macro avg F1 Score (Model F1 Score):",
+                value="{:.2f}".format(
+                    classification_report_dict["macro avg"]["f1-score"]
+                ),
+                delta=None,
+                delta_color="normal",
+            )
+        with st.expander("Classification Report Dictionary:"):
+            st.write(classification_report_dict)
+        st.markdown(
+            f'Default precision: {"{:.0%}".format(classification_report_dict["Default"]["precision"])} of loans predicted as default were actually default.'
+        )
+        st.markdown(
+            f'Default recall: {"{:.0%}".format(classification_report_dict["Default"]["recall"])} of true defaults predicted correctly.'
+        )
+        f1_gap = 1 - classification_report_dict["Default"]["f1-score"]
+        st.markdown(
+            f'Default F1 score: {"{:.2f}".format(classification_report_dict["Default"]["f1-score"])}\
+                is {"{:.2f}".format(f1_gap)} away from perfect precision and recall (no false positive rate).'
+        )
+        st.markdown(
+            f'macro avg F1 score: {"{:.2f}".format(classification_report_dict["macro avg"]["f1-score"])} is the models F1 score.'
+        )
+        st.subheader("Confusion Matrix")
+        confuctiomatrix_dict = confusion_matrix(
+            split_dataset.y_test, predicted_default_status
+        )
+        tn, fp, fn, tp = confusion_matrix(
+            split_dataset.y_test, predicted_default_status
+        ).ravel()
+        with st.expander(
+            "Confusion matrix (column name = classification model prediction, row name = true status, values = number of loans"
+        ):
+            st.write(confuctiomatrix_dict)
+        st.markdown(
+            f'{tp} ,\
+            {"{:.0%}".format(tp / len(predicted_default_status))} \
+                true positives (defaults correctly predicted as defaults).'
+        )
+        st.markdown(
+            f'{fp} ,\
+            {"{:.0%}".format(fp / len(predicted_default_status))} \
+                false positives (non-defaults incorrectly predicted as defaults).'
+        )
+        st.markdown(
+            f'{fn} ,\
+            {"{:.0%}".format(fn / len(predicted_default_status))} \
+                false negatives (defaults incorrectly predicted as non-defaults).'
+        )
+        st.markdown(
+            f'{tn} ,\
+            {"{:.0%}".format(tn / len(predicted_default_status))} \
+                true negatives (non-defaults correctly predicted as non-defaults).'
+        )
+        st.subheader("Bad Rate")
+        df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
+            get_df_trueStatus_probabilityDefault_threshStatus_loanAmount(
+                clf_xgbt_model,
+                split_dataset.X_test,
+                split_dataset.y_test,
+                prob_thresh_selected,
+                "loan_amnt",
+            )
+        )
+        with st.expander(
+            "Loan Status, Probability of Default, & Loan Amount DataFrame"
+        ):
+            st.write(df_trueStatus_probabilityDefault_threshStatus_loanAmount)
+        accepted_loans = (
+            df_trueStatus_probabilityDefault_threshStatus_loanAmount[
+                df_trueStatus_probabilityDefault_threshStatus_loanAmount[
+                    "PREDICT_DEFAULT_STATUS"
+                ]
+                == 0
+            ]
+        )
+        bad_rate = (
+            np.sum(accepted_loans["loan_status"])
+            / accepted_loans["loan_status"].count()
+        )
+        with st.expander("Loan Amount Summary Statistics"):
+            st.write(
+                df_trueStatus_probabilityDefault_threshStatus_loanAmount[
+                    "loan_amnt"
+                ].describe()
+            )
+        avg_loan = np.mean(
+            df_trueStatus_probabilityDefault_threshStatus_loanAmount[
+                "loan_amnt"
+            ]
+        )
+        crosstab_df = pd.crosstab(
+            df_trueStatus_probabilityDefault_threshStatus_loanAmount[
+                "loan_status"
+            ],  # row label
+            df_trueStatus_probabilityDefault_threshStatus_loanAmount[
+                "PREDICT_DEFAULT_STATUS"
+            ],
+        ).apply(
+            lambda x: x * avg_loan, axis=0
+        )  # column label
+        with st.expander(
+            "Cross tabulation (column name = classification model prediction, row name = true status, values = number of loans * average loan value"
+        ):
+            st.write(crosstab_df)
+        st.write(
+            f'Bad rate: {"{:.2%}".format(bad_rate)} of all the loans the model accepted (classified as non-default) from the test set were actually defaults.'
+        )
+        st.write(
+            f'Estimated value of the bad rate is {currency} {"{:,.2f}".format(crosstab_df[0][1])}.'
+        )
+        st.write(
+            f'Total estimated value of actual non-default loans is {currency} {"{:,.2f}".format(crosstab_df[0][0]+crosstab_df[0][1])}'
+        )
+        st.write(
+            f'Estimated value of loans incorrectly predicted as default is {currency} {"{:,.2f}".format(crosstab_df[1][0])}'
+        )
+        st.write(
+            f'Estimated value of loans correctly predicted as defaults is {currency} {"{:,.2f}".format(crosstab_df[1][1])}'
+        )
+        return df_trueStatus_probabilityDefault_threshStatus_loanAmount
+    return view
+def cross_validation_scores(model, X, y, nfold, score, seed):
+    # return cv scores of metric
+    return cross_val_score(
+        model,
+        np.ascontiguousarray(X),
+        np.ravel(np.ascontiguousarray(y)),
+        cv=StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed),
+        scoring=score,
+    )
+def create_cross_validation_df(
+    X, y, eval_metric, seed, trees, n_folds, early_stopping_rounds
+):
+    # Test data x and y
+    DTrain = xgb.DMatrix(X, label=y)
+    # auc or logloss
+    params = {
+        "eval_metric": eval_metric,
+        "objective": "binary:logistic",  # logistic say 0 or 1 for loan status
+        "seed": seed,
+    }
+    # Create the data frame of cross validations
+    cv_df = xgb.cv(
+        params,
+        DTrain,
+        num_boost_round=trees,
+        nfold=n_folds,
+        early_stopping_rounds=early_stopping_rounds,
+        shuffle=True,
+    )
+    return [DTrain, cv_df]
+def create_accept_rate_list(start, end, samples):
+    return np.linspace(start, end, samples, endpoint=True)
+def create_strategyTable_df(
+    start, end, samples, actual_probability_predicted_acc_rate, true, currency
+):
+    accept_rates = create_accept_rate_list(start, end, samples)
+    thresholds_strat = []
+    bad_rates_start = []
+    Avg_Loan_Amnt = actual_probability_predicted_acc_rate[true].mean()
+    num_accepted_loans_start = []
+    for rate in accept_rates:
+        # Calculate the threshold for the acceptance rate
+        thresh = np.quantile(
+            actual_probability_predicted_acc_rate["PROB_DEFAULT"], rate
+        ).round(3)
+        # Add the threshold value to the list of thresholds
+        thresholds_strat.append(
+            np.quantile(
+                actual_probability_predicted_acc_rate["PROB_DEFAULT"], rate
+            ).round(3)
+        )
+        # Reassign the loan_status value using the threshold
+        actual_probability_predicted_acc_rate[
+            "PREDICT_DEFAULT_STATUS"
+        ] = actual_probability_predicted_acc_rate["PROB_DEFAULT"].apply(
+            lambda x: 1 if x > thresh else 0
+        )
+        # Create a set of accepted loans using this acceptance rate
+        accepted_loans = actual_probability_predicted_acc_rate[
+            actual_probability_predicted_acc_rate["PREDICT_DEFAULT_STATUS"]
+            == 0
+        ]
+        # Calculate and append the bad rate using the acceptance rate
+        bad_rates_start.append(
+            np.sum((accepted_loans[true]) / len(accepted_loans[true])).round(3)
+        )
+        # Accepted loans
+        num_accepted_loans_start.append(len(accepted_loans))
+    # Calculate estimated value
+    money_accepted_loans = [
+        accepted_loans * Avg_Loan_Amnt
+        for accepted_loans in num_accepted_loans_start
+    ]
+    money_bad_accepted_loans = [
+        2 * money_accepted_loan * bad_rate
+        for money_accepted_loan, bad_rate in zip(
+            money_accepted_loans, bad_rates_start
+        )
+    ]
+    zip_object = zip(money_accepted_loans, money_bad_accepted_loans)
+    estimated_value = [
+        money_accepted_loan - money_bad_accepted_loan
+        for money_accepted_loan, money_bad_accepted_loan in zip_object
+    ]
+    accept_rates = ["{:.2f}".format(elem) for elem in accept_rates]
+    thresholds_strat = ["{:.2f}".format(elem) for elem in thresholds_strat]
+    bad_rates_start = ["{:.2f}".format(elem) for elem in bad_rates_start]
+    estimated_value = ["{:.2f}".format(elem) for elem in estimated_value]
+    return (
+        pd.DataFrame(
+            zip(
+                accept_rates,
+                thresholds_strat,
+                bad_rates_start,
+                num_accepted_loans_start,
+                estimated_value,
+            ),
+            columns=[
+                "Acceptance Rate",
+                "Threshold",
+                "Bad Rate",
+                "Num Accepted Loans",
+                f"Estimated Value ({currency})",
+            ],
+        )
+        .sort_values(by="Acceptance Rate", axis=0, ascending=False)
+        .reset_index(drop=True)
+    )
+def get_df_trueStatus_probabilityDefault_threshStatus_loanAmount(
+    model, X, y, threshold, loan_amount_col_name
+):
+    true_status = y.to_frame()
+    loan_amount = X[loan_amount_col_name]
+    clf_prediction_prob = model.predict_proba(np.ascontiguousarray(X))
+    clf_prediction_prob_df = pd.DataFrame(
+        clf_prediction_prob[:, 1], columns=["PROB_DEFAULT"]
+    )
+    clf_thresh_predicted_default_status = (
+        clf_prediction_prob_df["PROB_DEFAULT"]
+        .apply(lambda x: 1 if x > threshold else 0)
+        .rename("PREDICT_DEFAULT_STATUS")
+    )
+    return pd.concat(
+        [
+            true_status.reset_index(drop=True),
+            clf_prediction_prob_df.reset_index(drop=True),
+            clf_thresh_predicted_default_status.reset_index(drop=True),
+            loan_amount.reset_index(drop=True),
+        ],
+        axis=1,
+    )

src/models/xgboost_model.py CHANGED Viewed

@@ -3,19 +3,20 @@ from src.features.build_features import SplitDataset
 from src.models.xgboost_train_model import xgboost_train_model
 from src.models.xgboost_predict_model import xgboost_predit_model
 from src.models.xgboost_test_model import xgboost_test_model
 from src.models.util_model_class import ModelClass
 def xgboost_class(split_dataset: SplitDataset, currency: str):
     # Train Model
-    clf_xgbt_model = xgboost_train_model(split_dataset, currency)
     # Predit using Trained Model
     clf_xgbt_predictions = xgboost_predit_model(
         clf_xgbt_model, split_dataset)
-    # Test Predictions of Trained Model
     df_trueStatus_probabilityDefault_threshStatus_loanAmount_xgbt = xgboost_test_model(
         clf_xgbt_model,
         split_dataset,

 from src.models.xgboost_train_model import xgboost_train_model
 from src.models.xgboost_predict_model import xgboost_predit_model
 from src.models.xgboost_test_model import xgboost_test_model
 from src.models.util_model_class import ModelClass
 def xgboost_class(split_dataset: SplitDataset, currency: str):
     # Train Model
+    clf_xgbt_model = xgboost_train_model(split_dataset)
     # Predit using Trained Model
     clf_xgbt_predictions = xgboost_predit_model(
         clf_xgbt_model, split_dataset)
+    # Test and Evaluate Model
     df_trueStatus_probabilityDefault_threshStatus_loanAmount_xgbt = xgboost_test_model(
         clf_xgbt_model,
         split_dataset,

src/visualization/graphs_decision_tree.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import xgboost as xgb
+import streamlit as st
+import matplotlib.pyplot as plt
+from xgboost import plot_tree
+def plot_importance_gbt(clf_xgbt_model, barxsize, barysize):
+    axobject1 = xgb.plot_importance(clf_xgbt_model, importance_type="weight")
+    fig1 = axobject1.figure
+    st.write("Feature Importance Plot (Gradient Boosted Tree)")
+    fig1.set_size_inches(barxsize, barysize)
+    return fig1
+def plot_tree_gbt(treexsize, treeysize, clf_xgbt_model):
+    plot_tree(clf_xgbt_model)
+    fig2 = plt.gcf()
+    fig2.set_size_inches(treexsize, treeysize)
+    return fig2

src/visualization/graphs_download.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import streamlit as st
+import matplotlib.pyplot as plt
+def download_importance_gbt(fig1, barxsize, barysize):
+    if st.button(
+        "Download Feature Importance Plot as png (Gradient Boosted Tree)"
+    ):
+        dpisize = max(barxsize, barysize)
+        plt.savefig("bar.png", dpi=dpisize * 96, bbox_inches="tight")
+        fig1.set_size_inches(barxsize, barysize)
+def download_tree_gbt(treexsize, treeysize):
+    if st.button("Download XGBoost Decision Tree Plot as png (Gradient Boosted Tree)"):
+        dpisize = max(treexsize, treeysize)
+        plt.savefig("tree.png", dpi=dpisize * 96, bbox_inches="tight")

src/visualization/graphs_settings.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import streamlit as st
+def streamlit_chart_setting_height_width(
+    title: str,
+    default_widthvalue: int,
+    default_heightvalue: int,
+    widthkey: str,
+    heightkey: str,
+):
+    with st.expander(title):
+        lbarx_col, lbary_col = st.columns(2)
+        with lbarx_col:
+            width_size = st.number_input(
+                label="Width in inches:",
+                value=default_widthvalue,
+                key=widthkey,
+            )
+        with lbary_col:
+            height_size = st.number_input(
+                label="Height in inches:",
+                value=default_heightvalue,
+                key=heightkey,
+            )
+    return width_size, height_size

src/visualization/graphs_test.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from matplotlib import pyplot as plt
+from sklearn.metrics import roc_curve
+from typing import OrderedDict
+from src.models.util_model_class import ModelClass
+from sklearn.calibration import calibration_curve
+def cross_validation_graph(cv, eval_metric, trees):
+    # Plot the test AUC scores for each iteration
+    fig = plt.figure()
+    plt.plot(cv[cv.columns[2]])
+    plt.title(
+        "Test {eval_metric} Score Over {it_numbr} Iterations".format(
+            eval_metric=eval_metric, it_numbr=trees
+        )
+    )
+    plt.xlabel("Iteration Number")
+    plt.ylabel("Test {eval_metric} Score".format(eval_metric=eval_metric))
+    return fig
+def roc_auc_compare_n_models(y, model_views: OrderedDict[str, ModelClass]):
+    colors = ["blue", "green"]
+    fig = plt.figure()
+    for color_idx, (model_name, model_view) in enumerate(model_views.items()):
+        fpr, tpr, _thresholds = roc_curve(
+            y, model_view.prediction_probability_df
+        )
+        plt.plot(fpr, tpr, color=colors[color_idx], label=f"{model_name}")
+    plt.plot([0, 1], [0, 1], linestyle="--", label="Random Prediction")
+    model_names = list(model_views.keys())
+    if not model_names:
+        model_name_str = "None"
+    elif len(model_names) == 1:
+        model_name_str = model_names[0]
+    else:
+        model_name_str = " and ".join(
+            [", ".join(model_names[:-1]), model_names[-1]]
+        )
+    plt.title(f"ROC Chart for {model_name_str} on the Probability of Default")
+    plt.xlabel("False Positive Rate (FP Rate)")
+    plt.ylabel("True Positive Rate (TP Rate)")
+    plt.legend()
+    plt.grid(False)
+    plt.xlim(0, 1)
+    plt.ylim(0, 1)
+    return fig
+def calibration_curve_report_commented_n(
+    y, model_views: OrderedDict[str, ModelClass], bins: int
+):
+    fig = plt.figure()
+    for model_name, model_view in model_views.items():
+        frac_of_pos, mean_pred_val = calibration_curve(
+            y,
+            model_view.prediction_probability_df,
+            n_bins=bins,
+            normalize=True,
+        )
+        plt.plot(mean_pred_val, frac_of_pos, "s-", label=f"{model_name}")
+    # Create the calibration curve plot with the guideline
+    plt.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
+    plt.ylabel("Fraction of positives")
+    plt.xlabel("Average Predicted Probability")
+    plt.title("Calibration Curve")
+    plt.legend()
+    plt.grid(False)
+    plt.xlim(0, 1)
+    plt.ylim(0, 1)
+    return fig

src/visualization/graphs_threshold.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import plotly.express as px
+import streamlit as st
+import matplotlib.pyplot as plt
+import numpy as np
+def acceptance_rate_driven_threshold_graph(clf_prediction_prob_df_gbt, acc_rate_thresh_gbt):
+    figa = px.histogram(clf_prediction_prob_df_gbt["PROB_DEFAULT"])
+    figa.update_layout(
+        title="Acceptance Rate Threshold vs. Loans Accepted",
+        xaxis_title="Acceptance Rate Threshold",
+        yaxis_title="Loans Accepted",
+    )
+    figa.update_traces(marker_line_width=1, marker_line_color="white")
+    figa.add_vline(
+        x=acc_rate_thresh_gbt,
+        line_width=3,
+        line_dash="solid",
+        line_color="red",
+    )
+    st.plotly_chart(figa)
+def recall_accuracy_threshold_tradeoff_fig(
+    widthsize,
+    heightsize,
+    threshold_list,
+    thresh_def_recalls_list,
+    thresh_nondef_recalls_list,
+    thresh_accs_list,
+):
+    fig = plt.figure(figsize=(widthsize, heightsize))
+    plt.plot(threshold_list, thresh_def_recalls_list, label="Default Recall")
+    plt.plot(
+        threshold_list, thresh_nondef_recalls_list, label="Non-Default Recall"
+    )
+    plt.plot(threshold_list, thresh_accs_list, label="Model Accuracy")
+    plt.xlabel("Probability Threshold")
+    plt.ylabel("Score")
+    plt.xlim(0, 1)
+    plt.ylim(0, 1)
+    plt.legend()
+    plt.title("Recall and Accuracy Score Tradeoff with Probability Threshold")
+    plt.grid(False)
+    return fig
+def acceptance_rate_threshold_fig(probability_default, acceptancerate, bins):
+    # Probability distribution
+    probability_stat_distribution = probability_default.describe()
+    # Acceptance rate threshold
+    acc_rate_thresh = np.quantile(probability_default, acceptancerate)
+    fig = plt.figure()
+    plt.hist(
+        probability_default,
+        color="blue",
+        bins=bins,
+        histtype="bar",
+        ec="white",
+    )
+    # Add a reference line to the plot for the threshold
+    plt.axvline(x=acc_rate_thresh, color="red")
+    plt.title("Acceptance Rate Thershold")
+    return (
+        fig,
+        probability_stat_distribution,
+        acc_rate_thresh,
+    )

src/visualization/metrics.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import pandas as pd
+import streamlit as st
+def streamlit_2columns_metrics_pct_df(
+    column1name_label: str,
+    column2name_label: str,
+    df: pd.DataFrame,
+):
+    (
+        column1name,
+        column2name,
+    ) = st.columns(2)
+    with column1name:
+        st.metric(
+            label=column1name_label,
+            value="{:.0%}".format(df.value_counts().get(1) / df.shape[0]),
+            delta=None,
+            delta_color="normal",
+        )
+    with column2name:
+        st.metric(
+            label=column2name_label,
+            value="{:.0%}".format(df.value_counts().get(0) / df.shape[0]),
+            delta=None,
+            delta_color="normal",
+        )
+def streamlit_2columns_metrics_df(
+    column1name_label: str,
+    column2name_label: str,
+    df: pd.DataFrame,
+):
+    (
+        column1name,
+        column2name,
+    ) = st.columns(2)
+    with column1name:
+        st.metric(
+            label=column1name_label,
+            value=df.value_counts().get(1),
+            delta=None,
+            delta_color="normal",
+        )
+    with column2name:
+        st.metric(
+            label=column2name_label,
+            value=df.value_counts().get(0),
+            delta=None,
+            delta_color="normal",
+        )
+def streamlit_2columns_metrics_df_shape(df: pd.DataFrame):
+    (
+        column1name,
+        column2name,
+    ) = st.columns(2)
+    with column1name:
+        st.metric(
+            label="Rows",
+            value=df.shape[0],
+            delta=None,
+            delta_color="normal",
+        )
+    with column2name:
+        st.metric(
+            label="Columns",
+            value=df.shape[1],
+            delta=None,
+            delta_color="normal",
+        )
+def streamlit_2columns_metrics_pct_series(
+    column1name_label: str,
+    column2name_label: str,
+    series: pd.Series,
+):
+    (
+        column1name,
+        column2name,
+    ) = st.columns(2)
+    with column1name:
+        st.metric(
+            label=column1name_label,
+            value="{:.0%}".format(series.get(1) / series.sum()),
+            delta=None,
+            delta_color="normal",
+        )
+    with column2name:
+        st.metric(
+            label=column2name_label,
+            value="{:.0%}".format(series.get(0) / series.sum()),
+            delta=None,
+            delta_color="normal",
+        )
+def streamlit_2columns_metrics_series(
+    column1name_label: str,
+    column2name_label: str,
+    series: pd.Series,
+):
+    (
+        column1name,
+        column2name,
+    ) = st.columns(2)
+    with column1name:
+        st.metric(
+            label=column1name_label,
+            value=series.get(1),
+            delta=None,
+            delta_color="normal",
+        )
+    with column2name:
+        st.metric(
+            label=column2name_label,
+            value=series.get(0),
+            delta=None,
+            delta_color="normal",
+        )