Spaces:

pkiage
/

credit_risk_modeling_demo

Build error

App Files Files Community

pkiage commited on Feb 8, 2022

Commit

37b5a88

1 Parent(s): 232e5e5

clean up

Browse files

Files changed (13) hide show

common/__init__.py +0 -0
common/data.py +0 -94
common/util.py +0 -391
common/views.py +0 -361
data_setup.py +0 -180
views/__init__.py +0 -0
views/decision_tree.py +0 -70
views/evaluation.py +0 -410
views/logistic.py +0 -119
views/model_comparison.py +0 -81
views/strategy_table.py +0 -96
views/threshold.py +0 -272
views/typing.py +0 -15

common/__init__.py DELETED Viewed

File without changes

common/data.py DELETED Viewed

@@ -1,94 +0,0 @@
-from typing import List, Union, cast
-from dataclasses import dataclass
-from sklearn.model_selection import train_test_split
-import pandas as pd
-from common.util import drop_columns
-@dataclass
-class SplitDataset:
-    X_test: pd.DataFrame
-    X_train: pd.DataFrame
-    y_test: pd.Series
-    y_train: pd.Series
-    @property
-    def X_y_test(self) -> pd.DataFrame:
-        return pd.concat(
-            cast(
-                List[Union[pd.DataFrame, pd.Series]],
-                [
-                    self.X_test.reset_index(drop=True),
-                    self.y_test.reset_index(drop=True),
-                ],
-            ),
-            axis=1,
-        )
-    @property
-    def X_y_train(self) -> pd.DataFrame:
-        return pd.concat(
-            cast(
-                List[Union[pd.DataFrame, pd.Series]],
-                [
-                    self.X_train.reset_index(drop=True),
-                    self.y_train.reset_index(drop=True),
-                ],
-            ),
-            axis=1,
-        )
-@dataclass
-class Dataset:
-    df: pd.DataFrame
-    random_state: int
-    test_size: int
-    @property
-    def y_value(self) -> pd.DataFrame:
-        return self.df["loan_status"]
-    @property
-    def x_values(self) -> pd.DataFrame:
-        return cast(
-            pd.DataFrame,
-            drop_columns(
-                self.df,
-                [
-                    "loan_status",
-                    "loan_grade_A",
-                    "loan_grade_B",
-                    "loan_grade_C",
-                    "loan_grade_D",
-                    "loan_grade_E",
-                    "loan_grade_F",
-                    "loan_grade_G",
-                ],
-            ),
-        )
-    @property
-    def x_values_column_names(self):
-        return self.x_values.columns.tolist()
-    def x_values_filtered_columns(self, columns: List[str]) -> pd.DataFrame:
-        return self.df.filter(columns)
-    def train_test_split(
-        self, selected_x_values: pd.DataFrame
-    ) -> SplitDataset:
-        X_train, X_test, y_train, y_test = train_test_split(
-            selected_x_values,
-            self.y_value,
-            test_size=self.test_size / 100,  # since up was given as pct
-            random_state=self.random_state,
-        )
-        return SplitDataset(
-            X_train=cast(pd.DataFrame, X_train),
-            X_test=cast(pd.DataFrame, X_test),
-            y_train=cast(pd.Series, y_train),
-            y_test=cast(pd.Series, y_test),
-        )

common/util.py DELETED Viewed

@@ -1,391 +0,0 @@
-# DATA MANIPULATION & ANALYSIS
-import pickle
-import streamlit as st
-# Arrays
-import numpy as np
-# DataFrames and Series
-import pandas as pd
-# Returns the indices of the maximum values along an axis
-from numpy import argmax
-# MODELLING
-# Logistic regression
-from sklearn.linear_model import LogisticRegression
-from sklearn.model_selection import StratifiedKFold
-# XGBoosted Decision Trees
-import xgboost as xgb
-# REPORTING, EVALUATION, AND INTERPRETATION
-# Classification report
-from sklearn.metrics import classification_report
-# Reciever Operator Curve
-from sklearn.metrics import roc_curve
-# Evaluate a score by cross-validation
-from sklearn.model_selection import cross_val_score
-# # Functions
-def drop_columns(df, columns):
-    return df.drop(columns, axis=1)
-def remove_less_than_0_columns(df, column):
-    df[column].dropna()
-    return df.loc[(df[column] != 0).any(1)]
-def boolean_int_condition_label(df, label_column_name, condition):
-    df[label_column_name] = condition
-    y = df[label_column_name].astype(int)
-    df = drop_columns(df, label_column_name)
-    return y, df
-@st.cache(suppress_st_warning=True)
-def undersample_training_data(
-    df: pd.DataFrame, column_name: str, split_dataset
-):
-    count_nondefault, count_default = split_dataset.X_y_train[
-        column_name
-    ].value_counts()
-    nondefaults = df[df[column_name] == 0]  # 0
-    defaults = df[df[column_name] == 1]
-    under_sample = min(count_nondefault, count_default)
-    nondefaults_under = nondefaults.sample(under_sample)
-    defaults_under = defaults.sample(under_sample)
-    X_y_train_under = pd.concat(
-        [
-            nondefaults_under.reset_index(drop=True),
-            defaults_under.reset_index(drop=True),
-        ],
-        axis=0,
-    )
-    X_train_under = X_y_train_under.drop([column_name], axis=1)  # remove label
-    y_train_under = X_y_train_under[column_name]  # label only
-    class_balance_default = X_y_train_under[column_name].value_counts()
-    return [
-        X_train_under,
-        y_train_under,
-        X_y_train_under,
-        class_balance_default,
-    ]
-def create_coeffient_feature_dictionary_logistic_model(
-    logistic_model, training_data
-):
-    return {
-        feat: coef
-        for coef, feat in zip(
-            logistic_model.coef_[0, :], training_data.columns
-        )
-    }
-@st.cache(suppress_st_warning=True)
-def test_variables_logistic(X_train, y_train):
-    # Create and fit the logistic regression model
-    return LogisticRegression(solver="lbfgs").fit(X_train, np.ravel(y_train))
-@st.cache(suppress_st_warning=True)
-def print_coeff_logistic(clf_logistic_model, split_dataset):
-    # Dictionary of features and their coefficients
-    return create_coeffient_feature_dictionary_logistic_model(
-        clf_logistic_model, split_dataset.X_train
-    )
-@st.cache(suppress_st_warning=True, hash_funcs={
-    xgb.XGBClassifier: pickle.dumps
-})
-def test_variables_gbt(X_train, y_train):
-    # Using hyperparameters learning_rate and max_depth
-    return xgb.XGBClassifier(
-        learning_rate=0.1,
-        max_depth=7,
-        use_label_encoder=False,
-        eval_metric="logloss",
-    ).fit(X_train, np.ravel(y_train), eval_metric="logloss")
-# In[398]:
-def get_df_trueStatus_probabilityDefault_threshStatus_loanAmount(
-    model, X, y, threshold, loan_amount_col_name
-):
-    true_status = y.to_frame()
-    loan_amount = X[loan_amount_col_name]
-    clf_prediction_prob = model.predict_proba(np.ascontiguousarray(X))
-    clf_prediction_prob_df = pd.DataFrame(
-        clf_prediction_prob[:, 1], columns=["PROB_DEFAULT"]
-    )
-    clf_thresh_predicted_default_status = (
-        clf_prediction_prob_df["PROB_DEFAULT"]
-        .apply(lambda x: 1 if x > threshold else 0)
-        .rename("PREDICT_DEFAULT_STATUS")
-    )
-    return pd.concat(
-        [
-            true_status.reset_index(drop=True),
-            clf_prediction_prob_df.reset_index(drop=True),
-            clf_thresh_predicted_default_status.reset_index(drop=True),
-            loan_amount.reset_index(drop=True),
-        ],
-        axis=1,
-    )
-def find_best_threshold_J_statistic(y, clf_prediction_prob_df):
-    fpr, tpr, thresholds = roc_curve(y, clf_prediction_prob_df)
-    # get the best threshold
-    # Youden’s J statistic tpr-fpr
-    # Argmax to get the index in
-    # thresholds
-    return thresholds[argmax(tpr - fpr)]
-# In[399]:
-# Function that makes dataframe with probability of default, predicted default status based on threshold
-# and actual default status
-def model_probability_values_df(model, X):
-    return pd.DataFrame(model.predict_proba(X)[:, 1], columns=["PROB_DEFAULT"])
-def apply_threshold_to_probability_values(probability_values, threshold):
-    return (
-        probability_values["PROB_DEFAULT"]
-        .apply(lambda x: 1 if x > threshold else 0)
-        .rename("PREDICT_DEFAULT_STATUS")
-    )
-@st.cache(suppress_st_warning=True)
-def find_best_threshold_J_statistic(y, clf_prediction_prob_df):
-    fpr, tpr, thresholds = roc_curve(y, clf_prediction_prob_df)
-    # get the best threshold
-    J = tpr - fpr  # Youden’s J statistic
-    ix = argmax(J)
-    return thresholds[ix]
-# In[401]:
-def create_cross_validation_df(
-    X, y, eval_metric, seed, trees, n_folds, early_stopping_rounds
-):
-    # Test data x and y
-    DTrain = xgb.DMatrix(X, label=y)
-    # auc or logloss
-    params = {
-        "eval_metric": eval_metric,
-        "objective": "binary:logistic",  # logistic say 0 or 1 for loan status
-        "seed": seed,
-    }
-    # Create the data frame of cross validations
-    cv_df = xgb.cv(
-        params,
-        DTrain,
-        num_boost_round=trees,
-        nfold=n_folds,
-        early_stopping_rounds=early_stopping_rounds,
-        shuffle=True,
-    )
-    return [DTrain, cv_df]
-# In[450]:
-def cross_validation_scores(model, X, y, nfold, score, seed):
-    # return cv scores of metric
-    return cross_val_score(
-        model,
-        np.ascontiguousarray(X),
-        np.ravel(np.ascontiguousarray(y)),
-        cv=StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed),
-        scoring=score,
-    )
-def default_status_per_threshold(threshold_list, prob_default):
-    threshold_default_status_list = []
-    for threshold in threshold_list:
-        threshold_default_status = prob_default.apply(
-            lambda x: 1 if x > threshold else 0
-        )
-        threshold_default_status_list.append(threshold_default_status)
-    return threshold_default_status_list
-def classification_report_per_threshold(
-    threshold_list, threshold_default_status_list, y_test
-):
-    target_names = ["Non-Default", "Default"]
-    classification_report_list = []
-    for threshold_default_status in threshold_default_status_list:
-        thresh_classification_report = classification_report(
-            y_test,
-            threshold_default_status,
-            target_names=target_names,
-            output_dict=True,
-            zero_division=0,
-        )
-        classification_report_list.append(thresh_classification_report)
-    # Return threshold classification report dict
-    return dict(zip(threshold_list, classification_report_list))
-def thresh_classification_report_recall_accuracy(
-    thresh_classification_report_dict,
-):
-    thresh_def_recalls_list = []
-    thresh_nondef_recalls_list = []
-    thresh_accs_list = []
-    for x in [*thresh_classification_report_dict]:
-        thresh_def_recall = thresh_classification_report_dict[x]["Default"][
-            "recall"
-        ]
-        thresh_def_recalls_list.append(thresh_def_recall)
-        thresh_nondef_recall = thresh_classification_report_dict[x][
-            "Non-Default"
-        ]["recall"]
-        thresh_nondef_recalls_list.append(thresh_nondef_recall)
-        thresh_accs = thresh_classification_report_dict[x]["accuracy"]
-        thresh_accs_list.append(thresh_accs)
-    return [
-        thresh_def_recalls_list,
-        thresh_nondef_recalls_list,
-        thresh_accs_list,
-    ]
-def create_accept_rate_list(start, end, samples):
-    return np.linspace(start, end, samples, endpoint=True)
-def create_strategyTable_df(
-    start, end, samples, actual_probability_predicted_acc_rate, true, currency
-):
-    accept_rates = create_accept_rate_list(start, end, samples)
-    thresholds_strat = []
-    bad_rates_start = []
-    Avg_Loan_Amnt = actual_probability_predicted_acc_rate[true].mean()
-    num_accepted_loans_start = []
-    for rate in accept_rates:
-        # Calculate the threshold for the acceptance rate
-        thresh = np.quantile(
-            actual_probability_predicted_acc_rate["PROB_DEFAULT"], rate
-        ).round(3)
-        # Add the threshold value to the list of thresholds
-        thresholds_strat.append(
-            np.quantile(
-                actual_probability_predicted_acc_rate["PROB_DEFAULT"], rate
-            ).round(3)
-        )
-        # Reassign the loan_status value using the threshold
-        actual_probability_predicted_acc_rate[
-            "PREDICT_DEFAULT_STATUS"
-        ] = actual_probability_predicted_acc_rate["PROB_DEFAULT"].apply(
-            lambda x: 1 if x > thresh else 0
-        )
-        # Create a set of accepted loans using this acceptance rate
-        accepted_loans = actual_probability_predicted_acc_rate[
-            actual_probability_predicted_acc_rate["PREDICT_DEFAULT_STATUS"]
-            == 0
-        ]
-        # Calculate and append the bad rate using the acceptance rate
-        bad_rates_start.append(
-            np.sum((accepted_loans[true]) / len(accepted_loans[true])).round(3)
-        )
-        # Accepted loans
-        num_accepted_loans_start.append(len(accepted_loans))
-    # Calculate estimated value
-    money_accepted_loans = [
-        accepted_loans * Avg_Loan_Amnt
-        for accepted_loans in num_accepted_loans_start
-    ]
-    money_bad_accepted_loans = [
-        2 * money_accepted_loan * bad_rate
-        for money_accepted_loan, bad_rate in zip(
-            money_accepted_loans, bad_rates_start
-        )
-    ]
-    zip_object = zip(money_accepted_loans, money_bad_accepted_loans)
-    estimated_value = [
-        money_accepted_loan - money_bad_accepted_loan
-        for money_accepted_loan, money_bad_accepted_loan in zip_object
-    ]
-    accept_rates = ["{:.2f}".format(elem) for elem in accept_rates]
-    thresholds_strat = ["{:.2f}".format(elem) for elem in thresholds_strat]
-    bad_rates_start = ["{:.2f}".format(elem) for elem in bad_rates_start]
-    estimated_value = ["{:.2f}".format(elem) for elem in estimated_value]
-    return (
-        pd.DataFrame(
-            zip(
-                accept_rates,
-                thresholds_strat,
-                bad_rates_start,
-                num_accepted_loans_start,
-                estimated_value,
-            ),
-            columns=[
-                "Acceptance Rate",
-                "Threshold",
-                "Bad Rate",
-                "Num Accepted Loans",
-                f"Estimated Value ({currency})",
-            ],
-        )
-        .sort_values(by="Acceptance Rate", axis=0, ascending=False)
-        .reset_index(drop=True)
-    )

common/views.py DELETED Viewed

@@ -1,361 +0,0 @@
-from typing import OrderedDict
-import streamlit as st  # works on command prompt
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-import xgboost as xgb
-from sklearn.metrics import (
-    roc_curve,
-)
-from sklearn.calibration import calibration_curve
-from xgboost import plot_tree
-from views.typing import ModelView
-def plot_logistic_coeff_barh(coef_dict, x, y):
-    fig = plt.figure(figsize=(x, y))
-    coef_dict_sorted = dict(
-        sorted(coef_dict.items(), key=lambda item: item[1], reverse=False)
-    )
-    plt.barh(*zip(*coef_dict_sorted.items()))
-    return fig
-def print_negative_coefficients_logistic_model(coef_dict):
-    # Equal to or less than 0
-    NegativeCoefficients = dict(
-        filter(lambda x: x[1] <= 0.0, coef_dict.items())
-    )
-    NegativeCoefficientsSorted = sorted(
-        NegativeCoefficients.items(), key=lambda x: x[1], reverse=False
-    )
-    text = (
-        "\n\nFeatures the model found to be negatively correlated with probability of default are:"
-        "\n{negative_features}:"
-    )
-    st.markdown(text.format(negative_features=NegativeCoefficientsSorted))
-    st.markdown(type(NegativeCoefficientsSorted))
-    st.markdown(NegativeCoefficients.items())
-def print_positive_coefficients_logistic_model(coef_dict):
-    # Equal to or greater than 0
-    PositiveCoefficients = dict(
-        filter(lambda x: x[1] >= 0.0, coef_dict.items())
-    )
-    PositiveCoefficientsSorted = sorted(
-        PositiveCoefficients.items(), key=lambda x: x[1], reverse=True
-    )
-    text = (
-        "\n\nFeatures the model found to be positively correlated with probability of default are:"
-        "\n{positive_features}:"
-    )
-    st.markdown(text.format(positive_features=PositiveCoefficientsSorted))
-def plot_importance_gbt(clf_gbt_model, barxsize, barysize):
-    axobject1 = xgb.plot_importance(clf_gbt_model, importance_type="weight")
-    fig1 = axobject1.figure
-    st.write("Feature Importance Plot (Gradient Boosted Tree)")
-    fig1.set_size_inches(barxsize, barysize)
-    return fig1
-def download_importance_gbt(fig1, barxsize, barysize):
-    if st.button(
-        "Download Feature Importance Plot as png (Gradient Boosted Tree)"
-    ):
-        dpisize = max(barxsize, barysize)
-        plt.savefig("bar.png", dpi=dpisize * 96, bbox_inches="tight")
-        fig1.set_size_inches(barxsize, barysize)
-def plot_tree_gbt(treexsize, treeysize, clf_gbt_model):
-    plot_tree(clf_gbt_model)
-    fig2 = plt.gcf()
-    fig2.set_size_inches(treexsize, treeysize)
-    return fig2
-def download_tree_gbt(treexsize, treeysize):
-    if st.button("Download Decision Tree Plot as png (Gradient Boosted Tree)"):
-        dpisize = max(treexsize, treeysize)
-        plt.savefig("tree.png", dpi=dpisize * 96, bbox_inches="tight")
-def cross_validation_graph(cv, eval_metric, trees):
-    # Plot the test AUC scores for each iteration
-    fig = plt.figure()
-    plt.plot(cv[cv.columns[2]])
-    plt.title(
-        "Test {eval_metric} Score Over {it_numbr} Iterations".format(
-            eval_metric=eval_metric, it_numbr=trees
-        )
-    )
-    plt.xlabel("Iteration Number")
-    plt.ylabel("Test {eval_metric} Score".format(eval_metric=eval_metric))
-    return fig
-def recall_accuracy_threshold_tradeoff_fig(
-    widthsize,
-    heightsize,
-    threshold_list,
-    thresh_def_recalls_list,
-    thresh_nondef_recalls_list,
-    thresh_accs_list,
-):
-    fig = plt.figure(figsize=(widthsize, heightsize))
-    plt.plot(threshold_list, thresh_def_recalls_list, label="Default Recall")
-    plt.plot(
-        threshold_list, thresh_nondef_recalls_list, label="Non-Default Recall"
-    )
-    plt.plot(threshold_list, thresh_accs_list, label="Model Accuracy")
-    plt.xlabel("Probability Threshold")
-    plt.ylabel("Score")
-    plt.xlim(0, 1)
-    plt.ylim(0, 1)
-    plt.legend()
-    plt.title("Recall and Accuracy Score Tradeoff with Probability Threshold")
-    plt.grid(False)
-    return fig
-def roc_auc_compare_n_models(y, model_views: OrderedDict[str, ModelView]):
-    colors = ["blue", "green"]
-    fig = plt.figure()
-    for color_idx, (model_name, model_view) in enumerate(model_views.items()):
-        fpr, tpr, _thresholds = roc_curve(
-            y, model_view.prediction_probability_df
-        )
-        plt.plot(fpr, tpr, color=colors[color_idx], label=f"{model_name}")
-    plt.plot([0, 1], [0, 1], linestyle="--", label="Random Prediction")
-    model_names = list(model_views.keys())
-    if not model_names:
-        model_name_str = "None"
-    elif len(model_names) == 1:
-        model_name_str = model_names[0]
-    else:
-        model_name_str = " and ".join(
-            [", ".join(model_names[:-1]), model_names[-1]]
-        )
-    plt.title(f"ROC Chart for {model_name_str} on the Probability of Default")
-    plt.xlabel("False Positive Rate (FP Rate)")
-    plt.ylabel("True Positive Rate (TP Rate)")
-    plt.legend()
-    plt.grid(False)
-    plt.xlim(0, 1)
-    plt.ylim(0, 1)
-    return fig
-def calibration_curve_report_commented_n(
-    y, model_views: OrderedDict[str, ModelView], bins: int
-):
-    fig = plt.figure()
-    for model_name, model_view in model_views.items():
-        frac_of_pos, mean_pred_val = calibration_curve(
-            y,
-            model_view.prediction_probability_df,
-            n_bins=bins,
-            normalize=True,
-        )
-        plt.plot(mean_pred_val, frac_of_pos, "s-", label=f"{model_name}")
-    # Create the calibration curve plot with the guideline
-    plt.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
-    plt.ylabel("Fraction of positives")
-    plt.xlabel("Average Predicted Probability")
-    plt.title("Calibration Curve")
-    plt.legend()
-    plt.grid(False)
-    plt.xlim(0, 1)
-    plt.ylim(0, 1)
-    return fig
-def acceptance_rate_threshold_fig(probability_default, acceptancerate, bins):
-    # Probability distribution
-    probability_stat_distribution = probability_default.describe()
-    # Acceptance rate threshold
-    acc_rate_thresh = np.quantile(probability_default, acceptancerate)
-    fig = plt.figure()
-    plt.hist(
-        probability_default,
-        color="blue",
-        bins=bins,
-        histtype="bar",
-        ec="white",
-    )
-    # Add a reference line to the plot for the threshold
-    plt.axvline(x=acc_rate_thresh, color="red")
-    plt.title("Acceptance Rate Thershold")
-    return (
-        fig,
-        probability_stat_distribution,
-        acc_rate_thresh,
-    )
-def streamlit_2columns_metrics_pct_df(
-    column1name_label: str,
-    column2name_label: str,
-    df: pd.DataFrame,
-):
-    (
-        column1name,
-        column2name,
-    ) = st.columns(2)
-    with column1name:
-        st.metric(
-            label=column1name_label,
-            value="{:.0%}".format(df.value_counts().get(1) / df.shape[0]),
-            delta=None,
-            delta_color="normal",
-        )
-    with column2name:
-        st.metric(
-            label=column2name_label,
-            value="{:.0%}".format(df.value_counts().get(0) / df.shape[0]),
-            delta=None,
-            delta_color="normal",
-        )
-def streamlit_2columns_metrics_df(
-    column1name_label: str,
-    column2name_label: str,
-    df: pd.DataFrame,
-):
-    (
-        column1name,
-        column2name,
-    ) = st.columns(2)
-    with column1name:
-        st.metric(
-            label=column1name_label,
-            value=df.value_counts().get(1),
-            delta=None,
-            delta_color="normal",
-        )
-    with column2name:
-        st.metric(
-            label=column2name_label,
-            value=df.value_counts().get(0),
-            delta=None,
-            delta_color="normal",
-        )
-def streamlit_2columns_metrics_df_shape(df: pd.DataFrame):
-    (
-        column1name,
-        column2name,
-    ) = st.columns(2)
-    with column1name:
-        st.metric(
-            label="Rows",
-            value=df.shape[0],
-            delta=None,
-            delta_color="normal",
-        )
-    with column2name:
-        st.metric(
-            label="Columns",
-            value=df.shape[1],
-            delta=None,
-            delta_color="normal",
-        )
-def streamlit_2columns_metrics_pct_series(
-    column1name_label: str,
-    column2name_label: str,
-    series: pd.Series,
-):
-    (
-        column1name,
-        column2name,
-    ) = st.columns(2)
-    with column1name:
-        st.metric(
-            label=column1name_label,
-            value="{:.0%}".format(series.get(1) / series.sum()),
-            delta=None,
-            delta_color="normal",
-        )
-    with column2name:
-        st.metric(
-            label=column2name_label,
-            value="{:.0%}".format(series.get(0) / series.sum()),
-            delta=None,
-            delta_color="normal",
-        )
-def streamlit_2columns_metrics_series(
-    column1name_label: str,
-    column2name_label: str,
-    series: pd.Series,
-):
-    (
-        column1name,
-        column2name,
-    ) = st.columns(2)
-    with column1name:
-        st.metric(
-            label=column1name_label,
-            value=series.get(1),
-            delta=None,
-            delta_color="normal",
-        )
-    with column2name:
-        st.metric(
-            label=column2name_label,
-            value=series.get(0),
-            delta=None,
-            delta_color="normal",
-        )
-def streamlit_chart_setting_height_width(
-    title: str,
-    default_widthvalue: int,
-    default_heightvalue: int,
-    widthkey: str,
-    heightkey: str,
-):
-    with st.expander(title):
-        lbarx_col, lbary_col = st.columns(2)
-        with lbarx_col:
-            width_size = st.number_input(
-                label="Width in inches:",
-                value=default_widthvalue,
-                key=widthkey,
-            )
-        with lbary_col:
-            height_size = st.number_input(
-                label="Height in inches:",
-                value=default_heightvalue,
-                key=heightkey,
-            )
-    return width_size, height_size

data_setup.py DELETED Viewed

@@ -1,180 +0,0 @@
-from typing import Tuple, cast
-import pandas as pd
-import streamlit as st
-from common.data import Dataset, SplitDataset
-from common.util import (
-    undersample_training_data,
-)
-from common.views import (
-    streamlit_2columns_metrics_df_shape,
-    streamlit_2columns_metrics_series,
-    streamlit_2columns_metrics_pct_series,
-    streamlit_2columns_metrics_df,
-    streamlit_2columns_metrics_pct_df,
-)
-# Initialize dataframe session state
-def initialise_data() -> Tuple[Dataset, SplitDataset]:
-    if "input_data_frame" not in st.session_state:
-        st.session_state.input_data_frame = pd.read_csv(
-            r"./data/processed/cr_loan_w2.csv"
-        )
-    if "dataset" not in st.session_state:
-        df = cast(pd.DataFrame, st.session_state.input_data_frame)
-        dataset = Dataset(
-            df=df,
-            random_state=123235,
-            test_size=40,
-        )
-        st.session_state.dataset = dataset
-    else:
-        dataset = st.session_state.dataset
-    st.write(
-        "Assuming data is already cleaned and relevant features (predictors) added."
-    )
-    with st.expander("Input Dataframe (X and y)"):
-        st.dataframe(dataset.df)
-        streamlit_2columns_metrics_df_shape(dataset.df)
-    st.header("Predictors")
-    possible_columns = dataset.x_values_column_names
-    selected_columns = st.sidebar.multiselect(
-        label="Select Predictors",
-        options=possible_columns,
-        default=possible_columns,
-    )
-    selected_x_values = dataset.x_values_filtered_columns(selected_columns)
-    st.sidebar.metric(
-        label="# of Predictors Selected",
-        value=selected_x_values.shape[1],
-        delta=None,
-        delta_color="normal",
-    )
-    with st.expander("Predictors Dataframe (X)"):
-        st.dataframe(selected_x_values)
-        streamlit_2columns_metrics_df_shape(selected_x_values)
-    # 40% of data used for training
-    # 14321 as random seed for reproducability
-    st.header("Split Testing and Training Data")
-    test_size_slider_col, seed_col = st.columns(2)
-    with test_size_slider_col:
-        # Initialize test size
-        dataset.test_size = st.slider(
-            label="Test Size Percentage of Input Dataframe:",
-            min_value=0,
-            max_value=100,
-            value=dataset.test_size,
-            key="init_test_size",
-            format="%f%%",
-        )
-    with seed_col:
-        dataset.random_state = int(
-            st.number_input(label="Random State:", value=dataset.random_state)
-        )
-    split_dataset = dataset.train_test_split(selected_x_values)
-    # Series
-    true_status = split_dataset.y_test.to_frame().value_counts()
-    st.sidebar.metric(
-        label="Testing Data # of Actual Default (=1)",
-        value=true_status.get(1),
-    )
-    st.sidebar.metric(
-        label="Testing Data % of Actual Default",
-        value="{:.0%}".format(true_status.get(1) / true_status.sum()),
-    )
-    st.sidebar.metric(
-        label="Testing Data # of Actual Non-Default (=0)",
-        value=true_status.get(0),
-    )
-    st.sidebar.metric(
-        label="Testing Data % of Actual Non-Default",
-        value="{:.0%}".format(true_status.get(0) / true_status.sum()),
-    )
-    # Concat the testing sets
-    X_y_test = split_dataset.X_y_test
-    X_y_train = split_dataset.X_y_train
-    with st.expander("Testing Dataframe (X and y)"):
-        st.dataframe(X_y_test)
-        streamlit_2columns_metrics_df_shape(X_y_test)
-    streamlit_2columns_metrics_series(
-        "# Defaults(=1) (Testing Data)",
-        "# Non-Defaults(=0) (Testing Data)",
-        true_status,
-    )
-    streamlit_2columns_metrics_pct_series(
-        "% Defaults (Testing Data)",
-        "% Non-Defaults (Testing Data)",
-        true_status,
-    )
-    st.header("Training Data")
-    with st.expander("Training Dataframe (X and y)"):
-        st.dataframe(X_y_train)
-        streamlit_2columns_metrics_df_shape(X_y_train)
-    st.subheader("Class Count")
-    streamlit_2columns_metrics_df(
-        "# Defaults (Training Data Class Balance Check)",
-        "# Non-Defaults (Training Data Class Balance Check)",
-        split_dataset.y_train,
-    )
-    streamlit_2columns_metrics_pct_df(
-        "% Defaults (Training Data Class Balance Check)",
-        "% Non-Defaults (Training Data Class Balance Check)",
-        split_dataset.y_train,
-    )
-    balance_the_classes = st.radio(
-        label="Balance the Classes:", options=("Yes", "No")
-    )
-    if balance_the_classes == "Yes":
-        st.subheader("Balanced Classes (by Undersampling)")
-        (
-            split_dataset.X_train,
-            split_dataset.y_train,
-            _X_y_train,
-            class_balance_default,
-        ) = undersample_training_data(X_y_train, "loan_status", split_dataset)
-        streamlit_2columns_metrics_series(
-            "# Defaults (Training Data with Class Balance)",
-            "# Non-Defaults (Training Data with Class Balance)",
-            class_balance_default,
-        )
-        streamlit_2columns_metrics_pct_series(
-            "% of Defaults (Training Data with Class Balance)",
-            "% of Non-Defaults (Training Data with Class Balance)",
-            class_balance_default,
-        )
-    return dataset, split_dataset

views/__init__.py DELETED Viewed

File without changes

views/decision_tree.py DELETED Viewed

@@ -1,70 +0,0 @@
-from common.data import SplitDataset
-import streamlit as st
-from common.util import (
-    test_variables_gbt,
-)
-from common.views import (
-    streamlit_chart_setting_height_width,
-    plot_importance_gbt,
-    plot_tree_gbt,
-    download_importance_gbt,
-    download_tree_gbt,
-)
-from views.typing import ModelView
-from views.threshold import decision_tree_threshold_view
-from views.evaluation import decision_tree_evaluation_view
-def decisiontree_view(split_dataset: SplitDataset, currency: str):
-    st.header("Decision Trees")
-    clf_gbt_model = test_variables_gbt(
-        split_dataset.X_train, split_dataset.y_train
-    )
-    st.subheader("Decision Tree Feature Importance")
-    (barxsize, barysize,) = streamlit_chart_setting_height_width(
-        "Chart Settings", 10, 15, "barxsize", "barysize"
-    )
-    fig1 = plot_importance_gbt(clf_gbt_model, barxsize, barysize)
-    st.pyplot(fig1)
-    download_importance_gbt(fig1, barxsize, barysize)
-    st.subheader("Decision Tree Structure")
-    (treexsize, treeysize,) = streamlit_chart_setting_height_width(
-        "Chart Settings", 15, 10, "treexsize", "treeysize"
-    )
-    fig2 = plot_tree_gbt(treexsize, treeysize, clf_gbt_model)
-    st.pyplot(fig2)
-    download_tree_gbt(treexsize, treeysize)
-    st.markdown(
-        "Note: The downloaded decision tree plot chart in png has higher resolution than that displayed here."
-    )
-    threshold = decision_tree_threshold_view(clf_gbt_model, split_dataset)
-    df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
-        decision_tree_evaluation_view(
-            clf_gbt_model,
-            split_dataset,
-            currency,
-            threshold.probability_threshold_selected,
-            threshold.predicted_default_status,
-        )
-    )
-    return ModelView(
-        model=clf_gbt_model,
-        trueStatus_probabilityDefault_threshStatus_loanAmount_df=df_trueStatus_probabilityDefault_threshStatus_loanAmount,
-        probability_threshold_selected=threshold.probability_threshold_selected,
-        predicted_default_status=threshold.predicted_default_status,
-        prediction_probability_df=threshold.prediction_probability_df,
-    )

views/evaluation.py DELETED Viewed

@@ -1,410 +0,0 @@
-from typing import Union
-import pandas as pd
-import streamlit as st
-import numpy as np
-from sklearn.metrics import (
-    classification_report,
-    confusion_matrix,
-)
-from sklearn.linear_model import LogisticRegression
-from xgboost.sklearn import XGBClassifier
-from common.data import SplitDataset
-from common.util import (
-    create_cross_validation_df,
-    cross_validation_scores,
-    get_df_trueStatus_probabilityDefault_threshStatus_loanAmount,
-)
-from common.views import (
-    cross_validation_graph,
-)
-def make_evaluation_view(
-    model_name_short: str,
-    model_name_generic: str,
-):
-    def view(
-        clf_gbt_model: Union[XGBClassifier, LogisticRegression],
-        split_dataset: SplitDataset,
-        currency: str,
-        prob_thresh_selected,
-        predicted_default_status,
-    ):
-        st.header(f"Model Evaluation - {model_name_generic}")
-        st.subheader("Cross Validation")
-        st.write("Shows how our model will perform as new loans come in.")
-        st.write(
-            "If evaluation metric for test and train set improve as models \
-            train on each fold suggests performance will be stable."
-        )
-        st.write(f"XGBoost cross validation test:")
-        stcol_seed, stcol_eval_metric = st.columns(2)
-        with stcol_seed:
-            cv_seed = int(
-                st.number_input(
-                    label="Random State Seed for Cross Validation:",
-                    value=123235,
-                    key=f"cv_seed_{model_name_short}",
-                )
-            )
-        with stcol_eval_metric:
-            eval_metric = st.selectbox(
-                label="Select evaluation metric",
-                options=[
-                    "auc",
-                    "aucpr",
-                    "rmse",
-                    "mae",
-                    "logloss",
-                    "error",
-                    "merror",
-                    "mlogloss",
-                ],
-                key=f"eval_metric_{model_name_short}",
-            )
-        stcol_trees, stcol_eval_nfold, stcol_earlystoppingrounds = st.columns(
-            3
-        )
-        with stcol_trees:
-            trees = int(
-                st.number_input(
-                    label="Number of trees",
-                    value=5,
-                    key=f"trees_{model_name_short}",
-                )
-            )
-        with stcol_eval_nfold:
-            nfolds = int(
-                st.number_input(
-                    label="Number of folds",
-                    value=5,
-                    key=f"nfolds_{model_name_short}",
-                )
-            )
-        with stcol_earlystoppingrounds:
-            early_stopping_rounds = int(
-                st.number_input(
-                    label="Early stopping rounds",
-                    value=10,
-                    key=f"early_stopping_rounds_{model_name_short}",
-                )
-            )
-        DTrain, cv_df = create_cross_validation_df(
-            split_dataset.X_test,
-            split_dataset.y_test,
-            eval_metric,
-            cv_seed,
-            trees,
-            nfolds,
-            early_stopping_rounds,
-        )
-        st.write(cv_df)
-        scoring_options = [
-            "roc_auc",
-            "accuracy",
-            "precision",
-            "recall",
-            "f1",
-            "jaccard",
-        ]
-        overfit_test = st.radio(
-            label="Overfit test:",
-            options=("No", "Yes"),
-            key=f"overfit_test_{model_name_short}",
-        )
-        if overfit_test == "Yes":
-            st.write("Overfit test:")
-            iterations = int(
-                st.number_input(
-                    label="Number of folds (iterations)",
-                    value=500,
-                    key=f"iterations_{model_name_short}",
-                )
-            )
-            DTrain, cv_df_it = create_cross_validation_df(
-                split_dataset.X_test,
-                split_dataset.y_test,
-                eval_metric,
-                cv_seed,
-                iterations,
-                nfolds,
-                iterations,
-            )
-            fig_it = cross_validation_graph(cv_df_it, eval_metric, iterations)
-            st.pyplot(fig_it)
-        st.write("Sklearn cross validation test:")
-        stcol_scoringmetric, st_nfold = st.columns(2)
-        with stcol_scoringmetric:
-            score_metric = st.selectbox(
-                label="Select score",
-                options=scoring_options,
-                key=f"stcol_scoringmetric_{model_name_short}",
-            )
-        with st_nfold:
-            nfolds_score = int(
-                st.number_input(
-                    label="Number of folds",
-                    value=5,
-                    key=f"st_nfold_{model_name_short}",
-                )
-            )
-        cv_scores = cross_validation_scores(
-            clf_gbt_model,
-            split_dataset.X_test,
-            split_dataset.y_test,
-            nfolds_score,
-            score_metric,
-            cv_seed,
-        )
-        stcol_vals, stcol_mean, st_std = st.columns(3)
-        with stcol_vals:
-            st.markdown(f"{score_metric} scores:")
-            st.write(
-                pd.DataFrame(
-                    cv_scores,
-                    columns=[score_metric],
-                )
-            )
-        with stcol_mean:
-            st.metric(
-                label=f"Average {score_metric} score ",
-                value="{:.4f}".format(cv_scores.mean()),
-                delta=None,
-                delta_color="normal",
-            )
-        with st_std:
-            st.metric(
-                label=f"{score_metric} standard deviation (+/-)",
-                value="{:.4f}".format(cv_scores.std()),
-                delta=None,
-                delta_color="normal",
-            )
-        st.subheader("Classification Report")
-        target_names = ["Non-Default", "Default"]
-        classification_report_dict = classification_report(
-            split_dataset.y_test,
-            predicted_default_status,
-            target_names=target_names,
-            output_dict=True,
-        )
-        (
-            stcol_defaultpres,
-            stcol_defaultrecall,
-            stcol_defaultf1score,
-            stcol_f1score,
-        ) = st.columns(4)
-        with stcol_defaultpres:
-            st.metric(
-                label="Default Precision",
-                value="{:.0%}".format(
-                    classification_report_dict["Default"]["precision"]
-                ),
-                delta=None,
-                delta_color="normal",
-            )
-        with stcol_defaultrecall:
-            st.metric(
-                label="Default Recall",
-                value="{:.0%}".format(
-                    classification_report_dict["Default"]["recall"]
-                ),
-                delta=None,
-                delta_color="normal",
-            )
-        with stcol_defaultf1score:
-            st.metric(
-                label="Default F1 Score",
-                value="{:.2f}".format(
-                    classification_report_dict["Default"]["f1-score"]
-                ),
-                delta=None,
-                delta_color="normal",
-            )
-        with stcol_f1score:
-            st.metric(
-                label="Macro avg F1 Score (Model F1 Score):",
-                value="{:.2f}".format(
-                    classification_report_dict["macro avg"]["f1-score"]
-                ),
-                delta=None,
-                delta_color="normal",
-            )
-        with st.expander("Classification Report Dictionary:"):
-            st.write(classification_report_dict)
-        st.markdown(
-            f'Default precision: {"{:.0%}".format(classification_report_dict["Default"]["precision"])} of loans predicted as default were actually default.'
-        )
-        st.markdown(
-            f'Default recall: {"{:.0%}".format(classification_report_dict["Default"]["recall"])} of true defaults predicted correctly.'
-        )
-        f1_gap = 1 - classification_report_dict["Default"]["f1-score"]
-        st.markdown(
-            f'Default F1 score: {"{:.2f}".format(classification_report_dict["Default"]["f1-score"])}\
-                is {"{:.2f}".format(f1_gap)} away from perfect precision and recall (no false positive rate).'
-        )
-        st.markdown(
-            f'macro avg F1 score: {"{:.2f}".format(classification_report_dict["macro avg"]["f1-score"])} is the models F1 score.'
-        )
-        st.subheader("Confusion Matrix")
-        confuctiomatrix_dict = confusion_matrix(
-            split_dataset.y_test, predicted_default_status
-        )
-        tn, fp, fn, tp = confusion_matrix(
-            split_dataset.y_test, predicted_default_status
-        ).ravel()
-        with st.expander(
-            "Confusion matrix (column name = classification model prediction, row name = true status, values = number of loans"
-        ):
-            st.write(confuctiomatrix_dict)
-        st.markdown(
-            f'{tp} ,\
-            {"{:.0%}".format(tp / len(predicted_default_status))} \
-                true positives (defaults correctly predicted as defaults).'
-        )
-        st.markdown(
-            f'{fp} ,\
-            {"{:.0%}".format(fp / len(predicted_default_status))} \
-                false positives (non-defaults incorrectly predicted as defaults).'
-        )
-        st.markdown(
-            f'{fn} ,\
-            {"{:.0%}".format(fn / len(predicted_default_status))} \
-                false negatives (defaults incorrectly predicted as non-defaults).'
-        )
-        st.markdown(
-            f'{tn} ,\
-            {"{:.0%}".format(tn / len(predicted_default_status))} \
-                true negatives (non-defaults correctly predicted as non-defaults).'
-        )
-        st.subheader("Bad Rate")
-        df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
-            get_df_trueStatus_probabilityDefault_threshStatus_loanAmount(
-                clf_gbt_model,
-                split_dataset.X_test,
-                split_dataset.y_test,
-                prob_thresh_selected,
-                "loan_amnt",
-            )
-        )
-        with st.expander(
-            "Loan Status, Probability of Default, & Loan Amount DataFrame"
-        ):
-            st.write(df_trueStatus_probabilityDefault_threshStatus_loanAmount)
-        accepted_loans = (
-            df_trueStatus_probabilityDefault_threshStatus_loanAmount[
-                df_trueStatus_probabilityDefault_threshStatus_loanAmount[
-                    "PREDICT_DEFAULT_STATUS"
-                ]
-                == 0
-            ]
-        )
-        bad_rate = (
-            np.sum(accepted_loans["loan_status"])
-            / accepted_loans["loan_status"].count()
-        )
-        with st.expander("Loan Amount Summary Statistics"):
-            st.write(
-                df_trueStatus_probabilityDefault_threshStatus_loanAmount[
-                    "loan_amnt"
-                ].describe()
-            )
-        avg_loan = np.mean(
-            df_trueStatus_probabilityDefault_threshStatus_loanAmount[
-                "loan_amnt"
-            ]
-        )
-        crosstab_df = pd.crosstab(
-            df_trueStatus_probabilityDefault_threshStatus_loanAmount[
-                "loan_status"
-            ],  # row label
-            df_trueStatus_probabilityDefault_threshStatus_loanAmount[
-                "PREDICT_DEFAULT_STATUS"
-            ],
-        ).apply(
-            lambda x: x * avg_loan, axis=0
-        )  # column label
-        with st.expander(
-            "Cross tabulation (column name = classification model prediction, row name = true status, values = number of loans * average loan value"
-        ):
-            st.write(crosstab_df)
-        st.write(
-            f'Bad rate: {"{:.2%}".format(bad_rate)} of all the loans the model accepted (classified as non-default) from the test set were actually defaults.'
-        )
-        st.write(
-            f'Estimated value of the bad rate is {currency} {"{:,.2f}".format(crosstab_df[0][1])}.'
-        )
-        st.write(
-            f'Total estimated value of actual non-default loans is {currency} {"{:,.2f}".format(crosstab_df[0][0]+crosstab_df[0][1])}'
-        )
-        st.write(
-            f'Estimated value of loans incorrectly predicted as default is {currency} {"{:,.2f}".format(crosstab_df[1][0])}'
-        )
-        st.write(
-            f'Estimated value of loans correctly predicted as defaults is {currency} {"{:,.2f}".format(crosstab_df[1][1])}'
-        )
-        return df_trueStatus_probabilityDefault_threshStatus_loanAmount
-    return view
-decision_tree_evaluation_view = make_evaluation_view("gbt", "Decision Tree")
-logistic_evaluation_view = make_evaluation_view("lg", "Logistic Regression")

views/logistic.py DELETED Viewed

@@ -1,119 +0,0 @@
-from common.data import SplitDataset
-import streamlit as st
-import pandas as pd
-import plotly.express as px
-from views.threshold import logistic_threshold_view
-from views.evaluation import logistic_evaluation_view
-from common.util import (
-    test_variables_logistic,
-    print_coeff_logistic,
-    model_probability_values_df,
-    apply_threshold_to_probability_values,
-)
-from common.views import (
-    streamlit_2columns_metrics_df,
-    streamlit_2columns_metrics_pct_df,
-)
-from views.typing import ModelView
-def logistic_view(split_dataset: SplitDataset, currency: str) -> ModelView:
-    # ### Test and create variables logically
-    st.header("Logistic Regression")
-    clf_logistic_model = test_variables_logistic(
-        split_dataset.X_train, split_dataset.y_train
-    )
-    st.metric(
-        label="# of Coefficients in Logistic Regression",
-        value=clf_logistic_model.n_features_in_,
-        delta=None,
-        delta_color="normal",
-    )
-    coef_dict = print_coeff_logistic(clf_logistic_model, split_dataset)
-    st.subheader("Logistic Regression Coefficient Values")
-    coef_dict_sorted = dict(
-        sorted(coef_dict.items(), key=lambda item: item[1], reverse=False)
-    )
-    data_items = coef_dict_sorted.items()
-    data_list = list(data_items)
-    df = pd.DataFrame(data_list, columns=["Coefficient", "Value"])
-    fig1 = px.bar(data_frame=df, x="Value", y="Coefficient", orientation="h")
-    fig1.update_layout(
-        title="Logistic Regression Coefficients",
-        xaxis_title="Value",
-        yaxis_title="Coefficient",
-    )
-    st.plotly_chart(fig1)
-    st.subheader("Classification Probability Threshold")
-    st.write(
-        """
-        The logistic regression model (obtained using training data) is applied on testing data to predict the loans probabilities of defaulting.\n
-        Probabilities of defaulting of the loans are compared to a probability threshold.\n
-        A loan is predicted to default if its predicted probability of defaulting is greater than the probability threshold.
-        """
-    )
-    threshold = st.slider(
-        label="Default Probability Threshold:",
-        min_value=0.0,
-        max_value=1.0,
-        value=0.7,
-        key="key_threshold",
-    )
-    clf_prediction_prob_df_log = model_probability_values_df(
-        clf_logistic_model,
-        split_dataset.X_test,
-    )
-    clf_thresh_predicted_default_status_user = (
-        apply_threshold_to_probability_values(
-            clf_prediction_prob_df_log,
-            threshold,
-        )
-    )
-    streamlit_2columns_metrics_df(
-        "# of Predicted Defaults",
-        "# of Predicted Non-Default",
-        clf_thresh_predicted_default_status_user,
-    )
-    streamlit_2columns_metrics_pct_df(
-        "% of Loans Predicted to Default",
-        "% of Loans Predicted not to Default",
-        clf_thresh_predicted_default_status_user,
-    )
-    threshold = logistic_threshold_view(clf_logistic_model, split_dataset)
-    df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
-        logistic_evaluation_view(
-            clf_logistic_model,
-            split_dataset,
-            currency,
-            threshold.probability_threshold_selected,
-            threshold.predicted_default_status,
-        )
-    )
-    return ModelView(
-        model=clf_logistic_model,
-        trueStatus_probabilityDefault_threshStatus_loanAmount_df=df_trueStatus_probabilityDefault_threshStatus_loanAmount,
-        probability_threshold_selected=threshold.probability_threshold_selected,
-        predicted_default_status=threshold.predicted_default_status,
-        prediction_probability_df=threshold.prediction_probability_df,
-    )

views/model_comparison.py DELETED Viewed

@@ -1,81 +0,0 @@
-from typing import OrderedDict
-import streamlit as st
-from sklearn.metrics import roc_auc_score
-from common.data import SplitDataset
-from common.views import (
-    roc_auc_compare_n_models,
-    streamlit_chart_setting_height_width,
-    calibration_curve_report_commented_n,
-)
-from views.typing import ModelView
-def roc_auc_for_model(split_dataset: SplitDataset, model_view: ModelView):
-    roc_auc_model = roc_auc_score(
-        split_dataset.y_test, model_view.predicted_default_status
-    )
-    if roc_auc_model > 0.9:
-        roc_auc_lvl = f'Very good {"{:.2f}".format(roc_auc_model)} > 0.9)'
-    elif 0.8 < roc_auc_model < 0.9:
-        roc_auc_lvl = f'Good (0.8 < {"{:.2f}".format(roc_auc_model)} <0.9)'
-    elif 0.7 < roc_auc_model < 0.8:
-        roc_auc_lvl = f'Fair (0.7 <  {"{:.2f}".format(roc_auc_model)} < 0.8)'
-    elif 0.6 < roc_auc_model < 0.7:
-        roc_auc_lvl = f'Poor (0.6 <  {"{:.2f}".format(roc_auc_model)} < 0.7)'
-    else:
-        roc_auc_lvl = f'Fail ( {"{:.2f}".format(roc_auc_model)} < 0.6)'
-    return roc_auc_model, roc_auc_lvl
-def model_comparison_view(
-    split_dataset: SplitDataset,
-    model_views: OrderedDict[str, ModelView],
-):
-    st.header("Model Comparison")
-    for model_name, model_view in model_views.items():
-        roc_auc_model, roc_auc_lvl = roc_auc_for_model(
-            split_dataset, model_view
-        )
-        st.subheader(
-            f"Receiver Operating Characteristic (ROC) Curve - {model_name}"
-        )
-        st.markdown(
-            f'Area Under the Receiver Operating Characteristic Curve from prediction scores from "{model_name}" model is {roc_auc_model}.\n'
-        )
-        st.markdown(
-            f'The score of {"{:.2f}".format(roc_auc_model)} is in the {roc_auc_lvl} ROC AUC score category.'
-        )
-    fig1 = roc_auc_compare_n_models(
-        split_dataset.y_test,
-        model_views,
-    )
-    fig1 = fig1.figure
-    (xsize_roc, ysize_roc) = streamlit_chart_setting_height_width(
-        "Chart Settings", 7, 7, "xsize_roc", "ysize_roc"
-    )
-    fig1.set_size_inches(xsize_roc, ysize_roc)
-    st.pyplot(fig1)
-    st.subheader("Models Calibration Curve")
-    fig2 = calibration_curve_report_commented_n(
-        split_dataset.y_test,
-        model_views,
-        10,
-    )
-    fig2 = fig2.figure
-    (xsize_cal, ysize_cal) = streamlit_chart_setting_height_width(
-        "Chart Settings", 7, 7, "xsize_cal", "ysize_cal"
-    )
-    fig2.set_size_inches(xsize_cal, ysize_cal)
-    st.pyplot(fig2.figure)

views/strategy_table.py DELETED Viewed

@@ -1,96 +0,0 @@
-from typing import OrderedDict
-import plotly.express as px
-import numpy as np
-import streamlit as st
-from common.util import create_strategyTable_df
-from views.typing import ModelView
-def strategy_table_view(
-    currency: str, model_views: OrderedDict[str, ModelView]
-):
-    st.header("Strategy Table")
-    for (model_name, model_view) in model_views.items():
-        st.subheader(model_name)
-        strat_df = create_strategyTable_df(
-            0.05,
-            1,
-            20,
-            model_view.trueStatus_probabilityDefault_threshStatus_loanAmount_df,
-            "loan_status",
-            currency,
-        )
-        columns = strat_df.columns
-        with st.expander("Strategy Table:"):
-            st.write(strat_df)
-        for i in columns:
-            strat_df[i] = strat_df[i].astype(np.float64)
-        strat_df_boxPlot_data = strat_df.iloc[:, 0:3]
-        plot = px.box(data_frame=strat_df_boxPlot_data)
-        st.plotly_chart(plot)
-        # Plot the strategy curve
-        fig1 = px.line(
-            strat_df_boxPlot_data,
-            x="Acceptance Rate",
-            y="Bad Rate",
-            title="Acceptance and Bad Rates",
-        )
-        st.plotly_chart(fig1)
-        fig2 = px.line(
-            strat_df,
-            x="Acceptance Rate",
-            y=f"Estimated Value ({currency})",
-            title=f"Estimated Value ({currency}) by Acceptance Rate",
-        )
-        st.plotly_chart(fig2)
-        st.write("Row with the greatest estimated value:")
-        max_estimated_value = np.max(
-            strat_df[f"Estimated Value ({currency})"].astype(np.float64)
-        )
-        columns = strat_df.columns
-        max_estimated_value = np.max(strat_df[f"Estimated Value ({currency})"])
-        st.write(
-            strat_df.loc[
-                strat_df[f"Estimated Value ({currency})"]
-                == max_estimated_value
-            ]
-        )
-        loss_given_default = 1
-        df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
-            model_view.trueStatus_probabilityDefault_threshStatus_loanAmount_df[
-                "PROB_DEFAULT"
-            ]
-            * loss_given_default
-            * model_view.trueStatus_probabilityDefault_threshStatus_loanAmount_df[
-                "loan_amnt"
-            ]
-        )
-        tot_exp_loss = round(
-            np.sum(df_trueStatus_probabilityDefault_threshStatus_loanAmount),
-            2,
-        )
-        st.metric(
-            label=f"Total expected loss:",
-            value=f"{currency} {tot_exp_loss:,.2f}",
-            delta=None,
-            delta_color="normal",
-        )

views/threshold.py DELETED Viewed

@@ -1,272 +0,0 @@
-from dataclasses import dataclass
-from typing import Union, cast
-import numpy as np
-import streamlit as st
-import plotly.express as px
-import pandas as pd
-from xgboost.sklearn import XGBClassifier
-from sklearn.linear_model import LogisticRegression
-from common.data import SplitDataset
-from common.util import (
-    model_probability_values_df,
-    apply_threshold_to_probability_values,
-    find_best_threshold_J_statistic,
-    default_status_per_threshold,
-    classification_report_per_threshold,
-    thresh_classification_report_recall_accuracy,
-)
-from common.views import (
-    streamlit_2columns_metrics_df,
-    streamlit_2columns_metrics_pct_df,
-)
-@dataclass(frozen=True)
-class Threshold:
-    probability_threshold_selected: float
-    predicted_default_status: pd.Series
-    prediction_probability_df: pd.DataFrame
-def make_threshold_view(
-    model_name_short: str,
-    model_name: str,
-):
-    def view(
-        clf_gbt_model: Union[XGBClassifier, LogisticRegression],
-        split_dataset: SplitDataset,
-    ) -> Threshold:
-        st.subheader("Classification Probability Threshold - User Defined")
-        st.write(
-            f"""
-            The {model_name} model (obtained using training data) is applied on testing data to predict the loans probabilities of defaulting.\n
-            Probabilities of defaulting of the loans are compared to a probability threshold.\n
-            A loan is predicted to default if its predicted probability of defaulting is greater than the probability threshold.
-            """
-        )
-        threshold_gbt_default = st.slider(
-            label="Default Probability Threshold:",
-            min_value=0.0,
-            max_value=1.0,
-            value=0.8,
-            key=f"threshold_{model_name_short}_default",
-        )
-        clf_prediction_prob_df_gbt = model_probability_values_df(
-            clf_gbt_model,
-            split_dataset.X_test,
-        )
-        clf_thresh_predicted_default_status_user_gbt = (
-            apply_threshold_to_probability_values(
-                clf_prediction_prob_df_gbt,
-                threshold_gbt_default,
-            )
-        )
-        streamlit_2columns_metrics_df(
-            "# of Predicted Defaults",
-            "# of Predicted Non-Default",
-            clf_thresh_predicted_default_status_user_gbt,
-        )
-        streamlit_2columns_metrics_pct_df(
-            "% of Loans Predicted to Default",
-            "% of Loans Predicted not to Default",
-            clf_thresh_predicted_default_status_user_gbt,
-        )
-        st.subheader("J Statistic Driven Classification Probability Threshold")
-        J_statistic_best_threshold = find_best_threshold_J_statistic(
-            split_dataset.y_test, clf_prediction_prob_df_gbt
-        )
-        st.metric(
-            label="Youden's J statistic calculated best threshold",
-            value=J_statistic_best_threshold,
-        )
-        clf_thresh_predicted_default_status_Jstatistic_gbt = (
-            apply_threshold_to_probability_values(
-                clf_prediction_prob_df_gbt,
-                J_statistic_best_threshold,
-            )
-        )
-        streamlit_2columns_metrics_df(
-            "# of Predicted Defaults",
-            "# of Predicted Non-Default",
-            clf_thresh_predicted_default_status_Jstatistic_gbt,
-        )
-        streamlit_2columns_metrics_pct_df(
-            "% of Loans Predicted to Default",
-            "% of Loans Predicted not to Default",
-            clf_thresh_predicted_default_status_Jstatistic_gbt,
-        )
-        st.subheader(
-            "Recall and Accuracy Tradeoff with given Probability Threshold"
-        )
-        # Steps
-        # Get list of thresholds
-        # Get default status per threshold
-        # Get classification report per threshold
-        # Get recall, nondef recall, and accuracy per threshold
-        threshold_list = np.arange(0, 1, 0.025).round(decimals=3).tolist()
-        threshold_default_status_list = default_status_per_threshold(
-            threshold_list, clf_prediction_prob_df_gbt["PROB_DEFAULT"]
-        )
-        thresh_classification_report_dict = (
-            classification_report_per_threshold(
-                threshold_list,
-                threshold_default_status_list,
-                split_dataset.y_test,
-            )
-        )
-        (
-            thresh_def_recalls_list,
-            thresh_nondef_recalls_list,
-            thresh_accs_list,
-        ) = thresh_classification_report_recall_accuracy(
-            thresh_classification_report_dict
-        )
-        namelist = [
-            "Default Recall",
-            "Non Default Recall",
-            "Accuracy",
-            "Threshold",
-        ]
-        df = pd.DataFrame(
-            [
-                thresh_def_recalls_list,
-                thresh_nondef_recalls_list,
-                thresh_accs_list,
-                threshold_list,
-            ],
-            index=namelist,
-        )
-        df = df.T
-        fig2 = px.line(
-            data_frame=df,
-            y=["Default Recall", "Non Default Recall", "Accuracy"],
-            x="Threshold",
-        )
-        fig2.update_layout(
-            title="Recall and Accuracy score Trade-off with Probability Threshold",
-            xaxis_title="Probability Threshold",
-            yaxis_title="Score",
-        )
-        fig2.update_yaxes(range=[0.0, 1.0])
-        st.plotly_chart(fig2)
-        st.subheader("Acceptance Rate Driven Probability Threshold")
-        # Steps
-        # Set acceptance rate
-        # Get default status per threshold
-        # Get classification report per threshold
-        # Get recall, nondef recall, and accuracy per threshold
-        acceptance_rate = (
-            st.slider(
-                label="% of loans accepted (acceptance rate):",
-                min_value=0,
-                max_value=100,
-                value=85,
-                key=f"acceptance_rate_{model_name_short}",
-                format="%f%%",
-            )
-            / 100
-        )
-        acc_rate_thresh_gbt = np.quantile(
-            clf_prediction_prob_df_gbt["PROB_DEFAULT"], acceptance_rate
-        )
-        st.write(
-            f"An acceptance rate of {acceptance_rate} results in probability threshold of {acc_rate_thresh_gbt}"
-        )
-        figa = px.histogram(clf_prediction_prob_df_gbt["PROB_DEFAULT"])
-        figa.update_layout(
-            title="Acceptance Rate Threshold vs. Loans Accepted",
-            xaxis_title="Acceptance Rate Threshold",
-            yaxis_title="Loans Accepted",
-        )
-        figa.update_traces(marker_line_width=1, marker_line_color="white")
-        figa.add_vline(
-            x=acc_rate_thresh_gbt,
-            line_width=3,
-            line_dash="solid",
-            line_color="red",
-        )
-        st.plotly_chart(figa)
-        clf_thresh_predicted_default_status_acceptance_gbt = (
-            apply_threshold_to_probability_values(
-                clf_prediction_prob_df_gbt,
-                acc_rate_thresh_gbt,
-            )
-        )
-        st.write()
-        st.subheader("Selected Probability Threshold")
-        options = [
-            "User Defined",
-            "J Statistic Driven",
-            "Acceptance Rate Driven",
-        ]
-        prob_thresh_option = st.radio(
-            label="Selected Probability Threshold",
-            options=options,
-            key=f"{model_name_short}_radio_thresh",
-        )
-        if prob_thresh_option == "User Defined":
-            prob_thresh_selected_gbt = threshold_gbt_default
-            predicted_default_status_gbt = (
-                clf_thresh_predicted_default_status_user_gbt
-            )
-        elif prob_thresh_option == "J Statistic Driven":
-            prob_thresh_selected_gbt = J_statistic_best_threshold
-            predicted_default_status_gbt = (
-                clf_thresh_predicted_default_status_Jstatistic_gbt
-            )
-        else:
-            prob_thresh_selected_gbt = acc_rate_thresh_gbt
-            predicted_default_status_gbt = (
-                clf_thresh_predicted_default_status_acceptance_gbt
-            )
-        st.write(
-            f"Selected probability threshold is {prob_thresh_selected_gbt}"
-        )
-        return Threshold(
-            probability_threshold_selected=cast(
-                float, prob_thresh_selected_gbt
-            ),
-            predicted_default_status=predicted_default_status_gbt,
-            prediction_probability_df=clf_prediction_prob_df_gbt,
-        )
-    return view
-decision_tree_threshold_view = make_threshold_view("gbt", "decision tree")
-logistic_threshold_view = make_threshold_view("lg", "logistic")

views/typing.py DELETED Viewed

@@ -1,15 +0,0 @@
-from dataclasses import dataclass
-from typing import Union
-import pandas as pd
-from xgboost.sklearn import XGBClassifier
-from sklearn.linear_model import LogisticRegression
-@dataclass(frozen=True)
-class ModelView:
-    model: Union[XGBClassifier, LogisticRegression]
-    probability_threshold_selected: float
-    predicted_default_status: pd.Series
-    trueStatus_probabilityDefault_threshStatus_loanAmount_df: pd.DataFrame
-    prediction_probability_df: pd.DataFrame