Spaces:

pkiage
/

credit_risk_modeling_demo

Build error

File size: 9,930 Bytes

7d861ad

from typing import OrderedDict
import streamlit as st  # works on command prompt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import (
    roc_curve,
)
from sklearn.calibration import calibration_curve
from xgboost import plot_tree
from views.typing import ModelView


def plot_logistic_coeff_barh(coef_dict, x, y):
    fig = plt.figure(figsize=(x, y))
    coef_dict_sorted = dict(
        sorted(coef_dict.items(), key=lambda item: item[1], reverse=False)
    )
    plt.barh(*zip(*coef_dict_sorted.items()))
    return fig


def print_negative_coefficients_logistic_model(coef_dict):
    # Equal to or less than 0
    NegativeCoefficients = dict(
        filter(lambda x: x[1] <= 0.0, coef_dict.items())
    )

    NegativeCoefficientsSorted = sorted(
        NegativeCoefficients.items(), key=lambda x: x[1], reverse=False
    )
    text = (
        "\n\nFeatures the model found to be negatively correlated with probability of default are:"
        "\n{negative_features}:"
    )
    st.markdown(text.format(negative_features=NegativeCoefficientsSorted))
    st.markdown(type(NegativeCoefficientsSorted))
    st.markdown(NegativeCoefficients.items())


def print_positive_coefficients_logistic_model(coef_dict):
    # Equal to or greater than 0
    PositiveCoefficients = dict(
        filter(lambda x: x[1] >= 0.0, coef_dict.items())
    )

    PositiveCoefficientsSorted = sorted(
        PositiveCoefficients.items(), key=lambda x: x[1], reverse=True
    )
    text = (
        "\n\nFeatures the model found to be positively correlated with probability of default are:"
        "\n{positive_features}:"
    )
    st.markdown(text.format(positive_features=PositiveCoefficientsSorted))


def plot_importance_gbt(clf_gbt_model, barxsize, barysize):
    axobject1 = xgb.plot_importance(clf_gbt_model, importance_type="weight")
    fig1 = axobject1.figure
    st.write("Feature Importance Plot (Gradient Boosted Tree)")
    fig1.set_size_inches(barxsize, barysize)
    return fig1


def download_importance_gbt(fig1, barxsize, barysize):
    if st.button(
        "Download Feature Importance Plot as png (Gradient Boosted Tree)"
    ):
        dpisize = max(barxsize, barysize)
        plt.savefig("bar.png", dpi=dpisize * 96, bbox_inches="tight")
        fig1.set_size_inches(barxsize, barysize)


def plot_tree_gbt(treexsize, treeysize, clf_gbt_model):
    plot_tree(clf_gbt_model)
    fig2 = plt.gcf()
    fig2.set_size_inches(treexsize, treeysize)
    return fig2


def download_tree_gbt(treexsize, treeysize):
    if st.button("Download Decision Tree Plot as png (Gradient Boosted Tree)"):
        dpisize = max(treexsize, treeysize)
        plt.savefig("tree.png", dpi=dpisize * 96, bbox_inches="tight")


def cross_validation_graph(cv, eval_metric, trees):

    # Plot the test AUC scores for each iteration
    fig = plt.figure()
    plt.plot(cv[cv.columns[2]])
    plt.title(
        "Test {eval_metric} Score Over {it_numbr} Iterations".format(
            eval_metric=eval_metric, it_numbr=trees
        )
    )
    plt.xlabel("Iteration Number")
    plt.ylabel("Test {eval_metric} Score".format(eval_metric=eval_metric))
    return fig


def recall_accuracy_threshold_tradeoff_fig(
    widthsize,
    heightsize,
    threshold_list,
    thresh_def_recalls_list,
    thresh_nondef_recalls_list,
    thresh_accs_list,
):
    fig = plt.figure(figsize=(widthsize, heightsize))
    plt.plot(threshold_list, thresh_def_recalls_list, label="Default Recall")
    plt.plot(
        threshold_list, thresh_nondef_recalls_list, label="Non-Default Recall"
    )
    plt.plot(threshold_list, thresh_accs_list, label="Model Accuracy")
    plt.xlabel("Probability Threshold")
    plt.ylabel("Score")
    plt.xlim(0, 1)
    plt.ylim(0, 1)
    plt.legend()
    plt.title("Recall and Accuracy Score Tradeoff with Probability Threshold")
    plt.grid(False)
    return fig


def roc_auc_compare_n_models(y, model_views: OrderedDict[str, ModelView]):
    colors = ["blue", "green"]
    fig = plt.figure()
    for color_idx, (model_name, model_view) in enumerate(model_views.items()):
        fpr, tpr, _thresholds = roc_curve(
            y, model_view.prediction_probability_df
        )
        plt.plot(fpr, tpr, color=colors[color_idx], label=f"{model_name}")
    plt.plot([0, 1], [0, 1], linestyle="--", label="Random Prediction")
    model_names = list(model_views.keys())
    if not model_names:
        model_name_str = "None"
    elif len(model_names) == 1:
        model_name_str = model_names[0]
    else:
        model_name_str = " and ".join(
            [", ".join(model_names[:-1]), model_names[-1]]
        )
    plt.title(f"ROC Chart for {model_name_str} on the Probability of Default")
    plt.xlabel("False Positive Rate (FP Rate)")
    plt.ylabel("True Positive Rate (TP Rate)")
    plt.legend()
    plt.grid(False)
    plt.xlim(0, 1)
    plt.ylim(0, 1)
    return fig


def calibration_curve_report_commented_n(
    y, model_views: OrderedDict[str, ModelView], bins: int
):
    fig = plt.figure()
    for model_name, model_view in model_views.items():
        frac_of_pos, mean_pred_val = calibration_curve(
            y,
            model_view.prediction_probability_df,
            n_bins=bins,
            normalize=True,
        )
        plt.plot(mean_pred_val, frac_of_pos, "s-", label=f"{model_name}")

    # Create the calibration curve plot with the guideline
    plt.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")

    plt.ylabel("Fraction of positives")
    plt.xlabel("Average Predicted Probability")
    plt.title("Calibration Curve")
    plt.legend()
    plt.grid(False)
    plt.xlim(0, 1)
    plt.ylim(0, 1)
    return fig


def acceptance_rate_threshold_fig(probability_default, acceptancerate, bins):
    # Probability distribution
    probability_stat_distribution = probability_default.describe()

    # Acceptance rate threshold
    acc_rate_thresh = np.quantile(probability_default, acceptancerate)
    fig = plt.figure()

    plt.hist(
        probability_default,
        color="blue",
        bins=bins,
        histtype="bar",
        ec="white",
    )

    # Add a reference line to the plot for the threshold
    plt.axvline(x=acc_rate_thresh, color="red")
    plt.title("Acceptance Rate Thershold")

    return (
        fig,
        probability_stat_distribution,
        acc_rate_thresh,
    )


def streamlit_2columns_metrics_pct_df(
    column1name_label: str,
    column2name_label: str,
    df: pd.DataFrame,
):
    (
        column1name,
        column2name,
    ) = st.columns(2)

    with column1name:
        st.metric(
            label=column1name_label,
            value="{:.0%}".format(df.value_counts().get(1) / df.shape[0]),
            delta=None,
            delta_color="normal",
        )

    with column2name:
        st.metric(
            label=column2name_label,
            value="{:.0%}".format(df.value_counts().get(0) / df.shape[0]),
            delta=None,
            delta_color="normal",
        )


def streamlit_2columns_metrics_df(
    column1name_label: str,
    column2name_label: str,
    df: pd.DataFrame,
):
    (
        column1name,
        column2name,
    ) = st.columns(2)

    with column1name:
        st.metric(
            label=column1name_label,
            value=df.value_counts().get(1),
            delta=None,
            delta_color="normal",
        )

    with column2name:
        st.metric(
            label=column2name_label,
            value=df.value_counts().get(0),
            delta=None,
            delta_color="normal",
        )


def streamlit_2columns_metrics_df_shape(df: pd.DataFrame):
    (
        column1name,
        column2name,
    ) = st.columns(2)

    with column1name:
        st.metric(
            label="Rows",
            value=df.shape[0],
            delta=None,
            delta_color="normal",
        )

    with column2name:
        st.metric(
            label="Columns",
            value=df.shape[1],
            delta=None,
            delta_color="normal",
        )


def streamlit_2columns_metrics_pct_series(
    column1name_label: str,
    column2name_label: str,
    series: pd.Series,
):
    (
        column1name,
        column2name,
    ) = st.columns(2)
    with column1name:
        st.metric(
            label=column1name_label,
            value="{:.0%}".format(series.get(1) / series.sum()),
            delta=None,
            delta_color="normal",
        )

    with column2name:
        st.metric(
            label=column2name_label,
            value="{:.0%}".format(series.get(0) / series.sum()),
            delta=None,
            delta_color="normal",
        )


def streamlit_2columns_metrics_series(
    column1name_label: str,
    column2name_label: str,
    series: pd.Series,
):
    (
        column1name,
        column2name,
    ) = st.columns(2)
    with column1name:
        st.metric(
            label=column1name_label,
            value=series.get(1),
            delta=None,
            delta_color="normal",
        )

    with column2name:
        st.metric(
            label=column2name_label,
            value=series.get(0),
            delta=None,
            delta_color="normal",
        )


def streamlit_chart_setting_height_width(
    title: str,
    default_widthvalue: int,
    default_heightvalue: int,
    widthkey: str,
    heightkey: str,
):
    with st.expander(title):

        lbarx_col, lbary_col = st.columns(2)

        with lbarx_col:
            width_size = st.number_input(
                label="Width in inches:",
                value=default_widthvalue,
                key=widthkey,
            )

        with lbary_col:
            height_size = st.number_input(
                label="Height in inches:",
                value=default_heightvalue,
                key=heightkey,
            )
    return width_size, height_size