File size: 4,728 Bytes
7f0977b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import streamlit as st

from typing import List, Union, cast

from dataclasses import dataclass

from sklearn.model_selection import train_test_split

import pandas as pd


@dataclass
class SplitDataset:
    X_test: pd.DataFrame
    X_train: pd.DataFrame
    y_test: pd.Series
    y_train: pd.Series

    @property
    def X_y_test(self) -> pd.DataFrame:
        return pd.concat(
            cast(
                List[Union[pd.DataFrame, pd.Series]],
                [
                    self.X_test.reset_index(drop=True),
                    self.y_test.reset_index(drop=True),
                ],
            ),
            axis=1,
        )

    @property
    def X_y_train(self) -> pd.DataFrame:
        return pd.concat(
            cast(
                List[Union[pd.DataFrame, pd.Series]],
                [
                    self.X_train.reset_index(drop=True),
                    self.y_train.reset_index(drop=True),
                ],
            ),
            axis=1,
        )


@dataclass
class Dataset:
    df: pd.DataFrame
    random_state: int
    test_size: int

    @property
    def y_value(self) -> pd.DataFrame:
        return self.df["loan_status"]

    @property
    def x_values(self) -> pd.DataFrame:
        return cast(
            pd.DataFrame,
            drop_columns(
                self.df,
                [
                    "loan_status",
                    "loan_grade_A",
                    "loan_grade_B",
                    "loan_grade_C",
                    "loan_grade_D",
                    "loan_grade_E",
                    "loan_grade_F",
                    "loan_grade_G",
                ],
            ),
        )

    @property
    def x_values_column_names(self):
        return self.x_values.columns.tolist()

    def x_values_filtered_columns(self, columns: List[str]) -> pd.DataFrame:
        return self.df.filter(columns)

    def train_test_split(
        self, selected_x_values: pd.DataFrame
    ) -> SplitDataset:
        X_train, X_test, y_train, y_test = train_test_split(
            selected_x_values,
            self.y_value,
            test_size=self.test_size / 100,  # since up was given as pct
            random_state=self.random_state,
        )

        return SplitDataset(
            X_train=cast(pd.DataFrame, X_train),
            X_test=cast(pd.DataFrame, X_test),
            y_train=cast(pd.Series, y_train),
            y_test=cast(pd.Series, y_test),
        )


def drop_columns(df, columns):
    return df.drop(columns, axis=1)


def remove_less_than_0_columns(df, column):
    df[column].dropna()
    return df.loc[(df[column] != 0).any(1)]


def boolean_int_condition_label(df, label_column_name, condition):
    df[label_column_name] = condition
    y = df[label_column_name].astype(int)
    df = drop_columns(df, label_column_name)
    return y, df


@st.cache(suppress_st_warning=True)
def undersample_training_data(
    df: pd.DataFrame, column_name: str, split_dataset
):
    count_nondefault, count_default = split_dataset.X_y_train[
        column_name
    ].value_counts()

    nondefaults = df[df[column_name] == 0]  # 0

    defaults = df[df[column_name] == 1]

    under_sample = min(count_nondefault, count_default)

    nondefaults_under = nondefaults.sample(under_sample)

    defaults_under = defaults.sample(under_sample)

    X_y_train_under = pd.concat(
        [
            nondefaults_under.reset_index(drop=True),
            defaults_under.reset_index(drop=True),
        ],
        axis=0,
    )

    X_train_under = X_y_train_under.drop([column_name], axis=1)  # remove label

    y_train_under = X_y_train_under[column_name]  # label only

    class_balance_default = X_y_train_under[column_name].value_counts()

    return [
        X_train_under,
        y_train_under,
        X_y_train_under,
        class_balance_default,
    ]


def select_predictors(dataset):
    st.header("Predictors")

    possible_columns = dataset.x_values_column_names

    selected_columns = st.sidebar.multiselect(
        label="Select Predictors",
        options=possible_columns,
        default=possible_columns,
    )
    return dataset.x_values_filtered_columns(selected_columns)


def import_data():
    if "input_data_frame" not in st.session_state:
        st.session_state.input_data_frame = pd.read_csv(
            r"./data/processed/cr_loan_w2.csv"
        )
    if "dataset" not in st.session_state:
        df = cast(pd.DataFrame, st.session_state.input_data_frame)
        dataset = Dataset(
            df=df,
            random_state=123235,
            test_size=40,
        )
        st.session_state.dataset = dataset
    else:
        dataset = st.session_state.dataset

    return dataset