credit_risk_modeling_demo / src /features /util_build_features.py
pkiage's picture
cleanup
830efed
raw
history blame
4.73 kB
import streamlit as st
from typing import List, Union, cast
from dataclasses import dataclass
from sklearn.model_selection import train_test_split
import pandas as pd
@dataclass
class SplitDataset:
X_test: pd.DataFrame
X_train: pd.DataFrame
y_test: pd.Series
y_train: pd.Series
@property
def X_y_test(self) -> pd.DataFrame:
return pd.concat(
cast(
List[Union[pd.DataFrame, pd.Series]],
[
self.X_test.reset_index(drop=True),
self.y_test.reset_index(drop=True),
],
),
axis=1,
)
@property
def X_y_train(self) -> pd.DataFrame:
return pd.concat(
cast(
List[Union[pd.DataFrame, pd.Series]],
[
self.X_train.reset_index(drop=True),
self.y_train.reset_index(drop=True),
],
),
axis=1,
)
@dataclass
class Dataset:
df: pd.DataFrame
random_state: int
test_size: int
@property
def y_value(self) -> pd.DataFrame:
return self.df["loan_status"]
@property
def x_values(self) -> pd.DataFrame:
return cast(
pd.DataFrame,
drop_columns(
self.df,
[
"loan_status",
"loan_grade_A",
"loan_grade_B",
"loan_grade_C",
"loan_grade_D",
"loan_grade_E",
"loan_grade_F",
"loan_grade_G",
],
),
)
@property
def x_values_column_names(self):
return self.x_values.columns.tolist()
def x_values_filtered_columns(self, columns: List[str]) -> pd.DataFrame:
return self.df.filter(columns)
def train_test_split(
self, selected_x_values: pd.DataFrame
) -> SplitDataset:
X_train, X_test, y_train, y_test = train_test_split(
selected_x_values,
self.y_value,
test_size=self.test_size / 100, # since up was given as pct
random_state=self.random_state,
)
return SplitDataset(
X_train=cast(pd.DataFrame, X_train),
X_test=cast(pd.DataFrame, X_test),
y_train=cast(pd.Series, y_train),
y_test=cast(pd.Series, y_test),
)
def drop_columns(df, columns):
return df.drop(columns, axis=1)
def remove_less_than_0_columns(df, column):
df[column].dropna()
return df.loc[(df[column] != 0).any(1)]
def boolean_int_condition_label(df, label_column_name, condition):
df[label_column_name] = condition
y = df[label_column_name].astype(int)
df = drop_columns(df, label_column_name)
return y, df
@st.cache(suppress_st_warning=True)
def undersample_training_data(
df: pd.DataFrame, column_name: str, split_dataset
):
count_nondefault, count_default = split_dataset.X_y_train[
column_name
].value_counts()
nondefaults = df[df[column_name] == 0] # 0
defaults = df[df[column_name] == 1]
under_sample = min(count_nondefault, count_default)
nondefaults_under = nondefaults.sample(under_sample)
defaults_under = defaults.sample(under_sample)
X_y_train_under = pd.concat(
[
nondefaults_under.reset_index(drop=True),
defaults_under.reset_index(drop=True),
],
axis=0,
)
X_train_under = X_y_train_under.drop([column_name], axis=1) # remove label
y_train_under = X_y_train_under[column_name] # label only
class_balance_default = X_y_train_under[column_name].value_counts()
return [
X_train_under,
y_train_under,
X_y_train_under,
class_balance_default,
]
def select_predictors(dataset):
st.header("Predictors")
possible_columns = dataset.x_values_column_names
selected_columns = st.sidebar.multiselect(
label="Select Predictors",
options=possible_columns,
default=possible_columns,
)
return dataset.x_values_filtered_columns(selected_columns)
def import_data():
if "input_data_frame" not in st.session_state:
st.session_state.input_data_frame = pd.read_csv(
r"./data/processed/cr_loan_w2.csv"
)
if "dataset" not in st.session_state:
df = cast(pd.DataFrame, st.session_state.input_data_frame)
dataset = Dataset(
df=df,
random_state=123235,
test_size=40,
)
st.session_state.dataset = dataset
else:
dataset = st.session_state.dataset
return dataset