Spaces:
Build error
Build error
import streamlit as st | |
from typing import List, Union, cast | |
from dataclasses import dataclass | |
from sklearn.model_selection import train_test_split | |
import pandas as pd | |
class SplitDataset: | |
X_test: pd.DataFrame | |
X_train: pd.DataFrame | |
y_test: pd.Series | |
y_train: pd.Series | |
def X_y_test(self) -> pd.DataFrame: | |
return pd.concat( | |
cast( | |
List[Union[pd.DataFrame, pd.Series]], | |
[ | |
self.X_test.reset_index(drop=True), | |
self.y_test.reset_index(drop=True), | |
], | |
), | |
axis=1, | |
) | |
def X_y_train(self) -> pd.DataFrame: | |
return pd.concat( | |
cast( | |
List[Union[pd.DataFrame, pd.Series]], | |
[ | |
self.X_train.reset_index(drop=True), | |
self.y_train.reset_index(drop=True), | |
], | |
), | |
axis=1, | |
) | |
class Dataset: | |
df: pd.DataFrame | |
random_state: int | |
test_size: int | |
def y_value(self) -> pd.DataFrame: | |
return self.df["loan_status"] | |
def x_values(self) -> pd.DataFrame: | |
return cast( | |
pd.DataFrame, | |
drop_columns( | |
self.df, | |
[ | |
"loan_status", | |
"loan_grade_A", | |
"loan_grade_B", | |
"loan_grade_C", | |
"loan_grade_D", | |
"loan_grade_E", | |
"loan_grade_F", | |
"loan_grade_G", | |
], | |
), | |
) | |
def x_values_column_names(self): | |
return self.x_values.columns.tolist() | |
def x_values_filtered_columns(self, columns: List[str]) -> pd.DataFrame: | |
return self.df.filter(columns) | |
def train_test_split( | |
self, selected_x_values: pd.DataFrame | |
) -> SplitDataset: | |
X_train, X_test, y_train, y_test = train_test_split( | |
selected_x_values, | |
self.y_value, | |
test_size=self.test_size / 100, # since up was given as pct | |
random_state=self.random_state, | |
) | |
return SplitDataset( | |
X_train=cast(pd.DataFrame, X_train), | |
X_test=cast(pd.DataFrame, X_test), | |
y_train=cast(pd.Series, y_train), | |
y_test=cast(pd.Series, y_test), | |
) | |
def drop_columns(df, columns): | |
return df.drop(columns, axis=1) | |
def remove_less_than_0_columns(df, column): | |
df[column].dropna() | |
return df.loc[(df[column] != 0).any(1)] | |
def boolean_int_condition_label(df, label_column_name, condition): | |
df[label_column_name] = condition | |
y = df[label_column_name].astype(int) | |
df = drop_columns(df, label_column_name) | |
return y, df | |
def undersample_training_data( | |
df: pd.DataFrame, column_name: str, split_dataset | |
): | |
count_nondefault, count_default = split_dataset.X_y_train[ | |
column_name | |
].value_counts() | |
nondefaults = df[df[column_name] == 0] # 0 | |
defaults = df[df[column_name] == 1] | |
under_sample = min(count_nondefault, count_default) | |
nondefaults_under = nondefaults.sample(under_sample) | |
defaults_under = defaults.sample(under_sample) | |
X_y_train_under = pd.concat( | |
[ | |
nondefaults_under.reset_index(drop=True), | |
defaults_under.reset_index(drop=True), | |
], | |
axis=0, | |
) | |
X_train_under = X_y_train_under.drop([column_name], axis=1) # remove label | |
y_train_under = X_y_train_under[column_name] # label only | |
class_balance_default = X_y_train_under[column_name].value_counts() | |
return [ | |
X_train_under, | |
y_train_under, | |
X_y_train_under, | |
class_balance_default, | |
] | |
def select_predictors(dataset): | |
st.header("Predictors") | |
possible_columns = dataset.x_values_column_names | |
selected_columns = st.sidebar.multiselect( | |
label="Select Predictors", | |
options=possible_columns, | |
default=possible_columns, | |
) | |
return dataset.x_values_filtered_columns(selected_columns) | |
def import_data(): | |
if "input_data_frame" not in st.session_state: | |
st.session_state.input_data_frame = pd.read_csv( | |
r"./data/processed/cr_loan_w2.csv" | |
) | |
if "dataset" not in st.session_state: | |
df = cast(pd.DataFrame, st.session_state.input_data_frame) | |
dataset = Dataset( | |
df=df, | |
random_state=123235, | |
test_size=40, | |
) | |
st.session_state.dataset = dataset | |
else: | |
dataset = st.session_state.dataset | |
return dataset | |