from typing import List, Union, cast, Tuple from dataclasses import dataclass from sklearn.model_selection import train_test_split import pandas as pd import streamlit as st from src.features.util_build_features import ( Dataset, SplitDataset, undersample_training_data, select_predictors, import_data) from src.visualization.metrics import ( streamlit_2columns_metrics_df_shape, streamlit_2columns_metrics_series, streamlit_2columns_metrics_pct_series, streamlit_2columns_metrics_df, streamlit_2columns_metrics_pct_df, ) def initialise_data() -> Tuple[Dataset, SplitDataset]: dataset = import_data() st.write( "Assuming data is already cleaned and relevant features (predictors) added." ) with st.expander("Input Dataframe (X and y)"): st.dataframe(dataset.df) streamlit_2columns_metrics_df_shape(dataset.df) selected_x_values = select_predictors(dataset) with st.expander("Predictors Dataframe (X)"): st.dataframe(selected_x_values) streamlit_2columns_metrics_df_shape(selected_x_values) st.header("Split Testing and Training Data") test_size_slider_col, seed_col = st.columns(2) with test_size_slider_col: # Initialize test size dataset.test_size = st.slider( label="Test Size Percentage of Input Dataframe:", min_value=0, max_value=100, value=dataset.test_size, key="init_test_size", format="%f%%", ) with seed_col: dataset.random_state = int( st.number_input(label="Random State:", value=dataset.random_state) ) split_dataset = dataset.train_test_split(selected_x_values) true_status = split_dataset.y_test.to_frame().value_counts() st.sidebar.metric( label="Testing Data # of Actual Default (=1)", value=true_status.get(1), ) st.sidebar.metric( label="Testing Data % of Actual Default", value="{:.0%}".format(true_status.get(1) / true_status.sum()), ) st.sidebar.metric( label="Testing Data # of Actual Non-Default (=0)", value=true_status.get(0), ) st.sidebar.metric( label="Testing Data % of Actual Non-Default", value="{:.0%}".format(true_status.get(0) / true_status.sum()), ) # Concat the testing sets X_y_test = split_dataset.X_y_test X_y_train = split_dataset.X_y_train with st.expander("Testing Dataframe (X and y)"): st.dataframe(X_y_test) streamlit_2columns_metrics_df_shape(X_y_test) streamlit_2columns_metrics_series( "# Defaults(=1) (Testing Data)", "# Non-Defaults(=0) (Testing Data)", true_status, ) streamlit_2columns_metrics_pct_series( "% Defaults (Testing Data)", "% Non-Defaults (Testing Data)", true_status, ) st.header("Training Data") with st.expander("Training Dataframe (X and y)"): st.dataframe(X_y_train) streamlit_2columns_metrics_df_shape(X_y_train) st.subheader("Class Count") streamlit_2columns_metrics_df( "# Defaults (Training Data Class Balance Check)", "# Non-Defaults (Training Data Class Balance Check)", split_dataset.y_train, ) streamlit_2columns_metrics_pct_df( "% Defaults (Training Data Class Balance Check)", "% Non-Defaults (Training Data Class Balance Check)", split_dataset.y_train, ) balance_the_classes = st.radio( label="Balance the Classes:", options=("Yes", "No") ) if balance_the_classes == "Yes": st.subheader("Balanced Classes (by Undersampling)") ( split_dataset.X_train, split_dataset.y_train, _X_y_train, class_balance_default, ) = undersample_training_data(X_y_train, "loan_status", split_dataset) streamlit_2columns_metrics_series( "# Defaults (Training Data with Class Balance)", "# Non-Defaults (Training Data with Class Balance)", class_balance_default, ) streamlit_2columns_metrics_pct_series( "% of Defaults (Training Data with Class Balance)", "% of Non-Defaults (Training Data with Class Balance)", class_balance_default, ) return dataset, split_dataset