|
|
|
|
|
import pandas as pd |
|
import numpy as np |
|
from scipy import stats |
|
import warnings |
|
import streamlit as st |
|
import base64 |
|
|
|
def outlier_per_col(df,col): |
|
q1 = df[col].quantile(0.25) |
|
q3 = df[col].quantile(0.75) |
|
iqr = q3 - q1 |
|
|
|
|
|
dist_name, p = stats.normaltest(df[col])[0], stats.normaltest(df[col])[1] |
|
|
|
|
|
|
|
if p <= 0.05: |
|
lower_bound = q1 - 1.5 * iqr |
|
upper_bound = q3 + 1.5 * iqr |
|
outlier_df = df[(df[col] < lower_bound) | (df[col] > upper_bound)] |
|
outlier_per = (len(outlier_df) / len(df[col])) * 100 |
|
else: |
|
z_score = np.abs(df[col] - df[col].mean()) / df[col].std() |
|
outlier_df = df[(z_score > 3)] |
|
outlier_per = len(outlier_df) / len(df[col]) * 100 |
|
return outlier_per |
|
def summary_stats(df,per_to_drop): |
|
summary_df = df.isna().sum().reset_index().rename(columns={'index': 'variable', 0: 'null'}) |
|
summary_df['%null'] = (100 * summary_df['null'] / len(df)).round(2) |
|
summary_df = summary_df.merge(df.dtypes.reset_index().rename(columns={'index': 'variable', 0: 'type'}), on='variable') |
|
summary_df = summary_df.drop(columns=['null']) |
|
summary_df = summary_df.drop(summary_df[summary_df['%null'] > per_to_drop].index) |
|
df_numeric = df.select_dtypes(exclude='object') |
|
df_categorical = df.select_dtypes(include='object') |
|
if not df_numeric.empty: |
|
with warnings.catch_warnings(): |
|
warnings.simplefilter("ignore") |
|
summary_df['outlier%'] = summary_df[summary_df['variable'].isin(df_numeric.columns)].apply(lambda x: outlier_per_col(df_numeric, x['variable']), axis=1) |
|
else: |
|
summary_df = pd.concat([summary_df, pd.DataFrame({'variable': [], 'outlier%': []})]) |
|
summary_df = summary_df.merge((df.select_dtypes(exclude=['object']).nunique() / df.select_dtypes(exclude=['object']).count() * 100).reset_index().rename(columns={'index': 'variable', 0: 'unique%'}).round(2), on='variable', how='left').round(2) |
|
summary_df = summary_df.merge(df.mean(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'mean'}).round(2), on='variable', how='left') |
|
summary_df = summary_df.merge(df.std(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'standard deviation'}).round(2), on='variable', how='left') |
|
summary_df = (summary_df.merge(df.var(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'variance'}), on='variable', how='left').assign(variance=lambda x: x['variance'].apply(lambda y: "{:.2f}".format(y)))) |
|
summary_df = summary_df.merge(df.skew(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'skewness'}).round(2), on='variable', how='left') |
|
summary_df = summary_df.merge(df.kurt(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'kurtosis'}).round(2), on='variable', how='left') |
|
summary_df = summary_df.merge(df.min(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'min'}), on='variable', how='left') |
|
summary_df = summary_df.merge(df.max(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'max'}), on='variable', how='left') |
|
summary_df['range'] = summary_df['max'] - summary_df['min'] |
|
if not df_numeric.empty: |
|
summary_df = summary_df.merge((df.describe().loc['75%'].T - df.describe().loc['25%'].T).reset_index().rename(columns={'index': 'variable', 0: 'iqr'}), on='variable', how='left') |
|
else: |
|
summary_df = pd.concat([summary_df, pd.DataFrame({'variable': [], 'iqr': []})]) |
|
summary_df = summary_df.merge(df.median(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'median'}), on='variable', how='left') |
|
if not df_categorical.empty: |
|
summary_df = summary_df.merge(df.select_dtypes(include=['object']).mode().iloc[0].reset_index().rename(columns={'index': 'variable', 0: 'mode'}), on='variable', how='left') |
|
summary_df = summary_df.merge(df.select_dtypes(include=['object']).nunique().reset_index().rename(columns={'index': 'variable', 0: 'distinct count'}), on='variable', how='left') |
|
else: |
|
summary_df = pd.concat([summary_df, pd.DataFrame({'variable': [], 'mode': []})]) |
|
summary_df = pd.concat([summary_df, pd.DataFrame({'variable': [], 'distinct count': []})]) |
|
return summary_df |
|
|
|
|
|
def mean_imputation(df, col): |
|
df[col].fillna(round(df[col].mean(), 2), inplace=True) |
|
|
|
def median_imputation(df, col): |
|
median = df[col].median() |
|
df[col].fillna(round(median, 2), inplace=True) |
|
|
|
def drop_rows(df, col): |
|
df.dropna(subset=[col], inplace=True) |
|
|
|
def drop_column(df, col): |
|
df.drop(col, axis=1, inplace=True) |
|
|
|
def mode_imputation(df, col): |
|
mode = df[col].mode()[0] |
|
df[col].fillna(mode, inplace=True) |
|
|
|
def arbitrary_val(df, col, val): |
|
df[col].fillna(val, inplace=True) |
|
|
|
def linear_interpolate(df, col): |
|
df[col].interpolate(method='linear', inplace=True) |
|
|
|
def polynomial_interpolate(df, col): |
|
df[col].interpolate(method='polynomial', order=2, inplace=True) |
|
|
|
def interpolate_padding_forward(df, col): |
|
df[col].fillna(method='ffill', inplace=True) |
|
|
|
def interpolate_padding_backward(df, col): |
|
df[col].fillna(method='bfill', inplace=True) |
|
|
|
def fill_0(df, col): |
|
df[col].fillna(0, inplace=True) |
|
|
|
def remove_outliers(df, col): |
|
dist_name, p = stats.normaltest(df[col])[0], stats.normaltest(df[col])[1] |
|
if p <= 0.05: |
|
q1 = df[col].quantile(0.25) |
|
q3 = df[col].quantile(0.75) |
|
iqr = q3 - q1 |
|
lower_bound = q1 - 1.5 * iqr |
|
upper_bound = q3 + 1.5 * iqr |
|
df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)] |
|
else: |
|
z_score = np.abs(df[col] - df[col].mean()) / df[col].std() |
|
df = df[(z_score < 3)] |
|
return df |
|
def mean_outlier(df, col): |
|
dist_name, p = stats.normaltest(df[col])[0], stats.normaltest(df[col])[1] |
|
if p <= 0.05: |
|
q1 = df[col].quantile(0.25) |
|
q3 = df[col].quantile(0.75) |
|
iqr = q3 - q1 |
|
lower_bound = q1 - 1.5 * iqr |
|
upper_bound = q3 + 1.5 * iqr |
|
df[col][df[col] < lower_bound] = df[col].mean() |
|
df[col][df[col] > upper_bound] = df[col].mean() |
|
else: |
|
z_score = np.abs(df[col] - df[col].mean()) / df[col].std() |
|
df.loc[z_score > 3, col] = df[col].mean() |
|
return df |
|
|
|
def median_outlier(df, col): |
|
dist_name, p = stats.normaltest(df[col])[0], stats.normaltest(df[col])[1] |
|
if p <= 0.05: |
|
q1 = df[col].quantile(0.25) |
|
q3 = df[col].quantile(0.75) |
|
iqr = q3 - q1 |
|
lower_bound = q1 - 1.5 * iqr |
|
upper_bound = q3 + 1.5 * iqr |
|
df[col][df[col] < lower_bound] = df[col].median() |
|
df[col][df[col] > upper_bound] = df[col].median() |
|
else: |
|
z_score = np.abs(df[col] - df[col].mean()) / df[col].std() |
|
df.loc[z_score > 3, col] = df[col].median() |
|
return df |
|
|
|
def outlier_capping(df, col): |
|
dist_name, p = stats.normaltest(df[col])[0], stats.normaltest(df[col])[1] |
|
if p <= 0.05: |
|
q1 = df[col].quantile(0.25) |
|
q3 = df[col].quantile(0.75) |
|
iqr = q3-q1 |
|
lower_bound = q1-1.5*iqr |
|
upper_bound = q1+1.5*iqr |
|
df[col] = np.where(df[col] >= upper_bound, upper_bound, np.where(df[col] <= lower_bound, lower_bound, df[col])) |
|
else: |
|
upper_limit = df[col].mean() + (3 * df[col].std()) |
|
lower_limit = df[col].mean() - (3 * df[col].std()) |
|
df[col] = np.where(df[col] >= upper_limit, upper_limit, np.where(df[col] <= lower_limit, lower_limit, df[col])) |
|
return df |
|
|
|
def perform_treatment_missing(df, col, treatments): |
|
if treatments == 'mean': |
|
mean_imputation(df, col) |
|
elif treatments == 'median': |
|
median_imputation(df, col) |
|
elif treatments == 'drop row': |
|
drop_rows(df, col) |
|
elif treatments == 'drop column': |
|
drop_column(df, col) |
|
elif treatments == 'linear interpolation': |
|
linear_interpolate(df, col) |
|
elif treatments == 'polynomial interpolation': |
|
polynomial_interpolate(df, col) |
|
elif treatments == 'ffill': |
|
interpolate_padding_forward(df, col) |
|
elif treatments == 'bfill': |
|
interpolate_padding_backward(df, col) |
|
elif treatments == 'mode': |
|
mode_imputation(df, col) |
|
elif treatments == 'fill_0': |
|
fill_0(df, col) |
|
else: |
|
return df[col] |
|
|
|
def perform_treatment_outlier(df, col, treatments): |
|
if treatments == 'remove': |
|
remove_outliers(df,col) |
|
elif treatments == 'mean': |
|
mean_outlier(df,col) |
|
elif treatments == 'median': |
|
median_imputation(df,col) |
|
elif treatments == 'capping': |
|
outlier_capping(df,col) |
|
else: |
|
return df[col] |
|
|
|
def imputed_df(df,edited_df,identifier,flag,per_to_drop=None): |
|
if per_to_drop is not None: |
|
null_percentage = df.isnull().sum() / df.shape[0] * 100 |
|
col_to_drop = null_percentage[null_percentage > per_to_drop].keys() |
|
df = df.drop(col_to_drop, axis=1) |
|
|
|
cols_with_one_unique = df.columns[df.nunique() == 1] |
|
df.drop(cols_with_one_unique, axis=1, inplace=True) |
|
|
|
for col in edited_df['variable'].to_list(): |
|
perform_treatment_missing(df,col, edited_df.loc[edited_df['variable'] == col, 'Imputation method'].iloc[0]) |
|
perform_treatment_outlier(df,col, edited_df.loc[edited_df['variable'] == col, 'Outlier Treatment'].iloc[0]) |
|
return df |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.title("Data Summary") |
|
|
|
with st.expander("Data Inputs"): |
|
st.subheader("Data Inputs") |
|
ui_columns = st.columns((1, 1)) |
|
columns = set(st.session_state.df.columns) |
|
with ui_columns[0]: |
|
flag = st.selectbox( |
|
label="Flag variable", |
|
options=list(columns), |
|
index=list(columns).index(st.session_state.flag) if 'flag' in st.session_state and st.session_state.flag is not None else 0 |
|
) |
|
per_to_drop=st.slider( |
|
label= "Select missing % threshold to drop columns", |
|
key="per_to_drop", |
|
min_value=0, max_value=100, value=st.session_state.per_to_drop if 'per_to_drop' in st.session_state else 80) |
|
|
|
with ui_columns[-1]: |
|
identifier = st.selectbox( |
|
label="Identifier", |
|
options=list(columns), |
|
index=list(columns).index(st.session_state.identifier) if 'identifier' in st.session_state and st.session_state.identifier is not None else 0 |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.session_state.flag=flag |
|
st.session_state.identifier=identifier |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
summary_df = summary_stats(st.session_state.df, per_to_drop) |
|
summary_df["Imputation method"]=None |
|
summary_df["Outlier Treatment"]=None |
|
summary_df["Imputation method"]=np.where(summary_df["type"]=='object','mode','mean') |
|
summary_df["Outlier Treatment"]=np.where(summary_df["type"]=='object',summary_df["Outlier Treatment"],'capping') |
|
summary_df = summary_df[~summary_df['variable'].isin([flag,identifier])] |
|
st.session_state.summary_df=summary_df |
|
|
|
st.subheader("Variable Summary") |
|
|
|
IMPUTATION_OPTIONS = ["mean", "median", "linear interpolation", "polynomial interpolation", "ffill", "bfill","mode","fill_0"] |
|
OUTLIER_OPTIONS = ["capping","remove", "mean", "median"] |
|
NON_EDITABLE_COLUMNS = summary_df.columns.to_list() |
|
|
|
def highlight_cols(s): |
|
color = "#ccc" |
|
return "background-color: %s" % color |
|
|
|
column_config = { |
|
"variable": st.column_config.TextColumn(disabled=True, width="medium"), |
|
"type": st.column_config.TextColumn(disabled=True, width="medium"), |
|
"%null": st.column_config.NumberColumn(disabled=True), |
|
"unique%": st.column_config.NumberColumn(disabled=True), |
|
"outlier%": st.column_config.NumberColumn(disabled=True), |
|
"mean": st.column_config.NumberColumn(disabled=True), |
|
"standard deviation": st.column_config.NumberColumn(disabled=True), |
|
"variance": st.column_config.NumberColumn(disabled=True), |
|
"skewness": st.column_config.NumberColumn(disabled=True), |
|
"kurtosis": st.column_config.NumberColumn(disabled=True), |
|
"min": st.column_config.NumberColumn(disabled=True), |
|
"max": st.column_config.NumberColumn(disabled=True), |
|
"range": st.column_config.NumberColumn(disabled=True), |
|
"iqr": st.column_config.NumberColumn(disabled=True), |
|
"median": st.column_config.NumberColumn(disabled=True), |
|
"IV": st.column_config.NumberColumn(disabled=True), |
|
"mode": st.column_config.TextColumn(disabled=True), |
|
"distinct count": st.column_config.NumberColumn(disabled=True), |
|
"Imputation method": st.column_config.SelectboxColumn( |
|
options=IMPUTATION_OPTIONS, default=0 |
|
), |
|
"Outlier Treatment": st.column_config.SelectboxColumn( |
|
options=OUTLIER_OPTIONS, default=0 |
|
) |
|
} |
|
|
|
|
|
with st.expander("Variables from the data"): |
|
edited_df = st.data_editor( |
|
st.session_state.summary_df |
|
.style.hide(axis="index") |
|
.applymap(highlight_cols, subset=NON_EDITABLE_COLUMNS), |
|
column_config=column_config, |
|
) |
|
if st.button("Submit changes"): |
|
with st.spinner("Applying imputations"): |
|
st.divider() |
|
edited_df = st.session_state.summary_df.copy() |
|
edited_df["Imputation method"] = st.session_state.summary_df["Imputation method"] |
|
edited_df["Outlier Treatment"] = st.session_state.summary_df["Outlier Treatment"] |
|
|
|
imputed_df = imputed_df(st.session_state.df, edited_df, st.session_state.identifier, st.session_state.flag, st.session_state.per_to_drop) |
|
st.session_state.imputed_df = imputed_df |
|
st.markdown("Imputed DataFrame") |
|
st.dataframe(imputed_df.head(10)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if "imputed_df" in st.session_state: |
|
if st.button("Download Imputed Data"): |
|
imputed_df = st.session_state.imputed_df |
|
imputed_csv = imputed_df.to_csv(index=False) |
|
b64 = base64.b64encode(imputed_csv.encode()).decode() |
|
href = f'<a href="data:file/csv;base64,{b64}" download="imputed_data.csv">Download Imputed Data CSV File</a>' |
|
st.markdown(href, unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|