|
import numpy as np |
|
import pandas as pd |
|
from sklearn.tree import DecisionTreeClassifier |
|
from sklearn.model_selection import GridSearchCV |
|
import matplotlib.pyplot as plt |
|
from tqdm import tqdm |
|
from matplotlib.ticker import MaxNLocator |
|
import streamlit as st |
|
import ast |
|
from collections import defaultdict |
|
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram |
|
from sklearn.cluster import KMeans, AgglomerativeClustering |
|
from sklearn.preprocessing import LabelEncoder |
|
|
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
|
|
import warnings |
|
import pandas as pd |
|
import numpy as np |
|
from scipy import stats |
|
import scipy.cluster.hierarchy as sch |
|
from scipy.spatial.distance import pdist |
|
import os |
|
import re |
|
import time |
|
from plotly.subplots import make_subplots |
|
import plotly.graph_objects as go |
|
import numpy as np |
|
import plotly.express as px |
|
import base64 |
|
|
|
|
|
def tree_based_bin_data(df, column_name, dep_var, depth_of_tree): |
|
df2 = df.copy() |
|
df2 = df2.loc[df2[column_name].notnull()] |
|
x = df2[column_name].values.reshape(-1, 1) |
|
y = df2[dep_var].values |
|
params = {'max_depth': range(2, depth_of_tree + 1), 'min_samples_split': [2, 3, 5, 10], 'min_samples_leaf': [int(np.ceil(0.05 * len(x)))]} |
|
clf = DecisionTreeClassifier() |
|
g_search = GridSearchCV(clf, param_grid=params, scoring='accuracy') |
|
g_search.fit(x, y) |
|
best_clf = g_search.best_estimator_ |
|
bin_edges = best_clf.tree_.threshold |
|
bin_edges = sorted(set(bin_edges[bin_edges != -2])) |
|
tree_based_binned_data = value_bin_data(df, column_name, bin_edges) |
|
return tree_based_binned_data |
|
|
|
|
|
def decile_bin_data(df, col, no_of_bins): |
|
decile_binned_data = pd.qcut(df[col], no_of_bins, duplicates='drop') |
|
return decile_binned_data |
|
|
|
|
|
def value_bin_data(df, col, no_of_bins): |
|
value_binned_data = pd.cut(df[col], no_of_bins, duplicates='drop') |
|
return value_binned_data |
|
|
|
|
|
def col_bin_summary_numerical(bin_df, col, dep_var=None): |
|
unique_bin_edges = bin_df[col].unique() |
|
df_new = pd.DataFrame({"bin_ranges": unique_bin_edges}) |
|
|
|
try: |
|
df_new = df_new.merge((bin_df[col].value_counts() / len(bin_df) * 100).reset_index().rename(columns={'index': 'bin_ranges', col: 'count%'}).sort_values(by='bin_ranges').reset_index(drop=True), on='bin_ranges').round(2) |
|
except: |
|
df_new = df_new.merge((bin_df[col].value_counts() / len(bin_df) * 100).reset_index().rename(columns={col: 'bin_ranges', 'count': 'count%'}).sort_values(by='bin_ranges').reset_index(drop=True), on='bin_ranges').round(2) |
|
if dep_var is not None: |
|
df_new = df_new.merge(bin_df.groupby(col)[dep_var].sum().reset_index().rename(columns={col: 'bin_ranges', dep_var: 'Event'}), on='bin_ranges', how='left') |
|
df_new = df_new.merge(bin_df.groupby(col)[dep_var].mean().reset_index().rename(columns={col: 'bin_ranges', dep_var: 'Mean_DV'}), on='bin_ranges', how='left') |
|
df_new['Index'] = (100 * df_new['Mean_DV'] / bin_df['Y'].mean()).round() |
|
df_new = df_new[['bin_ranges', 'count%', 'Event', 'Mean_DV', 'Index']] |
|
df_new = df_new.sort_values(by='bin_ranges') |
|
|
|
return df_new |
|
|
|
|
|
|
|
|
|
|
|
def plot_chart(df, col, dep_var): |
|
|
|
df['bin_ranges_str'] = df['bin_ranges'].astype(str) |
|
fig = make_subplots(specs=[[{"secondary_y": True}]]) |
|
|
|
|
|
fig.add_trace( |
|
go.Bar( |
|
x=df['bin_ranges_str'], |
|
y=df['count%'], |
|
name='Count%', |
|
marker_color='#053057', |
|
hovertemplate=( |
|
f"Bin: %{{x}}<br>" |
|
f"Count%: %{{y}}" |
|
), |
|
) |
|
) |
|
|
|
|
|
fig.add_trace( |
|
go.Scatter( |
|
x=df['bin_ranges_str'], |
|
y=df['Index'], |
|
mode='lines+markers', |
|
name='Index', |
|
marker=dict(color="#8ac4f8"), |
|
hovertemplate=( |
|
f"Bin: %{{x}}<br>" |
|
f"Index%: %{{y}}" |
|
), |
|
), |
|
secondary_y=True |
|
) |
|
|
|
|
|
fig.update_layout( |
|
title=f'Distribution of {col}', |
|
xaxis=dict(title='Bin_ranges'), |
|
yaxis=dict(title='Count%', color='#053057'), |
|
yaxis2=dict(title='Index', color="#8ac4f8", overlaying='y', side='right'), |
|
legend=dict(x=1.02, y=0.98), |
|
hovermode='x' |
|
) |
|
|
|
fig.update_xaxes(showgrid=False) |
|
fig.update_yaxes(showgrid=False) |
|
|
|
return fig |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_numerical_binned_data(df, col, func,no_of_bins=None,dep_var=None, depth=None): |
|
df_org = df.copy() |
|
|
|
if dep_var is not None: |
|
df_org[dep_var] = df_org[dep_var].astype('int64') |
|
df_num = df_org.select_dtypes(include=[np.number]).drop(dep_var, axis=1) |
|
|
|
if func == 'tree': |
|
bin_df = tree_based_bin_data(df, col, dep_var, depth) |
|
elif func == 'decile': |
|
bin_df = decile_bin_data(df_num, col, 10) |
|
else: |
|
bin_df = value_bin_data(df_num, col, no_of_bins) |
|
|
|
bin_df = pd.concat([bin_df, df_org[dep_var]], axis=1) |
|
else: |
|
df_num = df_org.select_dtypes(include=[np.number]) |
|
|
|
if func == 'decile': |
|
bin_df = decile_bin_data(df_num, col, no_of_bins) |
|
else: |
|
bin_df = value_bin_data(df_num, col, no_of_bins) |
|
|
|
df_summary = col_bin_summary_numerical(bin_df,col, dep_var) |
|
|
|
return df_summary |
|
|
|
|
|
def create_numerical_binned_data1(df, col, func,no_of_bins,dep_var,depth=None): |
|
df_org = df.copy() |
|
|
|
df_org[dep_var] = df_org[dep_var].astype('int64') |
|
df_num = df_org.select_dtypes(include=[np.number]).drop(dep_var, axis=1) |
|
|
|
if func == 'tree': |
|
bin_df = tree_based_bin_data(df, col, dep_var, depth) |
|
elif func == 'decile': |
|
bin_df = decile_bin_data(df_num, col, no_of_bins) |
|
else: |
|
bin_df = value_bin_data(df_num, col, no_of_bins) |
|
|
|
bin_df = pd.concat([bin_df, df_org[dep_var]], axis=1) |
|
|
|
binned_data=pd.DataFrame() |
|
binned_data[col]=df_org[col] |
|
unique_bins = bin_df[col].unique() |
|
for bin_value in unique_bins: |
|
bin_column_name = f"{col}_{bin_value}" |
|
binned_data[bin_column_name] = np.where(binned_data[col] == bin_value, df_org[col], 0) |
|
|
|
return binned_data |
|
|
|
|
|
|
|
|
|
def woe_iv(df, column_name, dep_var, no_of_bins): |
|
y0 = df[dep_var].value_counts()[0] |
|
y1 = df[dep_var].value_counts()[1] |
|
if df[column_name].nunique() < 10: |
|
data = pd.Series(pd.factorize(df[column_name])[0] + 1, index=df.index).rename('{}'.format(column_name)).apply(lambda x: f'bin{x}') |
|
else: |
|
df_woe_iv = (pd.crosstab(df[column_name], df[dep_var], normalize='columns').assign(woe=lambda dfx: np.log((dfx[1] + (0.5 / y1)) / (dfx[0] + (0.5 / y0)))).assign(iv=lambda dfx: (dfx['woe'] * (dfx[1] - dfx[0])))) |
|
woe_map = df_woe_iv['woe'].to_dict() |
|
woe_col = df[column_name].map(woe_map) |
|
data = pd.qcut(woe_col, no_of_bins, duplicates='drop') |
|
n = data.nunique() |
|
labels = [f'bin{i}' for i in range(1, n + 1)] |
|
data = data.cat.rename_categories(labels) |
|
sizes = data.value_counts(normalize=True) |
|
min_size = 0.05 |
|
while sizes.min() < min_size and no_of_bins > 1: |
|
no_of_bins -= 1 |
|
data = pd.qcut(woe_col, q=no_of_bins, duplicates='drop') |
|
if data.nunique() != data.cat.categories.nunique(): |
|
continue |
|
n = data.nunique() |
|
labels = [f'bin{i}' for i in range(1, n + 1)] |
|
data = data.cat.rename_categories(labels) |
|
sizes = data.value_counts(normalize=True) |
|
return data |
|
|
|
def naive_cat_bin(df, col, max_thre=10, min_thre=5, tolerence=2, flag='ignore'): |
|
value_counts = df[col].value_counts() |
|
total_values = len(df) |
|
count_percentages = (value_counts / total_values) * 100 |
|
unique_values_df = pd.DataFrame({'Category': value_counts.index, 'Count Percentage': count_percentages}) |
|
count_per = list(unique_values_df['Count Percentage']) |
|
|
|
final_ini = [] |
|
for i in count_per: |
|
if i >= min_thre: |
|
final_ini.append(i) |
|
a = [x for x in count_per if x not in final_ini] |
|
|
|
total_bins = int(100 / max_thre) |
|
ava_bins = len(final_ini) |
|
ava_bin_per = sum(final_ini) |
|
bin_req = total_bins - ava_bins |
|
bin_req_per = 100 - ava_bin_per |
|
|
|
if flag == 'error' and bin_req > 0 and (bin_req_per / bin_req) > max_thre: |
|
print(f"Binning for {col} is not possible with given parameters.") |
|
return |
|
|
|
step = False |
|
while not step: |
|
if bin_req > 0: |
|
if (bin_req_per / bin_req) > min_thre: |
|
step = True |
|
else: |
|
bin_req -= 1 |
|
else: |
|
step = True |
|
|
|
final_ini = [[x] for x in final_ini] |
|
|
|
if bin_req > 0: |
|
target_sum = bin_req_per / bin_req |
|
else: |
|
target_sum = bin_req_per |
|
tolerence = 0 |
|
|
|
final = [] |
|
current_sum = 0.0 |
|
start_index = len(a) - 1 |
|
values = [] |
|
while start_index >= 0: |
|
current_sum += a[start_index] |
|
values.append(a[start_index]) |
|
if current_sum < target_sum - tolerence: |
|
start_index -= 1 |
|
else: |
|
final.append(values) |
|
values = [] |
|
start_index -= 1 |
|
current_sum = 0.0 |
|
final.append(values) |
|
final = final[::-1] |
|
final = [sublist for sublist in final if sublist] |
|
final_b = final_ini + final |
|
|
|
final = [final_b[0]] |
|
for subarr in final_b[1:]: |
|
if sum(subarr) < (min_thre - tolerence): |
|
final[-1].extend(subarr) |
|
else: |
|
final.append(subarr) |
|
|
|
table = dict(zip(unique_values_df['Category'], unique_values_df['Count Percentage'])) |
|
new_final = [sublist.copy() for sublist in final] |
|
|
|
table_reverse = defaultdict(list) |
|
for k, v in table.items(): |
|
table_reverse[v].append(k) |
|
|
|
output = [] |
|
for l in new_final: |
|
temp = [] |
|
for item in l: |
|
temp.append(table_reverse[item].pop()) |
|
output.append(temp) |
|
new_final = output |
|
|
|
k = len(new_final) |
|
bin_labels = [f'bin{i}' for i in range(1, k + 1)] |
|
bin_mapping = {value: bin_labels[i] for i, sublist in enumerate(new_final) for value in sublist} |
|
bin_mapping[np.nan] = 'binNA' |
|
return df[col].apply(lambda x: bin_mapping.get(x, x)) |
|
|
|
def col_bin_summary_categorical(df_cat, col, binned_df_1,dep_var=None): |
|
unique_values_in_bins = df_cat.groupby(binned_df_1[col])[col].unique().apply(list) |
|
unique_values_in_bins = unique_values_in_bins.rename_axis('bin').reset_index() |
|
unique_bin_ranges = pd.Categorical(binned_df_1[col].unique()) |
|
uni = binned_df_1[col].nunique() |
|
numeric_parts = [uni if val == 'binNA' else int(re.findall(r'\d+', val)[0]) for val in unique_bin_ranges] |
|
unique_bin_ranges = unique_bin_ranges[np.argsort(numeric_parts)] |
|
df_new_cat = pd.DataFrame({"column_name": [col] * len(unique_bin_ranges), "bin_ranges": unique_bin_ranges}) |
|
df_new_cat = df_new_cat.merge(unique_values_in_bins.rename(columns={'bin': 'bin_ranges', col: 'values in bin'})) |
|
df_new_cat = df_new_cat.merge((binned_df_1[col].value_counts() / len(binned_df_1) * 100).reset_index().rename(columns={col: 'bin_ranges', 'count': 'count%'}).sort_values(by='bin_ranges').reset_index(drop=True), on='bin_ranges').round(2) |
|
if dep_var is not None: |
|
df_new_cat = df_new_cat.merge(binned_df_1.groupby(col)[dep_var].sum(numeric_only=True).reset_index().rename(columns={col: 'bin_ranges', dep_var: 'Event'}), on='bin_ranges') |
|
df_new_cat = df_new_cat.merge(binned_df_1.groupby(col)[dep_var].mean(numeric_only=True).reset_index().rename(columns={col: 'bin_ranges', dep_var: 'Mean_DV'}), on='bin_ranges') |
|
df_new_cat['Index'] = (100 * df_new_cat['Mean_DV'] / binned_df_1[dep_var].mean()).round() |
|
return df_new_cat |
|
|
|
def create_categorical_binned_data(imputed_df,col, categorical_binning, dep_var, no_of_bins=None, max_thre=None, min_thre=None,tolerence=2, flag='ignore'): |
|
|
|
imputed_df[dep_var] = imputed_df[dep_var].astype('int64') |
|
df_cat = imputed_df.select_dtypes(include=['object']) |
|
|
|
unique_counts = df_cat.nunique() |
|
unique_cols = unique_counts[unique_counts == 1].index.tolist() |
|
df_cat = df_cat.drop(unique_cols, axis=1) |
|
|
|
if categorical_binning == 'woe_iv': |
|
df_nominal = pd.concat([imputed_df[col], imputed_df[dep_var]], axis=1) |
|
tqdm.pandas(dynamic_ncols=True, position=0) |
|
binned_df_nominal = df_nominal.progress_apply(lambda x: woe_iv(df_nominal, x.name, dep_var, no_of_bins)) |
|
binned_df_nominal.drop(dep_var, axis=1, inplace=True) |
|
binned_df_nominal = binned_df_nominal.applymap(lambda x: 'NA' if pd.isnull(x) else x) |
|
binned_df_nominal = binned_df_nominal.astype('category') |
|
|
|
cols_with_one_unique_bin = binned_df_nominal.columns[binned_df_nominal.nunique() == 1] |
|
binned_df_nominal.drop(cols_with_one_unique_bin, axis=1, inplace=True) |
|
|
|
binned_df_nominal_1 = pd.concat([binned_df_nominal, imputed_df[dep_var]], axis=1) |
|
elif categorical_binning == 'naive': |
|
df_nominal = pd.concat([imputed_df[col], imputed_df[dep_var]], axis=1) |
|
tqdm.pandas(dynamic_ncols=True, position=0) |
|
binned_df_nominal = df_nominal.progress_apply(lambda x: naive_cat_bin(df_nominal, x.name, 20, 5, 2, flag='ignore')) |
|
binned_df_nominal.drop(dep_var, axis=1, inplace=True) |
|
binned_df_nominal = binned_df_nominal.dropna(axis=1, how='all') |
|
binned_df_nominal = binned_df_nominal.astype('category') |
|
|
|
cols_with_one_unique_bin = binned_df_nominal.columns[binned_df_nominal.nunique() == 1] |
|
binned_df_nominal.drop(cols_with_one_unique_bin, axis=1, inplace=True) |
|
|
|
binned_df_nominal_1 = pd.concat([binned_df_nominal, imputed_df[dep_var]], axis=1) |
|
|
|
df_summary=col_bin_summary_categorical(df_cat, col, binned_df_nominal_1,dep_var) |
|
return df_summary |
|
|
|
def create_categorical_binned_data1(imputed_df,col, nominal_binning, dependant_target_variable, no_of_bins=10, max_thre=10, min_thre=5, tolerence=2, flag='ignore', min_cluster_size=0.05, max_clusters=10): |
|
|
|
imputed_df[dependant_target_variable] = imputed_df[dependant_target_variable].astype('int64') |
|
df_cat = imputed_df.select_dtypes(include=['object']) |
|
|
|
unique_counts = df_cat.nunique() |
|
unique_cols = unique_counts[unique_counts == 1].index.tolist() |
|
df_cat = df_cat.drop(unique_cols, axis=1) |
|
|
|
if nominal_binning == 'woe': |
|
df_nominal = pd.concat([imputed_df[col], imputed_df[dependant_target_variable]], axis=1) |
|
tqdm.pandas(dynamic_ncols=True, position=0) |
|
binned_df_nominal = df_nominal.progress_apply(lambda x: woe_iv(df_nominal, x.name, dependant_target_variable, no_of_bins)) |
|
binned_df_nominal.drop(dependant_target_variable, axis=1, inplace=True) |
|
binned_df_nominal = binned_df_nominal.applymap(lambda x: 'NA' if pd.isnull(x) else x) |
|
binned_df_nominal = binned_df_nominal.astype('category') |
|
|
|
cols_with_one_unique_bin = binned_df_nominal.columns[binned_df_nominal.nunique() == 1] |
|
binned_df_nominal.drop(cols_with_one_unique_bin, axis=1, inplace=True) |
|
|
|
binned_df_nominal_1 = pd.concat([binned_df_nominal, imputed_df[dependant_target_variable]], axis=1) |
|
elif nominal_binning == 'naive': |
|
df_nominal = pd.concat([imputed_df[col], imputed_df[dependant_target_variable]], axis=1) |
|
tqdm.pandas(dynamic_ncols=True, position=0) |
|
binned_df_nominal = df_nominal.progress_apply(lambda x: naive_cat_bin(df_nominal, x.name, 20, 5, 2, flag='ignore')) |
|
binned_df_nominal.drop(dependant_target_variable, axis=1, inplace=True) |
|
binned_df_nominal = binned_df_nominal.dropna(axis=1, how='all') |
|
binned_df_nominal = binned_df_nominal.astype('category') |
|
|
|
cols_with_one_unique_bin = binned_df_nominal.columns[binned_df_nominal.nunique() == 1] |
|
binned_df_nominal.drop(cols_with_one_unique_bin, axis=1, inplace=True) |
|
|
|
binned_df_nominal_1 = pd.concat([binned_df_nominal, imputed_df[dependant_target_variable]], axis=1) |
|
|
|
df_summary=col_bin_summary_categorical(df_cat, col, binned_df_nominal_1,dependant_target_variable) |
|
|
|
binned_data = pd.DataFrame() |
|
for bin_value in df_summary['values in bin']: |
|
bin_column_name = f"{col}_{bin_value}" |
|
binned_data[bin_column_name] = np.where(df_cat[col].isin(bin_value), 1, 0) |
|
|
|
return binned_data |
|
|
|
|
|
|
|
numerical_columns = st.session_state.imputed_df.select_dtypes(include=['number']).columns.tolist() |
|
numerical_columns = [x for x in numerical_columns if x != st.session_state.flag] |
|
categorical_columns = st.session_state.imputed_df.select_dtypes(include=['object', 'category']).columns.tolist() |
|
categorical_columns = [x for x in categorical_columns if x != st.session_state.identifier] |
|
st.session_state.numerical_columns=numerical_columns |
|
st.session_state.categorical_columns=categorical_columns |
|
|
|
|
|
st.title("Variable Profiling") |
|
|
|
|
|
function_num = st.session_state.get("function_num", "value") |
|
depth = st.session_state.get("depth", 3) |
|
num_bins = st.session_state.get("num_bins", 10) |
|
function_cat = st.session_state.get("function_cat", "woe_iv") |
|
max_slider = st.session_state.get("max_slider", 10) |
|
min_slider = st.session_state.get("min_slider", 5) |
|
cat_bins_iv = st.session_state.get("cat_bins_iv", 10) |
|
cat_bins_naive = st.session_state.get("cat_bins_naive", 10) |
|
|
|
with st.expander("Profiling Inputs"): |
|
st.write("Binning Inputs") |
|
ui_columns = st.columns((1, 1)) |
|
with ui_columns[0]: |
|
function_num = st.selectbox( |
|
label="Select Numerical Binning Function", |
|
options=['value', 'tree'], |
|
|
|
index=['value', 'tree'].index(st.session_state.function_num) if 'function_num' in st.session_state and st.session_state.function_num is not None else None |
|
) |
|
st.session_state.function_num = function_num |
|
params_num = st.empty() |
|
|
|
with params_num: |
|
with ui_columns[-1]: |
|
if function_num == 'tree': |
|
depth = st.slider( |
|
label="Depth", |
|
min_value=1, |
|
max_value=10, |
|
value=depth, |
|
key='depth_slider') |
|
st.session_state.depth = depth |
|
elif function_num == 'value': |
|
num_bins = st.slider( |
|
label="Number of Bins", |
|
min_value=2, |
|
max_value=20, |
|
value=num_bins, |
|
key='num_bins_slider_num') |
|
st.session_state.num_bins = num_bins |
|
left, right = st.columns(2) |
|
|
|
with left: |
|
function_cat = st.selectbox( |
|
label="Select Categorical Binning Function", |
|
options=['woe_iv', 'naive'], |
|
|
|
index=['woe_iv', 'naive'].index(st.session_state.function_cat) if 'function_cat' in st.session_state and st.session_state.function_cat is not None else None |
|
) |
|
st.session_state.function_cat = function_cat |
|
params_cat = st.empty() |
|
|
|
with params_cat: |
|
|
|
if function_cat == 'woe_iv': |
|
with right: |
|
cat_bins_iv = st.slider( |
|
label="Number of Bins", |
|
min_value=2, |
|
max_value=20, |
|
value=cat_bins_iv, |
|
key='num_bins_slider_cat_iv') |
|
st.session_state.cat_bins_iv = cat_bins_iv |
|
with left: |
|
min_slider = st.slider( |
|
label="Min Threshold", |
|
min_value=1, |
|
max_value=100, |
|
value=min_slider, |
|
key='min_slider') |
|
st.session_state.min_slider = min_slider |
|
with right: |
|
max_slider = st.slider( |
|
label="Max Threshold", |
|
min_value=1, |
|
max_value=100, |
|
value=max_slider, |
|
key='max_slider') |
|
st.session_state.max_slider = max_slider |
|
elif function_cat == 'naive': |
|
with right: |
|
cat_bins_naive = st.slider( |
|
label="Number of Bins", |
|
min_value=2, |
|
max_value=20, |
|
value=cat_bins_naive, |
|
key='num_bins_slider_cat_naive') |
|
st.session_state.cat_bins_naive = cat_bins_naive |
|
|
|
with left: |
|
st.write("#") |
|
perform_profiling = st.button( |
|
label="Perform profiling" |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if perform_profiling: |
|
with st.expander("Profiling summary"): |
|
st.write("Numerical binned data") |
|
binned_data_num = pd.DataFrame() |
|
for col in st.session_state.numerical_columns: |
|
if function_num == 'tree': |
|
depth = depth |
|
else: |
|
depth=None |
|
if function_num == 'value': |
|
num_bins=num_bins |
|
else: |
|
num_bins=None |
|
binned_data_col = create_numerical_binned_data(st.session_state.imputed_df, col, function_num,num_bins,st.session_state.flag, depth) |
|
binned_data_col.insert(0, 'column_bin', col + '_' + binned_data_col['bin_ranges'].astype(str)) |
|
binned_data_num = pd.concat([binned_data_num, binned_data_col],axis=0) |
|
st.dataframe(binned_data_num,use_container_width=True,hide_index=True) |
|
|
|
st.write("Categorical binned data") |
|
binned_data_cat = pd.DataFrame() |
|
for col in st.session_state.categorical_columns: |
|
if function_cat == 'woe_iv': |
|
max_thre = max_slider |
|
min_thre = min_slider |
|
no_of_bins = cat_bins_iv |
|
else: |
|
max_thre = None |
|
min_thre = None |
|
no_of_bins = None |
|
if function_cat == 'naive': |
|
no_of_bins = cat_bins_naive |
|
else: |
|
no_of_bins=None |
|
binned_data_col_cat = create_categorical_binned_data(st.session_state.imputed_df,col, function_cat, st.session_state.flag, no_of_bins=no_of_bins, max_thre=max_thre, min_thre=min_thre,tolerence=2, flag='ignore') |
|
binned_data_col_cat.insert(0, 'column_bin', col + '_' + binned_data_col_cat['values in bin'].astype(str)) |
|
binned_data_col_cat.drop('column_name',axis=1,inplace=True) |
|
binned_data_cat = pd.concat([binned_data_cat, binned_data_col_cat],axis=0) |
|
st.dataframe(binned_data_cat,use_container_width=True,hide_index=True) |
|
|
|
|
|
with st.expander("Profiling summary: Plots"): |
|
st.markdown( |
|
"<p class='plot-header'>Change the selected variable to plot" |
|
" different charts</p>", |
|
unsafe_allow_html=True, |
|
) |
|
left, right = st.columns(2) |
|
with left: |
|
if 'selected_variable' not in st.session_state: |
|
st.session_state.selected_variable = [] |
|
|
|
selected_variable = st.selectbox( |
|
"Variable", |
|
st.session_state.numerical_columns + st.session_state.categorical_columns, |
|
|
|
) |
|
if isinstance(selected_variable, str): |
|
selected_variable = [selected_variable] |
|
|
|
|
|
st.session_state.selected_variable = selected_variable |
|
|
|
|
|
|
|
if st.session_state.selected_variable: |
|
for col in st.session_state.selected_variable: |
|
if col in st.session_state.numerical_columns: |
|
if function_num == 'tree': |
|
depth = depth |
|
else: |
|
depth = None |
|
if function_num == 'value': |
|
num_bins = num_bins |
|
else: |
|
num_bins = None |
|
binned_data_col = create_numerical_binned_data(st.session_state.imputed_df, col, function_num, num_bins, st.session_state.flag, depth) |
|
binned_data_col.insert(0, 'column_bin', col + '_' + binned_data_col['bin_ranges'].astype(str)) |
|
fig = plot_chart(binned_data_col, col, dep_var=None) |
|
st.plotly_chart(fig, use_container_width=True) |
|
|
|
elif col in st.session_state.categorical_columns: |
|
if function_cat == 'woe_iv': |
|
max_thre = max_slider |
|
min_thre = min_slider |
|
no_of_bins = cat_bins_iv |
|
else: |
|
max_thre = None |
|
min_thre = None |
|
no_of_bins = None |
|
if function_cat == 'naive': |
|
no_of_bins = cat_bins_naive |
|
else: |
|
no_of_bins = None |
|
binned_data_col_cat = create_categorical_binned_data(st.session_state.imputed_df, col, function_cat, st.session_state.flag, no_of_bins=no_of_bins, max_thre=max_thre, min_thre=min_thre, tolerence=2, flag='ignore') |
|
binned_data_col_cat.insert(0, 'column_bin', col + '_' + binned_data_col_cat['values in bin'].astype(str)) |
|
binned_data_col_cat.drop('column_name', axis=1, inplace=True) |
|
fig_cat = plot_chart(binned_data_col_cat, col, dep_var=None) |
|
st.plotly_chart(fig_cat, use_container_width=True) |
|
|
|
|
|
st.divider() |
|
|
|
binned_data_combined = pd.DataFrame() |
|
|
|
|
|
for col in st.session_state.numerical_columns: |
|
if function_num == 'tree': |
|
depth = depth |
|
else: |
|
depth=None |
|
if function_num == 'value': |
|
num_bins=num_bins |
|
else: |
|
num_bins=None |
|
|
|
binned_data_num = create_numerical_binned_data1(st.session_state.imputed_df, col, function_num, num_bins, st.session_state.flag, depth) |
|
binned_data_combined = pd.concat([binned_data_combined, binned_data_num], axis=1) |
|
|
|
|
|
for col in st.session_state.categorical_columns: |
|
if function_cat == 'woe_iv': |
|
max_thre = max_slider |
|
min_thre = min_slider |
|
no_of_bins = cat_bins_iv |
|
else: |
|
max_thre = None |
|
min_thre = None |
|
no_of_bins = None |
|
if function_cat == 'naive': |
|
no_of_bins = cat_bins_naive |
|
else: |
|
no_of_bins=None |
|
|
|
binned_data_cat = create_categorical_binned_data1(st.session_state.imputed_df, col, function_cat, st.session_state.flag, no_of_bins=no_of_bins, max_thre=max_thre, min_thre=min_thre, tolerence=2, flag='ignore') |
|
binned_data_combined = pd.concat([binned_data_combined, binned_data_cat], axis=1) |
|
def clean_column_name(column_name): |
|
|
|
return re.sub(r'\.(\d+)', '', column_name) |
|
binned_data_combined.columns = binned_data_combined.columns.map(clean_column_name) |
|
valid_feature_names = [name.replace('[', '').replace(']', '').replace('<', '').replace(',', '_').replace('(', '').replace("'", '') for name in binned_data_combined.columns] |
|
valid_feature_names = [name.replace(' ', '').replace(' ', '') for name in valid_feature_names] |
|
binned_data_combined.columns = valid_feature_names |
|
|
|
st.session_state.binned_df = binned_data_combined |
|
st.session_state.binned_df[st.session_state.flag]=st.session_state.imputed_df[st.session_state.flag] |
|
st.session_state.binned_df.insert(0, st.session_state.identifier, st.session_state.imputed_df[st.session_state.identifier]) |
|
print(st.session_state.binned_df['individual_id_ov']) |
|
|
|
st.markdown("Binned DataFrame") |
|
st.dataframe(binned_data_combined.head(10), use_container_width=True, hide_index=True) |
|
|
|
|
|
if st.session_state.binned_df is not None: |
|
|
|
download_button = st.download_button( |
|
label="Download Binned Data as CSV", |
|
data=st.session_state.binned_df.to_csv(index=False).encode(), |
|
file_name='binned_data.csv', |
|
mime='text/csv', |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|