|
|
|
|
|
import pandas as pd |
|
import numpy as np |
|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
import seaborn as sn |
|
import matplotlib.pyplot as plt |
|
from sklearn.linear_model import LogisticRegression |
|
from sklearn.preprocessing import MinMaxScaler, StandardScaler |
|
from sklearn.metrics import confusion_matrix, classification_report |
|
from sklearn.model_selection import train_test_split |
|
import xgboost as xgb |
|
from sklearn.linear_model import LinearRegression |
|
from sklearn.metrics import mean_squared_error, r2_score |
|
from sklearn.decomposition import PCA |
|
from sklearn.preprocessing import StandardScaler |
|
import numpy as np |
|
import plotly.figure_factory as ff |
|
|
|
|
|
st.set_page_config( |
|
layout="wide", |
|
) |
|
|
|
def point_estimates(df, model_type, flag, identifier, control_sample_size, solver=None, max_iter=None, class_weights=None, max_depth=None, subsample=None, eta=None): |
|
|
|
|
|
|
|
Xs = df.drop(columns=[identifier, flag],axis=1) |
|
X_scaled = StandardScaler().fit_transform(Xs) |
|
n_comp = len(Xs.columns) |
|
pca = PCA(n_components=n_comp) |
|
pca.fit(X_scaled) |
|
princ_comp = pca.transform(X_scaled) |
|
PCA_DF = pd.DataFrame(princ_comp) |
|
pca_var = pca.explained_variance_ratio_[0:n_comp].cumsum() |
|
idx = [i for i in range(len(pca_var)) if pca_var[i] > 0.995][0] |
|
df_pca = PCA_DF.loc[:, 0:idx] |
|
df_pca[flag]=df[flag] |
|
print(df_pca) |
|
|
|
df_train = df_pca[df_pca[flag] == 1] |
|
df_control = df_pca[df_pca[flag] == 0] |
|
df_control_sample = df_control.sample(n=control_sample_size, random_state=42) |
|
final_df_sample = pd.concat([df_train, df_control_sample], ignore_index=True) |
|
non_req_cols=[flag] |
|
req_cols=df_pca.columns[~df_pca.columns.isin(non_req_cols)] |
|
|
|
identifier_df, X, y = df[[identifier]], final_df_sample[req_cols], final_df_sample[[flag]] |
|
if model_type == 'linear': |
|
|
|
|
|
|
|
|
|
|
|
model = LogisticRegression(solver=solver, max_iter=max_iter, class_weight=class_weights) |
|
model.fit(X, y) |
|
|
|
coefs = model.coef_[0] |
|
feats = X.columns |
|
importance_df = pd.DataFrame({'features':feats, 'coefficients':coefs}) |
|
importance_df['abs_coef'] = np.abs(importance_df['coefficients']) |
|
elif model_type == 'xgboost': |
|
model = xgb.XGBClassifier(max_depth=max_depth, subsample=subsample, eta=eta) |
|
model.fit(X, y) |
|
importance = model.feature_importances_ |
|
feats = X.columns |
|
importance_df = pd.DataFrame({'features':feats, 'Importance':importance}) |
|
|
|
|
|
Y_pred = model.predict(X) |
|
|
|
|
|
cm = confusion_matrix(y, Y_pred) / len(y) |
|
|
|
|
|
classes = np.unique(y) |
|
df_cm = pd.DataFrame(cm, index=classes, columns=classes) |
|
|
|
|
|
hover_text = [['Actual: {}<br>Predicted: {}<br>Value: {:.2f}'.format(y.iloc[i, 0], Y_pred[i], cm[i, j]) |
|
for j in range(len(classes))] for i in range(len(classes))] |
|
|
|
|
|
fig = ff.create_annotated_heatmap(z=df_cm.values, |
|
x=list(classes), |
|
y=list(classes), |
|
colorscale='blues', |
|
hoverinfo='text', |
|
text=hover_text) |
|
|
|
|
|
fig.update_layout( |
|
title='Confusion Matrix', |
|
xaxis_title='Predicted', |
|
yaxis_title='Actual', |
|
font=dict(size=14) |
|
) |
|
|
|
|
|
|
|
|
|
report = classification_report(y, Y_pred, output_dict=True) |
|
|
|
report_df = pd.DataFrame(report).transpose() |
|
|
|
X, y = df_pca[req_cols], df_pca[[flag]] |
|
|
|
|
|
|
|
|
|
|
|
|
|
y_pred_proba = model.predict_proba(X) |
|
y_pred_df = pd.DataFrame(y_pred_proba) |
|
df_pca.insert(0, 'propensity_score', y_pred_df[1]) |
|
|
|
|
|
|
|
st.subheader("Classification Report") |
|
st.dataframe(report_df,width=600) |
|
|
|
|
|
|
|
|
|
|
|
|
|
st.subheader("Confusion matrix") |
|
st.plotly_chart(fig) |
|
return df_pca[['propensity_score']] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.title("Algorithms") |
|
|
|
|
|
|
|
|
|
if 'classification_option' not in st.session_state: |
|
st.session_state.classification_option = "Classification" |
|
if 'algorithm_option' not in st.session_state: |
|
st.session_state.algorithm_option = "Logistic Regression" |
|
|
|
classification_option = st.radio("Algorithm Type", ["Classification", "Regression"], key="classification_option") |
|
|
|
if classification_option != st.session_state.classification_option: |
|
st.session_state.classification_option = classification_option |
|
|
|
if st.session_state.classification_option == "Classification": |
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
st.write("#####") |
|
lr_checkbox = st.checkbox( |
|
label="Logistic Regression", |
|
key="algorithm_lr_cb", |
|
value=(st.session_state.algorithm_option == "Logistic Regression") |
|
) |
|
|
|
with col2: |
|
st.write("#####") |
|
show_lr_options = st.checkbox( |
|
label="Change default options", |
|
key="lr_options_cb", |
|
disabled=not lr_checkbox, |
|
) |
|
|
|
cols = st.columns((2, 1)) |
|
with cols[0]: |
|
lr_hyp_placeholder = st.empty() |
|
lr_model_placeholder = st.empty() |
|
|
|
solver='lbfgs' |
|
class_weights=None |
|
max_iter=1000 |
|
if show_lr_options and lr_checkbox: |
|
with lr_hyp_placeholder: |
|
with st.expander("LR parameters"): |
|
solver=st.selectbox('Solver', ['liblinear', 'lbfgs', 'newton-cg', 'sag']) |
|
max_iter=st.slider('Max Iterations', min_value=100, max_value=10000, value=1000) |
|
class_weight_option = st.selectbox( |
|
'Select class weights option:', |
|
('Custom', 'Balanced') |
|
) |
|
|
|
if class_weight_option == 'Custom': |
|
weight_1 = st.number_input('Weight for class 1', min_value=0.0, max_value=1.0, value=0.4, step=0.1) |
|
weight_0 = st.number_input('Weight for class 0', min_value=0.0, max_value=1.0, value=0.6, step=0.1) |
|
class_weights = {1: weight_1, 0: weight_0} |
|
elif class_weight_option == 'Balanced': |
|
class_weights = {1: 0.5, 0: 0.5} |
|
|
|
|
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
st.write("#####") |
|
xgb_checkbox = st.checkbox( |
|
label="Xgboost Classifier", key="algorithm_xgb_cb", |
|
value=(st.session_state.algorithm_option == "Xgboost Classifier") |
|
) |
|
|
|
with col2: |
|
st.write("#####") |
|
show_xgb_options = st.checkbox( |
|
label="Change default options", |
|
key="xgb_options_cb", |
|
disabled=not xgb_checkbox, |
|
) |
|
|
|
cols = st.columns((2, 1)) |
|
with cols[0]: |
|
xgb_hyp_placeholder = st.empty() |
|
|
|
max_depth=None |
|
subsample=None |
|
eta=None |
|
|
|
if show_xgb_options and xgb_checkbox: |
|
with xgb_hyp_placeholder: |
|
with st.expander("XGB hyper parameters"): |
|
max_depth = st.slider("max_depth", min_value=1, max_value=10, value=3, step=1) |
|
subsample = st.slider("subsample", min_value=0.1, max_value=1.0, value=0.8, step=0.1) |
|
eta = st.slider("learning rate", min_value=0.01, max_value=0.5, value=0.3, step=0.01) |
|
|
|
st.session_state.algorithm_option = "Logistic Regression" if lr_checkbox else "Xgboost Classifier" |
|
|
|
elif classification_option == "Regression": |
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
st.write("#####") |
|
lr_checkbox = st.checkbox( |
|
label="Linear Regression", |
|
key="algorithm_lr_cb", |
|
value=(st.session_state.algorithm_option == "Linear Regression") |
|
) |
|
|
|
with col2: |
|
st.write("#####") |
|
show_lr_options = st.checkbox( |
|
label="Change default options", |
|
key="lr_options_cb", |
|
disabled=not lr_checkbox, |
|
) |
|
|
|
cols = st.columns((2, 1)) |
|
with cols[0]: |
|
lr_hyp_placeholder = st.empty() |
|
lr_model_placeholder = st.empty() |
|
|
|
solver='lbfgs' |
|
class_weights=None |
|
max_iter=1000 |
|
if show_lr_options and lr_checkbox: |
|
with lr_hyp_placeholder: |
|
with st.expander("LR parameters"): |
|
solver=st.selectbox('Solver', ['liblinear', 'lbfgs', 'newton-cg', 'sag']) |
|
max_iter=st.slider('Max Iterations', min_value=100, max_value=10000, value=1000) |
|
class_weight_option = st.selectbox( |
|
'Select class weights option:', |
|
('Custom', 'Balanced') |
|
) |
|
|
|
if class_weight_option == 'Custom': |
|
weight_1 = st.number_input('Weight for class 1', min_value=0.0, max_value=1.0, value=0.4, step=0.1) |
|
weight_0 = st.number_input('Weight for class 0', min_value=0.0, max_value=1.0, value=0.6, step=0.1) |
|
class_weights = {1: weight_1, 0: weight_0} |
|
elif class_weight_option == 'Balanced': |
|
class_weights = {1: 0.5, 0: 0.5} |
|
|
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
st.write("#####") |
|
xgb_checkbox = st.checkbox( |
|
label="Xgboost Regression", key="algorithm_xgb_cb", |
|
value=(st.session_state.algorithm_option == "Xgboost Regression") |
|
) |
|
|
|
with col2: |
|
st.write("#####") |
|
show_xgb_options = st.checkbox( |
|
label="Change default options", |
|
key="xgb_options_cb", |
|
disabled=not xgb_checkbox, |
|
) |
|
|
|
cols = st.columns((2, 1)) |
|
with cols[0]: |
|
xgb_hyp_placeholder = st.empty() |
|
|
|
max_depth=None |
|
subsample=None |
|
eta=None |
|
|
|
if show_xgb_options and xgb_checkbox: |
|
with xgb_hyp_placeholder: |
|
with st.expander("XGB hyper parameters"): |
|
max_depth = st.slider("max_depth", min_value=1, max_value=10, value=3, step=1) |
|
subsample = st.slider("subsample", min_value=0.1, max_value=1.0, value=0.8, step=0.1) |
|
eta = st.slider("learning rate", min_value=0.01, max_value=0.5, value=0.3, step=0.01) |
|
st.session_state.algorithm_option = "Linear Regression" if lr_checkbox else "Xgboost Regression" |
|
|
|
with cols[0]: |
|
control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1])) |
|
|
|
|
|
|
|
|
|
if st.button("Run Modeling"): |
|
if lr_checkbox: |
|
st.session_state.binned_df['propensity_score'] = point_estimates(st.session_state.binned_df,model_type='linear',flag=st.session_state.flag,identifier=st.session_state.identifier,control_sample_size=control_sample_size,solver=solver,max_iter=max_iter,class_weights=class_weights) |
|
elif xgb_checkbox: |
|
st.session_state.binned_df['propensity_score'] = point_estimates(st.session_state.binned_df,model_type='xgboost',flag=st.session_state.flag,identifier=st.session_state.identifier,control_sample_size=control_sample_size,max_depth=max_depth, subsample=subsample, eta=eta) |
|
|
|
|
|
|
|
st.session_state.treated_df = st.session_state.binned_df[st.session_state.binned_df['Y'] == 1] |
|
st.session_state.non_treated_df = st.session_state.binned_df[st.session_state.binned_df['Y'] == 0] |
|
|
|
|