SCM / pages /3_Point estimates.py
Manoj
firt
6a04ca4
###### SUPER SAFE ######
import pandas as pd
import numpy as np
import streamlit as st
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np
import plotly.figure_factory as ff
st.set_page_config(
layout="wide",
)
def point_estimates(df, model_type, flag, identifier, control_sample_size, solver=None, max_iter=None, class_weights=None, max_depth=None, subsample=None, eta=None):
# if set(df[df[flag] == 0][identifier]).intersection(set(df[df[flag] == 1][identifier])):
# st.error("The identifier should not be common between flag values 0 and 1.")
Xs = df.drop(columns=[identifier, flag],axis=1)
X_scaled = StandardScaler().fit_transform(Xs)
n_comp = len(Xs.columns)
pca = PCA(n_components=n_comp)
pca.fit(X_scaled)
princ_comp = pca.transform(X_scaled)
PCA_DF = pd.DataFrame(princ_comp)
pca_var = pca.explained_variance_ratio_[0:n_comp].cumsum()
idx = [i for i in range(len(pca_var)) if pca_var[i] > 0.995][0]
df_pca = PCA_DF.loc[:, 0:idx]
df_pca[flag]=df[flag]
print(df_pca)
#creating train and control datasets
df_train = df_pca[df_pca[flag] == 1]
df_control = df_pca[df_pca[flag] == 0]
df_control_sample = df_control.sample(n=control_sample_size, random_state=42)
final_df_sample = pd.concat([df_train, df_control_sample], ignore_index=True)
non_req_cols=[flag]
req_cols=df_pca.columns[~df_pca.columns.isin(non_req_cols)]
# create a holdout set
identifier_df, X, y = df[[identifier]], final_df_sample[req_cols], final_df_sample[[flag]]
if model_type == 'linear':
# scale features
# min_max_scaler = MinMaxScaler()
# X_norm = min_max_scaler.fit_transform(X)
#X_norm = (X - X.min()) / (X.max() - X.min())
# fit model
model = LogisticRegression(solver=solver, max_iter=max_iter, class_weight=class_weights)
model.fit(X, y)
#feature importances
coefs = model.coef_[0]
feats = X.columns
importance_df = pd.DataFrame({'features':feats, 'coefficients':coefs})
importance_df['abs_coef'] = np.abs(importance_df['coefficients'])
elif model_type == 'xgboost':
model = xgb.XGBClassifier(max_depth=max_depth, subsample=subsample, eta=eta)
model.fit(X, y)
importance = model.feature_importances_
feats = X.columns
importance_df = pd.DataFrame({'features':feats, 'Importance':importance})
#Prediction
Y_pred = model.predict(X)
#Confusion matrix
#cm = confusion_matrix(y, Y_pred)/y.shape[0]
cm = confusion_matrix(y, Y_pred) / len(y)
# Create DataFrame for confusion matrix
classes = np.unique(y)
df_cm = pd.DataFrame(cm, index=classes, columns=classes)
# Create hover text
hover_text = [['Actual: {}<br>Predicted: {}<br>Value: {:.2f}'.format(y.iloc[i, 0], Y_pred[i], cm[i, j])
for j in range(len(classes))] for i in range(len(classes))]
# Create heatmap using Plotly with hover text
fig = ff.create_annotated_heatmap(z=df_cm.values,
x=list(classes),
y=list(classes),
colorscale='blues',
hoverinfo='text',
text=hover_text)
# Update heatmap layout
fig.update_layout(
title='Confusion Matrix',
xaxis_title='Predicted',
yaxis_title='Actual',
font=dict(size=14)
)
# Display Plotly figure in Streamlit
#st.plotly_chart(fig)
#classification report
report = classification_report(y, Y_pred, output_dict=True)
# Convert the classification report to a DataFrame
report_df = pd.DataFrame(report).transpose()
# prep data
X, y = df_pca[req_cols], df_pca[[flag]]
#X, y = df.drop(columns=[flag,identifier]), df[[flag]]
# scale features
# min_max_scaler = MinMaxScaler()
# X_norm = min_max_scaler.fit_transform(X)
#X_norm = (X - X.min()) / (X.max() - X.min())
# run inference
y_pred_proba = model.predict_proba(X)
y_pred_df = pd.DataFrame(y_pred_proba)
df_pca.insert(0, 'propensity_score', y_pred_df[1])
# df_pca[identifier] = identifier_df
# df_pca[identifier]=df_pca[identifier].astype('str')
# Display classification report
st.subheader("Classification Report")
st.dataframe(report_df,width=600)
# Display confusion matrix
# st.subheader("Confusion Matrix")
# st.write(df_cm,width=600)
# Display confusion matrix
st.subheader("Confusion matrix")
st.plotly_chart(fig)
return df_pca[['propensity_score']]
# if 'df' in st.session_state:
# task_type = st.sidebar.selectbox("Task Type", ["classification", "regression"],key="task_type")
# model_type = st.sidebar.selectbox("Model Type", ["linear", "xgboost"])
# flag = st.sidebar.selectbox("Flag Column", [None] + list(st.session_state.df.columns))
# identifier = st.sidebar.selectbox("Identifier Column", [None] + list(st.session_state.df.columns))
# st.sidebar.write("Applicable only for Regression model type")
# dep_var = st.sidebar.selectbox("Dependent Variable (Regression)", [None] + list(st.session_state.df.columns))
# st.session_state.flag=flag
# st.session_state.identifier=identifier
# # Sidebar for user inputs
# if flag is not None:
# with st.expander("Model Configuration", expanded=True):
# unique_flag_values = st.session_state.df[flag].unique()
# for value in unique_flag_values:
# st.write(f"Y == {value}: {len(st.session_state.df[st.session_state.df[flag] == value])}")
# control_sample_size = st.text_input("Control Sample Size")
# try:
# # Try converting to an integer
# control_sample_size = int(control_sample_size)
# # Check if control_sample_size is within the valid range
# flag_0_size = len(st.session_state.df[st.session_state.df[flag] == 0])
# if control_sample_size < 0 or control_sample_size > flag_0_size:
# st.error(f"Control Sample Size must be between 0 and {flag_0_size}.")
# except ValueError:
# st.error("Please enter a valid integer for Control Sample Size.")
# #st.write("Applicable only for Regression model type")
# #if st.session_state.get("task_type","") == "regression":
# #dep_var = st.sidebar.selectbox("Dependent Variable (Regression)", [None] + list(st.session_state.df.columns))
# point_estimate_variable = st.text_input("Variable of interest")
# st.session_state.point_estimate_variable=point_estimate_variable
# if st.button("Run Modeling"):
# result_df = point_estimates(st.session_state.df, task_type, model_type, point_estimate_variable, control_sample_size, flag, identifier, dep_var)
# st.session_state.modeling_df = result_df
# st.session_state.treated_df=result_df[result_df['Y']==1]
# st.session_state.non_treated_df=result_df[result_df['Y']==0]
st.title("Algorithms")
#st.subheader("Classification") # Added line
#classification_option = st.radio("Classification", ["Classification"]) # Added line
if 'classification_option' not in st.session_state:
st.session_state.classification_option = "Classification"
if 'algorithm_option' not in st.session_state:
st.session_state.algorithm_option = "Logistic Regression"
classification_option = st.radio("Algorithm Type", ["Classification", "Regression"], key="classification_option")
if classification_option != st.session_state.classification_option:
st.session_state.classification_option = classification_option
if st.session_state.classification_option == "Classification":
col1, col2 = st.columns(2)
with col1:
st.write("#####")
lr_checkbox = st.checkbox(
label="Logistic Regression",
key="algorithm_lr_cb",
value=(st.session_state.algorithm_option == "Logistic Regression")
)
with col2:
st.write("#####")
show_lr_options = st.checkbox(
label="Change default options",
key="lr_options_cb",
disabled=not lr_checkbox,
)
cols = st.columns((2, 1))
with cols[0]:
lr_hyp_placeholder = st.empty()
lr_model_placeholder = st.empty()
solver='lbfgs'
class_weights=None
max_iter=1000
if show_lr_options and lr_checkbox:
with lr_hyp_placeholder:
with st.expander("LR parameters"):
solver=st.selectbox('Solver', ['liblinear', 'lbfgs', 'newton-cg', 'sag'])
max_iter=st.slider('Max Iterations', min_value=100, max_value=10000, value=1000)
class_weight_option = st.selectbox(
'Select class weights option:',
('Custom', 'Balanced')
)
if class_weight_option == 'Custom':
weight_1 = st.number_input('Weight for class 1', min_value=0.0, max_value=1.0, value=0.4, step=0.1)
weight_0 = st.number_input('Weight for class 0', min_value=0.0, max_value=1.0, value=0.6, step=0.1)
class_weights = {1: weight_1, 0: weight_0}
elif class_weight_option == 'Balanced':
class_weights = {1: 0.5, 0: 0.5}
#control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1]))
col1, col2 = st.columns(2)
with col1:
st.write("#####")
xgb_checkbox = st.checkbox(
label="Xgboost Classifier", key="algorithm_xgb_cb",
value=(st.session_state.algorithm_option == "Xgboost Classifier")
)
with col2:
st.write("#####")
show_xgb_options = st.checkbox(
label="Change default options",
key="xgb_options_cb",
disabled=not xgb_checkbox,
)
cols = st.columns((2, 1))
with cols[0]:
xgb_hyp_placeholder = st.empty()
max_depth=None
subsample=None
eta=None
if show_xgb_options and xgb_checkbox:
with xgb_hyp_placeholder:
with st.expander("XGB hyper parameters"):
max_depth = st.slider("max_depth", min_value=1, max_value=10, value=3, step=1)
subsample = st.slider("subsample", min_value=0.1, max_value=1.0, value=0.8, step=0.1)
eta = st.slider("learning rate", min_value=0.01, max_value=0.5, value=0.3, step=0.01)
#control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1]))
st.session_state.algorithm_option = "Logistic Regression" if lr_checkbox else "Xgboost Classifier"
elif classification_option == "Regression":
col1, col2 = st.columns(2)
with col1:
st.write("#####")
lr_checkbox = st.checkbox(
label="Linear Regression",
key="algorithm_lr_cb",
value=(st.session_state.algorithm_option == "Linear Regression")
)
with col2:
st.write("#####")
show_lr_options = st.checkbox(
label="Change default options",
key="lr_options_cb",
disabled=not lr_checkbox,
)
cols = st.columns((2, 1))
with cols[0]:
lr_hyp_placeholder = st.empty()
lr_model_placeholder = st.empty()
solver='lbfgs'
class_weights=None
max_iter=1000
if show_lr_options and lr_checkbox:
with lr_hyp_placeholder:
with st.expander("LR parameters"):
solver=st.selectbox('Solver', ['liblinear', 'lbfgs', 'newton-cg', 'sag'])
max_iter=st.slider('Max Iterations', min_value=100, max_value=10000, value=1000)
class_weight_option = st.selectbox(
'Select class weights option:',
('Custom', 'Balanced')
)
if class_weight_option == 'Custom':
weight_1 = st.number_input('Weight for class 1', min_value=0.0, max_value=1.0, value=0.4, step=0.1)
weight_0 = st.number_input('Weight for class 0', min_value=0.0, max_value=1.0, value=0.6, step=0.1)
class_weights = {1: weight_1, 0: weight_0}
elif class_weight_option == 'Balanced':
class_weights = {1: 0.5, 0: 0.5}
col1, col2 = st.columns(2)
with col1:
st.write("#####")
xgb_checkbox = st.checkbox(
label="Xgboost Regression", key="algorithm_xgb_cb",
value=(st.session_state.algorithm_option == "Xgboost Regression")
)
with col2:
st.write("#####")
show_xgb_options = st.checkbox(
label="Change default options",
key="xgb_options_cb",
disabled=not xgb_checkbox,
)
cols = st.columns((2, 1))
with cols[0]:
xgb_hyp_placeholder = st.empty()
max_depth=None
subsample=None
eta=None
if show_xgb_options and xgb_checkbox:
with xgb_hyp_placeholder:
with st.expander("XGB hyper parameters"):
max_depth = st.slider("max_depth", min_value=1, max_value=10, value=3, step=1)
subsample = st.slider("subsample", min_value=0.1, max_value=1.0, value=0.8, step=0.1)
eta = st.slider("learning rate", min_value=0.01, max_value=0.5, value=0.3, step=0.01)
st.session_state.algorithm_option = "Linear Regression" if lr_checkbox else "Xgboost Regression"
with cols[0]:
control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1]))
#st.subheader("Classification") # Added line
#classification_option = st.radio("Classification", ["Classification"]) # Added line
if st.button("Run Modeling"):
if lr_checkbox:
st.session_state.binned_df['propensity_score'] = point_estimates(st.session_state.binned_df,model_type='linear',flag=st.session_state.flag,identifier=st.session_state.identifier,control_sample_size=control_sample_size,solver=solver,max_iter=max_iter,class_weights=class_weights)
elif xgb_checkbox:
st.session_state.binned_df['propensity_score'] = point_estimates(st.session_state.binned_df,model_type='xgboost',flag=st.session_state.flag,identifier=st.session_state.identifier,control_sample_size=control_sample_size,max_depth=max_depth, subsample=subsample, eta=eta)
# st.session_state.binned_df['propensity_score'] = result_df['propensity_score']
st.session_state.treated_df = st.session_state.binned_df[st.session_state.binned_df['Y'] == 1]
st.session_state.non_treated_df = st.session_state.binned_df[st.session_state.binned_df['Y'] == 0]