Spaces:

BlendMMM
/

SCM

Sleeping

SCM

File size: 16,035 Bytes

a9415a6

###### SUPER SAFE ######

import pandas as pd
import numpy as np
import streamlit as st
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np
import plotly.figure_factory as ff


st.set_page_config(
    layout="wide",
)

def point_estimates(df, model_type, flag, identifier, control_sample_size, solver=None, max_iter=None, class_weights=None, max_depth=None, subsample=None, eta=None):
    # if set(df[df[flag] == 0][identifier]).intersection(set(df[df[flag] == 1][identifier])):
    #     st.error("The identifier should not be common between flag values 0 and 1.")

    Xs = df.drop(columns=[identifier, flag],axis=1)
    X_scaled = StandardScaler().fit_transform(Xs)
    n_comp = len(Xs.columns)
    pca = PCA(n_components=n_comp)
    pca.fit(X_scaled)
    princ_comp = pca.transform(X_scaled)
    PCA_DF = pd.DataFrame(princ_comp)
    pca_var = pca.explained_variance_ratio_[0:n_comp].cumsum()
    idx = [i for i in range(len(pca_var)) if pca_var[i] > 0.995][0]
    df_pca = PCA_DF.loc[:, 0:idx]
    df_pca[flag]=df[flag]
    print(df_pca)
    #creating train and control datasets
    df_train = df_pca[df_pca[flag] == 1]
    df_control = df_pca[df_pca[flag] == 0]
    df_control_sample = df_control.sample(n=control_sample_size, random_state=42)
    final_df_sample = pd.concat([df_train, df_control_sample], ignore_index=True)
    non_req_cols=[flag]
    req_cols=df_pca.columns[~df_pca.columns.isin(non_req_cols)]
    # create a holdout set
    identifier_df, X, y = df[[identifier]], final_df_sample[req_cols], final_df_sample[[flag]]
    if model_type == 'linear':
        # scale features
        # min_max_scaler = MinMaxScaler()
        # X_norm = min_max_scaler.fit_transform(X)
        #X_norm = (X - X.min()) / (X.max() - X.min())
        # fit model
        model = LogisticRegression(solver=solver, max_iter=max_iter, class_weight=class_weights)
        model.fit(X, y)
        #feature importances
        coefs = model.coef_[0]
        feats = X.columns
        importance_df = pd.DataFrame({'features':feats, 'coefficients':coefs})
        importance_df['abs_coef'] = np.abs(importance_df['coefficients'])
    elif model_type == 'xgboost':
        model = xgb.XGBClassifier(max_depth=max_depth, subsample=subsample, eta=eta)
        model.fit(X, y)
        importance = model.feature_importances_
        feats = X.columns
        importance_df = pd.DataFrame({'features':feats, 'Importance':importance})

    #Prediction
    Y_pred = model.predict(X)
    #Confusion matrix
    #cm = confusion_matrix(y, Y_pred)/y.shape[0]
    cm = confusion_matrix(y, Y_pred) / len(y)

    # Create DataFrame for confusion matrix
    classes = np.unique(y)
    df_cm = pd.DataFrame(cm, index=classes, columns=classes)

    # Create hover text
    hover_text = [['Actual: {}<br>Predicted: {}<br>Value: {:.2f}'.format(y.iloc[i, 0], Y_pred[i], cm[i, j])
                for j in range(len(classes))] for i in range(len(classes))]

    # Create heatmap using Plotly with hover text
    fig = ff.create_annotated_heatmap(z=df_cm.values,
                                    x=list(classes),
                                    y=list(classes),
                                    colorscale='blues',
                                    hoverinfo='text',
                                    text=hover_text)

    # Update heatmap layout
    fig.update_layout(
        title='Confusion Matrix',
        xaxis_title='Predicted',
        yaxis_title='Actual',
        font=dict(size=14)
    )

    # Display Plotly figure in Streamlit
    #st.plotly_chart(fig)
    #classification report
    report = classification_report(y, Y_pred, output_dict=True)
    # Convert the classification report to a DataFrame
    report_df = pd.DataFrame(report).transpose()  
    # prep data
    X, y = df_pca[req_cols], df_pca[[flag]]
    #X, y = df.drop(columns=[flag,identifier]), df[[flag]]
    # scale features
    # min_max_scaler = MinMaxScaler()
    # X_norm = min_max_scaler.fit_transform(X)
    #X_norm = (X - X.min()) / (X.max() - X.min())
    # run inference
    y_pred_proba = model.predict_proba(X)
    y_pred_df = pd.DataFrame(y_pred_proba)
    df_pca.insert(0, 'propensity_score', y_pred_df[1])
    # df_pca[identifier] = identifier_df
    # df_pca[identifier]=df_pca[identifier].astype('str')
    # Display classification report
    st.subheader("Classification Report")
    st.dataframe(report_df,width=600)

    # Display confusion matrix
    # st.subheader("Confusion Matrix")
    # st.write(df_cm,width=600)

    # Display confusion matrix
    st.subheader("Confusion matrix")
    st.plotly_chart(fig)
    return df_pca[['propensity_score']]



# if 'df' in st.session_state:
#     task_type = st.sidebar.selectbox("Task Type", ["classification", "regression"],key="task_type")
#     model_type = st.sidebar.selectbox("Model Type", ["linear", "xgboost"])
#     flag = st.sidebar.selectbox("Flag Column", [None] + list(st.session_state.df.columns))
#     identifier = st.sidebar.selectbox("Identifier Column", [None] + list(st.session_state.df.columns))
#     st.sidebar.write("Applicable only for Regression model type")
#     dep_var = st.sidebar.selectbox("Dependent Variable (Regression)", [None] + list(st.session_state.df.columns))
#     st.session_state.flag=flag
#     st.session_state.identifier=identifier
#     # Sidebar for user inputs
#     if flag is not None:
#         with st.expander("Model Configuration", expanded=True):
#                 unique_flag_values = st.session_state.df[flag].unique()
#                 for value in unique_flag_values:
#                     st.write(f"Y == {value}: {len(st.session_state.df[st.session_state.df[flag] == value])}")
#                 control_sample_size = st.text_input("Control Sample Size")

#                 try:
#                     # Try converting to an integer
#                     control_sample_size = int(control_sample_size)
                    
#                     # Check if control_sample_size is within the valid range
#                     flag_0_size = len(st.session_state.df[st.session_state.df[flag] == 0])
#                     if control_sample_size < 0 or control_sample_size > flag_0_size:
#                         st.error(f"Control Sample Size must be between 0 and {flag_0_size}.")
                    
#                 except ValueError:
#                     st.error("Please enter a valid integer for Control Sample Size.")
                    
                
#                 #st.write("Applicable only for Regression model type")
#                 #if st.session_state.get("task_type","") == "regression":
#                     #dep_var = st.sidebar.selectbox("Dependent Variable (Regression)", [None] + list(st.session_state.df.columns))
#                 point_estimate_variable = st.text_input("Variable of interest")
#                 st.session_state.point_estimate_variable=point_estimate_variable

#                 if st.button("Run Modeling"):
#                     result_df = point_estimates(st.session_state.df, task_type, model_type, point_estimate_variable, control_sample_size, flag, identifier, dep_var)

#                     st.session_state.modeling_df = result_df
#                     st.session_state.treated_df=result_df[result_df['Y']==1]
#                     st.session_state.non_treated_df=result_df[result_df['Y']==0]




st.title("Algorithms")

#st.subheader("Classification")  # Added line
#classification_option = st.radio("Classification", ["Classification"])  # Added line

if 'classification_option' not in st.session_state:
    st.session_state.classification_option = "Classification"
if 'algorithm_option' not in st.session_state:
    st.session_state.algorithm_option = "Logistic Regression"

classification_option = st.radio("Algorithm Type", ["Classification", "Regression"], key="classification_option")

if classification_option != st.session_state.classification_option:
    st.session_state.classification_option = classification_option

if st.session_state.classification_option == "Classification":
    col1, col2 = st.columns(2)

    with col1:
        st.write("#####")
        lr_checkbox = st.checkbox(
            label="Logistic Regression",
            key="algorithm_lr_cb",
            value=(st.session_state.algorithm_option == "Logistic Regression")
        )

    with col2:
        st.write("#####")
        show_lr_options = st.checkbox(
            label="Change default options",
            key="lr_options_cb",
            disabled=not lr_checkbox,
        )

    cols = st.columns((2, 1))
    with cols[0]:
        lr_hyp_placeholder = st.empty()
        lr_model_placeholder = st.empty()

    solver='lbfgs'
    class_weights=None
    max_iter=1000
    if show_lr_options and lr_checkbox:
        with lr_hyp_placeholder:
            with st.expander("LR parameters"):
                solver=st.selectbox('Solver', ['liblinear', 'lbfgs', 'newton-cg', 'sag'])
                max_iter=st.slider('Max Iterations', min_value=100, max_value=10000, value=1000)
                class_weight_option = st.selectbox(
                    'Select class weights option:',
                    ('Custom', 'Balanced')
                )

                if class_weight_option == 'Custom':
                    weight_1 = st.number_input('Weight for class 1', min_value=0.0, max_value=1.0, value=0.4, step=0.1)
                    weight_0 = st.number_input('Weight for class 0', min_value=0.0, max_value=1.0, value=0.6, step=0.1)
                    class_weights = {1: weight_1, 0: weight_0}
                elif class_weight_option == 'Balanced':
                    class_weights = {1: 0.5, 0: 0.5}
            #control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1]))            

    col1, col2 = st.columns(2)

    with col1:
        st.write("#####")
        xgb_checkbox = st.checkbox(
            label="Xgboost Classifier", key="algorithm_xgb_cb",
            value=(st.session_state.algorithm_option == "Xgboost Classifier")
        )

    with col2:
        st.write("#####")
        show_xgb_options = st.checkbox(
            label="Change default options",
            key="xgb_options_cb",
            disabled=not xgb_checkbox,
        )

    cols = st.columns((2, 1))
    with cols[0]:
        xgb_hyp_placeholder = st.empty()

    max_depth=None
    subsample=None
    eta=None

    if show_xgb_options and xgb_checkbox:
        with xgb_hyp_placeholder:
            with st.expander("XGB hyper parameters"):
                max_depth = st.slider("max_depth", min_value=1, max_value=10, value=3, step=1)
                subsample = st.slider("subsample", min_value=0.1, max_value=1.0, value=0.8, step=0.1)
                eta = st.slider("learning rate", min_value=0.01, max_value=0.5, value=0.3, step=0.01)
            #control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1]))            
    st.session_state.algorithm_option = "Logistic Regression" if lr_checkbox else "Xgboost Classifier"

elif classification_option == "Regression":
    col1, col2 = st.columns(2)

    with col1:
        st.write("#####")
        lr_checkbox = st.checkbox(
            label="Linear Regression",
            key="algorithm_lr_cb",
            value=(st.session_state.algorithm_option == "Linear Regression")
        )

    with col2:
        st.write("#####")
        show_lr_options = st.checkbox(
            label="Change default options",
            key="lr_options_cb",
            disabled=not lr_checkbox,
        )

    cols = st.columns((2, 1))
    with cols[0]:
        lr_hyp_placeholder = st.empty()
        lr_model_placeholder = st.empty()

    solver='lbfgs'
    class_weights=None
    max_iter=1000
    if show_lr_options and lr_checkbox:
        with lr_hyp_placeholder:
            with st.expander("LR parameters"):
                solver=st.selectbox('Solver', ['liblinear', 'lbfgs', 'newton-cg', 'sag'])
                max_iter=st.slider('Max Iterations', min_value=100, max_value=10000, value=1000)
                class_weight_option = st.selectbox(
                    'Select class weights option:',
                    ('Custom', 'Balanced')
                )

                if class_weight_option == 'Custom':
                    weight_1 = st.number_input('Weight for class 1', min_value=0.0, max_value=1.0, value=0.4, step=0.1)
                    weight_0 = st.number_input('Weight for class 0', min_value=0.0, max_value=1.0, value=0.6, step=0.1)
                    class_weights = {1: weight_1, 0: weight_0}
                elif class_weight_option == 'Balanced':
                    class_weights = {1: 0.5, 0: 0.5}

    col1, col2 = st.columns(2)

    with col1:
        st.write("#####")
        xgb_checkbox = st.checkbox(
            label="Xgboost Regression", key="algorithm_xgb_cb",
            value=(st.session_state.algorithm_option == "Xgboost Regression")
        )

    with col2:
        st.write("#####")
        show_xgb_options = st.checkbox(
            label="Change default options",
            key="xgb_options_cb",
            disabled=not xgb_checkbox,
        )

    cols = st.columns((2, 1))
    with cols[0]:
        xgb_hyp_placeholder = st.empty()

    max_depth=None
    subsample=None
    eta=None

    if show_xgb_options and xgb_checkbox:
        with xgb_hyp_placeholder:
            with st.expander("XGB hyper parameters"):
                max_depth = st.slider("max_depth", min_value=1, max_value=10, value=3, step=1)
                subsample = st.slider("subsample", min_value=0.1, max_value=1.0, value=0.8, step=0.1)
                eta = st.slider("learning rate", min_value=0.01, max_value=0.5, value=0.3, step=0.01)
    st.session_state.algorithm_option = "Linear Regression" if lr_checkbox else "Xgboost Regression"

with cols[0]:
    control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1]))   

#st.subheader("Classification")  # Added line
#classification_option = st.radio("Classification", ["Classification"])  # Added line

if st.button("Run Modeling"): 
    if lr_checkbox:
        st.session_state.binned_df['propensity_score'] = point_estimates(st.session_state.binned_df,model_type='linear',flag=st.session_state.flag,identifier=st.session_state.identifier,control_sample_size=control_sample_size,solver=solver,max_iter=max_iter,class_weights=class_weights)
    elif xgb_checkbox:
        st.session_state.binned_df['propensity_score'] = point_estimates(st.session_state.binned_df,model_type='xgboost',flag=st.session_state.flag,identifier=st.session_state.identifier,control_sample_size=control_sample_size,max_depth=max_depth, subsample=subsample, eta=eta)


    # st.session_state.binned_df['propensity_score'] = result_df['propensity_score']
    st.session_state.treated_df = st.session_state.binned_df[st.session_state.binned_df['Y'] == 1]
    st.session_state.non_treated_df = st.session_state.binned_df[st.session_state.binned_df['Y'] == 0]