File size: 16,035 Bytes
a9415a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
###### SUPER SAFE ######

import pandas as pd
import numpy as np
import streamlit as st
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np
import plotly.figure_factory as ff


st.set_page_config(
    layout="wide",
)

def point_estimates(df, model_type, flag, identifier, control_sample_size, solver=None, max_iter=None, class_weights=None, max_depth=None, subsample=None, eta=None):
    # if set(df[df[flag] == 0][identifier]).intersection(set(df[df[flag] == 1][identifier])):
    #     st.error("The identifier should not be common between flag values 0 and 1.")

    Xs = df.drop(columns=[identifier, flag],axis=1)
    X_scaled = StandardScaler().fit_transform(Xs)
    n_comp = len(Xs.columns)
    pca = PCA(n_components=n_comp)
    pca.fit(X_scaled)
    princ_comp = pca.transform(X_scaled)
    PCA_DF = pd.DataFrame(princ_comp)
    pca_var = pca.explained_variance_ratio_[0:n_comp].cumsum()
    idx = [i for i in range(len(pca_var)) if pca_var[i] > 0.995][0]
    df_pca = PCA_DF.loc[:, 0:idx]
    df_pca[flag]=df[flag]
    print(df_pca)
    #creating train and control datasets
    df_train = df_pca[df_pca[flag] == 1]
    df_control = df_pca[df_pca[flag] == 0]
    df_control_sample = df_control.sample(n=control_sample_size, random_state=42)
    final_df_sample = pd.concat([df_train, df_control_sample], ignore_index=True)
    non_req_cols=[flag]
    req_cols=df_pca.columns[~df_pca.columns.isin(non_req_cols)]
    # create a holdout set
    identifier_df, X, y = df[[identifier]], final_df_sample[req_cols], final_df_sample[[flag]]
    if model_type == 'linear':
        # scale features
        # min_max_scaler = MinMaxScaler()
        # X_norm = min_max_scaler.fit_transform(X)
        #X_norm = (X - X.min()) / (X.max() - X.min())
        # fit model
        model = LogisticRegression(solver=solver, max_iter=max_iter, class_weight=class_weights)
        model.fit(X, y)
        #feature importances
        coefs = model.coef_[0]
        feats = X.columns
        importance_df = pd.DataFrame({'features':feats, 'coefficients':coefs})
        importance_df['abs_coef'] = np.abs(importance_df['coefficients'])
    elif model_type == 'xgboost':
        model = xgb.XGBClassifier(max_depth=max_depth, subsample=subsample, eta=eta)
        model.fit(X, y)
        importance = model.feature_importances_
        feats = X.columns
        importance_df = pd.DataFrame({'features':feats, 'Importance':importance})

    #Prediction
    Y_pred = model.predict(X)
    #Confusion matrix
    #cm = confusion_matrix(y, Y_pred)/y.shape[0]
    cm = confusion_matrix(y, Y_pred) / len(y)

    # Create DataFrame for confusion matrix
    classes = np.unique(y)
    df_cm = pd.DataFrame(cm, index=classes, columns=classes)

    # Create hover text
    hover_text = [['Actual: {}<br>Predicted: {}<br>Value: {:.2f}'.format(y.iloc[i, 0], Y_pred[i], cm[i, j])
                for j in range(len(classes))] for i in range(len(classes))]

    # Create heatmap using Plotly with hover text
    fig = ff.create_annotated_heatmap(z=df_cm.values,
                                    x=list(classes),
                                    y=list(classes),
                                    colorscale='blues',
                                    hoverinfo='text',
                                    text=hover_text)

    # Update heatmap layout
    fig.update_layout(
        title='Confusion Matrix',
        xaxis_title='Predicted',
        yaxis_title='Actual',
        font=dict(size=14)
    )

    # Display Plotly figure in Streamlit
    #st.plotly_chart(fig)
    #classification report
    report = classification_report(y, Y_pred, output_dict=True)
    # Convert the classification report to a DataFrame
    report_df = pd.DataFrame(report).transpose()  
    # prep data
    X, y = df_pca[req_cols], df_pca[[flag]]
    #X, y = df.drop(columns=[flag,identifier]), df[[flag]]
    # scale features
    # min_max_scaler = MinMaxScaler()
    # X_norm = min_max_scaler.fit_transform(X)
    #X_norm = (X - X.min()) / (X.max() - X.min())
    # run inference
    y_pred_proba = model.predict_proba(X)
    y_pred_df = pd.DataFrame(y_pred_proba)
    df_pca.insert(0, 'propensity_score', y_pred_df[1])
    # df_pca[identifier] = identifier_df
    # df_pca[identifier]=df_pca[identifier].astype('str')
    # Display classification report
    st.subheader("Classification Report")
    st.dataframe(report_df,width=600)

    # Display confusion matrix
    # st.subheader("Confusion Matrix")
    # st.write(df_cm,width=600)

    # Display confusion matrix
    st.subheader("Confusion matrix")
    st.plotly_chart(fig)
    return df_pca[['propensity_score']]



# if 'df' in st.session_state:
#     task_type = st.sidebar.selectbox("Task Type", ["classification", "regression"],key="task_type")
#     model_type = st.sidebar.selectbox("Model Type", ["linear", "xgboost"])
#     flag = st.sidebar.selectbox("Flag Column", [None] + list(st.session_state.df.columns))
#     identifier = st.sidebar.selectbox("Identifier Column", [None] + list(st.session_state.df.columns))
#     st.sidebar.write("Applicable only for Regression model type")
#     dep_var = st.sidebar.selectbox("Dependent Variable (Regression)", [None] + list(st.session_state.df.columns))
#     st.session_state.flag=flag
#     st.session_state.identifier=identifier
#     # Sidebar for user inputs
#     if flag is not None:
#         with st.expander("Model Configuration", expanded=True):
#                 unique_flag_values = st.session_state.df[flag].unique()
#                 for value in unique_flag_values:
#                     st.write(f"Y == {value}: {len(st.session_state.df[st.session_state.df[flag] == value])}")
#                 control_sample_size = st.text_input("Control Sample Size")

#                 try:
#                     # Try converting to an integer
#                     control_sample_size = int(control_sample_size)
                    
#                     # Check if control_sample_size is within the valid range
#                     flag_0_size = len(st.session_state.df[st.session_state.df[flag] == 0])
#                     if control_sample_size < 0 or control_sample_size > flag_0_size:
#                         st.error(f"Control Sample Size must be between 0 and {flag_0_size}.")
                    
#                 except ValueError:
#                     st.error("Please enter a valid integer for Control Sample Size.")
                    
                
#                 #st.write("Applicable only for Regression model type")
#                 #if st.session_state.get("task_type","") == "regression":
#                     #dep_var = st.sidebar.selectbox("Dependent Variable (Regression)", [None] + list(st.session_state.df.columns))
#                 point_estimate_variable = st.text_input("Variable of interest")
#                 st.session_state.point_estimate_variable=point_estimate_variable

#                 if st.button("Run Modeling"):
#                     result_df = point_estimates(st.session_state.df, task_type, model_type, point_estimate_variable, control_sample_size, flag, identifier, dep_var)

#                     st.session_state.modeling_df = result_df
#                     st.session_state.treated_df=result_df[result_df['Y']==1]
#                     st.session_state.non_treated_df=result_df[result_df['Y']==0]




st.title("Algorithms")

#st.subheader("Classification")  # Added line
#classification_option = st.radio("Classification", ["Classification"])  # Added line

if 'classification_option' not in st.session_state:
    st.session_state.classification_option = "Classification"
if 'algorithm_option' not in st.session_state:
    st.session_state.algorithm_option = "Logistic Regression"

classification_option = st.radio("Algorithm Type", ["Classification", "Regression"], key="classification_option")

if classification_option != st.session_state.classification_option:
    st.session_state.classification_option = classification_option

if st.session_state.classification_option == "Classification":
    col1, col2 = st.columns(2)

    with col1:
        st.write("#####")
        lr_checkbox = st.checkbox(
            label="Logistic Regression",
            key="algorithm_lr_cb",
            value=(st.session_state.algorithm_option == "Logistic Regression")
        )

    with col2:
        st.write("#####")
        show_lr_options = st.checkbox(
            label="Change default options",
            key="lr_options_cb",
            disabled=not lr_checkbox,
        )

    cols = st.columns((2, 1))
    with cols[0]:
        lr_hyp_placeholder = st.empty()
        lr_model_placeholder = st.empty()

    solver='lbfgs'
    class_weights=None
    max_iter=1000
    if show_lr_options and lr_checkbox:
        with lr_hyp_placeholder:
            with st.expander("LR parameters"):
                solver=st.selectbox('Solver', ['liblinear', 'lbfgs', 'newton-cg', 'sag'])
                max_iter=st.slider('Max Iterations', min_value=100, max_value=10000, value=1000)
                class_weight_option = st.selectbox(
                    'Select class weights option:',
                    ('Custom', 'Balanced')
                )

                if class_weight_option == 'Custom':
                    weight_1 = st.number_input('Weight for class 1', min_value=0.0, max_value=1.0, value=0.4, step=0.1)
                    weight_0 = st.number_input('Weight for class 0', min_value=0.0, max_value=1.0, value=0.6, step=0.1)
                    class_weights = {1: weight_1, 0: weight_0}
                elif class_weight_option == 'Balanced':
                    class_weights = {1: 0.5, 0: 0.5}
            #control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1]))            

    col1, col2 = st.columns(2)

    with col1:
        st.write("#####")
        xgb_checkbox = st.checkbox(
            label="Xgboost Classifier", key="algorithm_xgb_cb",
            value=(st.session_state.algorithm_option == "Xgboost Classifier")
        )

    with col2:
        st.write("#####")
        show_xgb_options = st.checkbox(
            label="Change default options",
            key="xgb_options_cb",
            disabled=not xgb_checkbox,
        )

    cols = st.columns((2, 1))
    with cols[0]:
        xgb_hyp_placeholder = st.empty()

    max_depth=None
    subsample=None
    eta=None

    if show_xgb_options and xgb_checkbox:
        with xgb_hyp_placeholder:
            with st.expander("XGB hyper parameters"):
                max_depth = st.slider("max_depth", min_value=1, max_value=10, value=3, step=1)
                subsample = st.slider("subsample", min_value=0.1, max_value=1.0, value=0.8, step=0.1)
                eta = st.slider("learning rate", min_value=0.01, max_value=0.5, value=0.3, step=0.01)
            #control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1]))            
    st.session_state.algorithm_option = "Logistic Regression" if lr_checkbox else "Xgboost Classifier"

elif classification_option == "Regression":
    col1, col2 = st.columns(2)

    with col1:
        st.write("#####")
        lr_checkbox = st.checkbox(
            label="Linear Regression",
            key="algorithm_lr_cb",
            value=(st.session_state.algorithm_option == "Linear Regression")
        )

    with col2:
        st.write("#####")
        show_lr_options = st.checkbox(
            label="Change default options",
            key="lr_options_cb",
            disabled=not lr_checkbox,
        )

    cols = st.columns((2, 1))
    with cols[0]:
        lr_hyp_placeholder = st.empty()
        lr_model_placeholder = st.empty()

    solver='lbfgs'
    class_weights=None
    max_iter=1000
    if show_lr_options and lr_checkbox:
        with lr_hyp_placeholder:
            with st.expander("LR parameters"):
                solver=st.selectbox('Solver', ['liblinear', 'lbfgs', 'newton-cg', 'sag'])
                max_iter=st.slider('Max Iterations', min_value=100, max_value=10000, value=1000)
                class_weight_option = st.selectbox(
                    'Select class weights option:',
                    ('Custom', 'Balanced')
                )

                if class_weight_option == 'Custom':
                    weight_1 = st.number_input('Weight for class 1', min_value=0.0, max_value=1.0, value=0.4, step=0.1)
                    weight_0 = st.number_input('Weight for class 0', min_value=0.0, max_value=1.0, value=0.6, step=0.1)
                    class_weights = {1: weight_1, 0: weight_0}
                elif class_weight_option == 'Balanced':
                    class_weights = {1: 0.5, 0: 0.5}

    col1, col2 = st.columns(2)

    with col1:
        st.write("#####")
        xgb_checkbox = st.checkbox(
            label="Xgboost Regression", key="algorithm_xgb_cb",
            value=(st.session_state.algorithm_option == "Xgboost Regression")
        )

    with col2:
        st.write("#####")
        show_xgb_options = st.checkbox(
            label="Change default options",
            key="xgb_options_cb",
            disabled=not xgb_checkbox,
        )

    cols = st.columns((2, 1))
    with cols[0]:
        xgb_hyp_placeholder = st.empty()

    max_depth=None
    subsample=None
    eta=None

    if show_xgb_options and xgb_checkbox:
        with xgb_hyp_placeholder:
            with st.expander("XGB hyper parameters"):
                max_depth = st.slider("max_depth", min_value=1, max_value=10, value=3, step=1)
                subsample = st.slider("subsample", min_value=0.1, max_value=1.0, value=0.8, step=0.1)
                eta = st.slider("learning rate", min_value=0.01, max_value=0.5, value=0.3, step=0.01)
    st.session_state.algorithm_option = "Linear Regression" if lr_checkbox else "Xgboost Regression"

with cols[0]:
    control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1]))   

#st.subheader("Classification")  # Added line
#classification_option = st.radio("Classification", ["Classification"])  # Added line

if st.button("Run Modeling"): 
    if lr_checkbox:
        st.session_state.binned_df['propensity_score'] = point_estimates(st.session_state.binned_df,model_type='linear',flag=st.session_state.flag,identifier=st.session_state.identifier,control_sample_size=control_sample_size,solver=solver,max_iter=max_iter,class_weights=class_weights)
    elif xgb_checkbox:
        st.session_state.binned_df['propensity_score'] = point_estimates(st.session_state.binned_df,model_type='xgboost',flag=st.session_state.flag,identifier=st.session_state.identifier,control_sample_size=control_sample_size,max_depth=max_depth, subsample=subsample, eta=eta)


    # st.session_state.binned_df['propensity_score'] = result_df['propensity_score']
    st.session_state.treated_df = st.session_state.binned_df[st.session_state.binned_df['Y'] == 1]
    st.session_state.non_treated_df = st.session_state.binned_df[st.session_state.binned_df['Y'] == 0]