|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
from sklearn.neighbors import NearestNeighbors |
|
from sklearn.preprocessing import StandardScaler |
|
import xgboost as xgb |
|
import base64 |
|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
from sklearn.preprocessing import StandardScaler |
|
from sklearn.neighbors import NearestNeighbors |
|
from math import sqrt |
|
from statistics import mean, variance |
|
import seaborn as sns |
|
|
|
import plotly.graph_objects as go |
|
|
|
def cohend_plot_function(std_mean_diff_df2, std_mean_diff_df, selected_attributes): |
|
|
|
fig = go.Figure() |
|
|
|
x = std_mean_diff_df2[std_mean_diff_df2["Metrics"].isin(selected_attributes)]["Cohend Value"][::-1] |
|
y = list(std_mean_diff_df[std_mean_diff_df["Metrics"].isin(selected_attributes)]["Metrics"][::-1]) |
|
|
|
x1 = std_mean_diff_df[std_mean_diff_df["Metrics"].isin(selected_attributes)]["Cohend Value"][::-1] |
|
y1 = list(std_mean_diff_df[std_mean_diff_df["Metrics"].isin(selected_attributes)]["Metrics"][::-1]) |
|
|
|
|
|
fig.add_trace(go.Scatter( |
|
x=x, |
|
y=y, |
|
mode='markers', |
|
marker=dict(color='blue'), |
|
name='general_control_cohend' |
|
)) |
|
|
|
fig.add_trace(go.Scatter( |
|
x=x1, |
|
y=y1, |
|
mode='markers', |
|
marker=dict(color='orange', symbol='diamond-open'), |
|
name='synthetic_control_cohend' |
|
)) |
|
|
|
|
|
for val in [-0.1, 0.1, -0.75, -0.5, -0.25, 0.25, 0.5, 0.75]: |
|
fig.add_shape( |
|
type="line", |
|
x0=val, |
|
y0=0, |
|
x1=val, |
|
y1=10, |
|
line=dict( |
|
color="gray", |
|
width=1, |
|
dash="dash", |
|
) |
|
) |
|
|
|
|
|
fig.add_shape( |
|
type="line", |
|
x0=0, |
|
y0=0, |
|
x1=0, |
|
y1=10, |
|
line=dict( |
|
color="black", |
|
width=1, |
|
) |
|
) |
|
|
|
|
|
fig.update_layout( |
|
xaxis=dict( |
|
title='cohend', |
|
range=[-1, 1] |
|
), |
|
yaxis=dict( |
|
title='Metrics', |
|
autorange="reversed" |
|
), |
|
legend=dict( |
|
orientation="h", |
|
yanchor="bottom", |
|
y=1.02, |
|
xanchor="right", |
|
x=1 |
|
) |
|
) |
|
|
|
|
|
st.plotly_chart(fig,use_container_width=True) |
|
|
|
|
|
def plot_comparison(comparison_df): |
|
fig = go.Figure() |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
x=comparison_df.index, |
|
y=comparison_df[comparison_df.columns[0]], |
|
name='Treatment', |
|
marker=dict(color='#053057'), |
|
)) |
|
|
|
fig.add_trace(go.Bar( |
|
x=comparison_df.index, |
|
y=comparison_df[comparison_df.columns[1]], |
|
name='Control', |
|
marker=dict(color='#8ac4f8'), |
|
)) |
|
|
|
|
|
fig.update_layout( |
|
xaxis=dict( |
|
title='quartiles' |
|
), |
|
yaxis=dict( |
|
title='values' |
|
), |
|
barmode='group', |
|
title=comparison_df.columns[0].split('treatment')[1][1:] |
|
) |
|
|
|
|
|
st.plotly_chart(fig,use_container_width=True) |
|
|
|
|
|
def plot_propensity_distribution(treatment_data, control_data): |
|
fig = go.Figure() |
|
|
|
|
|
fig.add_trace(go.Histogram( |
|
x=treatment_data, |
|
name='Treatment', |
|
marker=dict(color='#053057'), |
|
opacity=0.6 |
|
)) |
|
|
|
fig.add_trace(go.Histogram( |
|
x=control_data, |
|
name='Control', |
|
marker=dict(color='#8ac4f8'), |
|
opacity=0.6 |
|
)) |
|
|
|
|
|
fig.update_layout( |
|
xaxis=dict( |
|
title='propensity_score' |
|
), |
|
yaxis=dict( |
|
title='count' |
|
), |
|
barmode='overlay', |
|
title='Propensity Distribution' |
|
) |
|
|
|
|
|
st.plotly_chart(fig,use_container_width=True) |
|
|
|
def comparison(df, variable): |
|
|
|
treatment_values = df[df.Y==1].groupby('quartiles')[variable].mean() |
|
control_values = df[df.Y==0].groupby('quartiles')[variable].mean() |
|
comparison = pd.merge(treatment_values, control_values, left_index=True, right_index=True) |
|
comparison.rename({f'{variable}_x': f'treatment_{variable}', f'{variable}_y': f'control_{variable}'}, axis=1, inplace=True) |
|
comparison['difference'] = np.abs(comparison[f'treatment_{variable}'] - comparison[f'control_{variable}']) |
|
comparison['percent_difference'] = np.abs((comparison[f'treatment_{variable}'] - comparison[f'control_{variable}']) / comparison[f'treatment_{variable}']) |
|
return comparison |
|
|
|
|
|
|
|
|
|
def cohend(d1, d2): |
|
n1, n2 = len(d1), len(d2) |
|
s1, s2 = np.var(d1, ddof=1), np.var(d2, ddof=1) |
|
s = sqrt(((n1-1) * s1 + (n2-1) * s2) / (n1 + n2 - 2)) |
|
u1, u2 = mean(d1), mean(d2) |
|
|
|
if s == 0: |
|
return 0 |
|
else: |
|
return (u1 - u2) / s |
|
|
|
|
|
def std_mean_diff(group_A_df, group_B_df): |
|
cohend_values_arr = [0] * len(group_A_df.columns) |
|
|
|
for i in range(len(group_A_df.columns)): |
|
cohend_values_arr[i] = cohend(group_A_df[group_A_df.columns[i]], group_B_df[group_A_df.columns[i]]) |
|
|
|
cohend_array_pre_transp = [group_A_df.columns, cohend_values_arr] |
|
np_array = np.array(cohend_array_pre_transp) |
|
cohend_array = np.transpose(np_array) |
|
|
|
return cohend_array |
|
|
|
|
|
def cohend_code_function(binned_df, matching_df): |
|
treat_df_complete = binned_df[binned_df['Y'] == 1] |
|
control_df_complete = binned_df[binned_df['Y'] == 0] |
|
treat_df_complete.drop('Y', axis =1, inplace = True) |
|
control_df_complete.drop('Y', axis =1, inplace = True) |
|
treatment_cust = pd.DataFrame() |
|
control_cust = pd.DataFrame() |
|
treatment_cust['individual_id_ov'] = matching_df["Id"] |
|
control_cust['individual_id_ov'] = matching_df["matched_Id"] |
|
|
|
|
|
|
|
group_A_df = treatment_cust[['individual_id_ov']] |
|
group_A_df = group_A_df.merge(treat_df_complete, |
|
how = 'left',right_on='individual_id_ov',left_on='individual_id_ov') |
|
group_B_df = control_cust[['individual_id_ov']] |
|
group_B_df = group_B_df.merge(control_df_complete, |
|
how = 'left',right_on='individual_id_ov',left_on='individual_id_ov') |
|
|
|
group_A_df.drop('individual_id_ov', axis =1, inplace = True) |
|
group_B_df.drop('individual_id_ov', axis =1, inplace = True) |
|
|
|
cohensd_df = std_mean_diff(group_A_df, group_B_df) |
|
std_mean_diff_df = pd.DataFrame(columns=["Metrics","Cohend Value"]) |
|
for i in range(len(cohensd_df)): |
|
std_mean_diff_df.loc[len(std_mean_diff_df.index)] = [cohensd_df[i][0],round(float(cohensd_df[i][1]),2)] |
|
|
|
std_mean_diff_df["flag"] = std_mean_diff_df.apply(lambda x : 1 if (x["Cohend Value"]>0.1 or x["Cohend Value"]<-0.1) else 0, axis =1) |
|
st.write('Number of variables with standard mean difference between treatment and control is out of desired range (-0.1, 0.1): ', std_mean_diff_df["flag"].sum()) |
|
|
|
|
|
|
|
st.write(std_mean_diff_df) |
|
|
|
|
|
|
|
group_A_df = treatment_cust[['individual_id_ov']] |
|
group_A_df = group_A_df.merge(treat_df_complete, |
|
how = 'left',right_on='individual_id_ov',left_on='individual_id_ov') |
|
group_B_df = control_df_complete[['individual_id_ov']] |
|
group_B_df = group_B_df.merge(control_df_complete, |
|
how = 'left',right_on='individual_id_ov',left_on='individual_id_ov') |
|
|
|
group_A_df.drop('individual_id_ov', axis =1, inplace = True) |
|
group_B_df.drop('individual_id_ov', axis =1, inplace = True) |
|
|
|
cohensd_df = std_mean_diff(group_A_df, group_B_df) |
|
|
|
std_mean_diff_df2 = pd.DataFrame(columns=["Metrics","Cohend Value"]) |
|
|
|
for i in range(len(cohensd_df)): |
|
std_mean_diff_df2.loc[len(std_mean_diff_df2.index)] = [cohensd_df[i][0],round(float(cohensd_df[i][1]),2)] |
|
|
|
return std_mean_diff_df2, std_mean_diff_df |
|
|
|
def calculate_iv(df, flag, identifier): |
|
df1 = df.drop([flag, identifier, 'propensity_score'], axis=1) |
|
iv_df = pd.DataFrame(columns=['Feature', 'IV']) |
|
for column in df1.columns: |
|
data = pd.concat([pd.qcut(df1[column], q=10, duplicates='drop'), df[flag]], axis=1) |
|
groups = data.groupby(by=column)[df[flag].name].agg(['count', 'sum']) |
|
groups['event_rate'] = groups['sum'] / groups['count'] |
|
groups['non_event_rate'] = (groups['count'] - groups['sum']) / groups['count'] |
|
groups['WOE'] = np.log(groups['event_rate'] / groups['non_event_rate']) |
|
groups['IV'] = (groups['event_rate'] - groups['non_event_rate']) * groups['WOE'] |
|
iv = groups['IV'].sum() |
|
iv_df = pd.concat([iv_df, pd.DataFrame({'Feature': [column], 'IV': [iv]})],axis=0, ignore_index=True) |
|
return iv_df |
|
|
|
def xgboost_feature_importance(df, flag,identifier): |
|
X, y = df.drop([flag,identifier,'propensity_score'],axis=1), df[[flag]] |
|
model = xgb.XGBClassifier() |
|
model.fit(X, y) |
|
importances = model.feature_importances_ |
|
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances}) |
|
importance_df = importance_df.sort_values(by='Importance', ascending=False) |
|
return importance_df |
|
|
|
|
|
|
|
|
|
|
|
def get_matching_pairs(identifier,treated_df, non_treated_df, sample_size_A, sample_size_B,matching_columns,flag): |
|
|
|
|
|
|
|
treated_df = treated_df[matching_columns].sample(frac=sample_size_A/100) |
|
non_treated_df = non_treated_df[matching_columns].sample(frac=sample_size_B/100) |
|
|
|
treated_df = treated_df.set_index(st.session_state.identifier) |
|
treated_df.drop(flag,axis=1,inplace=True) |
|
|
|
non_treated_df = non_treated_df.set_index(st.session_state.identifier) |
|
non_treated_df.drop(flag,axis=1,inplace=True) |
|
|
|
treated_x = treated_df.values |
|
non_treated_x = non_treated_df.values |
|
|
|
scaler = StandardScaler() |
|
scaler.fit(treated_x) |
|
treated_x = scaler.transform(treated_x) |
|
non_treated_x = scaler.transform(non_treated_x) |
|
|
|
|
|
print("data transformaion completed") |
|
|
|
nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(non_treated_x) |
|
|
|
print("model fitting completed") |
|
|
|
distances, indices = nbrs.kneighbors(treated_x) |
|
|
|
print("matching completed") |
|
|
|
indices = indices.reshape([1,indices.shape[0]*indices.shape[1]]) |
|
|
|
res = [] |
|
for i in list(treated_df.index): |
|
for ele in range(1): |
|
res.append(i) |
|
|
|
|
|
output_df = pd.DataFrame() |
|
output_df["Id"] = res |
|
output_df["matched_Id"] = non_treated_df.iloc[indices[0]].index |
|
|
|
return output_df |
|
|
|
|
|
st.title("Matching") |
|
|
|
|
|
iv_df = calculate_iv(st.session_state.binned_df, st.session_state.flag, st.session_state.identifier) |
|
|
|
|
|
importance_df = xgboost_feature_importance(st.session_state.binned_df, st.session_state.flag, st.session_state.identifier) |
|
|
|
|
|
combined_df = pd.merge(iv_df, importance_df, on='Feature', suffixes=('_iv', '_importance')) |
|
combined_df['Avg_IV_Importance'] = (combined_df['IV'] + combined_df['Importance']) / 2 |
|
combined_df.sort_values('Avg_IV_Importance',inplace=True,ascending=False) |
|
|
|
combined_df.insert(0, 'Select', False) |
|
combined_df.reset_index(drop=True,inplace=True) |
|
|
|
|
|
st.subheader("Feature importances") |
|
st.session_state["edited_df_combined"] = st.data_editor( |
|
combined_df.style.hide(axis="index"), |
|
column_config={ |
|
"Select": st.column_config.CheckboxColumn(required=True) |
|
}, |
|
disabled=combined_df.drop("Select", axis=1).columns,use_container_width=True |
|
) |
|
|
|
|
|
top_features_input = st.number_input("Enter the number of top features", min_value=1, max_value=len(combined_df), value=None) |
|
|
|
if top_features_input is not None: |
|
|
|
selected_df = combined_df.head(top_features_input) |
|
selected_features = selected_df['Feature'].tolist() |
|
else: |
|
|
|
selected_features = st.session_state.edited_df_combined[st.session_state.edited_df_combined['Select']]['Feature'].tolist() |
|
|
|
|
|
|
|
|
|
selected_features.append(st.session_state.identifier) |
|
selected_features.append(st.session_state.flag) |
|
|
|
st.session_state.selected_features = selected_features |
|
|
|
with st.expander("Matching Inputs",expanded=True): |
|
st.write("Matching Inputs") |
|
ui_columns = st.columns((1, 1)) |
|
with ui_columns[0]: |
|
sample_size_A = st.slider("Sample Size for treatment Group", 1, 100, 100) |
|
with ui_columns[1]: |
|
sample_size_B = st.slider("Sample Size for Control Group", 1, 100, 100) |
|
with ui_columns[0]: |
|
st.write("#") |
|
run_matching = st.button( |
|
label="Run Matching" |
|
) |
|
st.divider() |
|
if run_matching: |
|
matching_df = get_matching_pairs(st.session_state.identifier,st.session_state.treated_df, st.session_state.non_treated_df, sample_size_A, sample_size_B,st.session_state.selected_features,st.session_state.flag) |
|
st.session_state.matching_df = matching_df |
|
|
|
st.dataframe(st.session_state.matching_df) |
|
if st.session_state.matching_df is not None: |
|
|
|
download_button = st.download_button( |
|
label="Download Matched Data as CSV", |
|
data=st.session_state.matching_df.to_csv(index=False).encode(), |
|
file_name='matching_data.csv', |
|
mime='text/csv', |
|
) |
|
|
|
|
|
|
|
|
|
st.subheader("Matching diagnostics") |
|
control_group = st.session_state.binned_df[st.session_state.binned_df[st.session_state.identifier].isin(st.session_state.matching_df['matched_Id'])] |
|
treatment_group = st.session_state.binned_df[st.session_state.binned_df.Y==1] |
|
|
|
|
|
combined_group = pd.concat([control_group, treatment_group]) |
|
combined_group['quartiles'] = pd.qcut(combined_group['propensity_score'], 4, labels=False) |
|
|
|
combined_group.drop(st.session_state.identifier,axis=1,inplace=True) |
|
st.session_state.combined_group=combined_group |
|
|
|
if 'perform_diagnostics' not in st.session_state: |
|
st.session_state.perform_diagnostics = False |
|
|
|
|
|
perform_diagnostics = st.button(label="Run Diagnostics") |
|
|
|
if perform_diagnostics or st.session_state.perform_diagnostics: |
|
st.session_state.perform_diagnostics = True |
|
with st.expander("Matching Diagnostics", expanded=True): |
|
left, right = st.columns(2) |
|
std_mean_diff_df2,std_mean_diff_df = cohend_code_function(st.session_state.binned_df, st.session_state.matching_df) |
|
st.subheader("Cohen's d Plot") |
|
cohend_plot_function(std_mean_diff_df2,std_mean_diff_df, selected_features) |
|
|
|
|
|
st.subheader("Pre-matching Propensity Distributions") |
|
plot_propensity_distribution(st.session_state.binned_df[st.session_state.binned_df.Y == 1]['propensity_score'], st.session_state.binned_df[st.session_state.binned_df.Y == 0]['propensity_score']) |
|
|
|
|
|
st.subheader("Post-matching Propensity Distributions") |
|
temp = pd.merge(left=st.session_state.matching_df, right=st.session_state.binned_df[[st.session_state.identifier, 'propensity_score']], left_on='Id', right_on=st.session_state.identifier, how='left') |
|
temp.drop(st.session_state.identifier, axis=1, inplace=True) |
|
temp.rename({'Id': 'treatment_id', 'matched_Id': 'control_id', 'propensity_score': 'treatment_propensity'}, axis=1, inplace=True) |
|
temp = pd.merge(left=temp, right=st.session_state.binned_df[[st.session_state.identifier, 'propensity_score']], left_on='control_id', right_on=st.session_state.identifier, how='left') |
|
temp.drop(st.session_state.identifier, axis=1, inplace=True) |
|
temp.rename({'propensity_score': 'control_propensity'}, axis=1, inplace=True) |
|
|
|
plot_propensity_distribution(temp['treatment_propensity'],temp['control_propensity']) |
|
|
|
|
|
|
|
with st.expander("Comparison Plots",expanded=True): |
|
st.markdown( |
|
"<p class='plot-header'>Change the selected variable to plot" |
|
" different charts</p>", |
|
unsafe_allow_html=True, |
|
) |
|
left, right = st.columns(2) |
|
with left: |
|
if 'selected_variable_comp' not in st.session_state: |
|
st.session_state.selected_variable_comp = [] |
|
|
|
selected_variable_comp = st.multiselect( |
|
"Variable", |
|
st.session_state.combined_group.columns, |
|
st.session_state.selected_variable_comp |
|
) |
|
|
|
|
|
st.session_state.selected_variable_comp = selected_variable_comp |
|
|
|
if st.session_state.selected_variable_comp: |
|
|
|
comparisons = {} |
|
for var in st.session_state.selected_variable_comp: |
|
comparisons[var] = comparison(combined_group, var) |
|
plot_comparison(comparisons[var]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|