File size: 12,629 Bytes
91e0a48
d89f303
 
 
 
 
 
e8ef7ba
d89f303
91e0a48
3430dd0
 
d89f303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
import streamlit as st
import os
from streamlit_option_menu import option_menu
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from streamlit_ace import st_ace
from streamlit_pandas_profiling import st_profile_report



def set_data_files_session_object(file_name, file_path):
    if 'data_files' not in st.session_state:
        files_dictionary = {}
        files_dictionary[file_name] = file_path
        st.session_state['data_files'] = files_dictionary
    else:
        files_dictionary = st.session_state['data_files']
        files_dictionary[file_name] = file_path
        st.session_state['data_files'] = files_dictionary

def set_filtered_data_session_object(df, file_name):
    if 'filtered_data' not in st.session_state:
        filtered_data_dictionary = {}
        filtered_data_dictionary[file_name] = df
        st.session_state['filtered_data'] = filtered_data_dictionary
    else:
        filtered_data_dictionary = st.session_state['filtered_data']
        filtered_data_dictionary[file_name] = df
        st.session_state['filtered_data'] = filtered_data_dictionary

def set_dataframe_session_object(file_name, file_path):
    if 'data_frames' not in st.session_state:
        data_frame_dictionary = {}
        data_frame_dictionary[file_name] = pd.read_csv(file_path)
        st.session_state['data_frames'] = data_frame_dictionary
    else:
        data_frame_dictionary = st.session_state['data_frames']
        data_frame_dictionary[file_name] = pd.read_csv(file_path)
        st.session_state['data_frames'] = data_frame_dictionary
        
def save_file(file_object):
    file_path = os.path.join(os.getcwd(), "uploaded_files", file_object.name)
    with open(file_path, "wb") as f:
        f.write(file_object.getbuffer())
    
    set_data_files_session_object(file_object.name, file_path)
    set_dataframe_session_object(file_object.name, file_path)
    



def create_upload_file_component():
    uploaded_files = st.file_uploader("Upload one file at a time.", type=['csv', 'xls', 'xlsx', 'pkl', 'pdf'],
                                      accept_multiple_files=True)

    if uploaded_files:

        os.makedirs(os.path.join(os.getcwd(), "uploaded_files"), mode=0o777, exist_ok=True)
        for uploaded_file in uploaded_files:
            save_file(uploaded_file)

def create_component_to_add_target_func(selected_files, dfs, i):
    target_var_name = st.text_input("Name of the target variable",key="target_var" + str(i))
#     content = st_ace(language="python")
#     if content:
    code= "def f1(x): return str(x * 3)"
    exec(code)
    st.write(f1(3))

#         st.write(len(content.splitlines()))
#         exec(content)
#         code= "def f1(x): return str(x * 3)"
   
#         exec(code)
#         st.text(content)
#         st.write(f1(3))


def create_component_for_analysis_for_single_df(selected_files, dfs, i):
    st.subheader(selected_files[i])
    df = dfs[selected_files[i]]

    filter_data = st.checkbox("Analyse on Filtered Data",key="filter_data_check"+str(i))
    
    if filter_data:
        action = "data_filter"
        col_to_filter = st.selectbox("Select the field to Filter on ", df.columns.values,
                                     key= action + "_col_filter_" + str(i))
        filter_operation = st.selectbox("Operation ",
                                        ['Greater Than', 'Equals', 'Less Than', "In", "In Between"],
                                        key=action + "_col_filter_op_" + str(i))
        selected_filter_vals = None
        
        if filter_operation:
            if filter_operation == 'In':
                selected_filter_vals = st.multiselect("Select Values to Filter on ", df[col_to_filter].unique(),
                                                      key=action + "_col_filter_val_" + str(i))
                if selected_filter_vals:
                    filtered_df = df[df[col_to_filter].isin(selected_filter_vals)]
            elif filter_operation == 'Equals':
                selected_filter_vals = st.text_input("Enter a numeric value",
                                                     key=action + "_col_filter_val_" + str(i))
                if selected_filter_vals:
                    filtered_df = df[df[col_to_filter] == selected_filter_vals]
            elif filter_operation == 'Greater Than':
                selected_filter_vals = st.text_input("Enter a numeric value",
                                                     key=action + "_col_filter_val_" + str(i))
                if selected_filter_vals:
                    filtered_df = df[df[col_to_filter] > selected_filter_vals]
            elif filter_operation == 'Less Than':
                selected_filter_vals = st.text_input("Enter a numeric value",
                                                     key=action + "_col_filter_val_" + str(i))
                if selected_filter_vals:
                    filtered_df = df[df[col_to_filter] < selected_filter_vals]
            elif filter_operation == 'In Between':
                selected_filter_vals = st.select_slider("Select range",
                                                        (df[col_to_filter].min(), df[col_to_filter].max()),
                                                        key=action + "_col_filter_val_" + str(i))
                if selected_filter_vals:
                    filtered_df = df[df[col_to_filter] < selected_filter_vals]
            
            if selected_filter_vals:
                set_filtered_data_session_object(filtered_df,selected_files[i])
#                 st.write(df.shape)
#                 st.write( st.session_state['filtered_data'][selected_files[i]].shape)

    analysis_actions = st.multiselect("What analysis do you wish to do?",
                                      ['Summary of Data', 'Sample Data','Get Profile' ,'Univariate Analysis',
                                       'Bivariate Analysis','Add a Target Column'], key='analysis_action_' + str(i))
    if analysis_actions:
        
        df_for_analysis = st.session_state['filtered_data'][selected_files[i]] if filter_data else df
                                                                                          
        for action in analysis_actions:

            if action == 'Sample Data':
                st.write(df_for_analysis.sample(10))
            elif action == 'Get Profile':
                
                pr = df_for_analysis.profile_report()
                st_profile_report(pr)
                
            elif action == 'Summary of Data':
                st.write(df_for_analysis.describe())
#                 col_to_filter = st.selectbox("Select the field to Filter on ", df.columns.values,
#                                              key=action + "_col_filter_" + str(i))
#                 selected_filter_vals = st.multiselect("Select Values to Filter on ", df[col_to_filter].unique(),
#                                                       key=action + "_col_filter_val_" + str(i))
            elif action == 'Univariate Analysis':
                cols_for_analysis = st.multiselect("Select Columns for Univariate Analysis",options= df_for_analysis.columns.values)
                for col in cols_for_analysis:
                    if str(df_for_analysis[col].dtype) in ['int64','float64'] and df_for_analysis[col].nunique() > 10 :
                        
                        fig = px.scatter(x=df_for_analysis.index, y=df_for_analysis[col],labels=dict(x="Index", y=col))
                        st.plotly_chart(fig, use_container_width=True)

                    elif str(df_for_analysis[col].dtype) in ['object','category'] or df_for_analysis[col].nunique() <= 10:
                        
                        value_dist_df = df_for_analysis[col].value_counts(normalize=True)[:20].reset_index()
                        value_dist_df.columns = [col,'% Distribution']
                        
                        value_dist_df_counts = df_for_analysis[col].value_counts()[:20].reset_index()
                        value_dist_df_counts.columns = [col,'Count']
                        value_dist_df = value_dist_df.merge(value_dist_df_counts,on=col)
                        
                        trace1 = go.Bar(x=value_dist_df[col],y=value_dist_df['Count'],name='Count',marker=dict(color='rgb(34,163,192)'))
                        trace2 = go.Scatter(x=value_dist_df[col],y=value_dist_df['% Distribution'],name='% Distribution',yaxis='y2')

                        fig = make_subplots(specs=[[{"secondary_y": True}]])
                        fig.add_trace(trace1)
                        fig.add_trace(trace2,secondary_y=True)
                        
                        fig['layout'].update(height = 600, width = 800, title = f"{col} data distribution",xaxis=dict(tickangle=-90))
                        
#                         fig.update_layout(height=200, width=400, title_text=f"{col} data distribution")
                        
                        st.plotly_chart(fig, use_container_width=True)
        
            elif action == "Add a Target Column":
#                 create_component_to_add_target_func(selected_files, dfs, i)
                code= "def f1(x): return str(x * 3)"
                exec(code)
                st.write(f1(3))
                        
            
def create_component_for_data_analysis():
    if 'data_files' in st.session_state:

        selected_files = st.multiselect("Select the File(S) to analyze", st.session_state['data_files'].keys())

        if selected_files:
            cols = st.columns(len(selected_files))

            dfs = {}

            for selected_file in selected_files:
                if selected_file in  st.session_state['data_frames']:
                    dfs[selected_file] =  st.session_state['data_frames'][selected_file]
                else:
                    st.session_state['data_frames'][selected_file] = pd.read_csv(st.session_state['data_files'][selected_file])
                    dfs[selected_file] = st.session_state['data_frames'][selected_file]

            for i, col in enumerate(cols):
                with col:
                    create_component_for_analysis_for_single_df(selected_files, dfs, i)

    else:
        st.write("Upload a file to start analysis")


# def build_interface_for_model_analysis():
st.title("Model Results Analyzer")
with st.sidebar:
    

    selected_menu = option_menu(None, ["Home", "Upload Data", "Add Features","Analyze Data"],
                                icons=['house', 'cloud-upload', "list-task", 'gear'],
                                menu_icon="cast", default_index=0, orientation="vertical",
                                styles={
                                    "container": {"padding": "0!important", "background-color": "#fafafa"},
                                    "icon": {"color": "orange", "font-size": "15px"},
                                    "nav-link": {"font-size": "15px", "text-align": "left", "margin": "0px",
                                                 "--hover-color": "#eee"},
                                    "nav-link-selected": {"background-color": "green"},
                                })

if selected_menu == "Home":
    st.markdown('**This is to analyse models performance.**')

elif selected_menu == "Upload Data":

    create_upload_file_component()

    if 'data_files' in st.session_state:
        st.write(pd.DataFrame(
            data={"File Name": pd.DataFrame.from_dict(st.session_state['data_files'], orient='index').index}))

elif selected_menu == "Analyze Data":
    create_component_for_data_analysis()

elif selected_menu == "Add Features":
    if 'data_files' in st.session_state:
        selected_file = st.selectbox("Select the File(S) to analyze", st.session_state['data_files'].keys())
        
        if selected_file:
            df =  st.session_state['data_frames'][selected_file]
            st.header("Enter the function definiton to create a new feature")
            feature_name = st.text_input("Enter the New Feature Name")
            st.warning("please retain the function signature as 'add_feature(row)'")
            
            content = st_ace(language="python",value="def add_feature(row):")
            
            if content != 'def add_feature(row):':
                exec(content)
                df[feature_name] = df.apply(lambda x:add_feature(x),axis=1)
               
                st.session_state['data_frames'][selected_file] = df
                st.write(df.columns.values)