File size: 12,629 Bytes
91e0a48 d89f303 e8ef7ba d89f303 91e0a48 3430dd0 d89f303 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 |
import streamlit as st
import os
from streamlit_option_menu import option_menu
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from streamlit_ace import st_ace
from streamlit_pandas_profiling import st_profile_report
def set_data_files_session_object(file_name, file_path):
if 'data_files' not in st.session_state:
files_dictionary = {}
files_dictionary[file_name] = file_path
st.session_state['data_files'] = files_dictionary
else:
files_dictionary = st.session_state['data_files']
files_dictionary[file_name] = file_path
st.session_state['data_files'] = files_dictionary
def set_filtered_data_session_object(df, file_name):
if 'filtered_data' not in st.session_state:
filtered_data_dictionary = {}
filtered_data_dictionary[file_name] = df
st.session_state['filtered_data'] = filtered_data_dictionary
else:
filtered_data_dictionary = st.session_state['filtered_data']
filtered_data_dictionary[file_name] = df
st.session_state['filtered_data'] = filtered_data_dictionary
def set_dataframe_session_object(file_name, file_path):
if 'data_frames' not in st.session_state:
data_frame_dictionary = {}
data_frame_dictionary[file_name] = pd.read_csv(file_path)
st.session_state['data_frames'] = data_frame_dictionary
else:
data_frame_dictionary = st.session_state['data_frames']
data_frame_dictionary[file_name] = pd.read_csv(file_path)
st.session_state['data_frames'] = data_frame_dictionary
def save_file(file_object):
file_path = os.path.join(os.getcwd(), "uploaded_files", file_object.name)
with open(file_path, "wb") as f:
f.write(file_object.getbuffer())
set_data_files_session_object(file_object.name, file_path)
set_dataframe_session_object(file_object.name, file_path)
def create_upload_file_component():
uploaded_files = st.file_uploader("Upload one file at a time.", type=['csv', 'xls', 'xlsx', 'pkl', 'pdf'],
accept_multiple_files=True)
if uploaded_files:
os.makedirs(os.path.join(os.getcwd(), "uploaded_files"), mode=0o777, exist_ok=True)
for uploaded_file in uploaded_files:
save_file(uploaded_file)
def create_component_to_add_target_func(selected_files, dfs, i):
target_var_name = st.text_input("Name of the target variable",key="target_var" + str(i))
# content = st_ace(language="python")
# if content:
code= "def f1(x): return str(x * 3)"
exec(code)
st.write(f1(3))
# st.write(len(content.splitlines()))
# exec(content)
# code= "def f1(x): return str(x * 3)"
# exec(code)
# st.text(content)
# st.write(f1(3))
def create_component_for_analysis_for_single_df(selected_files, dfs, i):
st.subheader(selected_files[i])
df = dfs[selected_files[i]]
filter_data = st.checkbox("Analyse on Filtered Data",key="filter_data_check"+str(i))
if filter_data:
action = "data_filter"
col_to_filter = st.selectbox("Select the field to Filter on ", df.columns.values,
key= action + "_col_filter_" + str(i))
filter_operation = st.selectbox("Operation ",
['Greater Than', 'Equals', 'Less Than', "In", "In Between"],
key=action + "_col_filter_op_" + str(i))
selected_filter_vals = None
if filter_operation:
if filter_operation == 'In':
selected_filter_vals = st.multiselect("Select Values to Filter on ", df[col_to_filter].unique(),
key=action + "_col_filter_val_" + str(i))
if selected_filter_vals:
filtered_df = df[df[col_to_filter].isin(selected_filter_vals)]
elif filter_operation == 'Equals':
selected_filter_vals = st.text_input("Enter a numeric value",
key=action + "_col_filter_val_" + str(i))
if selected_filter_vals:
filtered_df = df[df[col_to_filter] == selected_filter_vals]
elif filter_operation == 'Greater Than':
selected_filter_vals = st.text_input("Enter a numeric value",
key=action + "_col_filter_val_" + str(i))
if selected_filter_vals:
filtered_df = df[df[col_to_filter] > selected_filter_vals]
elif filter_operation == 'Less Than':
selected_filter_vals = st.text_input("Enter a numeric value",
key=action + "_col_filter_val_" + str(i))
if selected_filter_vals:
filtered_df = df[df[col_to_filter] < selected_filter_vals]
elif filter_operation == 'In Between':
selected_filter_vals = st.select_slider("Select range",
(df[col_to_filter].min(), df[col_to_filter].max()),
key=action + "_col_filter_val_" + str(i))
if selected_filter_vals:
filtered_df = df[df[col_to_filter] < selected_filter_vals]
if selected_filter_vals:
set_filtered_data_session_object(filtered_df,selected_files[i])
# st.write(df.shape)
# st.write( st.session_state['filtered_data'][selected_files[i]].shape)
analysis_actions = st.multiselect("What analysis do you wish to do?",
['Summary of Data', 'Sample Data','Get Profile' ,'Univariate Analysis',
'Bivariate Analysis','Add a Target Column'], key='analysis_action_' + str(i))
if analysis_actions:
df_for_analysis = st.session_state['filtered_data'][selected_files[i]] if filter_data else df
for action in analysis_actions:
if action == 'Sample Data':
st.write(df_for_analysis.sample(10))
elif action == 'Get Profile':
pr = df_for_analysis.profile_report()
st_profile_report(pr)
elif action == 'Summary of Data':
st.write(df_for_analysis.describe())
# col_to_filter = st.selectbox("Select the field to Filter on ", df.columns.values,
# key=action + "_col_filter_" + str(i))
# selected_filter_vals = st.multiselect("Select Values to Filter on ", df[col_to_filter].unique(),
# key=action + "_col_filter_val_" + str(i))
elif action == 'Univariate Analysis':
cols_for_analysis = st.multiselect("Select Columns for Univariate Analysis",options= df_for_analysis.columns.values)
for col in cols_for_analysis:
if str(df_for_analysis[col].dtype) in ['int64','float64'] and df_for_analysis[col].nunique() > 10 :
fig = px.scatter(x=df_for_analysis.index, y=df_for_analysis[col],labels=dict(x="Index", y=col))
st.plotly_chart(fig, use_container_width=True)
elif str(df_for_analysis[col].dtype) in ['object','category'] or df_for_analysis[col].nunique() <= 10:
value_dist_df = df_for_analysis[col].value_counts(normalize=True)[:20].reset_index()
value_dist_df.columns = [col,'% Distribution']
value_dist_df_counts = df_for_analysis[col].value_counts()[:20].reset_index()
value_dist_df_counts.columns = [col,'Count']
value_dist_df = value_dist_df.merge(value_dist_df_counts,on=col)
trace1 = go.Bar(x=value_dist_df[col],y=value_dist_df['Count'],name='Count',marker=dict(color='rgb(34,163,192)'))
trace2 = go.Scatter(x=value_dist_df[col],y=value_dist_df['% Distribution'],name='% Distribution',yaxis='y2')
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(trace1)
fig.add_trace(trace2,secondary_y=True)
fig['layout'].update(height = 600, width = 800, title = f"{col} data distribution",xaxis=dict(tickangle=-90))
# fig.update_layout(height=200, width=400, title_text=f"{col} data distribution")
st.plotly_chart(fig, use_container_width=True)
elif action == "Add a Target Column":
# create_component_to_add_target_func(selected_files, dfs, i)
code= "def f1(x): return str(x * 3)"
exec(code)
st.write(f1(3))
def create_component_for_data_analysis():
if 'data_files' in st.session_state:
selected_files = st.multiselect("Select the File(S) to analyze", st.session_state['data_files'].keys())
if selected_files:
cols = st.columns(len(selected_files))
dfs = {}
for selected_file in selected_files:
if selected_file in st.session_state['data_frames']:
dfs[selected_file] = st.session_state['data_frames'][selected_file]
else:
st.session_state['data_frames'][selected_file] = pd.read_csv(st.session_state['data_files'][selected_file])
dfs[selected_file] = st.session_state['data_frames'][selected_file]
for i, col in enumerate(cols):
with col:
create_component_for_analysis_for_single_df(selected_files, dfs, i)
else:
st.write("Upload a file to start analysis")
# def build_interface_for_model_analysis():
st.title("Model Results Analyzer")
with st.sidebar:
selected_menu = option_menu(None, ["Home", "Upload Data", "Add Features","Analyze Data"],
icons=['house', 'cloud-upload', "list-task", 'gear'],
menu_icon="cast", default_index=0, orientation="vertical",
styles={
"container": {"padding": "0!important", "background-color": "#fafafa"},
"icon": {"color": "orange", "font-size": "15px"},
"nav-link": {"font-size": "15px", "text-align": "left", "margin": "0px",
"--hover-color": "#eee"},
"nav-link-selected": {"background-color": "green"},
})
if selected_menu == "Home":
st.markdown('**This is to analyse models performance.**')
elif selected_menu == "Upload Data":
create_upload_file_component()
if 'data_files' in st.session_state:
st.write(pd.DataFrame(
data={"File Name": pd.DataFrame.from_dict(st.session_state['data_files'], orient='index').index}))
elif selected_menu == "Analyze Data":
create_component_for_data_analysis()
elif selected_menu == "Add Features":
if 'data_files' in st.session_state:
selected_file = st.selectbox("Select the File(S) to analyze", st.session_state['data_files'].keys())
if selected_file:
df = st.session_state['data_frames'][selected_file]
st.header("Enter the function definiton to create a new feature")
feature_name = st.text_input("Enter the New Feature Name")
st.warning("please retain the function signature as 'add_feature(row)'")
content = st_ace(language="python",value="def add_feature(row):")
if content != 'def add_feature(row):':
exec(content)
df[feature_name] = df.apply(lambda x:add_feature(x),axis=1)
st.session_state['data_frames'][selected_file] = df
st.write(df.columns.values)
|