File size: 14,724 Bytes
91e0a48 d89f303 e8ef7ba d89f303 6d3f583 91e0a48 3430dd0 d89f303 17ef366 fbe48a0 53b8179 fbe48a0 d89f303 17ef366 d89f303 e8c4461 d89f303 76275be d89f303 76275be a0d26a9 becf373 a0d26a9 becf373 a0d26a9 becf373 a0d26a9 e842846 a0d26a9 d89f303 76275be d89f303 76275be d89f303 76275be d89f303 e8c4461 76275be e8c4461 76275be e8c4461 76275be e8c4461 9975064 7aaadbd 76275be 7aaadbd 76275be 7aaadbd e8c4461 d89f303 76275be d89f303 76275be d89f303 4f86c96 76275be d89f303 76275be d89f303 76275be d89f303 76275be d89f303 76275be d89f303 76275be d89f303 76275be 4f86c96 76275be |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 |
import streamlit as st
import os
from streamlit_option_menu import option_menu
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from streamlit_ace import st_ace
from streamlit_pandas_profiling import st_profile_report
import pandas_profiling
def set_data_files_session_object(file_name, file_path):
if 'data_files' not in st.session_state:
files_dictionary = {}
files_dictionary[file_name] = file_path
st.session_state['data_files'] = files_dictionary
else:
files_dictionary = st.session_state['data_files']
files_dictionary[file_name] = file_path
st.session_state['data_files'] = files_dictionary
def set_filtered_data_session_object(df, file_name):
if 'filtered_data' not in st.session_state:
filtered_data_dictionary = {}
filtered_data_dictionary[file_name] = df
st.session_state['filtered_data'] = filtered_data_dictionary
else:
filtered_data_dictionary = st.session_state['filtered_data']
filtered_data_dictionary[file_name] = df
st.session_state['filtered_data'] = filtered_data_dictionary
def set_dataframe_session_object(file_name, file_path):
if 'data_frames' not in st.session_state:
data_frame_dictionary = {}
data_frame_dictionary[file_name] = pd.read_csv(file_path)
st.session_state['data_frames'] = data_frame_dictionary
else:
data_frame_dictionary = st.session_state['data_frames']
data_frame_dictionary[file_name] = pd.read_csv(file_path)
st.session_state['data_frames'] = data_frame_dictionary
def save_file(file_object):
file_path = os.path.join(os.getcwd(), "uploaded_files", file_object.name)
with open(file_path, "wb") as f:
f.write(file_object.getbuffer())
set_data_files_session_object(file_object.name, file_path)
set_dataframe_session_object(file_object.name, file_path)
def create_upload_file_component():
uploaded_files = st.file_uploader("Upload one file at a time.", type=['csv', 'xls', 'xlsx', 'pkl', 'pdf'],
accept_multiple_files=True)
if uploaded_files:
os.makedirs(os.path.join(os.getcwd(), "uploaded_files"), mode=0o777, exist_ok=True)
for uploaded_file in uploaded_files:
save_file(uploaded_file)
def create_component_to_add_target_func(selected_files, dfs, i):
target_var_name = st.text_input("Name of the target variable",key="target_var" + str(i))
# content = st_ace(language="python")
# if content:
code= "def f1(x): return str(x * 3)"
exec(code)
st.write(f1(3))
# st.write(len(content.splitlines()))
# exec(content)
# code= "def f1(x): return str(x * 3)"
# exec(code)
# st.text(content)
# st.write(f1(3))
def set_filtered_data(df,selected_files,i):
action = "data_filter"
col_to_filter = st.selectbox("Select the field to Filter on ", df.columns.values,
key= action + "_col_filter_" + str(i))
filter_operation = st.selectbox("Operation ",
['Greater Than', 'Equals', 'Less Than', "In", "In Between"],
key=action + "_col_filter_op_" + str(i))
selected_filter_vals = None
if filter_operation:
if filter_operation == 'In':
selected_filter_vals = st.multiselect("Select Values to Filter on ", df[col_to_filter].unique(),
key=action + "_col_filter_val_" + str(i))
if selected_filter_vals:
filtered_df = df[df[col_to_filter].isin(selected_filter_vals)]
elif filter_operation == 'Equals':
selected_filter_vals = st.text_input("Enter a numeric value",
key=action + "_col_filter_val_" + str(i))
if selected_filter_vals:
filtered_df = df[df[col_to_filter] == selected_filter_vals]
elif filter_operation == 'Greater Than':
selected_filter_vals = st.text_input("Enter a numeric value",
key=action + "_col_filter_val_" + str(i))
if selected_filter_vals:
filtered_df = df[df[col_to_filter] > selected_filter_vals]
elif filter_operation == 'Less Than':
selected_filter_vals = st.text_input("Enter a numeric value",
key=action + "_col_filter_val_" + str(i))
if selected_filter_vals:
filtered_df = df[df[col_to_filter] < selected_filter_vals]
elif filter_operation == 'In Between':
selected_filter_vals = st.select_slider("Select range",
(df[col_to_filter].min(), df[col_to_filter].max()),
key=action + "_col_filter_val_" + str(i))
if selected_filter_vals:
filtered_df = df[df[col_to_filter] < selected_filter_vals]
if selected_filter_vals:
set_filtered_data_session_object(filtered_df,selected_files[i])
st.write('data filtered',st.session_state['filtered_data'][selected_files[i]].shape)
# st.write(df.shape)
# st.write( st.session_state['filtered_data'][selected_files[i]].shape)
def create_component_for_analysis_for_single_df(selected_files, dfs, i):
st.subheader(selected_files[i])
df = dfs[selected_files[i]]
filter_data = st.checkbox("Analyse on Filtered Data",key="filter_data_check"+str(i))
if filter_data:
set_filtered_data(df,selected_files,i)
analysis_actions = st.multiselect("What analysis do you wish to do?",
['Summary of Data', 'Sample Data','Get Profile' ,'Univariate Analysis',
'Bivariate Analysis'], key='analysis_action_' + str(i))
if analysis_actions:
df_for_analysis = st.session_state['filtered_data'][selected_files[i]] if filter_data else df
for action in analysis_actions:
if action == 'Sample Data':
clear_chart_type_session_var()
st.write(df_for_analysis.sample(10))
elif action == 'Get Profile':
clear_chart_type_session_var()
full_data_check = st.checkbox("Report on all columns",key="filter_data_check"+str(i))
if full_data_check:
st.warning("This might take a lot of time to generate the report depending on the size of the data.Select a subset of columns")
confirm_full_run = st.button("Run on full data")
if confirm_full_run:
pr = df_for_analysis.profile_report()
st_profile_report(pr)
else:
col_subset = st.multiselect("Select subset of columns", df.columns.values,key='filter_subset_'+ str(i))
if col_subset:
pr = df_for_analysis[col_subset].profile_report()
st_profile_report(pr)
elif action == 'Summary of Data':
clear_chart_type_session_var()
st.write(df_for_analysis.describe())
elif action == 'Univariate Analysis':
clear_chart_type_session_var()
cols_for_analysis = st.multiselect("Select Columns for Univariate Analysis",options= df_for_analysis.columns.values)
for col in cols_for_analysis:
if str(df_for_analysis[col].dtype) in ['int64','float64'] and df_for_analysis[col].nunique() > 10 :
fig = px.scatter(x=df_for_analysis.index, y=df_for_analysis[col],labels=dict(x="Index", y=col))
st.plotly_chart(fig, use_container_width=True)
elif str(df_for_analysis[col].dtype) in ['object','category'] or df_for_analysis[col].nunique() <= 10:
value_dist_df = df_for_analysis[col].value_counts(normalize=True)[:20].reset_index()
value_dist_df.columns = [col,'% Distribution']
value_dist_df_counts = df_for_analysis[col].value_counts()[:20].reset_index()
value_dist_df_counts.columns = [col,'Count']
value_dist_df = value_dist_df.merge(value_dist_df_counts,on=col)
trace1 = go.Bar(x=value_dist_df[col],y=value_dist_df['Count'],name='Count',marker=dict(color='rgb(34,163,192)'))
trace2 = go.Scatter(x=value_dist_df[col],y=value_dist_df['% Distribution'],name='% Distribution',yaxis='y2')
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(trace1)
fig.add_trace(trace2,secondary_y=True)
fig['layout'].update(height = 600, width = 800, title = f"{col} data distribution",xaxis=dict(tickangle=-90))
# fig.update_layout(height=200, width=400, title_text=f"{col} data distribution")
st.plotly_chart(fig, use_container_width=True)
elif action == "Bivariate Analysis":
add_chart_options_to_sidebar()
create_for_bivariate_analysis(selected_files, df, i)
def clear_chart_type_session_var():
if 'chart_type' in st.session_state:
del st.session_state[chart_type]
def add_chart_options_to_sidebar():
if 'chart_type' not in st.session_state :
with st.sidebar:
viz_type = st.radio("Graph Type",('None','Cross Tab','Pivot Table','Box Plot'))
if viz_type and viz_type != 'None':
st.session_state['chart_type'] == viz_type
def create_for_bivariate_analysis(selected_files, df, i):
target_column = st.selectbox("Select the target column ", df.columns.values,
key= "bivariate_target_column_" + str(i))
bivariate_columns = st.multiselect("Select the columns to analyse ", df.columns.values,
key= "bivariate_analysis_columns_" + str(i))
col_vals = []
if bivariate_columns:
for col in bivariate_columns:
col_vals.append(df[col])
if st.session_state['chart_type'] == 'Cross Tab':
if len(col_vals) > 3 :
st.warning("Too many columns to split on. Please consider reducing the no of columns")
crosstab_df = pd.crosstab(df[target_column], col_vals, margins=True)
st.write(crosstab_df.to_html(),unsafe_allow_html=True)
# 3 any other aggregation function can be used based on column type
def create_component_for_data_analysis():
if 'data_files' in st.session_state:
selected_files = st.multiselect("Select the File(S) to analyze", st.session_state['data_files'].keys())
if selected_files:
cols = st.columns(len(selected_files))
dfs = {}
for selected_file in selected_files:
if selected_file in st.session_state['data_frames']:
dfs[selected_file] = st.session_state['data_frames'][selected_file]
else:
st.session_state['data_frames'][selected_file] = pd.read_csv(st.session_state['data_files'][selected_file])
dfs[selected_file] = st.session_state['data_frames'][selected_file]
for i, col in enumerate(cols):
with col:
create_component_for_analysis_for_single_df(selected_files, dfs, i)
else:
st.write("Upload a file to start analysis")
def main():
st.title("Model Results Analyzer")
with st.sidebar:
selected_menu = option_menu(None, ["Home", "Upload Data", "Add Features","Analyze Data","Iframe"],
icons=['house', 'cloud-upload', "list-task", 'gear'],
menu_icon="cast", default_index=0, orientation="vertical",
styles={
"container": {"padding": "0!important", "background-color": "#fafafa"},
"icon": {"color": "orange", "font-size": "15px"},
"nav-link": {"font-size": "15px", "text-align": "left", "margin": "0px",
"--hover-color": "#eee"},
"nav-link-selected": {"background-color": "green"},
})
if selected_menu == "Home":
st.markdown('**This is to analyse models performance.**')
elif selected_menu == "Upload Data":
create_upload_file_component()
if 'data_files' in st.session_state:
st.write(pd.DataFrame(
data={"File Name": pd.DataFrame.from_dict(st.session_state['data_files'], orient='index').index}))
elif selected_menu == "Analyze Data":
create_component_for_data_analysis()
elif selected_menu == "Add Features":
if 'data_files' in st.session_state:
selected_file = st.selectbox("Select the File(S) to analyze", st.session_state['data_files'].keys())
if selected_file:
df = st.session_state['data_frames'][selected_file]
st.header("Enter the function definiton to create a new feature")
feature_name = st.text_input("Enter the New Feature Name")
st.warning("please retain the function signature as 'add_feature(row)'")
content = st_ace(language="python",value="def add_feature(row):")
if content != 'def add_feature(row):':
exec(content)
df[feature_name] = df.apply(lambda x:add_feature(x),axis=1)
st.session_state['data_frames'][selected_file] = df
st.write(df.columns.values)
elif selected_menu == "Iframe":
st.components.v1.iframe("https://huggingface.co/spaces/Sasidhar/information-extraction-demo", width=None, height=None, scrolling=False)
main() |