Spaces:

Sasidhar
/

test

Sleeping

App Files Files Community

test / app.py

Sasidhar

Update app.py

4f86c96 about 2 years ago

raw

history blame

14.7 kB

	import streamlit as st
	import os
	from streamlit_option_menu import option_menu
	import pandas as pd
	import plotly.express as px
	from plotly.subplots import make_subplots
	import plotly.graph_objects as go
	from streamlit_ace import st_ace
	from streamlit_pandas_profiling import st_profile_report
	import pandas_profiling



	def set_data_files_session_object(file_name, file_path):
	if 'data_files' not in st.session_state:
	files_dictionary = {}
	files_dictionary[file_name] = file_path
	st.session_state['data_files'] = files_dictionary
	else:
	files_dictionary = st.session_state['data_files']
	files_dictionary[file_name] = file_path
	st.session_state['data_files'] = files_dictionary

	def set_filtered_data_session_object(df, file_name):
	if 'filtered_data' not in st.session_state:
	filtered_data_dictionary = {}
	filtered_data_dictionary[file_name] = df
	st.session_state['filtered_data'] = filtered_data_dictionary
	else:
	filtered_data_dictionary = st.session_state['filtered_data']
	filtered_data_dictionary[file_name] = df
	st.session_state['filtered_data'] = filtered_data_dictionary

	def set_dataframe_session_object(file_name, file_path):
	if 'data_frames' not in st.session_state:
	data_frame_dictionary = {}
	data_frame_dictionary[file_name] = pd.read_csv(file_path)
	st.session_state['data_frames'] = data_frame_dictionary
	else:
	data_frame_dictionary = st.session_state['data_frames']
	data_frame_dictionary[file_name] = pd.read_csv(file_path)
	st.session_state['data_frames'] = data_frame_dictionary

	def save_file(file_object):
	file_path = os.path.join(os.getcwd(), "uploaded_files", file_object.name)
	with open(file_path, "wb") as f:
	f.write(file_object.getbuffer())

	set_data_files_session_object(file_object.name, file_path)
	set_dataframe_session_object(file_object.name, file_path)




	def create_upload_file_component():
	uploaded_files = st.file_uploader("Upload one file at a time.", type=['csv', 'xls', 'xlsx', 'pkl', 'pdf'],
	accept_multiple_files=True)

	if uploaded_files:

	os.makedirs(os.path.join(os.getcwd(), "uploaded_files"), mode=0o777, exist_ok=True)
	for uploaded_file in uploaded_files:
	save_file(uploaded_file)

	def create_component_to_add_target_func(selected_files, dfs, i):
	target_var_name = st.text_input("Name of the target variable",key="target_var" + str(i))
	# content = st_ace(language="python")
	# if content:
	code= "def f1(x): return str(x * 3)"
	exec(code)
	st.write(f1(3))

	# st.write(len(content.splitlines()))
	# exec(content)
	# code= "def f1(x): return str(x * 3)"

	# exec(code)
	# st.text(content)
	# st.write(f1(3))

	def set_filtered_data(df,selected_files,i):
	action = "data_filter"
	col_to_filter = st.selectbox("Select the field to Filter on ", df.columns.values,
	key= action + "_col_filter_" + str(i))
	filter_operation = st.selectbox("Operation ",
	['Greater Than', 'Equals', 'Less Than', "In", "In Between"],
	key=action + "_col_filter_op_" + str(i))
	selected_filter_vals = None

	if filter_operation:
	if filter_operation == 'In':
	selected_filter_vals = st.multiselect("Select Values to Filter on ", df[col_to_filter].unique(),
	key=action + "_col_filter_val_" + str(i))
	if selected_filter_vals:
	filtered_df = df[df[col_to_filter].isin(selected_filter_vals)]
	elif filter_operation == 'Equals':
	selected_filter_vals = st.text_input("Enter a numeric value",
	key=action + "_col_filter_val_" + str(i))
	if selected_filter_vals:
	filtered_df = df[df[col_to_filter] == selected_filter_vals]
	elif filter_operation == 'Greater Than':
	selected_filter_vals = st.text_input("Enter a numeric value",
	key=action + "_col_filter_val_" + str(i))
	if selected_filter_vals:
	filtered_df = df[df[col_to_filter] > selected_filter_vals]
	elif filter_operation == 'Less Than':
	selected_filter_vals = st.text_input("Enter a numeric value",
	key=action + "_col_filter_val_" + str(i))
	if selected_filter_vals:
	filtered_df = df[df[col_to_filter] < selected_filter_vals]
	elif filter_operation == 'In Between':
	selected_filter_vals = st.select_slider("Select range",
	(df[col_to_filter].min(), df[col_to_filter].max()),
	key=action + "_col_filter_val_" + str(i))
	if selected_filter_vals:
	filtered_df = df[df[col_to_filter] < selected_filter_vals]

	if selected_filter_vals:
	set_filtered_data_session_object(filtered_df,selected_files[i])
	st.write('data filtered',st.session_state['filtered_data'][selected_files[i]].shape)
	# st.write(df.shape)
	# st.write( st.session_state['filtered_data'][selected_files[i]].shape)


	def create_component_for_analysis_for_single_df(selected_files, dfs, i):
	st.subheader(selected_files[i])
	df = dfs[selected_files[i]]

	filter_data = st.checkbox("Analyse on Filtered Data",key="filter_data_check"+str(i))

	if filter_data:
	set_filtered_data(df,selected_files,i)

	analysis_actions = st.multiselect("What analysis do you wish to do?",
	['Summary of Data', 'Sample Data','Get Profile' ,'Univariate Analysis',
	'Bivariate Analysis'], key='analysis_action_' + str(i))
	if analysis_actions:

	df_for_analysis = st.session_state['filtered_data'][selected_files[i]] if filter_data else df

	for action in analysis_actions:

	if action == 'Sample Data':
	clear_chart_type_session_var()
	st.write(df_for_analysis.sample(10))
	elif action == 'Get Profile':
	clear_chart_type_session_var()
	full_data_check = st.checkbox("Report on all columns",key="filter_data_check"+str(i))

	if full_data_check:
	st.warning("This might take a lot of time to generate the report depending on the size of the data.Select a subset of columns")
	confirm_full_run = st.button("Run on full data")
	if confirm_full_run:
	pr = df_for_analysis.profile_report()
	st_profile_report(pr)
	else:
	col_subset = st.multiselect("Select subset of columns", df.columns.values,key='filter_subset_'+ str(i))

	if col_subset:
	pr = df_for_analysis[col_subset].profile_report()
	st_profile_report(pr)

	elif action == 'Summary of Data':
	clear_chart_type_session_var()
	st.write(df_for_analysis.describe())

	elif action == 'Univariate Analysis':
	clear_chart_type_session_var()
	cols_for_analysis = st.multiselect("Select Columns for Univariate Analysis",options= df_for_analysis.columns.values)
	for col in cols_for_analysis:
	if str(df_for_analysis[col].dtype) in ['int64','float64'] and df_for_analysis[col].nunique() > 10 :

	fig = px.scatter(x=df_for_analysis.index, y=df_for_analysis[col],labels=dict(x="Index", y=col))
	st.plotly_chart(fig, use_container_width=True)

	elif str(df_for_analysis[col].dtype) in ['object','category'] or df_for_analysis[col].nunique() <= 10:

	value_dist_df = df_for_analysis[col].value_counts(normalize=True)[:20].reset_index()
	value_dist_df.columns = [col,'% Distribution']

	value_dist_df_counts = df_for_analysis[col].value_counts()[:20].reset_index()
	value_dist_df_counts.columns = [col,'Count']
	value_dist_df = value_dist_df.merge(value_dist_df_counts,on=col)

	trace1 = go.Bar(x=value_dist_df[col],y=value_dist_df['Count'],name='Count',marker=dict(color='rgb(34,163,192)'))
	trace2 = go.Scatter(x=value_dist_df[col],y=value_dist_df['% Distribution'],name='% Distribution',yaxis='y2')

	fig = make_subplots(specs=[[{"secondary_y": True}]])
	fig.add_trace(trace1)
	fig.add_trace(trace2,secondary_y=True)

	fig['layout'].update(height = 600, width = 800, title = f"{col} data distribution",xaxis=dict(tickangle=-90))

	# fig.update_layout(height=200, width=400, title_text=f"{col} data distribution")

	st.plotly_chart(fig, use_container_width=True)
	elif action == "Bivariate Analysis":
	add_chart_options_to_sidebar()
	create_for_bivariate_analysis(selected_files, df, i)

	def clear_chart_type_session_var():
	if 'chart_type' in st.session_state:
	del st.session_state[chart_type]

	def add_chart_options_to_sidebar():
	if 'chart_type' not in st.session_state :
	with st.sidebar:
	viz_type = st.radio("Graph Type",('None','Cross Tab','Pivot Table','Box Plot'))
	if viz_type and viz_type != 'None':
	st.session_state['chart_type'] == viz_type

	def create_for_bivariate_analysis(selected_files, df, i):

	target_column = st.selectbox("Select the target column ", df.columns.values,
	key= "bivariate_target_column_" + str(i))
	bivariate_columns = st.multiselect("Select the columns to analyse ", df.columns.values,
	key= "bivariate_analysis_columns_" + str(i))

	col_vals = []

	if bivariate_columns:
	for col in bivariate_columns:
	col_vals.append(df[col])

	if st.session_state['chart_type'] == 'Cross Tab':
	if len(col_vals) > 3 :
	st.warning("Too many columns to split on. Please consider reducing the no of columns")
	crosstab_df = pd.crosstab(df[target_column], col_vals, margins=True)
	st.write(crosstab_df.to_html(),unsafe_allow_html=True)
	# 3 any other aggregation function can be used based on column type



	def create_component_for_data_analysis():
	if 'data_files' in st.session_state:

	selected_files = st.multiselect("Select the File(S) to analyze", st.session_state['data_files'].keys())

	if selected_files:
	cols = st.columns(len(selected_files))

	dfs = {}

	for selected_file in selected_files:
	if selected_file in st.session_state['data_frames']:
	dfs[selected_file] = st.session_state['data_frames'][selected_file]
	else:
	st.session_state['data_frames'][selected_file] = pd.read_csv(st.session_state['data_files'][selected_file])
	dfs[selected_file] = st.session_state['data_frames'][selected_file]

	for i, col in enumerate(cols):
	with col:
	create_component_for_analysis_for_single_df(selected_files, dfs, i)

	else:
	st.write("Upload a file to start analysis")


	def main():

	st.title("Model Results Analyzer")
	with st.sidebar:

	selected_menu = option_menu(None, ["Home", "Upload Data", "Add Features","Analyze Data","Iframe"],
	icons=['house', 'cloud-upload', "list-task", 'gear'],
	menu_icon="cast", default_index=0, orientation="vertical",
	styles={
	"container": {"padding": "0!important", "background-color": "#fafafa"},
	"icon": {"color": "orange", "font-size": "15px"},
	"nav-link": {"font-size": "15px", "text-align": "left", "margin": "0px",
	"--hover-color": "#eee"},
	"nav-link-selected": {"background-color": "green"},
	})

	if selected_menu == "Home":
	st.markdown('This is to analyse models performance.')

	elif selected_menu == "Upload Data":

	create_upload_file_component()

	if 'data_files' in st.session_state:
	st.write(pd.DataFrame(
	data={"File Name": pd.DataFrame.from_dict(st.session_state['data_files'], orient='index').index}))

	elif selected_menu == "Analyze Data":
	create_component_for_data_analysis()

	elif selected_menu == "Add Features":
	if 'data_files' in st.session_state:
	selected_file = st.selectbox("Select the File(S) to analyze", st.session_state['data_files'].keys())

	if selected_file:
	df = st.session_state['data_frames'][selected_file]
	st.header("Enter the function definiton to create a new feature")
	feature_name = st.text_input("Enter the New Feature Name")
	st.warning("please retain the function signature as 'add_feature(row)'")

	content = st_ace(language="python",value="def add_feature(row):")

	if content != 'def add_feature(row):':
	exec(content)
	df[feature_name] = df.apply(lambda x:add_feature(x),axis=1)

	st.session_state['data_frames'][selected_file] = df
	st.write(df.columns.values)
	elif selected_menu == "Iframe":
	st.components.v1.iframe("https://huggingface.co/spaces/Sasidhar/information-extraction-demo", width=None, height=None, scrolling=False)

	main()