Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -616,23 +616,16 @@ elif app_mode == "Data Cleaning":
|
|
616 |
st.dataframe(st.session_state.cleaned_data.head(), use_container_width=True)
|
617 |
|
618 |
|
|
|
|
|
|
|
|
|
|
|
619 |
elif app_mode == "EDA":
|
620 |
st.title("🔍 Interactive Data Explorer")
|
621 |
|
622 |
if st.session_state.cleaned_data is None:
|
623 |
st.warning("Please clean your data first")
|
624 |
-
# Show Upload Clean Data button
|
625 |
-
uploaded_clean_file = st.file_uploader("Upload your cleaned dataset (CSV/XLSX)", type=["csv", "xlsx"])
|
626 |
-
if uploaded_clean_file:
|
627 |
-
try:
|
628 |
-
if uploaded_clean_file.name.endswith('.csv'):
|
629 |
-
df = pd.read_csv(uploaded_clean_file)
|
630 |
-
else:
|
631 |
-
df = pd.read_excel(uploaded_clean_file)
|
632 |
-
st.session_state.cleaned_data = df
|
633 |
-
st.success("Cleaned data uploaded successfully!")
|
634 |
-
except Exception as e:
|
635 |
-
st.error(f"Error loading file: {str(e)}")
|
636 |
st.stop()
|
637 |
|
638 |
df = st.session_state.cleaned_data
|
@@ -685,8 +678,15 @@ elif app_mode == "EDA":
|
|
685 |
options=[
|
686 |
"Scatter Plot", "Histogram", "Box Plot",
|
687 |
"Violin Plot", "Line Chart", "Bar Chart",
|
688 |
-
"Correlation Matrix", "Pair Plot
|
689 |
-
"3D Scatter", "Parallel Categories"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
690 |
],
|
691 |
index=0,
|
692 |
help="Automatically filtered based on data types"
|
@@ -698,17 +698,19 @@ elif app_mode == "EDA":
|
|
698 |
z_axis = None
|
699 |
color_by = "None" # Default color to None
|
700 |
|
701 |
-
if plot_type not in ["Correlation Matrix", "Pair Plot"]:
|
702 |
x_axis = st.selectbox("X-axis", df.columns, help="Primary dimension for analysis")
|
703 |
|
704 |
-
if plot_type in ["Scatter Plot", "Box Plot", "Violin Plot", "Line Chart", "Heatmap"]:
|
705 |
-
|
|
|
706 |
|
707 |
if plot_type == "3D Scatter":
|
708 |
z_axis = st.selectbox("Z-axis", df.columns, help="Third dimension for analysis")
|
709 |
|
710 |
# Color encoding
|
711 |
-
if plot_type not in ["Correlation Matrix", "Pair Plot"]:
|
|
|
712 |
color_options = ["None"] + df.columns.tolist()
|
713 |
color_by = st.selectbox("Color encoding", color_options,
|
714 |
format_func=lambda x: "No color" if x == "None" else x)
|
@@ -717,10 +719,17 @@ elif app_mode == "EDA":
|
|
717 |
dimensions = None
|
718 |
if plot_type == "Parallel Categories":
|
719 |
dimensions = st.multiselect("Dimensions", df.columns.tolist(), default=df.columns[:3])
|
|
|
|
|
|
|
|
|
720 |
|
721 |
with col2:
|
722 |
try:
|
723 |
fig = None # Initialize fig to None
|
|
|
|
|
|
|
724 |
|
725 |
# Generate appropriate visualization with input validation
|
726 |
if plot_type == "Scatter Plot":
|
@@ -768,10 +777,85 @@ elif app_mode == "EDA":
|
|
768 |
if dimensions:
|
769 |
fig = px.parallel_categories(df, dimensions=dimensions,
|
770 |
color=color_by if color_by != "None" else None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
771 |
else:
|
772 |
st.error("Please choose the specific plot")
|
773 |
|
774 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
775 |
if fig: #Only display customization options when we have a plot
|
776 |
with st.expander("⚙️ Chart Settings", expanded=False):
|
777 |
col1, col2 = st.columns(2)
|
@@ -810,6 +894,22 @@ elif app_mode == "EDA":
|
|
810 |
st.write(f"F-value: {f_val:.2f}, p-value: {p_val:.4f}")
|
811 |
else:
|
812 |
st.warning("Please select both a Test variable and a Grouping variable for ANOVA.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
813 |
|
814 |
with tab2:
|
815 |
st.subheader("Pattern Discovery")
|
|
|
616 |
st.dataframe(st.session_state.cleaned_data.head(), use_container_width=True)
|
617 |
|
618 |
|
619 |
+
|
620 |
+
|
621 |
+
# --------------------------
|
622 |
+
# EDA
|
623 |
+
# --------------------------
|
624 |
elif app_mode == "EDA":
|
625 |
st.title("🔍 Interactive Data Explorer")
|
626 |
|
627 |
if st.session_state.cleaned_data is None:
|
628 |
st.warning("Please clean your data first")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
629 |
st.stop()
|
630 |
|
631 |
df = st.session_state.cleaned_data
|
|
|
678 |
options=[
|
679 |
"Scatter Plot", "Histogram", "Box Plot",
|
680 |
"Violin Plot", "Line Chart", "Bar Chart",
|
681 |
+
"Correlation Matrix", "Heatmap", # Removed standard Pair Plot to use customized plots,
|
682 |
+
"3D Scatter", "Parallel Categories", "Segmented Bar Chart", #NEW CHART
|
683 |
+
"Swarm Plot", # YData Library Plots,
|
684 |
+
"Ridge Plot",
|
685 |
+
"Bubble Plot",
|
686 |
+
"Barh Plot",
|
687 |
+
"Density Plot",
|
688 |
+
"Count Plot",
|
689 |
+
"Lollipop Chart",
|
690 |
],
|
691 |
index=0,
|
692 |
help="Automatically filtered based on data types"
|
|
|
698 |
z_axis = None
|
699 |
color_by = "None" # Default color to None
|
700 |
|
701 |
+
if plot_type not in ["Correlation Matrix", "Segmented Bar Chart","Pair Plot"]: #Removed normal
|
702 |
x_axis = st.selectbox("X-axis", df.columns, help="Primary dimension for analysis")
|
703 |
|
704 |
+
if plot_type in ["Scatter Plot", "Box Plot", "Violin Plot", "Line Chart", "Heatmap","Swarm Plot", "Ridge Plot","Bubble Plot", "Density Plot","Lollipop Chart"]: #Allow Categoricals for Box and Violin
|
705 |
+
|
706 |
+
y_axis = st.selectbox("Y-axis", df.columns, help="Secondary dimension for analysis") # Allow more types.
|
707 |
|
708 |
if plot_type == "3D Scatter":
|
709 |
z_axis = st.selectbox("Z-axis", df.columns, help="Third dimension for analysis")
|
710 |
|
711 |
# Color encoding
|
712 |
+
if plot_type not in ["Correlation Matrix", "Pair Plot"]: # Keep Pair plot for simplicity
|
713 |
+
|
714 |
color_options = ["None"] + df.columns.tolist()
|
715 |
color_by = st.selectbox("Color encoding", color_options,
|
716 |
format_func=lambda x: "No color" if x == "None" else x)
|
|
|
719 |
dimensions = None
|
720 |
if plot_type == "Parallel Categories":
|
721 |
dimensions = st.multiselect("Dimensions", df.columns.tolist(), default=df.columns[:3])
|
722 |
+
# Context Aware Control for Segmented Bar Chart
|
723 |
+
segment_col = None
|
724 |
+
if plot_type == "Segmented Bar Chart":
|
725 |
+
segment_col = st.selectbox("Segment Column (Categorical)", categorical_cols, help="Column to segment the bar chart")
|
726 |
|
727 |
with col2:
|
728 |
try:
|
729 |
fig = None # Initialize fig to None
|
730 |
+
if st.session_state.cleaned_data is None:
|
731 |
+
st.warning("Please upload data first")
|
732 |
+
st.stop()
|
733 |
|
734 |
# Generate appropriate visualization with input validation
|
735 |
if plot_type == "Scatter Plot":
|
|
|
777 |
if dimensions:
|
778 |
fig = px.parallel_categories(df, dimensions=dimensions,
|
779 |
color=color_by if color_by != "None" else None)
|
780 |
+
#NEW SEGMENTED BAR CHART
|
781 |
+
elif plot_type == "Segmented Bar Chart":
|
782 |
+
if x_axis and segment_col:
|
783 |
+
# Data preparation for segmented bar chart
|
784 |
+
segment_counts = df.groupby([x_axis, segment_col]).size().reset_index(name='counts')
|
785 |
+
fig = px.bar(segment_counts, x=x_axis, y='counts', color=segment_col,
|
786 |
+
title=f"Segmented Bar Chart of {x_axis} by {segment_col}")
|
787 |
+
fig.update_layout(yaxis_title="Count") #Clear Title
|
788 |
+
|
789 |
+
elif plot_type == "Swarm Plot": # Handle Swarm Plot
|
790 |
+
if x_axis and y_axis:
|
791 |
+
fig = px.strip(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None)
|
792 |
+
elif plot_type == "Ridge Plot": #Handling Ridge Plot type
|
793 |
+
|
794 |
+
if x_axis and y_axis:
|
795 |
+
#The type of data has to have continuous variables, then show.
|
796 |
+
fig = px.histogram(df, x=x_axis, color=y_axis, marginal="rug", #Adds distribution for visual,
|
797 |
+
title=f"Ridgeline Plot of {x_axis} Distribution by {y_axis}")
|
798 |
+
elif plot_type == "Bubble Plot":
|
799 |
+
|
800 |
+
if x_axis and y_axis:
|
801 |
+
|
802 |
+
size_col = st.selectbox("Size Column", df.columns, help = "What is bubble Sizes Based From?")# What bubbles will be based from? - have size select
|
803 |
+
fig = px.scatter(df, x=x_axis, y=y_axis,
|
804 |
+
size=size_col, color=color_by if color_by != "None" else None,
|
805 |
+
hover_name = size_col,#Hover Name, to show value
|
806 |
+
title=f"Bubble Plot of {x_axis} vs. {y_axis} Colored by{size_col}"
|
807 |
+
)
|
808 |
+
elif plot_type == "Barh Plot":
|
809 |
+
|
810 |
+
if x_axis and y_axis:
|
811 |
+
fig = px.bar(df, y=x_axis, x=y_axis,
|
812 |
+
color=color_by if color_by != "None" else None, orientation = 'h', # set x on y-axis side.
|
813 |
+
title=f"Horizontal Bar Plot of {y_axis} vs {x_axis}"# added chart titles
|
814 |
+
) #Set as Vertical as Base, and
|
815 |
+
|
816 |
+
elif plot_type == "Density Plot": #Kernel Estimations with px
|
817 |
+
|
818 |
+
if x_axis and y_axis:
|
819 |
+
fig = px.density_heatmap(df, x=x_axis, y=y_axis,
|
820 |
+
color_continuous_scale="Viridis", title="Density Plot" # Added Names
|
821 |
+
) #Kernel Heat Maps
|
822 |
+
elif plot_type == "Count Plot":
|
823 |
+
|
824 |
+
if x_axis:
|
825 |
+
fig = px.bar(df, x=x_axis, color=color_by if color_by != "None" else None, title ="Counts graph")#Bar
|
826 |
+
fig.update_layout(yaxis_title="Count")
|
827 |
+
|
828 |
+
elif plot_type == "Lollipop Chart":#Creates
|
829 |
+
|
830 |
+
if x_axis and y_axis:# Has to take X AND Y
|
831 |
+
|
832 |
+
fig = px.scatter(df, x=x_axis, y=y_axis, size = "Bubble", title="Lollipop char type.")
|
833 |
+
fig.add_segments(x=df[x_axis], y=df[y_axis], xend=df[x_axis], yend=0, color = color_by if color_by != "None" else None)# Adds Line
|
834 |
+
# add lines as Lollipop. and size.
|
835 |
+
|
836 |
else:
|
837 |
st.error("Please choose the specific plot")
|
838 |
|
839 |
+
#CUSTOM INFERENTIAL ADDS - BOX PLOT FOR GROUP COMPARISON
|
840 |
+
if fig and plot_type == "Box Plot" and color_by != "None":
|
841 |
+
try: # Add statistical annotations for Box Plots
|
842 |
+
from statannotations.Annotator import Annotator
|
843 |
+
|
844 |
+
x = df[x_axis].values
|
845 |
+
y = df[y_axis].values
|
846 |
+
order = df[color_by].unique().tolist()
|
847 |
+
|
848 |
+
pairs=[(order[0],order[1])] #Only take 2 for visual
|
849 |
+
annotator = Annotator(x=x_axis, y=y_axis, data=df, order=order, pairs=pairs, color=color_by)
|
850 |
+
annotator.configure(test='Mann-Whitney', text_format='star', loc='inside', verbose=2)
|
851 |
+
annotator.apply_test().annotate()
|
852 |
+
|
853 |
+
st.pyplot(go.Figure(annotator.fig_).show(config={'displayModeBar': False})) #Turn off default graph functions
|
854 |
+
fig = go.Figure(annotator.fig_)# Show Annotation
|
855 |
+
|
856 |
+
except ImportError:
|
857 |
+
st.warning("Install statannotations for p-value annotation on box plots (pip install statannotations)")
|
858 |
+
|
859 |
if fig: #Only display customization options when we have a plot
|
860 |
with st.expander("⚙️ Chart Settings", expanded=False):
|
861 |
col1, col2 = st.columns(2)
|
|
|
894 |
st.write(f"F-value: {f_val:.2f}, p-value: {p_val:.4f}")
|
895 |
else:
|
896 |
st.warning("Please select both a Test variable and a Grouping variable for ANOVA.")
|
897 |
+
#Chi Squared test
|
898 |
+
st.subheader("Chi-Squared Test")
|
899 |
+
col1, col2 = st.columns(2)
|
900 |
+
with col1:
|
901 |
+
cat1 = st.selectbox("Categorical Variable 1", categorical_cols)
|
902 |
+
with col2:
|
903 |
+
cat2 = st.selectbox("Categorical Variable 2", categorical_cols)
|
904 |
+
|
905 |
+
if cat1 and cat2 and st.button("Run Chi-Squared Test"):
|
906 |
+
observed = pd.crosstab(df[cat1], df[cat2])
|
907 |
+
chi2, p, dof, expected = stats.chi2_contingency(observed)
|
908 |
+
st.write(f"Chi-Squared Value: {chi2:.2f}, p-value: {p:.4f}, Degrees of Freedom: {dof}")
|
909 |
+
if p < 0.05:
|
910 |
+
st.write("There is a statistically significant association between the two categorical variables.")
|
911 |
+
else:
|
912 |
+
st.write("There is no statistically significant association between the two categorical variables.")
|
913 |
|
914 |
with tab2:
|
915 |
st.subheader("Pattern Discovery")
|