Spaces:

CosmickVisions
/

Data-Vision

Running

App Files Files Community

CosmickVisions commited on Mar 2

Commit

0cf55dc

verified ·

1 Parent(s): 58d0cb5

Update app.py

Browse files

Files changed (1) hide show

app.py +119 -19

app.py CHANGED Viewed

@@ -616,23 +616,16 @@ elif app_mode == "Data Cleaning":
             st.dataframe(st.session_state.cleaned_data.head(), use_container_width=True)
 elif app_mode == "EDA":
     st.title("🔍 Interactive Data Explorer")
     if st.session_state.cleaned_data is None:
         st.warning("Please clean your data first")
-            # Show Upload Clean Data button
-        uploaded_clean_file = st.file_uploader("Upload your cleaned dataset (CSV/XLSX)", type=["csv", "xlsx"])
-        if uploaded_clean_file:
-            try:
-                if uploaded_clean_file.name.endswith('.csv'):
-                    df = pd.read_csv(uploaded_clean_file)
-                else:
-                    df = pd.read_excel(uploaded_clean_file)
-                st.session_state.cleaned_data = df
-                st.success("Cleaned data uploaded successfully!")
-            except Exception as e:
-                st.error(f"Error loading file: {str(e)}")
         st.stop()
     df = st.session_state.cleaned_data
@@ -685,8 +678,15 @@ elif app_mode == "EDA":
             options=[
                 "Scatter Plot", "Histogram", "Box Plot",
                 "Violin Plot", "Line Chart", "Bar Chart",
-                "Correlation Matrix", "Pair Plot", "Heatmap",
-                "3D Scatter", "Parallel Categories"
             ],
             index=0,
             help="Automatically filtered based on data types"
@@ -698,17 +698,19 @@ elif app_mode == "EDA":
         z_axis = None
         color_by = "None" # Default color to None
-        if plot_type not in ["Correlation Matrix", "Pair Plot"]:
             x_axis = st.selectbox("X-axis", df.columns, help="Primary dimension for analysis")
-        if plot_type in ["Scatter Plot", "Box Plot", "Violin Plot", "Line Chart", "Heatmap"]:
-            y_axis = st.selectbox("Y-axis", df.columns, help="Secondary dimension for analysis")
         if plot_type == "3D Scatter":
             z_axis = st.selectbox("Z-axis", df.columns, help="Third dimension for analysis")
         # Color encoding
-        if plot_type not in ["Correlation Matrix", "Pair Plot"]:
             color_options = ["None"] + df.columns.tolist()
             color_by = st.selectbox("Color encoding", color_options,
                                   format_func=lambda x: "No color" if x == "None" else x)
@@ -717,10 +719,17 @@ elif app_mode == "EDA":
         dimensions = None
         if plot_type == "Parallel Categories":
             dimensions = st.multiselect("Dimensions", df.columns.tolist(), default=df.columns[:3])
     with col2:
         try:
             fig = None  # Initialize fig to None
             # Generate appropriate visualization with input validation
             if plot_type == "Scatter Plot":
@@ -768,10 +777,85 @@ elif app_mode == "EDA":
                 if dimensions:
                     fig = px.parallel_categories(df, dimensions=dimensions,
                                                color=color_by if color_by != "None" else None)
             else:
                 st.error("Please choose the specific plot")
-            # Interactive plot customization
             if fig: #Only display customization options when we have a plot
                 with st.expander("⚙️ Chart Settings", expanded=False):
                     col1, col2 = st.columns(2)
@@ -810,6 +894,22 @@ elif app_mode == "EDA":
                     st.write(f"F-value: {f_val:.2f}, p-value: {p_val:.4f}")
                 else:
                     st.warning("Please select both a Test variable and a Grouping variable for ANOVA.")
         with tab2:
             st.subheader("Pattern Discovery")

             st.dataframe(st.session_state.cleaned_data.head(), use_container_width=True)
+# --------------------------
+# EDA
+# --------------------------
 elif app_mode == "EDA":
     st.title("🔍 Interactive Data Explorer")
     if st.session_state.cleaned_data is None:
         st.warning("Please clean your data first")
         st.stop()
     df = st.session_state.cleaned_data
             options=[
                 "Scatter Plot", "Histogram", "Box Plot",
                 "Violin Plot", "Line Chart", "Bar Chart",
+                "Correlation Matrix", "Heatmap",  # Removed standard Pair Plot to use customized plots,
+                "3D Scatter", "Parallel Categories", "Segmented Bar Chart", #NEW CHART
+                "Swarm Plot",  # YData Library Plots,
+                "Ridge Plot",
+                "Bubble Plot",
+                "Barh Plot",
+                "Density Plot",
+                "Count Plot",
+                "Lollipop Chart",
             ],
             index=0,
             help="Automatically filtered based on data types"
         z_axis = None
         color_by = "None" # Default color to None
+        if plot_type not in ["Correlation Matrix",  "Segmented Bar Chart","Pair Plot"]: #Removed normal
             x_axis = st.selectbox("X-axis", df.columns, help="Primary dimension for analysis")
+        if plot_type in ["Scatter Plot", "Box Plot", "Violin Plot", "Line Chart", "Heatmap","Swarm Plot", "Ridge Plot","Bubble Plot", "Density Plot","Lollipop Chart"]: #Allow Categoricals for Box and Violin
+            y_axis = st.selectbox("Y-axis", df.columns, help="Secondary dimension for analysis") # Allow more types.
         if plot_type == "3D Scatter":
             z_axis = st.selectbox("Z-axis", df.columns, help="Third dimension for analysis")
         # Color encoding
+        if plot_type not in ["Correlation Matrix", "Pair Plot"]: # Keep Pair plot for simplicity
             color_options = ["None"] + df.columns.tolist()
             color_by = st.selectbox("Color encoding", color_options,
                                   format_func=lambda x: "No color" if x == "None" else x)
         dimensions = None
         if plot_type == "Parallel Categories":
             dimensions = st.multiselect("Dimensions", df.columns.tolist(), default=df.columns[:3])
+        # Context Aware Control for Segmented Bar Chart
+        segment_col = None
+        if plot_type == "Segmented Bar Chart":
+            segment_col = st.selectbox("Segment Column (Categorical)", categorical_cols, help="Column to segment the bar chart")
     with col2:
         try:
             fig = None  # Initialize fig to None
+            if st.session_state.cleaned_data is None:
+                st.warning("Please upload data first")
+                st.stop()
             # Generate appropriate visualization with input validation
             if plot_type == "Scatter Plot":
                 if dimensions:
                     fig = px.parallel_categories(df, dimensions=dimensions,
                                                color=color_by if color_by != "None" else None)
+            #NEW SEGMENTED BAR CHART
+            elif plot_type == "Segmented Bar Chart":
+                if x_axis and segment_col:
+                    # Data preparation for segmented bar chart
+                    segment_counts = df.groupby([x_axis, segment_col]).size().reset_index(name='counts')
+                    fig = px.bar(segment_counts, x=x_axis, y='counts', color=segment_col,
+                                 title=f"Segmented Bar Chart of {x_axis} by {segment_col}")
+                    fig.update_layout(yaxis_title="Count") #Clear Title
+            elif plot_type == "Swarm Plot":  # Handle Swarm Plot
+                 if x_axis and y_axis:
+                      fig = px.strip(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None)
+            elif plot_type == "Ridge Plot": #Handling Ridge Plot type
+                if x_axis and y_axis:
+                     #The type of data has to have continuous variables, then show.
+                     fig = px.histogram(df, x=x_axis, color=y_axis, marginal="rug", #Adds distribution for visual,
+                              title=f"Ridgeline Plot of {x_axis} Distribution by {y_axis}")
+            elif plot_type == "Bubble Plot":
+                if x_axis and y_axis:
+                      size_col = st.selectbox("Size Column", df.columns, help = "What is bubble Sizes Based From?")# What bubbles will be based from? - have size select
+                      fig = px.scatter(df, x=x_axis, y=y_axis,
+                                   size=size_col, color=color_by if color_by != "None" else None,
+                                   hover_name = size_col,#Hover Name, to show value
+                                    title=f"Bubble Plot of {x_axis} vs. {y_axis} Colored by{size_col}"
+                        )
+            elif plot_type == "Barh Plot":
+                 if x_axis and y_axis:
+                    fig = px.bar(df, y=x_axis, x=y_axis,
+                                 color=color_by if color_by != "None" else None, orientation = 'h', # set x on y-axis side.
+                                title=f"Horizontal Bar Plot of {y_axis} vs {x_axis}"# added chart titles
+                                ) #Set as Vertical as Base, and
+            elif plot_type == "Density Plot": #Kernel Estimations with px
+                 if x_axis and y_axis:
+                    fig = px.density_heatmap(df, x=x_axis, y=y_axis,
+                              color_continuous_scale="Viridis", title="Density Plot" # Added Names
+                              ) #Kernel Heat Maps
+            elif plot_type == "Count Plot":
+                 if x_axis:
+                    fig = px.bar(df, x=x_axis, color=color_by if color_by != "None" else None, title ="Counts graph")#Bar
+                    fig.update_layout(yaxis_title="Count")
+            elif plot_type == "Lollipop Chart":#Creates
+                  if x_axis and y_axis:# Has to take X AND Y
+                     fig = px.scatter(df, x=x_axis, y=y_axis, size = "Bubble", title="Lollipop char type.")
+                     fig.add_segments(x=df[x_axis], y=df[y_axis], xend=df[x_axis], yend=0, color = color_by if color_by != "None" else None)# Adds Line
+                     # add lines as Lollipop. and size.
             else:
                 st.error("Please choose the specific plot")
+            #CUSTOM INFERENTIAL ADDS - BOX PLOT FOR GROUP COMPARISON
+            if fig and plot_type == "Box Plot" and color_by != "None":
+                    try:  # Add statistical annotations for Box Plots
+                         from statannotations.Annotator import Annotator
+                         x = df[x_axis].values
+                         y = df[y_axis].values
+                         order = df[color_by].unique().tolist()
+                         pairs=[(order[0],order[1])] #Only take 2 for visual
+                         annotator = Annotator(x=x_axis, y=y_axis, data=df, order=order, pairs=pairs, color=color_by)
+                         annotator.configure(test='Mann-Whitney', text_format='star', loc='inside', verbose=2)
+                         annotator.apply_test().annotate()
+                         st.pyplot(go.Figure(annotator.fig_).show(config={'displayModeBar': False})) #Turn off default graph functions
+                         fig = go.Figure(annotator.fig_)# Show Annotation
+                    except ImportError:
+                         st.warning("Install statannotations for p-value annotation on box plots (pip install statannotations)")
             if fig: #Only display customization options when we have a plot
                 with st.expander("⚙️ Chart Settings", expanded=False):
                     col1, col2 = st.columns(2)
                     st.write(f"F-value: {f_val:.2f}, p-value: {p_val:.4f}")
                 else:
                     st.warning("Please select both a Test variable and a Grouping variable for ANOVA.")
+            #Chi Squared test
+            st.subheader("Chi-Squared Test")
+            col1, col2 = st.columns(2)
+            with col1:
+                cat1 = st.selectbox("Categorical Variable 1", categorical_cols)
+            with col2:
+                cat2 = st.selectbox("Categorical Variable 2", categorical_cols)
+            if cat1 and cat2 and st.button("Run Chi-Squared Test"):
+                    observed = pd.crosstab(df[cat1], df[cat2])
+                    chi2, p, dof, expected = stats.chi2_contingency(observed)
+                    st.write(f"Chi-Squared Value: {chi2:.2f}, p-value: {p:.4f}, Degrees of Freedom: {dof}")
+                    if p < 0.05:
+                        st.write("There is a statistically significant association between the two categorical variables.")
+                    else:
+                        st.write("There is no statistically significant association between the two categorical variables.")
         with tab2:
             st.subheader("Pattern Discovery")