CosmickVisions commited on
Commit
0cf55dc
·
verified ·
1 Parent(s): 58d0cb5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +119 -19
app.py CHANGED
@@ -616,23 +616,16 @@ elif app_mode == "Data Cleaning":
616
  st.dataframe(st.session_state.cleaned_data.head(), use_container_width=True)
617
 
618
 
 
 
 
 
 
619
  elif app_mode == "EDA":
620
  st.title("🔍 Interactive Data Explorer")
621
 
622
  if st.session_state.cleaned_data is None:
623
  st.warning("Please clean your data first")
624
- # Show Upload Clean Data button
625
- uploaded_clean_file = st.file_uploader("Upload your cleaned dataset (CSV/XLSX)", type=["csv", "xlsx"])
626
- if uploaded_clean_file:
627
- try:
628
- if uploaded_clean_file.name.endswith('.csv'):
629
- df = pd.read_csv(uploaded_clean_file)
630
- else:
631
- df = pd.read_excel(uploaded_clean_file)
632
- st.session_state.cleaned_data = df
633
- st.success("Cleaned data uploaded successfully!")
634
- except Exception as e:
635
- st.error(f"Error loading file: {str(e)}")
636
  st.stop()
637
 
638
  df = st.session_state.cleaned_data
@@ -685,8 +678,15 @@ elif app_mode == "EDA":
685
  options=[
686
  "Scatter Plot", "Histogram", "Box Plot",
687
  "Violin Plot", "Line Chart", "Bar Chart",
688
- "Correlation Matrix", "Pair Plot", "Heatmap",
689
- "3D Scatter", "Parallel Categories"
 
 
 
 
 
 
 
690
  ],
691
  index=0,
692
  help="Automatically filtered based on data types"
@@ -698,17 +698,19 @@ elif app_mode == "EDA":
698
  z_axis = None
699
  color_by = "None" # Default color to None
700
 
701
- if plot_type not in ["Correlation Matrix", "Pair Plot"]:
702
  x_axis = st.selectbox("X-axis", df.columns, help="Primary dimension for analysis")
703
 
704
- if plot_type in ["Scatter Plot", "Box Plot", "Violin Plot", "Line Chart", "Heatmap"]:
705
- y_axis = st.selectbox("Y-axis", df.columns, help="Secondary dimension for analysis")
 
706
 
707
  if plot_type == "3D Scatter":
708
  z_axis = st.selectbox("Z-axis", df.columns, help="Third dimension for analysis")
709
 
710
  # Color encoding
711
- if plot_type not in ["Correlation Matrix", "Pair Plot"]:
 
712
  color_options = ["None"] + df.columns.tolist()
713
  color_by = st.selectbox("Color encoding", color_options,
714
  format_func=lambda x: "No color" if x == "None" else x)
@@ -717,10 +719,17 @@ elif app_mode == "EDA":
717
  dimensions = None
718
  if plot_type == "Parallel Categories":
719
  dimensions = st.multiselect("Dimensions", df.columns.tolist(), default=df.columns[:3])
 
 
 
 
720
 
721
  with col2:
722
  try:
723
  fig = None # Initialize fig to None
 
 
 
724
 
725
  # Generate appropriate visualization with input validation
726
  if plot_type == "Scatter Plot":
@@ -768,10 +777,85 @@ elif app_mode == "EDA":
768
  if dimensions:
769
  fig = px.parallel_categories(df, dimensions=dimensions,
770
  color=color_by if color_by != "None" else None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
771
  else:
772
  st.error("Please choose the specific plot")
773
 
774
- # Interactive plot customization
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
775
  if fig: #Only display customization options when we have a plot
776
  with st.expander("⚙️ Chart Settings", expanded=False):
777
  col1, col2 = st.columns(2)
@@ -810,6 +894,22 @@ elif app_mode == "EDA":
810
  st.write(f"F-value: {f_val:.2f}, p-value: {p_val:.4f}")
811
  else:
812
  st.warning("Please select both a Test variable and a Grouping variable for ANOVA.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
813
 
814
  with tab2:
815
  st.subheader("Pattern Discovery")
 
616
  st.dataframe(st.session_state.cleaned_data.head(), use_container_width=True)
617
 
618
 
619
+
620
+
621
+ # --------------------------
622
+ # EDA
623
+ # --------------------------
624
  elif app_mode == "EDA":
625
  st.title("🔍 Interactive Data Explorer")
626
 
627
  if st.session_state.cleaned_data is None:
628
  st.warning("Please clean your data first")
 
 
 
 
 
 
 
 
 
 
 
 
629
  st.stop()
630
 
631
  df = st.session_state.cleaned_data
 
678
  options=[
679
  "Scatter Plot", "Histogram", "Box Plot",
680
  "Violin Plot", "Line Chart", "Bar Chart",
681
+ "Correlation Matrix", "Heatmap", # Removed standard Pair Plot to use customized plots,
682
+ "3D Scatter", "Parallel Categories", "Segmented Bar Chart", #NEW CHART
683
+ "Swarm Plot", # YData Library Plots,
684
+ "Ridge Plot",
685
+ "Bubble Plot",
686
+ "Barh Plot",
687
+ "Density Plot",
688
+ "Count Plot",
689
+ "Lollipop Chart",
690
  ],
691
  index=0,
692
  help="Automatically filtered based on data types"
 
698
  z_axis = None
699
  color_by = "None" # Default color to None
700
 
701
+ if plot_type not in ["Correlation Matrix", "Segmented Bar Chart","Pair Plot"]: #Removed normal
702
  x_axis = st.selectbox("X-axis", df.columns, help="Primary dimension for analysis")
703
 
704
+ if plot_type in ["Scatter Plot", "Box Plot", "Violin Plot", "Line Chart", "Heatmap","Swarm Plot", "Ridge Plot","Bubble Plot", "Density Plot","Lollipop Chart"]: #Allow Categoricals for Box and Violin
705
+
706
+ y_axis = st.selectbox("Y-axis", df.columns, help="Secondary dimension for analysis") # Allow more types.
707
 
708
  if plot_type == "3D Scatter":
709
  z_axis = st.selectbox("Z-axis", df.columns, help="Third dimension for analysis")
710
 
711
  # Color encoding
712
+ if plot_type not in ["Correlation Matrix", "Pair Plot"]: # Keep Pair plot for simplicity
713
+
714
  color_options = ["None"] + df.columns.tolist()
715
  color_by = st.selectbox("Color encoding", color_options,
716
  format_func=lambda x: "No color" if x == "None" else x)
 
719
  dimensions = None
720
  if plot_type == "Parallel Categories":
721
  dimensions = st.multiselect("Dimensions", df.columns.tolist(), default=df.columns[:3])
722
+ # Context Aware Control for Segmented Bar Chart
723
+ segment_col = None
724
+ if plot_type == "Segmented Bar Chart":
725
+ segment_col = st.selectbox("Segment Column (Categorical)", categorical_cols, help="Column to segment the bar chart")
726
 
727
  with col2:
728
  try:
729
  fig = None # Initialize fig to None
730
+ if st.session_state.cleaned_data is None:
731
+ st.warning("Please upload data first")
732
+ st.stop()
733
 
734
  # Generate appropriate visualization with input validation
735
  if plot_type == "Scatter Plot":
 
777
  if dimensions:
778
  fig = px.parallel_categories(df, dimensions=dimensions,
779
  color=color_by if color_by != "None" else None)
780
+ #NEW SEGMENTED BAR CHART
781
+ elif plot_type == "Segmented Bar Chart":
782
+ if x_axis and segment_col:
783
+ # Data preparation for segmented bar chart
784
+ segment_counts = df.groupby([x_axis, segment_col]).size().reset_index(name='counts')
785
+ fig = px.bar(segment_counts, x=x_axis, y='counts', color=segment_col,
786
+ title=f"Segmented Bar Chart of {x_axis} by {segment_col}")
787
+ fig.update_layout(yaxis_title="Count") #Clear Title
788
+
789
+ elif plot_type == "Swarm Plot": # Handle Swarm Plot
790
+ if x_axis and y_axis:
791
+ fig = px.strip(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None)
792
+ elif plot_type == "Ridge Plot": #Handling Ridge Plot type
793
+
794
+ if x_axis and y_axis:
795
+ #The type of data has to have continuous variables, then show.
796
+ fig = px.histogram(df, x=x_axis, color=y_axis, marginal="rug", #Adds distribution for visual,
797
+ title=f"Ridgeline Plot of {x_axis} Distribution by {y_axis}")
798
+ elif plot_type == "Bubble Plot":
799
+
800
+ if x_axis and y_axis:
801
+
802
+ size_col = st.selectbox("Size Column", df.columns, help = "What is bubble Sizes Based From?")# What bubbles will be based from? - have size select
803
+ fig = px.scatter(df, x=x_axis, y=y_axis,
804
+ size=size_col, color=color_by if color_by != "None" else None,
805
+ hover_name = size_col,#Hover Name, to show value
806
+ title=f"Bubble Plot of {x_axis} vs. {y_axis} Colored by{size_col}"
807
+ )
808
+ elif plot_type == "Barh Plot":
809
+
810
+ if x_axis and y_axis:
811
+ fig = px.bar(df, y=x_axis, x=y_axis,
812
+ color=color_by if color_by != "None" else None, orientation = 'h', # set x on y-axis side.
813
+ title=f"Horizontal Bar Plot of {y_axis} vs {x_axis}"# added chart titles
814
+ ) #Set as Vertical as Base, and
815
+
816
+ elif plot_type == "Density Plot": #Kernel Estimations with px
817
+
818
+ if x_axis and y_axis:
819
+ fig = px.density_heatmap(df, x=x_axis, y=y_axis,
820
+ color_continuous_scale="Viridis", title="Density Plot" # Added Names
821
+ ) #Kernel Heat Maps
822
+ elif plot_type == "Count Plot":
823
+
824
+ if x_axis:
825
+ fig = px.bar(df, x=x_axis, color=color_by if color_by != "None" else None, title ="Counts graph")#Bar
826
+ fig.update_layout(yaxis_title="Count")
827
+
828
+ elif plot_type == "Lollipop Chart":#Creates
829
+
830
+ if x_axis and y_axis:# Has to take X AND Y
831
+
832
+ fig = px.scatter(df, x=x_axis, y=y_axis, size = "Bubble", title="Lollipop char type.")
833
+ fig.add_segments(x=df[x_axis], y=df[y_axis], xend=df[x_axis], yend=0, color = color_by if color_by != "None" else None)# Adds Line
834
+ # add lines as Lollipop. and size.
835
+
836
  else:
837
  st.error("Please choose the specific plot")
838
 
839
+ #CUSTOM INFERENTIAL ADDS - BOX PLOT FOR GROUP COMPARISON
840
+ if fig and plot_type == "Box Plot" and color_by != "None":
841
+ try: # Add statistical annotations for Box Plots
842
+ from statannotations.Annotator import Annotator
843
+
844
+ x = df[x_axis].values
845
+ y = df[y_axis].values
846
+ order = df[color_by].unique().tolist()
847
+
848
+ pairs=[(order[0],order[1])] #Only take 2 for visual
849
+ annotator = Annotator(x=x_axis, y=y_axis, data=df, order=order, pairs=pairs, color=color_by)
850
+ annotator.configure(test='Mann-Whitney', text_format='star', loc='inside', verbose=2)
851
+ annotator.apply_test().annotate()
852
+
853
+ st.pyplot(go.Figure(annotator.fig_).show(config={'displayModeBar': False})) #Turn off default graph functions
854
+ fig = go.Figure(annotator.fig_)# Show Annotation
855
+
856
+ except ImportError:
857
+ st.warning("Install statannotations for p-value annotation on box plots (pip install statannotations)")
858
+
859
  if fig: #Only display customization options when we have a plot
860
  with st.expander("⚙️ Chart Settings", expanded=False):
861
  col1, col2 = st.columns(2)
 
894
  st.write(f"F-value: {f_val:.2f}, p-value: {p_val:.4f}")
895
  else:
896
  st.warning("Please select both a Test variable and a Grouping variable for ANOVA.")
897
+ #Chi Squared test
898
+ st.subheader("Chi-Squared Test")
899
+ col1, col2 = st.columns(2)
900
+ with col1:
901
+ cat1 = st.selectbox("Categorical Variable 1", categorical_cols)
902
+ with col2:
903
+ cat2 = st.selectbox("Categorical Variable 2", categorical_cols)
904
+
905
+ if cat1 and cat2 and st.button("Run Chi-Squared Test"):
906
+ observed = pd.crosstab(df[cat1], df[cat2])
907
+ chi2, p, dof, expected = stats.chi2_contingency(observed)
908
+ st.write(f"Chi-Squared Value: {chi2:.2f}, p-value: {p:.4f}, Degrees of Freedom: {dof}")
909
+ if p < 0.05:
910
+ st.write("There is a statistically significant association between the two categorical variables.")
911
+ else:
912
+ st.write("There is no statistically significant association between the two categorical variables.")
913
 
914
  with tab2:
915
  st.subheader("Pattern Discovery")