CosmickVisions commited on
Commit
4138c2a
Β·
verified Β·
1 Parent(s): 6699046

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +141 -106
app.py CHANGED
@@ -632,9 +632,8 @@ if app_mode == "Data Cleaning":
632
  st.dataframe(st.session_state.cleaned_data.head(), use_container_width=True)
633
 
634
 
635
- # Main function for EDA
636
- def eda():
637
- st.title("πŸ” Exploratory Data Analysis")
638
 
639
  if st.session_state.cleaned_data is None:
640
  st.warning("Please clean your data first")
@@ -643,143 +642,179 @@ def eda():
643
  df = st.session_state.cleaned_data
644
 
645
  # --------------------------
646
- # Data Overview
647
  # --------------------------
648
- with st.expander("πŸ“Š Data Overview", expanded=True):
649
- col1, col2, col3 = st.columns(3)
650
  with col1:
651
- st.metric("Total Rows", df.shape[0])
652
  with col2:
653
- st.metric("Total Columns", df.shape[1])
654
  with col3:
655
- st.metric("Missing Values", df.isna().sum().sum())
 
 
 
 
656
 
657
- if st.checkbox("Show Data Preview"):
 
 
658
  st.dataframe(df.head(), use_container_width=True)
 
 
 
 
 
 
 
659
 
660
  # --------------------------
661
- # Visualization Selector
662
  # --------------------------
663
- st.subheader("πŸ“ˆ Visualization Setup")
 
 
 
 
 
664
  col1, col2 = st.columns([1, 3])
665
  with col1:
666
- plot_type = st.selectbox("Choose plot type", [
667
- "Scatter Plot", "Histogram",
668
- "Box Plot", "Correlation Matrix",
669
- "Line Chart", "Heatmap", "Violin Plot",
670
- "3D Scatter Plot", "Parallel Coordinates",
671
- "Pair Plot", "Density Contour"
672
- ])
 
 
 
 
 
 
 
 
 
 
 
 
 
673
 
674
- x_axis = st.selectbox("X-Axis", df.columns)
675
- y_axis = st.selectbox("Y-Axis", df.columns) if plot_type in ["Scatter Plot", "Box Plot", "Line Chart", "Violin Plot", "3D Scatter Plot", "Density Contour"] else None
676
- z_axis = st.selectbox("Z-Axis", df.columns) if plot_type == "3D Scatter Plot" else None
677
- color_by = st.selectbox("Color By", [None] + df.columns.tolist())
678
- facet_col = st.selectbox("Facet By", [None] + df.columns.tolist())
 
 
 
 
 
679
 
680
  with col2:
681
- st.subheader("πŸ“Š Visualization")
682
  try:
 
683
  if plot_type == "Scatter Plot":
684
- fig = px.scatter(df, x=x_axis, y=y_axis, color=color_by, facet_col=facet_col)
 
685
  elif plot_type == "Histogram":
686
- fig = px.histogram(df, x=x_axis, color=color_by, facet_col=facet_col)
 
687
  elif plot_type == "Box Plot":
688
- fig = px.box(df, x=x_axis, y=y_axis, color=color_by, facet_col=facet_col)
 
 
 
 
 
 
 
689
  elif plot_type == "Correlation Matrix":
690
  corr = df.select_dtypes(include=np.number).corr()
691
- fig = px.imshow(corr, text_auto=True, color_continuous_scale='Viridis')
692
- elif plot_type == "Line Chart":
693
- fig = px.line(df, x=x_axis, y=y_axis, color=color_by, facet_col=facet_col)
694
- elif plot_type == "Heatmap":
695
- fig = go.Figure(data=go.Heatmap(
696
- z=df.corr().values,
697
- x=df.columns,
698
- y=df.columns,
699
- colorscale='Viridis'))
700
- elif plot_type == "Violin Plot":
701
- fig = px.violin(df, x=x_axis, y=y_axis, color=color_by, facet_col=facet_col)
702
- elif plot_type == "3D Scatter Plot":
703
- fig = px.scatter_3d(df, x=x_axis, y=y_axis, z=z_axis, color=color_by)
704
- elif plot_type == "Parallel Coordinates":
705
- fig = px.parallel_coordinates(df, color=color_by)
706
  elif plot_type == "Pair Plot":
707
- fig = px.scatter_matrix(df, color=color_by)
708
- elif plot_type == "Density Contour":
709
- fig = px.density_contour(df, x=x_axis, y=y_axis, color=color_by)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
710
 
711
  st.plotly_chart(fig, use_container_width=True)
 
712
  except Exception as e:
713
- st.error(f"Visualization error: {str(e)}")
 
714
 
715
  # --------------------------
716
- # Relationship Diagnostics
717
  # --------------------------
718
- st.subheader("πŸ”— Relationship Diagnostics")
719
- selected_columns = st.multiselect("Select columns to analyze relationships", df.columns)
720
- if selected_columns:
721
- if len(selected_columns) == 2:
 
722
  col1, col2 = st.columns(2)
723
  with col1:
724
- st.write(f"**Scatter Plot: {selected_columns[0]} vs {selected_columns[1]}**")
725
- fig = px.scatter(df, x=selected_columns[0], y=selected_columns[1], trendline="ols")
726
- st.plotly_chart(fig, use_container_width=True)
727
-
728
  with col2:
729
- st.write("**Statistical Summary**")
730
- st.write(df[selected_columns].describe())
731
-
732
- # Correlation Analysis
733
- pearson_corr, _ = pearsonr(df[selected_columns[0]], df[selected_columns[1]])
734
- spearman_corr, _ = spearmanr(df[selected_columns[0]], df[selected_columns[1]])
735
-
736
- st.metric("Pearson Correlation", f"{pearson_corr:.2f}")
737
- st.metric("Spearman Correlation", f"{spearman_corr:.2f}")
738
-
739
- st.write("**Regression Line**")
740
- st.write(f"Equation: y = {fig.data[1].line.color} * x + {fig.data[1].line.dash}")
741
- elif len(selected_columns) > 2:
742
- st.warning("Please select only two columns for relationship analysis.")
743
- else:
744
- st.warning("Please select at least two columns for relationship analysis.")
745
-
746
- # --------------------------
747
- # Advanced Statistics
748
- # --------------------------
749
- with st.expander("πŸ“Š Advanced Statistics", expanded=False):
750
- st.write("**Column-wise Statistics**")
751
- selected_col = st.selectbox("Select a column for detailed analysis", df.columns)
752
- if selected_col:
753
- if pd.api.types.is_numeric_dtype(df[selected_col]):
754
- st.write(f"**Distribution of {selected_col}**")
755
- fig = px.histogram(df, x=selected_col, nbins=30)
756
- st.plotly_chart(fig, use_container_width=True)
757
-
758
- st.write("**Outlier Detection**")
759
- Q1 = df[selected_col].quantile(0.25)
760
- Q3 = df[selected_col].quantile(0.75)
761
- IQR = Q3 - Q1
762
- outliers = df[(df[selected_col] < (Q1 - 1.5 * IQR)) | (df[selected_col] > (Q3 + 1.5 * IQR))]
763
- st.write(f"Number of outliers: {len(outliers)}")
764
- st.dataframe(outliers.head(), use_container_width=True)
765
- else:
766
- st.write(f"**Value Counts for {selected_col}**")
767
- value_counts = df[selected_col].value_counts()
768
- st.bar_chart(value_counts)
769
 
770
  # --------------------------
771
- # Save Visualizations
772
  # --------------------------
773
- st.subheader("πŸ’Ύ Save Visualizations")
774
- if st.button("Export Current Visualization as PNG"):
775
- try:
 
776
  fig.write_image("visualization.png")
777
- st.success("Visualization saved as PNG!")
778
- except Exception as e:
779
- st.error(f"Error saving visualization: {str(e)}")
780
-
781
- # Call the EDA function
782
- eda()
783
 
784
  # Streamlit App
785
  elif app_mode == "Model Training":
 
632
  st.dataframe(st.session_state.cleaned_data.head(), use_container_width=True)
633
 
634
 
635
+ elif app_mode = "EDA":
636
+ st.title("πŸ” Interactive Data Explorer")
 
637
 
638
  if st.session_state.cleaned_data is None:
639
  st.warning("Please clean your data first")
 
642
  df = st.session_state.cleaned_data
643
 
644
  # --------------------------
645
+ # Enhanced Data Overview
646
  # --------------------------
647
+ with st.expander("πŸ“ Dataset Overview", expanded=True):
648
+ col1, col2, col3, col4 = st.columns(4)
649
  with col1:
650
+ st.metric("Total Rows", df.shape[0], help="Number of observations in the dataset")
651
  with col2:
652
+ st.metric("Total Columns", df.shape[1], help="Number of features in the dataset")
653
  with col3:
654
+ missing = df.isna().sum().sum()
655
+ st.metric("Missing Values", f"{missing} ({missing/(df.size)*100:.1f}%)")
656
+ with col4:
657
+ dupes = df.duplicated().sum()
658
+ st.metric("Duplicates", dupes, help="Fully duplicated rows")
659
 
660
+ # Data Preview Tabs
661
+ tab1, tab2, tab3 = st.tabs(["Quick Preview", "Column Types", "Missing Matrix"])
662
+ with tab1:
663
  st.dataframe(df.head(), use_container_width=True)
664
+ with tab2:
665
+ types = df.dtypes.value_counts().reset_index()
666
+ types.columns = ['Type', 'Count']
667
+ st.dataframe(types, use_container_width=True)
668
+ with tab3:
669
+ fig = px.imshow(df.isna(), color_continuous_scale='gray')
670
+ st.plotly_chart(fig, use_container_width=True)
671
 
672
  # --------------------------
673
+ # Smart Visualization Builder
674
  # --------------------------
675
+ st.subheader("πŸ“Š Visualization Builder")
676
+
677
+ # Automatic plot type suggestions
678
+ numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
679
+ categorical_cols = df.select_dtypes(exclude=np.number).columns.tolist()
680
+
681
  col1, col2 = st.columns([1, 3])
682
  with col1:
683
+ # Dynamic plot type filtering
684
+ default_plot = "Histogram" if len(numeric_cols) > 0 else "Bar Chart"
685
+ plot_type = st.selectbox(
686
+ "Choose visualization type",
687
+ options=[
688
+ "Scatter Plot", "Histogram", "Box Plot",
689
+ "Violin Plot", "Line Chart", "Bar Chart",
690
+ "Correlation Matrix", "Pair Plot", "Heatmap",
691
+ "3D Scatter", "Parallel Categories"
692
+ ],
693
+ index=0,
694
+ help="Automatically filtered based on data types"
695
+ )
696
+
697
+ # Dynamic axis selection
698
+ x_axis = st.selectbox("X-axis", df.columns,
699
+ help="Primary dimension for analysis")
700
+ y_axis = st.selectbox("Y-axis", [None] + df.columns.tolist(),
701
+ disabled=plot_type in ["Histogram", "Bar Chart"],
702
+ help="Secondary dimension for analysis")
703
 
704
+ # Smart color encoding
705
+ color_options = ["None"] + df.columns.tolist()
706
+ color_by = st.selectbox("Color encoding", color_options,
707
+ format_func=lambda x: "No color" if x == "None" else x)
708
+
709
+ # Context-aware controls
710
+ if plot_type in ["3D Scatter", "Parallel Categories"]:
711
+ z_axis = st.selectbox("Z-axis", [None] + df.columns.tolist())
712
+ if plot_type == "Parallel Categories":
713
+ dimensions = st.multiselect("Dimensions", df.columns.tolist(), default=df.columns[:3])
714
 
715
  with col2:
 
716
  try:
717
+ # Generate appropriate visualization
718
  if plot_type == "Scatter Plot":
719
+ fig = px.scatter(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None,
720
+ hover_data=df.columns, trendline="lowess")
721
  elif plot_type == "Histogram":
722
+ fig = px.histogram(df, x=x_axis, color=color_by if color_by != "None" else None,
723
+ nbins=30, marginal="box")
724
  elif plot_type == "Box Plot":
725
+ fig = px.box(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None)
726
+ elif plot_type == "Violin Plot":
727
+ fig = px.violin(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None,
728
+ box=True)
729
+ elif plot_type == "Line Chart":
730
+ fig = px.line(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None)
731
+ elif plot_type == "Bar Chart":
732
+ fig = px.bar(df, x=x_axis, color=color_by if color_by != "None" else None)
733
  elif plot_type == "Correlation Matrix":
734
  corr = df.select_dtypes(include=np.number).corr()
735
+ fig = px.imshow(corr, text_auto=True, color_continuous_scale='RdBu_r',
736
+ zmin=-1, zmax=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
737
  elif plot_type == "Pair Plot":
738
+ fig = px.scatter_matrix(df, dimensions=numeric_cols[:4],
739
+ color=color_by if color_by != "None" else None)
740
+ elif plot_type == "Heatmap":
741
+ fig = px.density_heatmap(df, x=x_axis, y=y_axis, facet_col=color_by if color_by != "None" else None)
742
+ elif plot_type == "3D Scatter":
743
+ fig = px.scatter_3d(df, x=x_axis, y=y_axis, z=z_axis,
744
+ color=color_by if color_by != "None" else None)
745
+ elif plot_type == "Parallel Categories":
746
+ fig = px.parallel_categories(df, dimensions=dimensions,
747
+ color=color_by if color_by != "None" else None)
748
+
749
+ # Interactive plot customization
750
+ with st.expander("βš™οΈ Chart Settings", expanded=False):
751
+ col1, col2 = st.columns(2)
752
+ with col1:
753
+ chart_title = st.text_input("Chart title", f"{plot_type} of {x_axis} vs {y_axis}")
754
+ fig.update_layout(title=chart_title)
755
+ with col2:
756
+ theme = st.selectbox("Color theme", px.colors.named_colorscales())
757
+ fig.update_layout(colorway=px.colors.qualitative.Plotly)
758
 
759
  st.plotly_chart(fig, use_container_width=True)
760
+
761
  except Exception as e:
762
+ st.error(f"Couldn't create visualization: {str(e)}")
763
+ st.info("Try selecting different columns or changing the visualization type")
764
 
765
  # --------------------------
766
+ # Advanced Analysis
767
  # --------------------------
768
+ with st.expander("πŸ”¬ Deep Analysis Tools", expanded=False):
769
+ tab1, tab2, tab3 = st.tabs(["Statistical Tests", "Pattern Explorer", "Data Transformation"])
770
+
771
+ with tab1:
772
+ st.subheader("Hypothesis Testing")
773
  col1, col2 = st.columns(2)
774
  with col1:
775
+ test_var = st.selectbox("Test variable", numeric_cols)
 
 
 
776
  with col2:
777
+ group_var = st.selectbox("Grouping variable", [None] + categorical_cols)
778
+
779
+ if group_var and st.button("Run ANOVA"):
780
+ groups = df.groupby(group_var)[test_var].apply(list)
781
+ f_val, p_val = stats.f_oneway(*groups)
782
+ st.write(f"F-value: {f_val:.2f}, p-value: {p_val:.4f}")
783
+
784
+ with tab2:
785
+ st.subheader("Pattern Discovery")
786
+ explore_col = st.selectbox("Column to analyze", df.columns)
787
+ if pd.api.types.is_string_dtype(df[explore_col]):
788
+ pattern = st.text_input("Regex pattern")
789
+ if pattern:
790
+ matches = df[explore_col].str.contains(pattern).sum()
791
+ st.write(f"Found {matches} matches")
792
+
793
+ with tab3:
794
+ st.subheader("Data Transformation")
795
+ transform_col = st.selectbox("Column to transform", numeric_cols)
796
+ transform_type = st.selectbox("Transformation", ["Log", "Square Root", "Z-score"])
797
+ if transform_type == "Log":
798
+ df[transform_col] = np.log1p(df[transform_col])
799
+ elif transform_type == "Square Root":
800
+ df[transform_col] = np.sqrt(df[transform_col])
801
+ elif transform_type == "Z-score":
802
+ df[transform_col] = (df[transform_col] - df[transform_col].mean())/df[transform_col].std()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
803
 
804
  # --------------------------
805
+ # Export & Save
806
  # --------------------------
807
+ st.subheader("πŸ’Ύ Export Options")
808
+ col1, col2 = st.columns(2)
809
+ with col1:
810
+ if st.button("πŸ“₯ Download Current Visualization"):
811
  fig.write_image("visualization.png")
812
+ st.success("Image saved!")
813
+ with col2:
814
+ if st.button("πŸ“Š Export Analysis Report"):
815
+ profile = ProfileReport(df, minimal=True)
816
+ profile.to_file("analysis_report.html")
817
+ st.success("Report generated!")
818
 
819
  # Streamlit App
820
  elif app_mode == "Model Training":