Spaces:

CosmickVisions
/

Data-Vision

Sleeping

App Files Files Community

CosmickVisions commited on Mar 2

Commit

4138c2a

verified ·

1 Parent(s): 6699046

Update app.py

Browse files

Files changed (1) hide show

app.py +141 -106

app.py CHANGED Viewed

@@ -632,9 +632,8 @@ if app_mode == "Data Cleaning":
             st.dataframe(st.session_state.cleaned_data.head(), use_container_width=True)
-# Main function for EDA
-def eda():
-    st.title("🔍 Exploratory Data Analysis")
     if st.session_state.cleaned_data is None:
         st.warning("Please clean your data first")
@@ -643,143 +642,179 @@ def eda():
     df = st.session_state.cleaned_data
     # --------------------------
-    # Data Overview
     # --------------------------
-    with st.expander("📊 Data Overview", expanded=True):
-        col1, col2, col3 = st.columns(3)
         with col1:
-            st.metric("Total Rows", df.shape[0])
         with col2:
-            st.metric("Total Columns", df.shape[1])
         with col3:
-            st.metric("Missing Values", df.isna().sum().sum())
-        if st.checkbox("Show Data Preview"):
             st.dataframe(df.head(), use_container_width=True)
     # --------------------------
-    # Visualization Selector
     # --------------------------
-    st.subheader("📈 Visualization Setup")
     col1, col2 = st.columns([1, 3])
     with col1:
-        plot_type = st.selectbox("Choose plot type", [
-            "Scatter Plot", "Histogram",
-            "Box Plot", "Correlation Matrix",
-            "Line Chart", "Heatmap", "Violin Plot",
-            "3D Scatter Plot", "Parallel Coordinates",
-            "Pair Plot", "Density Contour"
-        ])
-        x_axis = st.selectbox("X-Axis", df.columns)
-        y_axis = st.selectbox("Y-Axis", df.columns) if plot_type in ["Scatter Plot", "Box Plot", "Line Chart", "Violin Plot", "3D Scatter Plot", "Density Contour"] else None
-        z_axis = st.selectbox("Z-Axis", df.columns) if plot_type == "3D Scatter Plot" else None
-        color_by = st.selectbox("Color By", [None] + df.columns.tolist())
-        facet_col = st.selectbox("Facet By", [None] + df.columns.tolist())
     with col2:
-        st.subheader("📊 Visualization")
         try:
             if plot_type == "Scatter Plot":
-                fig = px.scatter(df, x=x_axis, y=y_axis, color=color_by, facet_col=facet_col)
             elif plot_type == "Histogram":
-                fig = px.histogram(df, x=x_axis, color=color_by, facet_col=facet_col)
             elif plot_type == "Box Plot":
-                fig = px.box(df, x=x_axis, y=y_axis, color=color_by, facet_col=facet_col)
             elif plot_type == "Correlation Matrix":
                 corr = df.select_dtypes(include=np.number).corr()
-                fig = px.imshow(corr, text_auto=True, color_continuous_scale='Viridis')
-            elif plot_type == "Line Chart":
-                fig = px.line(df, x=x_axis, y=y_axis, color=color_by, facet_col=facet_col)
-            elif plot_type == "Heatmap":
-                fig = go.Figure(data=go.Heatmap(
-                    z=df.corr().values,
-                    x=df.columns,
-                    y=df.columns,
-                    colorscale='Viridis'))
-            elif plot_type == "Violin Plot":
-                fig = px.violin(df, x=x_axis, y=y_axis, color=color_by, facet_col=facet_col)
-            elif plot_type == "3D Scatter Plot":
-                fig = px.scatter_3d(df, x=x_axis, y=y_axis, z=z_axis, color=color_by)
-            elif plot_type == "Parallel Coordinates":
-                fig = px.parallel_coordinates(df, color=color_by)
             elif plot_type == "Pair Plot":
-                fig = px.scatter_matrix(df, color=color_by)
-            elif plot_type == "Density Contour":
-                fig = px.density_contour(df, x=x_axis, y=y_axis, color=color_by)
             st.plotly_chart(fig, use_container_width=True)
         except Exception as e:
-            st.error(f"Visualization error: {str(e)}")
     # --------------------------
-    # Relationship Diagnostics
     # --------------------------
-    st.subheader("🔗 Relationship Diagnostics")
-    selected_columns = st.multiselect("Select columns to analyze relationships", df.columns)
-    if selected_columns:
-        if len(selected_columns) == 2:
             col1, col2 = st.columns(2)
             with col1:
-                st.write(f"**Scatter Plot: {selected_columns[0]} vs {selected_columns[1]}**")
-                fig = px.scatter(df, x=selected_columns[0], y=selected_columns[1], trendline="ols")
-                st.plotly_chart(fig, use_container_width=True)
             with col2:
-                st.write("**Statistical Summary**")
-                st.write(df[selected_columns].describe())
-                # Correlation Analysis
-                pearson_corr, _ = pearsonr(df[selected_columns[0]], df[selected_columns[1]])
-                spearman_corr, _ = spearmanr(df[selected_columns[0]], df[selected_columns[1]])
-                st.metric("Pearson Correlation", f"{pearson_corr:.2f}")
-                st.metric("Spearman Correlation", f"{spearman_corr:.2f}")
-                st.write("**Regression Line**")
-                st.write(f"Equation: y = {fig.data[1].line.color} * x + {fig.data[1].line.dash}")
-        elif len(selected_columns) > 2:
-            st.warning("Please select only two columns for relationship analysis.")
-        else:
-            st.warning("Please select at least two columns for relationship analysis.")
-    # --------------------------
-    # Advanced Statistics
-    # --------------------------
-    with st.expander("📊 Advanced Statistics", expanded=False):
-        st.write("**Column-wise Statistics**")
-        selected_col = st.selectbox("Select a column for detailed analysis", df.columns)
-        if selected_col:
-            if pd.api.types.is_numeric_dtype(df[selected_col]):
-                st.write(f"**Distribution of {selected_col}**")
-                fig = px.histogram(df, x=selected_col, nbins=30)
-                st.plotly_chart(fig, use_container_width=True)
-                st.write("**Outlier Detection**")
-                Q1 = df[selected_col].quantile(0.25)
-                Q3 = df[selected_col].quantile(0.75)
-                IQR = Q3 - Q1
-                outliers = df[(df[selected_col] < (Q1 - 1.5 * IQR)) | (df[selected_col] > (Q3 + 1.5 * IQR))]
-                st.write(f"Number of outliers: {len(outliers)}")
-                st.dataframe(outliers.head(), use_container_width=True)
-            else:
-                st.write(f"**Value Counts for {selected_col}**")
-                value_counts = df[selected_col].value_counts()
-                st.bar_chart(value_counts)
     # --------------------------
-    # Save Visualizations
     # --------------------------
-    st.subheader("💾 Save Visualizations")
-    if st.button("Export Current Visualization as PNG"):
-        try:
             fig.write_image("visualization.png")
-            st.success("Visualization saved as PNG!")
-        except Exception as e:
-            st.error(f"Error saving visualization: {str(e)}")
-# Call the EDA function
-eda()
 # Streamlit App
 elif app_mode == "Model Training":

             st.dataframe(st.session_state.cleaned_data.head(), use_container_width=True)
+elif app_mode = "EDA":
+    st.title("🔍 Interactive Data Explorer")
     if st.session_state.cleaned_data is None:
         st.warning("Please clean your data first")
     df = st.session_state.cleaned_data
     # --------------------------
+    # Enhanced Data Overview
     # --------------------------
+    with st.expander("📁 Dataset Overview", expanded=True):
+        col1, col2, col3, col4 = st.columns(4)
         with col1:
+            st.metric("Total Rows", df.shape[0], help="Number of observations in the dataset")
         with col2:
+            st.metric("Total Columns", df.shape[1], help="Number of features in the dataset")
         with col3:
+            missing = df.isna().sum().sum()
+            st.metric("Missing Values", f"{missing} ({missing/(df.size)*100:.1f}%)")
+        with col4:
+            dupes = df.duplicated().sum()
+            st.metric("Duplicates", dupes, help="Fully duplicated rows")
+        # Data Preview Tabs
+        tab1, tab2, tab3 = st.tabs(["Quick Preview", "Column Types", "Missing Matrix"])
+        with tab1:
             st.dataframe(df.head(), use_container_width=True)
+        with tab2:
+            types = df.dtypes.value_counts().reset_index()
+            types.columns = ['Type', 'Count']
+            st.dataframe(types, use_container_width=True)
+        with tab3:
+            fig = px.imshow(df.isna(), color_continuous_scale='gray')
+            st.plotly_chart(fig, use_container_width=True)
     # --------------------------
+    # Smart Visualization Builder
     # --------------------------
+    st.subheader("📊 Visualization Builder")
+    # Automatic plot type suggestions
+    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
+    categorical_cols = df.select_dtypes(exclude=np.number).columns.tolist()
     col1, col2 = st.columns([1, 3])
     with col1:
+        # Dynamic plot type filtering
+        default_plot = "Histogram" if len(numeric_cols) > 0 else "Bar Chart"
+        plot_type = st.selectbox(
+            "Choose visualization type",
+            options=[
+                "Scatter Plot", "Histogram", "Box Plot",
+                "Violin Plot", "Line Chart", "Bar Chart",
+                "Correlation Matrix", "Pair Plot", "Heatmap",
+                "3D Scatter", "Parallel Categories"
+            ],
+            index=0,
+            help="Automatically filtered based on data types"
+        )
+        # Dynamic axis selection
+        x_axis = st.selectbox("X-axis", df.columns,
+                            help="Primary dimension for analysis")
+        y_axis = st.selectbox("Y-axis", [None] + df.columns.tolist(),
+                            disabled=plot_type in ["Histogram", "Bar Chart"],
+                            help="Secondary dimension for analysis")
+        # Smart color encoding
+        color_options = ["None"] + df.columns.tolist()
+        color_by = st.selectbox("Color encoding", color_options,
+                              format_func=lambda x: "No color" if x == "None" else x)
+        # Context-aware controls
+        if plot_type in ["3D Scatter", "Parallel Categories"]:
+            z_axis = st.selectbox("Z-axis", [None] + df.columns.tolist())
+        if plot_type == "Parallel Categories":
+            dimensions = st.multiselect("Dimensions", df.columns.tolist(), default=df.columns[:3])
     with col2:
         try:
+            # Generate appropriate visualization
             if plot_type == "Scatter Plot":
+                fig = px.scatter(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None,
+                               hover_data=df.columns, trendline="lowess")
             elif plot_type == "Histogram":
+                fig = px.histogram(df, x=x_axis, color=color_by if color_by != "None" else None,
+                                 nbins=30, marginal="box")
             elif plot_type == "Box Plot":
+                fig = px.box(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None)
+            elif plot_type == "Violin Plot":
+                fig = px.violin(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None,
+                              box=True)
+            elif plot_type == "Line Chart":
+                fig = px.line(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None)
+            elif plot_type == "Bar Chart":
+                fig = px.bar(df, x=x_axis, color=color_by if color_by != "None" else None)
             elif plot_type == "Correlation Matrix":
                 corr = df.select_dtypes(include=np.number).corr()
+                fig = px.imshow(corr, text_auto=True, color_continuous_scale='RdBu_r',
+                              zmin=-1, zmax=1)
             elif plot_type == "Pair Plot":
+                fig = px.scatter_matrix(df, dimensions=numeric_cols[:4],
+                                      color=color_by if color_by != "None" else None)
+            elif plot_type == "Heatmap":
+                fig = px.density_heatmap(df, x=x_axis, y=y_axis, facet_col=color_by if color_by != "None" else None)
+            elif plot_type == "3D Scatter":
+                fig = px.scatter_3d(df, x=x_axis, y=y_axis, z=z_axis,
+                                  color=color_by if color_by != "None" else None)
+            elif plot_type == "Parallel Categories":
+                fig = px.parallel_categories(df, dimensions=dimensions,
+                                           color=color_by if color_by != "None" else None)
+            # Interactive plot customization
+            with st.expander("⚙️ Chart Settings", expanded=False):
+                col1, col2 = st.columns(2)
+                with col1:
+                    chart_title = st.text_input("Chart title", f"{plot_type} of {x_axis} vs {y_axis}")
+                    fig.update_layout(title=chart_title)
+                with col2:
+                    theme = st.selectbox("Color theme", px.colors.named_colorscales())
+                    fig.update_layout(colorway=px.colors.qualitative.Plotly)
             st.plotly_chart(fig, use_container_width=True)
         except Exception as e:
+            st.error(f"Couldn't create visualization: {str(e)}")
+            st.info("Try selecting different columns or changing the visualization type")
     # --------------------------
+    # Advanced Analysis
     # --------------------------
+    with st.expander("🔬 Deep Analysis Tools", expanded=False):
+        tab1, tab2, tab3 = st.tabs(["Statistical Tests", "Pattern Explorer", "Data Transformation"])
+        with tab1:
+            st.subheader("Hypothesis Testing")
             col1, col2 = st.columns(2)
             with col1:
+                test_var = st.selectbox("Test variable", numeric_cols)
             with col2:
+                group_var = st.selectbox("Grouping variable", [None] + categorical_cols)
+            if group_var and st.button("Run ANOVA"):
+                groups = df.groupby(group_var)[test_var].apply(list)
+                f_val, p_val = stats.f_oneway(*groups)
+                st.write(f"F-value: {f_val:.2f}, p-value: {p_val:.4f}")
+        with tab2:
+            st.subheader("Pattern Discovery")
+            explore_col = st.selectbox("Column to analyze", df.columns)
+            if pd.api.types.is_string_dtype(df[explore_col]):
+                pattern = st.text_input("Regex pattern")
+                if pattern:
+                    matches = df[explore_col].str.contains(pattern).sum()
+                    st.write(f"Found {matches} matches")
+        with tab3:
+            st.subheader("Data Transformation")
+            transform_col = st.selectbox("Column to transform", numeric_cols)
+            transform_type = st.selectbox("Transformation", ["Log", "Square Root", "Z-score"])
+            if transform_type == "Log":
+                df[transform_col] = np.log1p(df[transform_col])
+            elif transform_type == "Square Root":
+                df[transform_col] = np.sqrt(df[transform_col])
+            elif transform_type == "Z-score":
+                df[transform_col] = (df[transform_col] - df[transform_col].mean())/df[transform_col].std()
     # --------------------------
+    # Export & Save
     # --------------------------
+    st.subheader("💾 Export Options")
+    col1, col2 = st.columns(2)
+    with col1:
+        if st.button("📥 Download Current Visualization"):
             fig.write_image("visualization.png")
+            st.success("Image saved!")
+    with col2:
+        if st.button("📊 Export Analysis Report"):
+            profile = ProfileReport(df, minimal=True)
+            profile.to_file("analysis_report.html")
+            st.success("Report generated!")
 # Streamlit App
 elif app_mode == "Model Training":