Spaces:

CosmickVisions
/

Data-Vision

Running

App Files Files Community

CosmickVisions commited on Feb 28

Commit

b065a25

verified ·

1 Parent(s): caa2b7a

Update app.py

Browse files

Files changed (1) hide show

app.py +212 -212

app.py CHANGED Viewed

@@ -519,241 +519,241 @@ elif app_mode == "Advanced EDA":
         Uncover hidden patterns and relationships in your data.
     """)
-    if 'cleaned_data' not in st.session_state or st.session_state.cleaned_data is None:
-        st.warning("Please clean your data in the Smart Cleaning section first.")
-        st.stop()
-    df = st.session_state.cleaned_data.copy()
-    # Initialize session state for EDA configuration
-    if 'eda_config' not in st.session_state:
-        st.session_state.eda_config = {
-            'plot_type': "Histogram",
-            'x_col': df.columns[0] if len(df.columns) > 0 else None,
-            'y_col': df.columns[1] if len(df.columns) > 1 else None,
-            'z_col': df.columns[2] if len(df.columns) > 2 else None,
-            'color_col': None,
-            'size_col': None,
-            'time_col': None,
-            'value_col': None,
-            'scatter_matrix_cols': df.select_dtypes(include=np.number).columns.tolist()[:5],
-            'color_palette': "Viridis",
-            'hover_data_cols': [],
-            'filter_col': None,
-            'filter_options': []
-        }
-    # Data Filtering Section
-    with st.expander("🔎 Data Filtering", expanded=True):
-        st.session_state.eda_config['filter_col'] = st.selectbox(
-            "Filter Column",
-            [None] + list(df.columns),
-            help="Choose a column to filter the data."
         )
-        if st.session_state.eda_config['filter_col']:
-            unique_values = df[st.session_state.eda_config['filter_col']].unique()
-            st.session_state.eda_config['filter_options'] = st.multiselect(
-                "Filter Values",
-                unique_values,
-                default=unique_values,
-                help=f"Select values from '{st.session_state.eda_config['filter_col']}'"
-            )
-            df = df[df[st.session_state.eda_config['filter_col']].isin(
-                st.session_state.eda_config['filter_options']
-            )]
-    # Visualization Type Selection
-    st.sidebar.header("📊 Visualization Configuration")
-    plot_types = [
-        "Histogram", "Scatter Plot", "Box Plot", "Violin Plot",
-        "Correlation Heatmap", "Parallel Coordinates", "Andrews Curves",
-        "Pair Plot", "Density Contour", "3D Scatter", "Time Series",
-        "Sunburst Chart", "Funnel Chart", "Clustering Analysis"
-    ]
-    st.session_state.eda_config['plot_type'] = st.sidebar.selectbox(
-        "Choose Visualization",
-        plot_types,
-        index=0
     )
-    # Dynamic Controls Based on Plot Type
-    if st.session_state.eda_config['plot_type'] != "Correlation Heatmap":
-        st.session_state.eda_config['x_col'] = st.sidebar.selectbox(
-            "X Axis",
-            df.columns,
-            index=df.columns.get_loc(st.session_state.eda_config['x_col'])
-            if st.session_state.eda_config['x_col'] in df.columns else 0
-        )
-    if st.session_state.eda_config['plot_type'] in ["Scatter Plot", "Box Plot", "Violin Plot", "Time Series", "3D Scatter", "Histogram"]:
-        st.session_state.eda_config['y_col'] = st.sidebar.selectbox(
-            "Y Axis",
-            df.columns,
-            index=df.columns.get_loc(st.session_state.eda_config['y_col'])
-            if st.session_state.eda_config['y_col'] in df.columns else 0
-        )
-    if st.session_state.eda_config['plot_type'] == "3D Scatter":
-        st.session_state.eda_config['z_col'] = st.sidebar.selectbox(
-            "Z Axis",
-            df.columns,
-            index=df.columns.get_loc(st.session_state.eda_config['z_col'])
-            if st.session_state.eda_config['z_col'] in df.columns else 0
-        )
-        st.session_state.eda_config['color_col'] = st.sidebar.selectbox(
-            "Color by",
-            [None] + list(df.columns)
         )
-    # Advanced Plot Customization
-    with st.expander("🎨 Advanced Customization", expanded=False):
-        st.session_state.eda_config['color_palette'] = st.selectbox(
-            "Color Palette",
-            ["Viridis", "Plasma", "Magma", "Cividis", "RdBu", "Rainbow"]
         )
-        st.session_state.eda_config['hover_data_cols'] = st.multiselect(
-            "Hover Data",
-            df.columns
-        )
-    # Plot Generation
-    try:
-        fig = None
-        config = st.session_state.eda_config
-        if config['plot_type'] == "Histogram":
-            color_palette = config['color_palette']
-            colors = getattr(pc.sequential, color_palette)
-            fig = px.histogram(
-                df, x=config['x_col'], y=config['y_col'],
-                nbins=30, template="plotly_dark",
-                color=config['x_col'],
-                color_discrete_sequence = [colors[0]]
-            )
-        elif config['plot_type'] == "Scatter Plot":
-            fig = px.scatter(
-                df, x=config['x_col'], y=config['y_col'],
-                color=config['color_col'],
-                size=config['size_col'],
-                hover_data=config['hover_data_cols']
-            )
-        elif config['plot_type'] == "3D Scatter":
-            fig = px.scatter_3d(
-                df, x=config['x_col'], y=config['y_col'], z=config['z_col'],
-                color=config['color_col'],
-                color_discrete_sequence=[config['color_palette']]
             )
-        elif config['plot_type'] == "Correlation Heatmap":
-            numeric_df = df.select_dtypes(include=np.number)
-            if not numeric_df.empty:
-                corr = numeric_df.corr()
-                fig = px.imshow(
-                    corr, text_auto=True,
-                    color_continuous_scale=config['color_palette']
-                )
-            else:
-                st.warning("No numerical columns found for correlation heatmap.")
-        elif config['plot_type'] == "Box Plot":
-            fig = px.box(
-                df, x=config['x_col'], y=config['y_col'],
-                color=config['color_col']
-            )
-        elif config['plot_type'] == "Violin Plot":
-            fig = px.violin(
-                df, x=config['x_col'], y=config['y_col'],
-                box=True, points="all",
-                color=config['color_col']
-            )
-        elif config['plot_type'] == "Time Series":
-            df = df.sort_values(by=config['time_col'])
-            fig = px.line(
-                df, x=config['time_col'], y=config['value_col'],
-                color=config['color_col']
-            )
-        elif config['plot_type'] == "Scatter Matrix":
-            fig = px.scatter_matrix(
-                df, dimensions=config['scatter_matrix_cols'],
-                color=config['color_col']
-            )
-        if fig:
-            st.plotly_chart(fig, use_container_width=True)
-    except Exception as e:
-        st.error(f"An error occurred while generating the plot: {e}")
-    # Statistical Analysis Section
-    with st.expander("📊 Statistical Analysis", expanded=True):
-        analysis_type = st.selectbox("Select Analysis Type", [
-            "Descriptive Statistics",
-            "Correlation Analysis",
-            "Hypothesis Testing",
-            "Distribution Fitting"
         ])
-        if analysis_type == "Descriptive Statistics":
-            st.write(df.describe(include='all'))
-        elif analysis_type == "Correlation Analysis":
-            numeric_cols = df.select_dtypes(include=np.number).columns
-            if len(numeric_cols) >= 2:
-                corr_method = st.selectbox("Correlation Method", [
-                    "Pearson", "Kendall", "Spearman"
-                ])
-                corr_matrix = df[numeric_cols].corr(method=corr_method.lower())
-                st.write(corr_matrix)
-                st.heatmap(corr_matrix, annot=True, cmap=config['color_palette'])
-            else:
-                st.warning("Need at least 2 numeric columns for correlation analysis")
-        elif analysis_type == "Hypothesis Testing":
-            test_type = st.selectbox("Select Test Type", [
-                "T-test", "Chi-Squared Test", "ANOVA", "Mann-Whitney U"
-            ])
-            if test_type == "T-test":
-                col1 = st.selectbox("Column 1 (Numeric)", df.select_dtypes(include=np.number).columns)
-                col2 = st.selectbox("Column 2 (Categorical)", df.select_dtypes(include='object').columns)
-                if st.button("Run T-test"):
-                    groups = df.groupby(col2)[col1].apply(list)
-                    if len(groups) == 2:
-                        t_stat, p_value = stats.ttest_ind(groups.iloc[0], groups.iloc[1])
-                        st.write(f"T-statistic: {t_stat:.4f}")
-                        st.write(f"P-value: {p_value:.4f}")
-                        if p_value < 0.05:
-                            st.write("Reject the null hypothesis.")
-                        else:
-                            st.write("Fail to reject the null hypothesis.")
                     else:
-                        st.write("Select a categorical column with exactly two categories.")
-        elif analysis_type == "Distribution Fitting":
-            numeric_col = st.selectbox("Select Numeric Column", df.select_dtypes(include=np.number).columns)
-            dist_types = ["Normal", "Log-Normal", "Exponential", "Gamma"]
-            selected_dist = st.selectbox("Select Distribution Type", dist_types)
-            if st.button("Fit Distribution"):
-                from scipy.stats import norm, lognorm, expon, gamma
-                dist_functions = {
-                    "Normal": norm,
-                    "Log-Normal": lognorm,
-                    "Exponential": expon,
-                    "Gamma": gamma
-                }
-                params = dist_functions[selected_dist].fit(df[numeric_col].dropna())
-                st.write(f"Fitted Parameters: {params}")
-    # Data Profiling Section
-    with st.expander("📝 Generate Full Data Profile", expanded=False):
-        if st.button("🚀 Generate Comprehensive Report"):
-            with st.spinner("Generating report..."):
-                pr = ProfileReport(df, explorative=True)
-                st_profile_report(pr)
 # Model Training Section
 elif app_mode == "Model Training":

         Uncover hidden patterns and relationships in your data.
     """)
+   if 'cleaned_data' not in st.session_state or st.session_state.cleaned_data is None:
+    st.warning("Please clean your data in the Smart Cleaning section first.")
+    st.stop()
+df = st.session_state.cleaned_data.copy()
+# Initialize session state for EDA configuration
+if 'eda_config' not in st.session_state:
+    st.session_state.eda_config = {
+        'plot_type': "Histogram",
+        'x_col': df.columns[0] if len(df.columns) > 0 else None,
+        'y_col': df.columns[1] if len(df.columns) > 1 else None,
+        'z_col': df.columns[2] if len(df.columns) > 2 else None,
+        'color_col': None,
+        'size_col': None,
+        'time_col': None,
+        'value_col': None,
+        'scatter_matrix_cols': df.select_dtypes(include=np.number).columns.tolist()[:5],
+        'color_palette': "Viridis",
+        'hover_data_cols': [],
+        'filter_col': None,
+        'filter_options': []
+    }
+# Data Filtering Section
+with st.expander("🔎 Data Filtering", expanded=True):
+    st.session_state.eda_config['filter_col'] = st.selectbox(
+        "Filter Column",
+        [None] + list(df.columns),
+        help="Choose a column to filter the data."
+    )
+    if st.session_state.eda_config['filter_col']:
+        unique_values = df[st.session_state.eda_config['filter_col']].unique()
+        st.session_state.eda_config['filter_options'] = st.multiselect(
+            "Filter Values",
+            unique_values,
+            default=unique_values,
+            help=f"Select values from '{st.session_state.eda_config['filter_col']}'"
         )
+        df = df[df[st.session_state.eda_config['filter_col']].isin(
+            st.session_state.eda_config['filter_options']
+        )]
+# Visualization Type Selection
+st.sidebar.header("📊 Visualization Configuration")
+plot_types = [
+    "Histogram", "Scatter Plot", "Box Plot", "Violin Plot",
+    "Correlation Heatmap", "Parallel Coordinates", "Andrews Curves",
+    "Pair Plot", "Density Contour", "3D Scatter", "Time Series",
+    "Sunburst Chart", "Funnel Chart", "Clustering Analysis"
+]
+st.session_state.eda_config['plot_type'] = st.sidebar.selectbox(
+    "Choose Visualization",
+    plot_types,
+    index=0
+)
+# Dynamic Controls Based on Plot Type
+if st.session_state.eda_config['plot_type'] != "Correlation Heatmap":
+    st.session_state.eda_config['x_col'] = st.sidebar.selectbox(
+        "X Axis",
+        df.columns,
+        index=df.columns.get_loc(st.session_state.eda_config['x_col'])
+        if st.session_state.eda_config['x_col'] in df.columns else 0
+    )
+if st.session_state.eda_config['plot_type'] in ["Scatter Plot", "Box Plot", "Violin Plot", "Time Series", "3D Scatter", "Histogram"]:
+    st.session_state.eda_config['y_col'] = st.sidebar.selectbox(
+        "Y Axis",
+        df.columns,
+        index=df.columns.get_loc(st.session_state.eda_config['y_col'])
+        if st.session_state.eda_config['y_col'] in df.columns else 0
     )
+if st.session_state.eda_config['plot_type'] == "3D Scatter":
+    st.session_state.eda_config['z_col'] = st.sidebar.selectbox(
+        "Z Axis",
+        df.columns,
+        index=df.columns.get_loc(st.session_state.eda_config['z_col'])
+        if st.session_state.eda_config['z_col'] in df.columns else 0
+    )
+    st.session_state.eda_config['color_col'] = st.sidebar.selectbox(
+        "Color by",
+        [None] + list(df.columns)
+    )
+# Advanced Plot Customization
+with st.expander("🎨 Advanced Customization", expanded=False):
+    st.session_state.eda_config['color_palette'] = st.selectbox(
+        "Color Palette",
+        ["Viridis", "Plasma", "Magma", "Cividis", "RdBu", "Rainbow"]
+    )
+    st.session_state.eda_config['hover_data_cols'] = st.multiselect(
+        "Hover Data",
+        df.columns
+    )
+# Plot Generation
+try:
+    fig = None
+    config = st.session_state.eda_config
+    if config['plot_type'] == "Histogram":
+        color_palette = config['color_palette']
+        colors = getattr(pc.sequential, color_palette)
+        fig = px.histogram(
+            df, x=config['x_col'], y=config['y_col'],
+            nbins=30, template="plotly_dark",
+            color=config['x_col'],
+            color_discrete_sequence = [colors[0]]
         )
+    elif config['plot_type'] == "Scatter Plot":
+        fig = px.scatter(
+            df, x=config['x_col'], y=config['y_col'],
+            color=config['color_col'],
+            size=config['size_col'],
+            hover_data=config['hover_data_cols']
         )
+    elif config['plot_type'] == "3D Scatter":
+        fig = px.scatter_3d(
+            df, x=config['x_col'], y=config['y_col'], z=config['z_col'],
+            color=config['color_col'],
+            color_discrete_sequence=[config['color_palette']]
+        )
+    elif config['plot_type'] == "Correlation Heatmap":
+        numeric_df = df.select_dtypes(include=np.number)
+        if not numeric_df.empty:
+            corr = numeric_df.corr()
+            fig = px.imshow(
+                corr, text_auto=True,
+                color_continuous_scale=config['color_palette']
             )
+        else:
+            st.warning("No numerical columns found for correlation heatmap.")
+    elif config['plot_type'] == "Box Plot":
+        fig = px.box(
+            df, x=config['x_col'], y=config['y_col'],
+            color=config['color_col']
+        )
+    elif config['plot_type'] == "Violin Plot":
+        fig = px.violin(
+            df, x=config['x_col'], y=config['y_col'],
+            box=True, points="all",
+            color=config['color_col']
+        )
+    elif config['plot_type'] == "Time Series":
+        df = df.sort_values(by=config['time_col'])
+        fig = px.line(
+            df, x=config['time_col'], y=config['value_col'],
+            color=config['color_col']
+        )
+    elif config['plot_type'] == "Scatter Matrix":
+        fig = px.scatter_matrix(
+            df, dimensions=config['scatter_matrix_cols'],
+            color=config['color_col']
+        )
+    if fig:
+        st.plotly_chart(fig, use_container_width=True)
+except Exception as e:
+    st.error(f"An error occurred while generating the plot: {e}")
+# Statistical Analysis Section
+with st.expander("📊 Statistical Analysis", expanded=True):
+    analysis_type = st.selectbox("Select Analysis Type", [
+        "Descriptive Statistics",
+        "Correlation Analysis",
+        "Hypothesis Testing",
+        "Distribution Fitting"
+    ])
+    if analysis_type == "Descriptive Statistics":
+        st.write(df.describe(include='all'))
+    elif analysis_type == "Correlation Analysis":
+        numeric_cols = df.select_dtypes(include=np.number).columns
+        if len(numeric_cols) >= 2:
+            corr_method = st.selectbox("Correlation Method", [
+                "Pearson", "Kendall", "Spearman"
+            ])
+            corr_matrix = df[numeric_cols].corr(method=corr_method.lower())
+            st.write(corr_matrix)
+            st.heatmap(corr_matrix, annot=True, cmap=config['color_palette'])
+        else:
+            st.warning("Need at least 2 numeric columns for correlation analysis")
+    elif analysis_type == "Hypothesis Testing":
+        test_type = st.selectbox("Select Test Type", [
+            "T-test", "Chi-Squared Test", "ANOVA", "Mann-Whitney U"
         ])
+        if test_type == "T-test":
+            col1 = st.selectbox("Column 1 (Numeric)", df.select_dtypes(include=np.number).columns)
+            col2 = st.selectbox("Column 2 (Categorical)", df.select_dtypes(include='object').columns)
+            if st.button("Run T-test"):
+                groups = df.groupby(col2)[col1].apply(list)
+                if len(groups) == 2:
+                    t_stat, p_value = stats.ttest_ind(groups.iloc[0], groups.iloc[1])
+                    st.write(f"T-statistic: {t_stat:.4f}")
+                    st.write(f"P-value: {p_value:.4f}")
+                    if p_value < 0.05:
+                        st.write("Reject the null hypothesis.")
                     else:
+                        st.write("Fail to reject the null hypothesis.")
+                else:
+                    st.write("Select a categorical column with exactly two categories.")
+    elif analysis_type == "Distribution Fitting":
+        numeric_col = st.selectbox("Select Numeric Column", df.select_dtypes(include=np.number).columns)
+        dist_types = ["Normal", "Log-Normal", "Exponential", "Gamma"]
+        selected_dist = st.selectbox("Select Distribution Type", dist_types)
+        if st.button("Fit Distribution"):
+            from scipy.stats import norm, lognorm, expon, gamma
+            dist_functions = {
+                "Normal": norm,
+                "Log-Normal": lognorm,
+                "Exponential": expon,
+                "Gamma": gamma
+            }
+            params = dist_functions[selected_dist].fit(df[numeric_col].dropna())
+            st.write(f"Fitted Parameters: {params}")
+# Data Profiling Section
+with st.expander("📝 Generate Full Data Profile", expanded=False):
+    if st.button("🚀 Generate Comprehensive Report"):
+        with st.spinner("Generating report..."):
+            pr = ProfileReport(df, explorative=True)
+            st_profile_report(pr)
 # Model Training Section
 elif app_mode == "Model Training":