Spaces:

CosmickVisions
/

Data-Vision

Sleeping

App Files Files Community

CosmickVisions commited on Feb 28

Commit

961a3b2

verified ·

1 Parent(s): 6299132

Update app.py

Browse files

Files changed (1) hide show

app.py +323 -282

app.py CHANGED Viewed

@@ -22,6 +22,8 @@ from io import BytesIO
 import base64
 import time
 from sklearn.cluster import KMeans
 # Configurations
 st.set_page_config(page_title="Executive Insights Pro", layout="wide", page_icon="📈")
@@ -243,209 +245,239 @@ if app_mode == "Data Upload":
                     show_loader("Generating EDA Report")
                     pr = generate_profile(df)
                     st_profile_report(pr)
 elif app_mode == "Smart Cleaning":
-    st.title("🧼 Intelligent Data Cleaning")
-    if st.session_state.raw_data is not None:
-        df = st.session_state.cleaned_data
-        # Initialize history if not exists
-        if 'data_history' not in st.session_state:
-            st.session_state.data_history = [df.copy()]
-        # Cleaning Toolkit
-        col1, col2 = st.columns([1, 3])
-        with col1:
-            st.subheader("Cleaning Actions")
-            # Add Reset and Undo buttons
-            col1a, col1b = st.columns(2)
-            with col1a:
-                if st.button("Reset to Original", help="Revert all changes to the uploaded data."):
-                    st.session_state.cleaned_data = st.session_state.raw_data.copy()
-                    st.session_state.data_history = [st.session_state.raw_data.copy()]
                     st.experimental_rerun()
-            with col1b:
-                if len(st.session_state.data_history) > 1:
-                    if st.button("Undo Last Action", help="Revert to the previous state."):
-                        st.session_state.data_history.pop()
-                        st.session_state.cleaned_data = st.session_state.data_history[-1].copy()
-                        st.experimental_rerun()
-            clean_action = st.selectbox("Choose Operation", [
-                "Handle Missing Values",
-                "Remove Duplicates",
-                "Remove Column",
-                "Normalize Data",
-                "Encode Categories",
-                "Outlier Removal",
-                "Auto Clean",
-                "Neural Network Prep"
-            ], help="Select the data cleaning operation to perform.")
-            # Initialize Auto Clean Variables
-            auto_missing = False
-            auto_normalize = False
-            auto_encode = False
-            missing_strategy_num = "Median"
-            missing_strategy_cat = "Most Frequent"
-        if clean_action == "Handle Missing Values": #Corrected indentation
-            st.markdown("**Configure how missing values will be handled.**", unsafe_allow_html=True)
-            all_impute_cols = ["All Columns"] + df.columns.tolist()
-            impute_cols = st.multiselect("Columns to Impute", all_impute_cols, default=["All Columns"], help="Select the columns with missing values to impute. Choose 'All Columns' to apply to all columns with missing values.")
-            if "All Columns" in impute_cols:
-                impute_cols = df.columns.tolist()
-            method = st.selectbox("Imputation Method", [
-                "KNN Imputation",
-                "Median Fill",
-                "Mean Fill",
-                "Drop Missing"
-            ], help="Choose the method to use for imputing missing values.")
-        elif clean_action == "Neural Network Prep":
-            st.markdown("**Neural Network Specific Preparation**", unsafe_allow_html=True)
-            # Make dynamic to check if the models can allow it
-            validModels=["RNN", "CNN"]
-            model_Choice_text = st.radio("What's a use case for Models?",
-                options= validModels)
-            # display a string or some other feedback
-            st.info('Select a machine learning task below!')
-            ## to check which text based mode
-            validColumnNumerical_cols = df.select_dtypes(include=['int','float']).columns.tolist()
-            numcol_cols = st.multiselect("Text use Colimns:  or sequence for model usage :D, to generate the code - to understand how each one plays out D", options =validColumnNumerical_cols )
-            #### Make different selections here now
-            ####
-            st.code('Code example is generated.')
-            """ Make each configuration do an function or callback" just one press and more to learn""" ### Show code. You do need check what variables and show output that goes on
-            #### then, what did output happen if you pick A or B variable selection.
-            seq_length = st.number_input("Sequence Length (for RNN)", 10, 100, 30, help =" Length to do that. make them more power ")
-            method = st.selectbox("Imputation Method", ["KNN Imputation", "Median Fill", "Mean Fill",  "Drop Missing"])
-            if method == "KNN Imputation":
-                knn_neighbors = st.slider("KNN Neighbors", 2, 10, 5, help="Number of neighbors for KNN Imputation.") #Parameter
-        elif clean_action == "Normalize Data":
-            st.markdown("**Choose a scaling method and columns to normalize.**")
-            scaler_type = st.selectbox("Scaler Type", ["RobustScaler", "StandardScaler"], help="Select the type of scaler to use.")
-            all_normalize_cols = ["All Numerical"] + df.select_dtypes(include=np.number).columns.tolist()
-            normalize_cols = st.multiselect("Columns to Normalize", all_normalize_cols, default=["All Numerical"], help="Select the numerical columns to normalize. Choose 'All Numerical' to apply to all numerical columns.")
-            if "All Numerical" in normalize_cols:
-                normalize_cols = df.select_dtypes(include=np.number).columns.tolist()
-        elif clean_action == "Encode Categories":
-            st.markdown("**Select categorical columns to encode.**")
-            all_encode_cols = ["All Categorical"] + df.select_dtypes(include='object').columns.tolist()
-            encode_cols = st.multiselect("Columns to Encode", all_encode_cols, default=["All Categorical"], help="Select the categorical columns to encode. Choose 'All Categorical' to apply to all object type columns.")
-            if "All Categorical" in encode_cols:
-                encode_cols = df.select_dtypes(include='object').columns.tolist()
-            encoding_method = st.selectbox("Encoding Method", ["OneHotEncoder"], help="Choose the encoding method.")
-        elif clean_action == "Outlier Removal":
-            st.markdown("**Configure outlier removal settings.**")
-            all_outlier_cols = ["All Numerical"] + df.select_dtypes(include=np.number).columns.tolist()
-            outlier_cols = st.multiselect("Columns to Remove Outliers From", all_outlier_cols, default=["All Numerical"], help="Select the columns to remove outliers from. Choose 'All Numerical' to apply to all numerical columns.")
-            if "All Numerical" in outlier_cols:
-                outlier_cols = df.select_dtypes(include=np.number).columns.tolist()
-            outlier_method = st.selectbox("Outlier Removal Method", ["IQR", "Z-score"], help="Choose the outlier removal method.")
-            if outlier_method == "IQR":
-                iqr_threshold = st.slider("IQR Threshold", 1.0, 3.0, 1.5, help="Adjust the IQR threshold.")
-            else:
-                zscore_threshold = st.slider("Z-score Threshold", 2.0, 4.0, 3.0, help="Adjust the Z-score threshold.")
-        elif clean_action == "Remove Column":
-            st.markdown("**Choose Columns to Remove**")
-            all_cols = df.columns.tolist()
-            remove_cols = st.multiselect("Columns to Remove", all_cols, help="Select the columns to remove.")
-        elif clean_action == "Auto Clean":
-            st.markdown("**Automatically Impute Missing Values, Encode Categorical Variables, and Normalize Numeric Variables**", help = "These action happens automically when selected.")
-            with st.expander("⚙️ Auto Processing Settings"):
-                st.markdown("**Check to enable setting automatic data cleaning.**", help = "You must manually change configurations in the following setttings below.")
-                auto_missing = st.checkbox("Auto Handle Missing Values", True, help = "Auto handle all mission values with selected configurations")
-                auto_normalize = st.checkbox("Auto Normalize Numerical Features", True, help = "Check to automatically normalize all numerical features")
-                auto_encode = st.checkbox("Auto Encode Categorical Features", True, help="Check to automatically Encode all catigorical columns")
-                if auto_missing:
-                    missing_strategy_num = st.selectbox("Numerical Imputation", ["Median", "Mean"], help="Choose the numeric strategy for Auto Clean")
-                    missing_strategy_cat = st.selectbox("Categorical Imputation", ["Most Frequent", "Constant"], help="Choose strategy for auto cleaning on categorical attributes")
-        with col2:
-            if st.button("Apply Transformation"):
-                with st.spinner("Applying changes..."):
-                    current_df = df.copy() # important
-                    if 'data_history' not in st.session_state:
-                        st.session_state.data_history = [df.copy()]
-                    # Store the current state in history BEFORE processing
-                    st.session_state.data_history.append(current_df)
-                    # Auto Processing
-                    if auto_missing and clean_action != "Auto Clean":
-                        num_cols = current_df.select_dtypes(include=np.number).columns
-                        cat_cols = current_df.select_dtypes(include='object').columns
-                        if missing_strategy_num == "Median":
-                            current_df[num_cols] = current_df[num_cols].fillna(current_df[num_cols].median())
-                        else:
-                            current_df[num_cols] = current_df[num_cols].fillna(current_df[num_cols].mean())
-                        if missing_strategy_cat == "Most Frequent":
-                            current_df[cat_cols] = current_df[cat_cols].fillna(current_df[cat_cols].mode().iloc[0])
-                        else:
-                            current_df[cat_cols] = current_df[cat_cols].fillna("Missing")
-                    if auto_normalize and clean_action != "Auto Clean":
-                        num_cols = current_df.select_dtypes(include=np.number).columns
-                        scaler = StandardScaler()
-                        current_df[num_cols] = scaler.fit_transform(current_df[num_cols])
-                    if auto_encode and clean_action != "Auto Clean":
-                        cat_cols = current_df.select_dtypes(include='object').columns
-                        if len(cat_cols) > 0:
-                            encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
-                            encoded_data = encoder.fit_transform(current_df[cat_cols])
-                            encoded_df = pd.DataFrame(encoded_data,columns=encoder.get_feature_names_out(cat_cols))
-                            current_df = pd.concat([current_df.drop(columns=cat_cols), encoded_df], axis=1)
-                    # Manual Processing
-                    if clean_action == "Handle Missing Values":
-                        if method == "KNN Imputation":
-                            imputer = KNNImputer(n_neighbors=knn_neighbors)
-                            current_df[impute_cols] = imputer.fit_transform(current_df[impute_cols])
-                        elif method == "Median Fill":
-                            current_df[impute_cols] = current_df[impute_cols].fillna(current_df[impute_cols].median())
-                        elif method == "Mean Fill":
-                            current_df[impute_cols] = current_df[impute_cols].fillna(current_df[impute_cols].mean())
-                        else:
-                            current_df = current_df.dropna(subset=impute_cols)
-                    elif clean_action == "Remove Columns":
-                        if remove_cols:
-                            current_df = current_df.drop(columns=remove_cols)
-                    st.session_state.cleaned_data = current_df
-                    st.success("Transformation applied!")
-        # Data Comparison
-        st.subheader("Data Version Comparison")
-        col_orig, col_clean = st.columns(2)
-        with col_orig:
-            st.markdown("**Original Data**")
-            if st.session_state.raw_data is not None:
-                st.dataframe(st.session_state.raw_data.head(5), use_container_width=True)
-            else:
-                st.write("No original data uploaded yet.")
-        with col_clean:
-            st.markdown("**Cleaned Data**")
-            st.dataframe(df.head(5), use_container_width=True)
 elif app_mode == "Advanced EDA":
     st.title("🔍 Advanced Exploratory Analysis")
@@ -789,106 +821,115 @@ elif app_mode == "Predictions":
     else:
         st.write("Please train a model first in the 'Model Training' section.")
-elif app_mode == "Visualization Lab":
-    st.title("🔬 Advanced Visualizations")
-    if st.session_state.cleaned_data is not None:
-        df = st.session_state.cleaned_data.copy()
-        # Visualization Type Selection
-        visualization_type = st.selectbox("Select Visualization Type", [
-            "Pair Plot", "Parallel Coordinates Plot", "Andrews Curves", "Pie Chart",
-            "Area Chart", "Density Contour", "Sunburst Chart", "Funnel Chart"
-        ])
-        if visualization_type == "Pair Plot":
-            cols_for_pairplot = st.multiselect("Select Columns for Pair Plot", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:3])
-            if cols_for_pairplot:
-                fig = px.scatter_matrix(df, dimensions=cols_for_pairplot)
-                st.plotly_chart(fig, use_container_width=True)
-        elif visualization_type == "Parallel Coordinates Plot":
-            cols_for_parallel = st.multiselect("Select Columns for Parallel Coordinates", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:5])
-            if cols_for_parallel:
-                fig = px.parallel_coordinates(df[cols_for_parallel], color=df[cols_for_parallel[0]] if cols_for_parallel else None)
-                st.plotly_chart(fig, use_container_width=True)
-        elif visualization_type == "Andrews Curves":
-            cols_for_andrews = st.multiselect("Select Columns for Andrews Curves", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:5])
-            if cols_for_andrews:
-                fig = px.andrews_curves(df[cols_for_andrews + [df.columns[0]]], class_column=df.columns[0])
-                st.plotly_chart(fig, use_container_width=True)
-        elif visualization_type == "Pie Chart":
-            col_for_pie = st.selectbox("Select Column for Pie Chart", df.columns)
-            fig = px.pie(df, names=col_for_pie)
             st.plotly_chart(fig, use_container_width=True)
-        elif visualization_type == "Area Chart":
-            cols_for_area = st.multiselect("Select Columns for Area Chart", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:3])
-            if cols_for_area:
-                fig = px.area(df[cols_for_area])
-                st.plotly_chart(fig, use_container_width=True)
-        elif visualization_type == "Density Contour":
-            x_col = st.selectbox("Select X Column for Density Contour", df.select_dtypes(include=np.number).columns.tolist())
-            y_col = st.selectbox("Select Y Column for Density Contour", df.select_dtypes(include=np.number).columns.tolist())
-            fig = px.density_contour(df, x=x_col, y=y_col)
             st.plotly_chart(fig, use_container_width=True)
-        elif visualization_type == "Sunburst Chart":
-            path_cols = st.multiselect("Select Path Columns for Sunburst Chart", df.columns)
-            if path_cols:
-                fig = px.sunburst(df, path=path_cols)
-                st.plotly_chart(fig, use_container_width=True)
-        elif visualization_type == "Funnel Chart":
-            x_col_funnel = st.selectbox("Select X Column for Funnel Chart", df.columns)
-            y_col_funnel = st.selectbox("Select Y Column for Funnel Chart", df.columns)
-            fig = px.funnel(df, x=x_col_funnel, y=y_col_funnel)
             st.plotly_chart(fig, use_container_width=True)
-elif app_mode == "Visualization Lab" and st.session_state.cleaned_data is not None:
-    st.subheader("Clustering Analysis")
-    df = st.session_state.cleaned_data.copy()
-    # Select columns for clustering
-    numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
-    if not numerical_cols:
-        st.warning("No numerical columns found for clustering.")
-    else:
-        cluster_cols = st.multiselect("Select Columns for Clustering", numerical_cols, default=numerical_cols[:2] if len(numerical_cols) >= 2 else numerical_cols)
-        if cluster_cols:
-            try:
-                # Scale the data
-                scaler = StandardScaler()
-                scaled_data = scaler.fit_transform(df[cluster_cols])
-                # Number of clusters
-                n_clusters = st.slider("Number of Clusters", 2, 10, 3, help="Number of clusters to form.")
-                # Apply K-Means clustering
-                kmeans = KMeans(n_clusters=n_clusters, random_state=42)
-                clusters = kmeans.fit_predict(scaled_data)
-                # Add cluster labels to the DataFrame
-                df['Cluster'] = clusters
-                # Visualize clusters
-                if len(cluster_cols) == 2:
-                    fig = px.scatter(df, x=cluster_cols[0], y=cluster_cols[1], color='Cluster', title="K-Means Clustering")
-                    st.plotly_chart(fig, use_container_width=True)
-                elif len(cluster_cols) == 3:
-                    fig = px.scatter_3d(df, x=cluster_cols[0], y=cluster_cols[1], z=cluster_cols[2], color='Cluster', title="K-Means Clustering (3D)")
-                    st.plotly_chart(fig, use_container_width=True)
-                else:
-                    st.write("Clustering visualization is only supported for 2 or 3 selected columns.")
-                st.success("Clustering applied successfully!")
-            except Exception as e:
-                st.error(f"An error occurred during clustering: {e}")
 elif app_mode == "Neural Network Studio":
     st.title("🧠 Neural Network Studio")

 import base64
 import time
 from sklearn.cluster import KMeans
+import keras
 # Configurations
 st.set_page_config(page_title="Executive Insights Pro", layout="wide", page_icon="📈")
                     show_loader("Generating EDA Report")
                     pr = generate_profile(df)
                     st_profile_report(pr)
 elif app_mode == "Smart Cleaning":
+    st.subheader("Data Cleaning and Preprocessing")
+    if st.checkbox("Clean Data using Neural Network (Imputation)"):
+        numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
+        for col in numerical_cols:
+            if df[col].isnull().sum() > 0:
+                st.write(f"Imputing missing values in {col} using a Neural Network...")
+                train_df = df.dropna(subset=[col]).copy()
+                test_df = df[df[col].isnull()].drop(col, axis=1).copy()
+                train_X = train_df.drop(col, axis=1).select_dtypes(include=np.number)
+                train_y = train_df[col]
+                if not train_X.empty:
+                    # Enhanced Model Selection (Simple Additions)
+                    model_type = st.selectbox(f"Model for {col}", ["Simple Feedforward", "Slightly Deeper"])
+                    if model_type == "Simple Feedforward":
+                        model = keras.Sequential([
+                            keras.layers.Dense(64, activation='relu', input_shape=(train_X.shape[1],)),
+                            keras.layers.Dense(32, activation='relu'),
+                            keras.layers.Dense(1)
+                        ])
+                    else:
+                        model = keras.Sequential([
+                            keras.layers.Dense(128, activation='relu', input_shape=(train_X.shape[1],)),
+                            keras.layers.Dense(64, activation='relu'),
+                            keras.layers.Dense(32, activation='relu'),
+                            keras.layers.Dense(1)
+                        ])
+                    model.compile(optimizer='adam', loss='mse')
+                    model.fit(train_X, train_y, epochs=50, verbose=0)
+                    imputed_values = model.predict(test_df.select_dtypes(include=np.number))
+                    df.loc[df[col].isnull(), col] = imputed_values.flatten()
+                    st.success(f"Imputation in {col} completed.")
+                else:
+                    st.warning(f"Skipping imputation for {col} due to insufficient data.")
+    if st.checkbox("Standardize Numerical Columns"):
+        numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
+        scaler = StandardScaler()
+        df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
+        st.success("Numerical columns standardized.")
+    if st.checkbox("Encode Categorical Columns"):
+        categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
+        for col in categorical_cols:
+            le = LabelEncoder()
+            df[col] = le.fit_transform(df[col])
+        st.success("Categorical columns encoded.")
+    st.session_state.cleaned_data = df #Update cleaned data after cleaning operations.
+    # Cleaning Toolkit
+    col1, col2 = st.columns([1, 3])
+    with col1:
+        st.subheader("Cleaning Actions")
+        # Add Reset and Undo buttons
+        col1a, col1b = st.columns(2)
+        with col1a:
+            if st.button("Reset to Original", help="Revert all changes to the uploaded data."):
+                st.session_state.cleaned_data = st.session_state.raw_data.copy()
+                st.session_state.data_history = [st.session_state.raw_data.copy()]
+                st.experimental_rerun()
+        with col1b:
+            if len(st.session_state.data_history) > 1:
+                if st.button("Undo Last Action", help="Revert to the previous state."):
+                    st.session_state.data_history.pop()
+                    st.session_state.cleaned_data = st.session_state.data_history[-1].copy()
                     st.experimental_rerun()
+        clean_action = st.selectbox("Choose Operation", [
+            "Handle Missing Values",
+            "Remove Duplicates",
+            "Remove Column",
+            "Normalize Data",
+            "Encode Categories",
+            "Outlier Removal",
+            "Auto Clean",
+            "Neural Network Prep"
+        ], help="Select the data cleaning operation to perform.")
+        # Initialize Auto Clean Variables
+        auto_missing = False
+        auto_normalize = False
+        auto_encode = False
+        missing_strategy_num = "Median"
+        missing_strategy_cat = "Most Frequent"
+    if clean_action == "Handle Missing Values":
+        st.markdown("**Configure how missing values will be handled.**", unsafe_allow_html=True)
+        all_impute_cols = ["All Columns"] + df.columns.tolist()
+        impute_cols = st.multiselect("Columns to Impute", all_impute_cols, default=["All Columns"], help="Select the columns with missing values to impute. Choose 'All Columns' to apply to all columns with missing values.")
+        if "All Columns" in impute_cols:
+            impute_cols = df.columns.tolist()
+        method = st.selectbox("Imputation Method", [
+            "KNN Imputation",
+            "Median Fill",
+            "Mean Fill",
+            "Drop Missing"
+        ], help="Choose the method to use for imputing missing values.")
+        if method == "KNN Imputation":
+            knn_neighbors = st.slider("KNN Neighbors", 2, 10, 5, help="Number of neighbors for KNN Imputation.") #Parameter
+    elif clean_action == "Neural Network Prep":
+        st.markdown("**Neural Network Specific Preparation**", unsafe_allow_html=True)
+        validModels=["RNN", "CNN"]
+        model_Choice_text = st.radio("What's a use case for Models?", options= validModels)
+        st.info('Select a machine learning task below!')
+        validColumnNumerical_cols = df.select_dtypes(include=['int','float']).columns.tolist()
+        numcol_cols = st.multiselect("Text use Colimns:  or sequence for model usage :D, to generate the code - to understand how each one plays out D", options =validColumnNumerical_cols )
+        st.code('Code example is generated.')
+        """ Make each configuration do an function or callback" just one press and more to learn"""
+        seq_length = st.number_input("Sequence Length (for RNN)", 10, 100, 30, help =" Length to do that. make them more power ")
+        method = st.selectbox("Imputation Method", ["KNN Imputation", "Median Fill", "Mean Fill",  "Drop Missing"])
+    elif clean_action == "Normalize Data":
+        st.markdown("**Choose a scaling method and columns to normalize.**")
+        scaler_type = st.selectbox("Scaler Type", ["RobustScaler", "StandardScaler"], help="Select the type of scaler to use.")
+        all_normalize_cols = ["All Numerical"] + df.select_dtypes(include=np.number).columns.tolist()
+        normalize_cols = st.multiselect("Columns to Normalize", all_normalize_cols, default=["All Numerical"], help="Select the numerical columns to normalize. Choose 'All Numerical' to apply to all numerical columns.")
+        if "All Numerical" in normalize_cols:
+            normalize_cols = df.select_dtypes(include=np.number).columns.tolist()
+    elif clean_action == "Encode Categories":
+        st.markdown("**Select categorical columns to encode.**")
+        all_encode_cols = ["All Categorical"] + df.select_dtypes(include='object').columns.tolist()
+        encode_cols = st.multiselect("Columns to Encode", all_encode_cols, default=["All Categorical"], help="Select the categorical columns to encode. Choose 'All Categorical' to apply to all object type columns.")
+        if "All Categorical" in encode_cols:
+            encode_cols = df.select_dtypes(include='object').columns.tolist()
+        encoding_method = st.selectbox("Encoding Method", ["OneHotEncoder"], help="Choose the encoding method.")
+    elif clean_action == "Outlier Removal":
+        st.markdown("**Configure outlier removal settings.**")
+        all_outlier_cols = ["All Numerical"] + df.select_dtypes(include=np.number).columns.tolist()
+        outlier_cols = st.multiselect("Columns to Remove Outliers From", all_outlier_cols, default=["All Numerical"], help="Select the columns to remove outliers from. Choose 'All Numerical' to apply to all numerical columns.")
+        if "All Numerical" in outlier_cols:
+            outlier_cols = df.select_dtypes(include=np.number).columns.tolist()
+        outlier_method = st.selectbox("outlier_method = st.selectbox("Outlier Removal Method", ["IQR", "Z-score"], help="Choose the outlier removal method.")
+        if outlier_method == "IQR":
+            iqr_threshold = st.slider("IQR Threshold", 1.0, 3.0, 1.5, help="Adjust the IQR threshold.")
+        else:
+            zscore_threshold = st.slider("Z-score Threshold", 2.0, 4.0, 3.0, help="Adjust the Z-score threshold.")
+    elif clean_action == "Remove Column":
+        st.markdown("**Choose Columns to Remove**")
+        all_cols = df.columns.tolist()
+        remove_cols = st.multiselect("Columns to Remove", all_cols, help="Select the columns to remove.")
+    elif clean_action == "Auto Clean":
+        st.markdown("**Automatically Impute Missing Values, Encode Categorical Variables, and Normalize Numeric Variables**", help = "These action happens automically when selected.")
+        with st.expander("⚙️ Auto Processing Settings"):
+            st.markdown("**Check to enable setting automatic data cleaning.**", help = "You must manually change configurations in the following setttings below.")
+            auto_missing = st.checkbox("Auto Handle Missing Values", True, help = "Auto handle all mission values with selected configurations")
+            auto_normalize = st.checkbox("Auto Normalize Numerical Features", True, help = "Check to automatically normalize all numerical features")
+            auto_encode = st.checkbox("Auto Encode Categorical Features", True, help="Check to automatically Encode all catigorical columns")
+            if auto_missing:
+                missing_strategy_num = st.selectbox("Numerical Imputation", ["Median", "Mean"], help="Choose the numeric strategy for Auto Clean")
+                missing_strategy_cat = st.selectbox("Categorical Imputation", ["Most Frequent", "Constant"], help="Choose strategy for auto cleaning on categorical attributes")
+    with col2:
+        if st.button("Apply Transformation"):
+            with st.spinner("Applying changes..."):
+                current_df = df.copy() # important
+                if 'data_history' not in st.session_state:
+                    st.session_state.data_history = [df.copy()]
+                # Store the current state in history BEFORE processing
+                st.session_state.data_history.append(current_df)
+                # Auto Processing
+                if auto_missing and clean_action != "Auto Clean":
+                    num_cols = current_df.select_dtypes(include=np.number).columns
+                    cat_cols = current_df.select_dtypes(include='object').columns
+                    if missing_strategy_num == "Median":
+                        current_df[num_cols] = current_df[num_cols].fillna(current_df[num_cols].median())
+                    else:
+                        current_df[num_cols] = current_df[num_cols].fillna(current_df[num_cols].mean())
+                    if missing_strategy_cat == "Most Frequent":
+                        current_df[cat_cols] = current_df[cat_cols].fillna(current_df[cat_cols].mode().iloc[0])
+                    else:
+                        current_df[cat_cols] = current_df[cat_cols].fillna("Missing")
+                if auto_normalize and clean_action != "Auto Clean":
+                    num_cols = current_df.select_dtypes(include=np.number).columns
+                    scaler = StandardScaler()
+                    current_df[num_cols] = scaler.fit_transform(current_df[num_cols])
+                if auto_encode and clean_action != "Auto Clean":
+                    cat_cols = current_df.select_dtypes(include='object').columns
+                    if len(cat_cols) > 0:
+                        encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
+                        encoded_data = encoder.fit_transform(current_df[cat_cols])
+                        encoded_df = pd.DataFrame(encoded_data,columns=encoder.get_feature_names_out(cat_cols))
+                        current_df = pd.concat([current_df.drop(columns=cat_cols), encoded_df], axis=1)
+                # Manual Processing
+                if clean_action == "Handle Missing Values":
+                    if method == "KNN Imputation":
+                        imputer = KNNImputer(n_neighbors=knn_neighbors)
+                        current_df[impute_cols] = imputer.fit_transform(current_df[impute_cols])
+                    elif method == "Median Fill":
+                        current_df[impute_cols] = current_df[impute_cols].fillna(current_df[impute_cols].median())
+                    elif method == "Mean Fill":
+                        current_df[impute_cols] = current_df[impute_cols].fillna(current_df[impute_cols].mean())
+                    else:
+                        current_df = current_df.dropna(subset=impute_cols)
+                elif clean_action == "Remove Column":
+                    if remove_cols:
+                        current_df = current_df.drop(columns=remove_cols)
+                st.session_state.cleaned_data = current_df
+                st.success("Transformation applied!")
+    # Data Comparison
+    st.subheader("Data Version Comparison")
+    col_orig, col_clean = st.columns(2)
+    with col_orig:
+        st.markdown("**Original Data**")
+        if st.session_state.raw_data is not None:
+            st.dataframe(st.session_state.raw_data.head(5), use_container_width=True)
+        else:
+            st.write("No original data uploaded yet.")
+    with col_clean:
+        st.markdown("**Cleaned Data**")
+        st.dataframe(df.head(5), use_container_width=True)
 elif app_mode == "Advanced EDA":
     st.title("🔍 Advanced Exploratory Analysis")
     else:
         st.write("Please train a model first in the 'Model Training' section.")
+st.title("🔬 Advanced Data Visualization and Clustering Lab")
+# Initialize session state for cleaned data
+if 'cleaned_data' not in st.session_state:
+    st.session_state.cleaned_data = None
+# Sample data upload (replace with your data loading logic)
+uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
+if uploaded_file is not None:
+    try:
+        df = pd.read_csv(uploaded_file)
+        st.session_state.cleaned_data = df
+        st.success("Data loaded successfully!")
+    except Exception as e:
+        st.error(f"Error loading data: {e}")
+if st.session_state.cleaned_data is not None:
+    df = st.session_state.cleaned_data.copy()
+    # Visualization Type Selection
+    visualization_type = st.selectbox("Select Visualization Type", [
+        "Pair Plot", "Parallel Coordinates Plot", "Andrews Curves", "Pie Chart",
+        "Area Chart", "Density Contour", "Sunburst Chart", "Funnel Chart", "Clustering Analysis"
+    ])
+    if visualization_type == "Pair Plot":
+        st.subheader("Pair Plot")
+        cols_for_pairplot = st.multiselect("Select Columns for Pair Plot", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:3])
+        if cols_for_pairplot:
+            fig = px.scatter_matrix(df, dimensions=cols_for_pairplot)
             st.plotly_chart(fig, use_container_width=True)
+    elif visualization_type == "Parallel Coordinates Plot":
+        st.subheader("Parallel Coordinates Plot")
+        cols_for_parallel = st.multiselect("Select Columns for Parallel Coordinates", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:5])
+        if cols_for_parallel:
+            fig = px.parallel_coordinates(df[cols_for_parallel], color=df[cols_for_parallel[0]] if cols_for_parallel else None)
             st.plotly_chart(fig, use_container_width=True)
+    elif visualization_type == "Andrews Curves":
+        st.subheader("Andrews Curves")
+        cols_for_andrews = st.multiselect("Select Columns for Andrews Curves", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:5])
+        if cols_for_andrews:
+            fig = px.andrews_curves(df[cols_for_andrews + [df.columns[0]]], class_column=df.columns[0])
+            st.plotly_chart(fig, use_container_width=True)
+    elif visualization_type == "Pie Chart":
+        st.subheader("Pie Chart")
+        col_for_pie = st.selectbox("Select Column for Pie Chart", df.columns)
+        fig = px.pie(df, names=col_for_pie)
+        st.plotly_chart(fig, use_container_width=True)
+    elif visualization_type == "Area Chart":
+        st.subheader("Area Chart")
+        cols_for_area = st.multiselect("Select Columns for Area Chart", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:3])
+        if cols_for_area:
+            fig = px.area(df[cols_for_area])
             st.plotly_chart(fig, use_container_width=True)
+    elif visualization_type == "Density Contour":
+        st.subheader("Density Contour")
+        x_col = st.selectbox("Select X Column for Density Contour", df.select_dtypes(include=np.number).columns.tolist())
+        y_col = st.selectbox("Select Y Column for Density Contour", df.select_dtypes(include=np.number).columns.tolist())
+        fig = px.density_contour(df, x=x_col, y=y_col)
+        st.plotly_chart(fig, use_container_width=True)
+    elif visualization_type == "Sunburst Chart":
+        st.subheader("Sunburst Chart")
+        path_cols = st.multiselect("Select Path Columns for Sunburst Chart", df.columns)
+        if path_cols:
+            fig = px.sunburst(df, path=path_cols)
+            st.plotly_chart(fig, use_container_width=True)
+    elif visualization_type == "Funnel Chart":
+        st.subheader("Funnel Chart")
+        x_col = st.selectbox("Select X Column for Funnel Chart (Values)", df.select_dtypes(include=np.number).columns.tolist())
+        y_col = st.selectbox("Select Y Column for Funnel Chart (Categories)", df.columns)
+        fig = px.funnel(df, x=x_col, y=y_col)
+        st.plotly_chart(fig, use_container_width=True)
+    elif visualization_type == "Clustering Analysis":
+        st.subheader("Clustering Analysis")
+        numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
+        if not numerical_cols:
+            st.warning("No numerical columns found for clustering.")
+        else:
+            cluster_cols = st.multiselect("Select Columns for Clustering", numerical_cols, default=numerical_cols[:2] if len(numerical_cols) >= 2 else numerical_cols)
+            if cluster_cols:
+                try:
+                    scaler = StandardScaler()
+                    scaled_data = scaler.fit_transform(df[cluster_cols])
+                    n_clusters = st.slider("Number of Clusters", 2, 10, 3, help="Number of clusters to form.")
+                    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
+                    clusters = kmeans.fit_predict(scaled_data)
+                    df['Cluster'] = clusters
+                    if len(cluster_cols) == 2:
+                        fig = px.scatter(df, x=cluster_cols[0], y=cluster_cols[1], color='Cluster', title="K-Means Clustering")
+                        st.plotly_chart(fig, use_container_width=True)
+                    elif len(cluster_cols) == 3:
+                        fig = px.scatter_3d(df, x=cluster_cols[0], y=cluster_cols[1], z=cluster_cols[2], color='Cluster', title="K-Means Clustering (3D)")
+                        st.plotly_chart(fig, use_container_width=True)
+                    else:
+                        st.write("Clustering visualization is only supported for 2 or 3 selected columns.")
+                    st.success("Clustering applied successfully!")
+                except Exception as e:
+                    st.error(f"An error occurred during clustering: {e}")
 elif app_mode == "Neural Network Studio":
     st.title("🧠 Neural Network Studio")