Spaces:

CosmickVisions
/

Data-Vision

Running

App Files Files Community

CosmickVisions commited on Mar 2

Commit

da4d621

verified ·

1 Parent(s): a25447e

Update app.py

Browse files

Files changed (1) hide show

app.py +110 -11

app.py CHANGED Viewed

@@ -337,7 +337,7 @@ elif app_mode == "Smart Cleaning":
     st.subheader("🔧 Cleaning Operations")
     tab1, tab2, tab3, tab4 = st.tabs(["Missing Values", "Duplicates", "Data Types", "Outliers"])
-    # 1. Missing Value Handling
     with tab1:
         st.markdown("### 🕳️ Handle Missing Values")
         missing_cols = df.columns[df.isna().any()].tolist()
@@ -346,6 +346,7 @@ elif app_mode == "Smart Cleaning":
             cols = st.multiselect("Select columns to clean", missing_cols, default=missing_cols)
             method = st.radio("Imputation Method", [
                 "Drop Missing",
                 "Mean/Median/Mode",
                 "KNN Imputation",
@@ -355,11 +356,47 @@ elif app_mode == "Smart Cleaning":
             if st.button(f"Apply {method}"):
                 try:
-                    original_df = df.copy()
-                    # Imputation logic here...
-                    cleaning_actions.append(f"Applied {method} on {cols}")
-                    update_version(df)
-                    st.success(f"{method} applied successfully! ✅")
                 except Exception as e:
                     st.error(f"Error: {str(e)}")
         else:
@@ -403,7 +440,19 @@ elif app_mode == "Smart Cleaning":
             ])
             if st.button("Convert Data Type"):
                 try:
-                    # Conversion logic here...
                     cleaning_actions.append(f"Converted {col_to_convert} to {new_type}")
                     update_version(df)
                     st.success("Data type converted successfully! ✅")
@@ -417,14 +466,64 @@ elif app_mode == "Smart Cleaning":
         if numeric_cols:
             outlier_col = st.selectbox("Select numeric column", numeric_cols)
             st.plotly_chart(px.box(df, y=outlier_col, title="Outlier Distribution"))
             if st.button("Remove Outliers"):
-                # Outlier removal logic here...
-                cleaning_actions.append(f"Removed outliers from {outlier_col}")
-                update_version(df)
-                st.success("Outliers removed successfully! ✅")
         else:
             st.info("ℹ️ No numeric columns found for outlier detection")
     # Save Cleaned Data with Enhanced Feedback
     if st.button("💾 Save Cleaned Data"):
         st.session_state.cleaned_data = df

     st.subheader("🔧 Cleaning Operations")
     tab1, tab2, tab3, tab4 = st.tabs(["Missing Values", "Duplicates", "Data Types", "Outliers"])
+        # 1. Missing Value Handling
     with tab1:
         st.markdown("### 🕳️ Handle Missing Values")
         missing_cols = df.columns[df.isna().any()].tolist()
             cols = st.multiselect("Select columns to clean", missing_cols, default=missing_cols)
             method = st.radio("Imputation Method", [
+                "Keep Missing",
                 "Drop Missing",
                 "Mean/Median/Mode",
                 "KNN Imputation",
             if st.button(f"Apply {method}"):
                 try:
+                    original_df = df.copy() # Store the original df before applying any change
+                    if missing_value_method == "Drop Missing":
+                        df = df.dropna(subset=cols) # Drop rows with missing values in selected columns
+                        cleaning_actions.append(f"Dropped missing values in selected columns")
+                    elif missing_value_method == "Mean/Median/Mode":
+                       # Imputation logic here, added to perform the imputation in multiple columns
+                        for col in cols:
+                            if df[col].isnull().any():  # Check if missing values exist before imputing
+                                if pd.api.types.is_numeric_dtype(df[col]):
+                                    df[col] = df[col].fillna(df[col].mean())
+                                else: # Impute strings with mode
+                                    df[col] = df[col].fillna(df[col].mode()[0])
+                        cleaning_actions.append(f"Applied Mean/Median/Mode imputation on {cols}")
+                    elif missing_value_method == "KNN Imputation":
+                        from sklearn.impute import KNNImputer
+                        imputer = KNNImputer(n_neighbors=5)
+                        # Ensure numeric data for KNN, select only numeric columns to impute
+                        numeric_cols = df[cols].select_dtypes(include=np.number).columns
+                        if not numeric_cols.empty:  # Check if there are numeric columns to impute
+                            df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
+                            cleaning_actions.append(f"Applied KNN Imputation on {cols}")
+                        else:
+                            st.warning("No numeric columns to apply KNN imputation")
+                    elif missing_value_method == "MICE Imputation":
+                         from sklearn.impute import IterativeImputer
+                         # Select numeric columns for MICE
+                         numeric_cols = df[cols].select_dtypes(include=np.number).columns
+                         if not numeric_cols.empty:  # Check if there are numeric columns to impute
+                             imputer = IterativeImputer()
+                             df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
+                             cleaning_actions.append(f"Applied MICE Imputation on {cols}")
+                         else:
+                             st.warning("No numeric columns to apply MICE imputation")
+                    elif missing_value_method == "Deep Learning Imputation":
+                        st.warning("Deep Learning Imputation is not implemented in this example.  Please use other methods.")
+                    update_version(df) # Update the version after cleaning
+                    st.success(f"{missing_value_method} applied successfully! ✅")
                 except Exception as e:
                     st.error(f"Error: {str(e)}")
         else:
             ])
             if st.button("Convert Data Type"):
                 try:
+                    if new_type == "String":
+                        df[col_to_convert] = df[col_to_convert].astype(str)
+                    elif new_type == "Integer":
+                        df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce').astype('Int64')
+                    elif new_type == "Float":
+                        df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce')
+                    elif new_type == "Boolean":
+                        df[col_to_convert] = df[col_to_convert].astype(bool)
+                    elif new_type == "Datetime":
+                        df[col_to_convert] = pd.to_datetime(df[col_to_convert], errors='coerce')
+                    elif new_type == "Category":
+                        df[col_to_convert] = df[col_to_convert].astype('category')
                     cleaning_actions.append(f"Converted {col_to_convert} to {new_type}")
                     update_version(df)
                     st.success("Data type converted successfully! ✅")
         if numeric_cols:
             outlier_col = st.selectbox("Select numeric column", numeric_cols)
             st.plotly_chart(px.box(df, y=outlier_col, title="Outlier Distribution"))
+            outlier_method = st.radio("Outlier Handling Method", ["Z-score", "IQR", "Manual"], horizontal=True)
             if st.button("Remove Outliers"):
+                try:
+                    original_df = df.copy()
+                    if outlier_method == "Z-score":
+                        from scipy import stats
+                        z_scores = np.abs(stats.zscore(df[outlier_col]))
+                        df = df[(z_scores < 3)] # Keep only values with zscore less than 3
+                        cleaning_actions.append(f"Removed outliers from {outlier_col} using Z-score (threshold 3)")
+                    elif outlier_method == "IQR":
+                        Q1 = df[outlier_col].quantile(0.25)
+                        Q3 = df[outlier_col].quantile(0.75)
+                        IQR = Q3 - Q1
+                        df = df[~((df[outlier_col] < (Q1 - 1.5 * IQR)) |(df[outlier_col] > (Q3 + 1.5 * IQR)))]
+                        cleaning_actions.append(f"Removed outliers from {outlier_col} using IQR")
+                    elif outlier_method == "Manual":
+                        lower_bound = st.number_input("Lower Bound", value=df[outlier_col].min(), step=1.0)
+                        upper_bound = st.number_input("Upper Bound", value=df[outlier_col].max(), step=1.0)
+                        df = df[(df[outlier_col] >= lower_bound) & (df[outlier_col] <= upper_bound)]
+                        cleaning_actions.append(f"Removed outliers from {outlier_col} using manual bounds")
+                    update_version(df)
+                    st.success("Outliers removed successfully! ✅")
+                except Exception as e:
+                    st.error(f"Outlier removal failed: {str(e)}")
         else:
             st.info("ℹ️ No numeric columns found for outlier detection")
+    # Drop Column Functionality with Interface
+    st.subheader("🗑️ Drop Specific Columns")
+    cols_to_drop = st.multiselect("Select Columns to Drop", df.columns)
+    if st.button("Drop Selected Columns"):
+        try:
+            df = df.drop(columns=cols_to_drop) #Drop the cols here.
+            cleaning_actions.append(f"Dropped columns: {', '.join(cols_to_drop)}")
+            update_version(df)
+            st.success(f"Columns dropped successfully! ✅")
+        except (KeyError):
+            st.error("Invalid column(s) selected.")
+        except Exception as e:
+            st.error(f"An unexpected error occurred: {e}")
+    # Label Encoding (Categorical to Numeric)
+    st.subheader("🔢 Label Encoding")
+    if st.button("Encode Categorical Columns"):
+        try:
+            le = LabelEncoder()
+            categorical_cols = df.select_dtypes(include=['object', 'category']).columns
+            for col in categorical_cols:
+                df[col] = df[col].astype(str)  # Ensure all cols are string
+                df[col] = le.fit_transform(df[col])
+            cleaning_actions.append("Applied Label Encoding to categorical columns")
+            update_version(df)
+            st.success("Label encoding applied successfully! ✅")
+        except Exception as e:
+            st.error(f"Label encoding failed: {str(e)}")
+    # Live Data Preview after every cleaning action
+    st.subheader("✨ Live Data Preview")
+    st.dataframe(df.head(10)) # show 10 rows
     # Save Cleaned Data with Enhanced Feedback
     if st.button("💾 Save Cleaned Data"):
         st.session_state.cleaned_data = df