Spaces:

CosmickVisions
/

Data-Vision

Running

App Files Files Community

CosmickVisions commited on Mar 2

Commit

0bc0d5d

verified ·

1 Parent(s): 3f24c82

Update app.py

Browse files

Files changed (1) hide show

app.py +127 -82

app.py CHANGED Viewed

@@ -259,85 +259,7 @@ if app_mode == "Data Upload":
                 pr = ProfileReport(df, explorative=True,title="Data Upload Report")  # Added title to pandas profiling
                 st_profile_report(pr)
-elif app_mode == "Smart Cleaning":
-    st.title("🧼 Intelligent Data Cleaning")
-    st.markdown("""
-        **Automated Data Cleaning** with smart suggestions and advanced transformations.
-        Clean your data with confidence using AI-powered recommendations.
-    """)
-    if 'raw_data' not in st.session_state or st.session_state.raw_data is None:
-        st.warning("Please upload your data in the Data Upload section first.")
-        st.stop()
-    # Initialize versioning
-    if 'data_versions' not in st.session_state:
-        st.session_state.data_versions = [st.session_state.raw_data.copy()]
-        st.session_state.current_version = 0
-    def update_version(new_df):
-        st.session_state.data_versions = st.session_state.data_versions[:st.session_state.current_version+1]
-        st.session_state.data_versions.append(new_df.copy())
-        st.session_state.current_version += 1
-    df = st.session_state.data_versions[st.session_state.current_version].copy()
-    cleaning_actions = st.session_state.get('cleaning_actions', [])
-    # Version Control with Progress Bar
-    with st.expander("⏪ Version Control", expanded=True):
-        st.caption(f"Current Version: {st.session_state.current_version+1}/{len(st.session_state.data_versions)}")
-        progress = (st.session_state.current_version + 1) / len(st.session_state.data_versions)
-        st.progress(progress)
-        col1, col2 = st.columns(2)
-        with col1:
-            if st.button("⏮️ Undo Last Action", disabled=st.session_state.current_version == 0):
-                st.session_state.current_version -= 1
-                st.experimental_rerun()
-        with col2:
-            if st.button("⏭️ Redo Next Action", disabled=st.session_state.current_version == len(st.session_state.data_versions)-1):
-                st.session_state.current_version += 1
-                st.experimental_rerun()
-    dtype_counts = df.dtypes.astype(str).value_counts()
-        # Data Health Dashboard with Cards
-    st.subheader("📊 Data Health Dashboard")
-    with st.expander("Show Comprehensive Data Report", expanded=True):
-        try: #Add a try except for the pandas profiling
-            pr = ProfileReport(df, title="Cleaned Data Report")  # Add title to pandas profiling report
-            st_profile_report(pr)
-        except ValueError as e:
-            st.error(f"Error generating data report: {e}. This can often be caused by an empty or inappropriate dataset. Try checking the dataset or cleaning steps")
-            st.stop() #stop to fix
-    # Enhanced Health Summary with Cards
-    col1, col2, col3, col4 = st.columns(4)
-    with col1:
-        st.metric("Total Rows", len(df), help="Number of rows in the dataset")
-    with col2:
-        st.metric("Total Columns", len(df.columns), help="Number of columns in the dataset")
-    with col3:
-        missing_pct = df.isna().mean().mean()
-        st.metric("Missing Values", f"{missing_pct:.1%}", help="Percentage of missing values in the dataset")
-    with col4:
-        duplicates = df.duplicated().sum()
-        st.metric("Duplicates", duplicates, help="Number of duplicate rows in the dataset")
-    # Visualizations for Data Health
-    st.markdown("### 📈 Data Health Visualizations")
-    col1, col2 = st.columns(2)
-    with col1:
-        st.plotly_chart(px.bar(df.isna().sum(), title="Missing Values per Column",
-                         labels={'index': 'Column', 'value': 'Missing Count'},
-                         color=df.isna().sum(), color_continuous_scale="Bluered"))
-    with col2:
-        st.plotly_chart(px.pie(values = df.dtypes.value_counts().tolist(),names = df.dtypes.value_counts().index.astype(str).tolist(),
-                            title="Data Type Distribution", hole=0.3))
-    # Cleaning Operations with Tabs
-    st.subheader("🔧 Cleaning Operations")
-    tab1, tab2, tab3, tab4 = st.tabs(["Missing Values", "Duplicates", "Data Types", "Outliers"])
-        # 1. Missing Value Handling
     with tab1:
         st.markdown("### 🕳️ Handle Missing Values")
         missing_cols = df.columns[df.isna().any()].tolist()
@@ -360,10 +282,10 @@ elif app_mode == "Smart Cleaning":
                     if missing_value_method == "Drop Missing":
                         df = df.dropna(subset=cols) # Drop rows with missing values in selected columns
                         cleaning_actions.append(f"Dropped missing values in selected columns")
-                   elif missing_value_method == "Mean/Median/Mode":
                         # Allow the user to select the specific imputation method
                         imputation_choice = st.radio("Select Imputation Method", ["Mean", "Median", "Mode"], horizontal=True)
                         # Imputation logic here, added to perform the imputation in multiple columns
                         for col in cols:
                             if df[col].isnull().any():  # Check if missing values exist before imputing
@@ -376,7 +298,7 @@ elif app_mode == "Smart Cleaning":
                                         df[col] = df[col].fillna(df[col].mode()[0])
                                 else:  # Impute strings with mode
                                     df[col] = df[col].fillna(df[col].mode()[0])
-                                cleaning_actions.append(f"Applied Mean/Median/Mode imputation on {cols}")
                     elif missing_value_method == "KNN Imputation":
                         from sklearn.impute import KNNImputer
@@ -409,6 +331,129 @@ elif app_mode == "Smart Cleaning":
         else:
             st.success("✨ No missing values found!")
     # 2. Duplicate Handling
     with tab2:
         st.markdown("### 🔄 Handle Duplicates")

                 pr = ProfileReport(df, explorative=True,title="Data Upload Report")  # Added title to pandas profiling
                 st_profile_report(pr)
+    # 1. Missing Value Handling
     with tab1:
         st.markdown("### 🕳️ Handle Missing Values")
         missing_cols = df.columns[df.isna().any()].tolist()
                     if missing_value_method == "Drop Missing":
                         df = df.dropna(subset=cols) # Drop rows with missing values in selected columns
                         cleaning_actions.append(f"Dropped missing values in selected columns")
+                    elif missing_value_method == "Mean/Median/Mode":
                         # Allow the user to select the specific imputation method
                         imputation_choice = st.radio("Select Imputation Method", ["Mean", "Median", "Mode"], horizontal=True)
                         # Imputation logic here, added to perform the imputation in multiple columns
                         for col in cols:
                             if df[col].isnull().any():  # Check if missing values exist before imputing
                                         df[col] = df[col].fillna(df[col].mode()[0])
                                 else:  # Impute strings with mode
                                     df[col] = df[col].fillna(df[col].mode()[0])
+                        cleaning_actions.append(f"Applied Mean/Median/Mode imputation on {cols}")
                     elif missing_value_method == "KNN Imputation":
                         from sklearn.impute import KNNImputer
         else:
             st.success("✨ No missing values found!")
+    # 2. Duplicate Handling
+    with tab2:
+        st.markdown("### 🔄 Handle Duplicates")
+        duplicates = df.duplicated().sum()
+        if duplicates > 0:
+            st.plotly_chart(px.histogram(df, x=df.duplicated(), title="Duplicate Distribution"))
+            dup_strategy = st.radio("Duplicate Strategy", [
+                "Remove All Duplicates",
+                "Keep First Occurrence",
+                "Keep Last Occurrence"
+            ])
+            if st.button("Handle Duplicates"):
+                original_count = len(df)
+                df = df.drop_duplicates(keep={
+                    "Remove All Duplicates": False,
+                    "Keep First Occurrence": 'first',
+                    "Keep Last Occurrence": 'last'
+                }[dup_strategy])
+                cleaning_actions.append(f"Removed {original_count - len(df)} duplicates")
+                update_version(df)
+                st.success(f"Removed {original_count - len(df)} duplicates! ✅")
+        else:
+            st.success("✨ No duplicates found!")
+    # 3. Data Type Conversion
+    with tab3:
+        st.markdown("### 🔄 Convert Data Types")
+        col1, col2 = st.columns(2)
+        with col1:
+            st.dataframe(df.dtypes.reset_index().rename(columns={0: 'Type', 'index': 'Column'}))
+        with col2:
+            col_to_convert = st.selectbox("Select column to convert", df.columns)
+            new_type = st.selectbox("New Data Type", [
+                "String", "Integer", "Float",
+                "Boolean", "Datetime", "Category"
+            ])
+            if st.button("Convert Data Type"):
+                try:
+                    if new_type == "String":
+                        df[col_to_convert] = df[col_to_convert].astype(str)
+                    elif new_type == "Integer":
+                        df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce').astype('Int64')
+                    elif new_type == "Float":
+                        df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce')
+                    elif new_type == "Boolean":
+                        df[col_to_convert] = df[col_to_convert].astype(bool)
+                    elif new_type == "Datetime":
+                        df[col_to_convert] = pd.to_datetime(df[col_to_convert], errors='coerce')
+                    elif new_type == "Category":
+                        df[col_to_convert] = df[col_to_convert].astype('category')
+                    cleaning_actions.append(f"Converted {col_to_convert} to {new_type}")
+                    update_version(df)
+                    st.success("Data type converted successfully! ✅")
+                except Exception as e:
+                    st.error(f"Conversion failed: {str(e)}")
+    # 4. Outlier Handling
+    with tab4:
+        st.markdown("### 📈 Handle Outliers")
+        numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
+        if numeric_cols:
+            outlier_col = st.selectbox("Select numeric column", numeric_cols)
+            st.plotly_chart(px.box(df, y=outlier_col, title="Outlier Distribution"))
+            outlier_method = st.radio("Outlier Handling Method", ["Z-score", "IQR", "Manual"], horizontal=True)
+            if st.button("Remove Outliers"):
+                try:
+                    original_df = df.copy()
+                    if outlier_method == "Z-score":
+                        from scipy import stats
+                        z_scores = np.abs(stats.zscore(df[outlier_col]))
+                        df = df[(z_scores < 3)] # Keep only values with zscore less than 3
+                        cleaning_actions.append(f"Removed outliers from {outlier_col} using Z-score (threshold 3)")
+                    elif outlier_method == "IQR":
+                        Q1 = df[outlier_col].quantile(0.25)
+                        Q3 = df[outlier_col].quantile(0.75)
+                        IQR = Q3 - Q1
+                        df = df[~((df[outlier_col] < (Q1 - 1.5 * IQR)) |(df[outlier_col] > (Q3 + 1.5 * IQR)))]
+                        cleaning_actions.append(f"Removed outliers from {outlier_col} using IQR")
+                    elif outlier_method == "Manual":
+                        lower_bound = st.number_input("Lower Bound", value=df[outlier_col].min(), step=1.0)
+                        upper_bound = st.number_input("Upper Bound", value=df[outlier_col].max(), step=1.0)
+                        df = df[(df[outlier_col] >= lower_bound) & (df[outlier_col] <= upper_bound)]
+                        cleaning_actions.append(f"Removed outliers from {outlier_col} using manual bounds")
+                    update_version(df)
+                    st.success("Outliers removed successfully! ✅")
+                except Exception as e:
+                    st.error(f"Outlier removal failed: {str(e)}")
+        else:
+            st.info("ℹ️ No numeric columns found for outlier detection")
+    # Drop Column Functionality with Interface
+    st.subheader("🗑️ Drop Specific Columns")
+    cols_to_drop = st.multiselect("Select Columns to Drop", df.columns)
+    if st.button("Drop Selected Columns"):
+        try:
+            df = df.drop(columns=cols_to_drop)  # Drop the cols here.
+            cleaning_actions.append(f"Dropped columns: {', '.join(cols_to_drop)}")
+            update_version(df)
+            st.success(f"Columns dropped successfully! ✅")
+        except (KeyError, ValueError) as e:
+            st.error(f"Invalid column(s) selected or other error: {e}") # Handle ValueErrors
+        except Exception as e:
+            st.error(f"An unexpected error occurred: {e}")
+    # Label Encoding (Categorical to Numeric)
+    st.subheader("🔢 Label Encoding")
+    if st.button("Encode Categorical Columns"):
+        try:
+            le = LabelEncoder()
+            categorical_cols = df.select_dtypes(include=['object', 'category']).columns
+            for col in categorical_cols:
+                df[col] = df[col].astype(str)  # Ensure all cols are string
+                df[col] = le.fit_transform(df[col])
+            cleaning_actions.append("Applied Label Encoding to categorical columns")
+            update_version(df)
+            st.success("Label encoding applied successfully! ✅")
+        except Exception as e:
+            st.error(f"Label encoding failed: {str(e)}")
+    # Live Data Preview after every cleaning action
+    st.subheader("✨ Live Data Preview")
+    st.dataframe(df.head(10)) # show 10 rows
     # 2. Duplicate Handling
     with tab2:
         st.markdown("### 🔄 Handle Duplicates")