Spaces:

CosmickVisions
/

Data-Vision

Running

App Files Files Community

CosmickVisions commited on Mar 2

Commit

b9d21cf

verified ·

1 Parent(s): 5cb75ad

Update app.py

Browse files

Files changed (1) hide show

app.py +125 -174

app.py CHANGED Viewed

@@ -153,20 +153,20 @@ app_mode = st.sidebar.selectbox(
     help="Choose the section to navigate to."
 )
 # --- Data Upload Page ---
 if app_mode == "Data Upload":
-    st.title("📤 Smart Data Hub")
     st.markdown("""
-        **Upload your dataset** (CSV, Excel, Parquet) for comprehensive analysis.
-        Get instant data health insights and quality assessment.
     """)
-    # File upload with enhanced UI
-    uploaded_file = st.file_uploader(
-        "Drag & drop or browse files",
-        type=list(ALLOWED_EXTENSIONS),
-        help=f"Max file size: {MAX_FILE_SIZE_MB}MB. Supported formats: {', '.join(ALLOWED_EXTENSIONS)}"
-    )
     if uploaded_file:
         # Validate file
@@ -174,9 +174,9 @@ if app_mode == "Data Upload":
         if not is_valid:
             st.error(f"Upload error: {message}")
             st.stop()
         # Load data with progress
-        with st.spinner(f"Loading {uploaded_file.name}..."):
             try:
                 if uploaded_file.name.endswith('.csv'):
                     df = pd.read_csv(uploaded_file, low_memory=False)
@@ -186,10 +186,8 @@ if app_mode == "Data Upload":
                     df = pd.read_parquet(uploaded_file)
                 elif uploaded_file.name.endswith('.feather'):
                     df = pd.read_feather(uploaded_file)
                 st.session_state.raw_data = df
                 st.success("Dataset loaded successfully!")
             except Exception as e:
                 st.error(f"Error loading file: {str(e)}")
                 st.stop()
@@ -260,171 +258,124 @@ if app_mode == "Data Upload":
                 st_profile_report(pr)
-    tab1, tab2, tab3, tab4 = st.tabs(["Missing Values", "Duplicates", "Data Types", "Outliers"])
-    # 1. Missing Value Handling
-    with tab1:
-        st.markdown("### 🕳️ Handle Missing Values")
-        missing_cols = df.columns[df.isna().any()].tolist()
-        if missing_cols:
-            st.write("Columns with missing values:")
-            cols = st.multiselect("Select columns to clean", missing_cols, default=missing_cols)
-            method = st.radio("Imputation Method", [
-                "Keep Missing",
-                "Drop Missing",
-                "Mean/Median/Mode",
-                "KNN Imputation",
-                "MICE Imputation",
-                "Deep Learning Imputation"
-            ], horizontal=True)
-            if st.button(f"Apply {method}"):
-                try:
-                    original_df = df.copy() # Store the original df before applying any change
-                    if missing_value_method == "Drop Missing":
-                        df = df.dropna(subset=cols) # Drop rows with missing values in selected columns
-                        cleaning_actions.append(f"Dropped missing values in selected columns")
-                    elif missing_value_method == "Mean/Median/Mode":
-                        # Allow the user to select the specific imputation method
-                        imputation_choice = st.radio("Select Imputation Method", ["Mean", "Median", "Mode"], horizontal=True)
-                        # Imputation logic here, added to perform the imputation in multiple columns
-                        for col in cols:
-                            if df[col].isnull().any():  # Check if missing values exist before imputing
-                                if pd.api.types.is_numeric_dtype(df[col]):
-                                    if imputation_choice == "Mean":
-                                        df[col] = df[col].fillna(df[col].mean())
-                                    elif imputation_choice == "Median":
-                                        df[col] = df[col].fillna(df[col].median())
-                                    elif imputation_choice == "Mode":
-                                        df[col] = df[col].fillna(df[col].mode()[0])
-                                else:  # Impute strings with mode
                                     df[col] = df[col].fillna(df[col].mode()[0])
-                        cleaning_actions.append(f"Applied Mean/Median/Mode imputation on {cols}")
-                    elif missing_value_method == "KNN Imputation":
-                        from sklearn.impute import KNNImputer
-                        imputer = KNNImputer(n_neighbors=5)
-                        # Ensure numeric data for KNN, select only numeric columns to impute
-                        numeric_cols = df[cols].select_dtypes(include=np.number).columns
-                        if not numeric_cols.empty:  # Check if there are numeric columns to impute
-                            df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
-                            cleaning_actions.append(f"Applied KNN Imputation on {cols}")
-                        else:
-                            st.warning("No numeric columns to apply KNN imputation")
-                    elif missing_value_method == "MICE Imputation":
-                         from sklearn.impute import IterativeImputer
-                         # Select numeric columns for MICE
-                         numeric_cols = df[cols].select_dtypes(include=np.number).columns
-                         if not numeric_cols.empty:  # Check if there are numeric columns to impute
-                             imputer = IterativeImputer()
-                             df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
-                             cleaning_actions.append(f"Applied MICE Imputation on {cols}")
-                         else:
-                             st.warning("No numeric columns to apply MICE imputation")
-                    elif missing_value_method == "Deep Learning Imputation":
-                        st.warning("Deep Learning Imputation is not implemented in this example.  Please use other methods.")
-                    update_version(df) # Update the version after cleaning
-                    st.success(f"{missing_value_method} applied successfully! ✅")
-                except Exception as e:
-                    st.error(f"Error: {str(e)}")
-        else:
-            st.success("✨ No missing values found!")
-    # 2. Duplicate Handling
-    with tab2:
-        st.markdown("### 🔄 Handle Duplicates")
-        duplicates = df.duplicated().sum()
-        if duplicates > 0:
-            st.plotly_chart(px.histogram(df, x=df.duplicated(), title="Duplicate Distribution"))
-            dup_strategy = st.radio("Duplicate Strategy", [
-                "Remove All Duplicates",
-                "Keep First Occurrence",
-                "Keep Last Occurrence"
-            ])
-            if st.button("Handle Duplicates"):
-                original_count = len(df)
-                df = df.drop_duplicates(keep={
-                    "Remove All Duplicates": False,
-                    "Keep First Occurrence": 'first',
-                    "Keep Last Occurrence": 'last'
-                }[dup_strategy])
-                cleaning_actions.append(f"Removed {original_count - len(df)} duplicates")
                 update_version(df)
-                st.success(f"Removed {original_count - len(df)} duplicates! ✅")
-        else:
-            st.success("✨ No duplicates found!")
-    # 3. Data Type Conversion
-    with tab3:
-        st.markdown("### 🔄 Convert Data Types")
-        col1, col2 = st.columns(2)
-        with col1:
-            st.dataframe(df.dtypes.reset_index().rename(columns={0: 'Type', 'index': 'Column'}))
-        with col2:
-            col_to_convert = st.selectbox("Select column to convert", df.columns)
-            new_type = st.selectbox("New Data Type", [
-                "String", "Integer", "Float",
-                "Boolean", "Datetime", "Category"
-            ])
-            if st.button("Convert Data Type"):
-                try:
-                    if new_type == "String":
-                        df[col_to_convert] = df[col_to_convert].astype(str)
-                    elif new_type == "Integer":
-                        df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce').astype('Int64')
-                    elif new_type == "Float":
-                        df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce')
-                    elif new_type == "Boolean":
-                        df[col_to_convert] = df[col_to_convert].astype(bool)
-                    elif new_type == "Datetime":
-                        df[col_to_convert] = pd.to_datetime(df[col_to_convert], errors='coerce')
-                    elif new_type == "Category":
-                        df[col_to_convert] = df[col_to_convert].astype('category')
-                    cleaning_actions.append(f"Converted {col_to_convert} to {new_type}")
-                    update_version(df)
-                    st.success("Data type converted successfully! ✅")
-                except Exception as e:
-                    st.error(f"Conversion failed: {str(e)}")
-    # 4. Outlier Handling
-    with tab4:
-        st.markdown("### 📈 Handle Outliers")
-        numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
-        if numeric_cols:
-            outlier_col = st.selectbox("Select numeric column", numeric_cols)
-            st.plotly_chart(px.box(df, y=outlier_col, title="Outlier Distribution"))
-            outlier_method = st.radio("Outlier Handling Method", ["Z-score", "IQR", "Manual"], horizontal=True)
-            if st.button("Remove Outliers"):
-                try:
-                    original_df = df.copy()
-                    if outlier_method == "Z-score":
-                        from scipy import stats
-                        z_scores = np.abs(stats.zscore(df[outlier_col]))
-                        df = df[(z_scores < 3)] # Keep only values with zscore less than 3
-                        cleaning_actions.append(f"Removed outliers from {outlier_col} using Z-score (threshold 3)")
-                    elif outlier_method == "IQR":
-                        Q1 = df[outlier_col].quantile(0.25)
-                        Q3 = df[outlier_col].quantile(0.75)
-                        IQR = Q3 - Q1
-                        df = df[~((df[outlier_col] < (Q1 - 1.5 * IQR)) |(df[outlier_col] > (Q3 + 1.5 * IQR)))]
-                        cleaning_actions.append(f"Removed outliers from {outlier_col} using IQR")
-                    elif outlier_method == "Manual":
-                        lower_bound = st.number_input("Lower Bound", value=df[outlier_col].min(), step=1.0)
-                        upper_bound = st.number_input("Upper Bound", value=df[outlier_col].max(), step=1.0)
-                        df = df[(df[outlier_col] >= lower_bound) & (df[outlier_col] <= upper_bound)]
-                        cleaning_actions.append(f"Removed outliers from {outlier_col} using manual bounds")
-                    update_version(df)
-                    st.success("Outliers removed successfully! ✅")
-                except Exception as e:
-                    st.error(f"Outlier removal failed: {str(e)}")
-        else:
-            st.info("ℹ️ No numeric columns found for outlier detection")
     # Drop Column Functionality with Interface
     st.subheader("🗑️ Drop Specific Columns")

     help="Choose the section to navigate to."
 )
+# Initialize df globally
+df = pd.DataFrame()
 # --- Data Upload Page ---
+# Data Upload Page
 if app_mode == "Data Upload":
+    st.title("📥 Smart Data Hub")
     st.markdown("""
+        **Upload your dataset** (CSV, Excel, Parquet) for comprehensive analysis. Get instant data health insights and quality assessment.
     """)
+    # File upload
+    uploaded_file = st.file_uploader("Drag & drop or browse files", type=list(ALLOWED_EXTENSIONS))
     if uploaded_file:
         # Validate file
         if not is_valid:
             st.error(f"Upload error: {message}")
             st.stop()
         # Load data with progress
+        with st.spinner(f"Loading {uploaded_file.name} ..."):
             try:
                 if uploaded_file.name.endswith('.csv'):
                     df = pd.read_csv(uploaded_file, low_memory=False)
                     df = pd.read_parquet(uploaded_file)
                 elif uploaded_file.name.endswith('.feather'):
                     df = pd.read_feather(uploaded_file)
                 st.session_state.raw_data = df
                 st.success("Dataset loaded successfully!")
             except Exception as e:
                 st.error(f"Error loading file: {str(e)}")
                 st.stop()
                 st_profile_report(pr)
+    # Cleaning Operations with Tabs
+st.subheader("🔧 Cleaning Operations")
+tab1, tab2, tab3, tab4 = st.tabs(["Missing Values", "Duplicates", "Data Types", "Outliers"])
+# 1. Missing Value Handling
+with tab1:
+    st.markdown("### 🕳️ Handle Missing Values")
+    missing_cols = df.columns[df.isna().any()].tolist()
+    if missing_cols:
+        st.write("Columns with missing values:")
+        cols = st.multiselect("Select columns to clean", missing_cols, default=missing_cols)
+        method = st.radio("Imputation Method", [
+            "Drop Missing",
+            "Mean/Median/Mode",
+            "KNN Imputation",
+            "MICE Imputation",
+            "Deep Learning Imputation"
+        ], horizontal=True)
+        if method == "Mean/Median/Mode":
+            imputation_choice = st.radio("Select Imputation Method", ["Mean", "Median", "Mode"], horizontal=True)
+        if st.button(f"Apply {method}"):
+            try:
+                original_df = df.copy()
+                if method == "Mean/Median/Mode":
+                    for col in cols:
+                        if df[col].isnull().any():  # Check if missing values exist before imputing
+                            if pd.api.types.is_numeric_dtype(df[col]):
+                                if imputation_choice == "Mean":
+                                    df[col] = df[col].fillna(df[col].mean())
+                                elif imputation_choice == "Median":
+                                    df[col] = df[col].fillna(df[col].median())
+                                elif imputation_choice == "Mode":
                                     df[col] = df[col].fillna(df[col].mode()[0])
+                            else:  # Impute strings with mode
+                                df[col] = df[col].fillna(df[col].mode()[0])
+                # Add logic for other methods here...
+                cleaning_actions.append(f"Applied {method} on {cols}")
                 update_version(df)
+                st.success(f"{method} applied successfully! ✅")
+            except Exception as e:
+                st.error(f"Error: {str(e)}")
+    else:
+        st.success("✨ No missing values found!")
+# 2. Duplicate Handling
+with tab2:
+    st.markdown("### 🔄 Handle Duplicates")
+    duplicates = df.duplicated().sum()
+    if duplicates > 0:
+        st.plotly_chart(px.histogram(df, x=df.duplicated(), title="Duplicate Distribution"))
+        dup_strategy = st.radio("Duplicate Strategy", [
+            "Remove All Duplicates",
+            "Keep First Occurrence",
+            "Keep Last Occurrence"
+        ])
+        if st.button("Handle Duplicates"):
+            original_count = len(df)
+            df = df.drop_duplicates(keep={
+                "Remove All Duplicates": False,
+                "Keep First Occurrence": 'first',
+                "Keep Last Occurrence": 'last'
+            }[dup_strategy])
+            cleaning_actions.append(f"Removed {original_count - len(df)} duplicates")
+            update_version(df)
+            st.success(f"Removed {original_count - len(df)} duplicates! ✅")
+    else:
+        st.success("✨ No duplicates found!")
+# 3. Data Type Conversion
+with tab3:
+    st.markdown("### 🔄 Convert Data Types")
+    col1, col2 = st.columns(2)
+    with col1:
+        st.dataframe(df.dtypes.reset_index().rename(columns={0: 'Type', 'index': 'Column'}))
+    with col2:
+        col_to_convert = st.selectbox("Select column to convert", df.columns)
+        new_type = st.selectbox("New Data Type", [
+            "String", "Integer", "Float",
+            "Boolean", "Datetime", "Category"
+        ])
+        if st.button("Convert Data Type"):
+            try:
+                if new_type == "String":
+                    df[col_to_convert] = df[col_to_convert].astype(str)
+                elif new_type == "Integer":
+                    df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce').astype('Int64')
+                elif new_type == "Float":
+                    df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce')
+                elif new_type == "Boolean":
+                    df[col_to_convert] = df[col_to_convert].astype(bool)
+                elif new_type == "Datetime":
+                    df[col_to_convert] = pd.to_datetime(df[col_to_convert], errors='coerce')
+                elif new_type == "Category":
+                    df[col_to_convert] = df[col_to_convert].astype('category')
+                cleaning_actions.append(f"Converted {col_to_convert} to {new_type}")
+                update_version(df)
+                st.success("Data type converted successfully! ✅")
+            except Exception as e:
+                st.error(f"Conversion failed: {str(e)}")
+# 4. Outlier Handling
+with tab4:
+    st.markdown("### 📈 Handle Outliers")
+    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
+    if numeric_cols:
+        outlier_col = st.selectbox("Select numeric column", numeric_cols)
+        st.plotly_chart(px.box(df, y=outlier_col, title="Outlier Distribution"))
+        if st.button("Remove Outliers"):
+            # Outlier removal logic here...
+            cleaning_actions.append(f"Removed outliers from {outlier_col}")
+            update_version(df)
+            st.success("Outliers removed successfully! ✅")
+    else:
+        st.info("ℹ️ No numeric columns found for outlier detection")
     # Drop Column Functionality with Interface
     st.subheader("🗑️ Drop Specific Columns")