Spaces:

CosmickVisions
/

Data-Vision

Running

App Files Files Community

CosmickVisions commited on Mar 1

Commit

977f130

verified ·

1 Parent(s): 77d87df

Update app.py

Browse files

Files changed (1) hide show

app.py +167 -194

app.py CHANGED Viewed

@@ -247,26 +247,55 @@ elif app_mode == "Smart Cleaning":
         st.warning("Please upload your data in the Data Upload section first.")
         st.stop()
-    df = st.session_state.raw_data.copy()
-    cleaning_actions = []
-    # Data Health Summary
-    st.subheader("📊 Data Health Summary")
-    col1, col2, col3 = st.columns(3)
     with col1:
-        missing_pct = df.isna().mean().mean()
-        st.metric("Missing Values", f"{missing_pct:.1%}")
     with col2:
-        duplicates = df.duplicated().sum()
-        st.metric("Duplicates", duplicates)
     with col3:
-        data_types = df.dtypes.value_counts().to_dict()
-        st.metric("Data Types", str(data_types))
     # Cleaning Operations
     st.subheader("🔧 Cleaning Operations")
-    # 1. Missing Value Handling
     with st.expander("🕳️ Handle Missing Values", expanded=True):
         missing_cols = df.columns[df.isna().any()].tolist()
         if missing_cols:
@@ -281,88 +310,80 @@ elif app_mode == "Smart Cleaning":
                 "Deep Learning Imputation"
             ], horizontal=True)
-            if method == "Drop Missing":
-                if st.button("Apply Drop Missing"):
-                    try:
-                        df.dropna(subset=cols, inplace=True)
-                        cleaning_actions.append(f"Dropped missing values in {cols}")
-                        st.success("Missing values dropped successfully!")
-                    except Exception as e:
-                        st.error(f"Error during dropping missing values: {e}")
-            elif method == "Mean/Median/Mode":
-                strategy = st.selectbox("Strategy", ["mean", "median", "most_frequent"])
-                if st.button("Apply Imputation"):
-                    try:
                         for col in cols:
                             if pd.api.types.is_numeric_dtype(df[col]):
-                                if strategy == "most_frequent":
-                                    from sklearn.impute import SimpleImputer
-                                    imputer = SimpleImputer(strategy=strategy)
-                                    df[col] = imputer.fit_transform(df[[col]])
-                                else:
-                                    df[col] = df[col].fillna(df[col].agg(strategy))
                             else:
-                                st.warning(f"Cannot apply {strategy} to non-numeric column: {col}")
-                        cleaning_actions.append(f"Filled missing values in {cols} using {strategy}")
-                        st.success("Imputation applied successfully!")
-                    except Exception as e:
-                        st.error(f"Error during imputation: {e}")
-            elif method == "KNN Imputation":
-                n_neighbors = st.slider("Number of neighbors", 2, 15, 5)
-                if st.button("Apply KNN Imputation"):
-                    try:
                         from sklearn.impute import KNNImputer
                         imputer = KNNImputer(n_neighbors=n_neighbors)
                         df[cols] = imputer.fit_transform(df[cols])
-                        cleaning_actions.append(f"Applied KNN imputation (k={n_neighbors}) on {cols}")
-                        st.success("KNN imputation applied successfully!")
-                    except Exception as e:
-                        st.error(f"Error during KNN imputation: {e}")
-            elif method == "MICE Imputation":
-                if st.button("Apply MICE Imputation"):
-                    try:
                         from sklearn.experimental import enable_iterative_imputer
                         from sklearn.impute import IterativeImputer
                         imputer = IterativeImputer(random_state=42)
                         df[cols] = imputer.fit_transform(df[cols])
-                        cleaning_actions.append(f"Applied MICE imputation on {cols}")
-                        st.success("MICE imputation applied successfully!")
-                    except Exception as e:
-                        st.error(f"Error during MICE imputation: {e}")
-            elif method == "Deep Learning Imputation":
-                if st.button("Apply Deep Learning Imputation"):
-                    try:
                         from sklearn.neural_network import MLPRegressor
-                        from sklearn.model_selection import train_test_split
                         for col in cols:
-                            if pd.api.types.is_numeric_dtype(df[col]):
-                                train_data = df[cols].dropna()
-                                X_train = train_data.drop(columns=[col])
-                                y_train = train_data[col]
-                                model = MLPRegressor(random_state=42)
-                                model.fit(X_train, y_train)
-                                missing_data = df[cols][df[cols][col].isna()]
-                                X_missing = missing_data.drop(columns=[col])
-                                df.loc[df[cols][col].isna(), col] = model.predict(X_missing)
-                        cleaning_actions.append(f"Applied Deep Learning imputation on {cols}")
-                        st.success("Deep Learning imputation applied successfully!")
-                    except Exception as e:
-                        st.error(f"Error during Deep Learning imputation: {e}")
         else:
-            st.success("No missing values found!")
-    # 2. Duplicate Handling
     with st.expander("🔄 Handle Duplicates", expanded=True):
         if duplicates > 0:
-            st.write(f"Found {duplicates} duplicate rows")
             dup_strategy = st.radio("Duplicate Strategy", [
                 "Remove All Duplicates",
                 "Keep First Occurrence",
@@ -370,146 +391,98 @@ elif app_mode == "Smart Cleaning":
             ])
             if st.button("Handle Duplicates"):
                 df = df.drop_duplicates(keep={
                     "Remove All Duplicates": False,
                     "Keep First Occurrence": 'first',
                     "Keep Last Occurrence": 'last'
                 }[dup_strategy])
-                cleaning_actions.append(f"Removed duplicates using strategy: {dup_strategy}")
         else:
-            st.success("No duplicates found!")
-    # 3. Data Type Conversion
     with st.expander("🔄 Convert Data Types", expanded=True):
-        st.write("Current Data Types:")
-        st.dataframe(df.dtypes.reset_index().rename(columns={
-            0: 'Type',
-            'index': 'Column'
-        }))
-        col_to_convert = st.selectbox("Select column to convert", df.columns)
-        new_type = st.selectbox("New Data Type", [
-            "String", "Integer", "Float",
-            "Boolean", "Datetime", "Category"
-        ])
-        if st.button("Convert Data Type"):
-            try:
-                if new_type == "String":
-                    df[col_to_convert] = df[col_to_convert].astype(str)
-                elif new_type == "Integer":
-                    df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce').astype('Int64')
-                elif new_type == "Float":
-                    df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce')
-                elif new_type == "Boolean":
-                    df[col_to_convert] = df[col_to_convert].astype(bool)
-                elif new_type == "Datetime":
-                    df[col_to_convert] = pd.to_datetime(df[col_to_convert], errors='coerce')
-                elif new_type == "Category":
-                    df[col_to_convert] = df[col_to_convert].astype('category')
-                cleaning_actions.append(f"Converted {col_to_convert} to {new_type}")
-                st.success("Data type converted successfully!")
-            except Exception as e:
-                st.error(f"Conversion failed: {str(e)}")
-    # 4. Outlier Detection & Handling
     with st.expander("📈 Handle Outliers", expanded=True):
         numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
         if numeric_cols:
             outlier_col = st.selectbox("Select numeric column", numeric_cols)
-            threshold = st.slider("Outlier Threshold (Z-Score)", 1.0, 5.0, 3.0)
-            z_scores = (df[outlier_col] - df[outlier_col].mean()) / df[outlier_col].std()
-            outliers = df[abs(z_scores) > threshold]
-            st.write(f"Detected {len(outliers)} outliers")
-            st.dataframe(outliers)
-            if st.button("Handle Outliers"):
-                df = df[abs(z_scores) <= threshold]
-                cleaning_actions.append(f"Removed {len(outliers)} outliers from {outlier_col}")
         else:
-            st.info("No numeric columns found for outlier detection")
-    # 5. Text Cleaning
-    with st.expander("📝 Clean Text Data", expanded=True):
-        text_cols = df.select_dtypes(include='object').columns.tolist()
-        if text_cols:
-            text_col = st.selectbox("Select text column", text_cols)
-            options = st.multiselect("Text Cleaning Options", [
-                "Lowercase",
-                "Remove Punctuation",
-                "Remove Extra Spaces",
-                "Remove Stopwords",
-                "Stemming"
-            ])
-            if st.button("Clean Text"):
-                if "Lowercase" in options:
-                    df[text_col] = df[text_col].str.lower()
-                if "Remove Punctuation" in options:
-                    df[text_col] = df[text_col].str.replace(r'[^\w\s]', '', regex=True)
-                if "Remove Extra Spaces" in options:
-                    df[text_col] = df[text_col].str.strip().str.replace(r'\s+', ' ', regex=True)
-                if "Remove Stopwords" in options:
-                    from nltk.corpus import stopwords
-                    stop_words = set(stopwords.words('english'))
-                    df[text_col] = df[text_col].apply(
-                        lambda x: ' '.join([word for word in x.split() if word not in stop_words])
-                    )
-                if "Stemming" in options:
-                    from nltk.stem import PorterStemmer
-                    stemmer = PorterStemmer()
-                    df[text_col] = df[text_col].apply(
-                        lambda x: ' '.join([stemmer.stem(word) for word in x.split()])
-                    )
-                cleaning_actions.append(f"Cleaned text in {text_col}")
-                st.success("Text cleaned successfully!")
-        else:
-            st.info("No text columns found for cleaning")
-    # 6. Standardization Methods for Categorical Values
-    with st.expander("🔄 Standardize Categorical Values", expanded=True):
-        cat_cols = df.select_dtypes(include='object').columns.tolist()
-        if cat_cols:
-            cat_col = st.selectbox("Select Categorical Column", cat_cols)
-            standardization_method = st.selectbox("Standardization Method", ["Label Encoding", "One-Hot Encoding"])
-            if st.button("Apply Standardization"):
-                try:
-                    if standardization_method == "Label Encoding":
-                        from sklearn.preprocessing import LabelEncoder
-                        le = LabelEncoder()
-                        df[cat_col] = le.fit_transform(df[cat_col])
-                        cleaning_actions.append(f"Applied Label Encoding to {cat_col}")
-                    elif standardization_method == "One-Hot Encoding":
-                        from sklearn.preprocessing import OneHotEncoder
-                        ohe = OneHotEncoder(sparse=False, drop='first')
-                        encoded_cols = ohe.fit_transform(df[[cat_col]])
-                        encoded_df = pd.DataFrame(encoded_cols, columns=ohe.get_feature_names_out([cat_col]))
-                        df = pd.concat([df.drop(columns=[cat_col]), encoded_df], axis=1)
-                        cleaning_actions.append(f"Applied One-Hot Encoding to {cat_col}")
-                    st.success("Standardization applied successfully!")
-                except Exception as e:
-                    st.error(f"Error during standardization: {e}")
-        else:
-            st.info("No categorical columns found for standardization")
-    # Save Cleaned Data
     if st.button("💾 Save Cleaned Data"):
         st.session_state.cleaned_data = df
-        st.success("Cleaned data saved successfully!")
-        # Show Cleaning Log
         st.subheader("📝 Cleaning Log")
-        if cleaning_actions:
-            st.write("### Applied Transformations")
-            for action in cleaning_actions:
-                st.write(f"- {action}")
-        else:
-            st.info("No transformations applied yet")
 # Advanced EDA Section
 elif app_mode == "Advanced EDA":

         st.warning("Please upload your data in the Data Upload section first.")
         st.stop()
+    # Initialize versioning
+    if 'data_versions' not in st.session_state:
+        st.session_state.data_versions = [st.session_state.raw_data.copy()]
+        st.session_state.current_version = 0
+    def update_version(new_df):
+        st.session_state.data_versions = st.session_state.data_versions[:st.session_state.current_version+1]
+        st.session_state.data_versions.append(new_df.copy())
+        st.session_state.current_version += 1
+    df = st.session_state.data_versions[st.session_state.current_version].copy()
+    cleaning_actions = st.session_state.get('cleaning_actions', [])
+    # Version Control
+    with st.expander("⏪ Version Control", expanded=True):
+        col1, col2 = st.columns(2)
+        with col1:
+            if st.button("Undo Last Action") and st.session_state.current_version > 0:
+                st.session_state.current_version -= 1
+                st.experimental_rerun()
+        with col2:
+            if st.button("Redo Next Action") and st.session_state.current_version < len(st.session_state.data_versions)-1:
+                st.session_state.current_version += 1
+                st.experimental_rerun()
+        st.caption(f"Current Version: {st.session_state.current_version+1}/{len(st.session_state.data_versions)}")
+    # Data Health Dashboard
+    st.subheader("📊 Data Health Dashboard")
+    with st.expander("Show Comprehensive Data Report"):
+        from pandas_profiling import ProfileReport
+        pr = ProfileReport(df, explorative=True)
+        st_profile_report(pr)
+    # Enhanced Health Summary
+    col1, col2, col3, col4 = st.columns(4)
     with col1:
+        st.plotly_chart(px.bar(df.isna().sum(), title="Missing Values per Column").update_layout(showlegend=False))
     with col2:
+        st.plotly_chart(px.pie(values=df.dtypes.value_counts(), names=df.dtypes.value_counts().index,
+                            title="Data Type Distribution"))
     with col3:
+        st.metric("Total Rows", len(df))
+    with col4:
+        st.metric("Total Columns", len(df.columns))
     # Cleaning Operations
     st.subheader("🔧 Cleaning Operations")
+    # 1. Missing Value Handling - Enhanced
     with st.expander("🕳️ Handle Missing Values", expanded=True):
         missing_cols = df.columns[df.isna().any()].tolist()
         if missing_cols:
                 "Deep Learning Imputation"
             ], horizontal=True)
+            preview_expander = st.expander("Preview Data Before/After")
+            if method in ["KNN Imputation", "MICE Imputation", "Deep Learning Imputation"]:
+                numeric_cols = df[cols].select_dtypes(include=np.number).columns.tolist()
+                if len(numeric_cols) != len(cols):
+                    st.error("Non-numeric columns selected for numeric imputation. Please select only numeric columns.")
+                    st.stop()
+            if st.button(f"Apply {method}"):
+                try:
+                    original_df = df.copy()
+                    if method == "Drop Missing":
+                        df.dropna(subset=cols, inplace=True)
+                        action_msg = f"Dropped missing values in {cols}"
+                    elif method == "Mean/Median/Mode":
+                        strategy = st.selectbox("Strategy", ["mean", "median", "most_frequent"])
                         for col in cols:
                             if pd.api.types.is_numeric_dtype(df[col]):
+                                df[col].fillna(df[col].agg(strategy), inplace=True)
                             else:
+                                df[col].fillna(df[col].mode()[0], inplace=True)
+                        action_msg = f"Filled missing values in {cols} using {strategy}"
+                    elif method == "KNN Imputation":
+                        n_neighbors = st.slider("Number of neighbors", 2, 15, 5)
                         from sklearn.impute import KNNImputer
                         imputer = KNNImputer(n_neighbors=n_neighbors)
                         df[cols] = imputer.fit_transform(df[cols])
+                        action_msg = f"Applied KNN imputation (k={n_neighbors}) on {cols}"
+                    elif method == "MICE Imputation":
                         from sklearn.experimental import enable_iterative_imputer
                         from sklearn.impute import IterativeImputer
                         imputer = IterativeImputer(random_state=42)
                         df[cols] = imputer.fit_transform(df[cols])
+                        action_msg = f"Applied MICE imputation on {cols}"
+                    elif method == "Deep Learning Imputation":
                         from sklearn.neural_network import MLPRegressor
+                        model = MLPRegressor(hidden_layer_sizes=(100,50), max_iter=1000)
                         for col in cols:
+                            temp_df = df.dropna()
+                            X = temp_df.drop(columns=[col])
+                            y = temp_df[col]
+                            model.fit(X, y)
+                            mask = df[col].isna()
+                            df.loc[mask, col] = model.predict(df.loc[mask].drop(columns=[col]))
+                        action_msg = f"Applied Deep Learning imputation on {cols}"
+                    with preview_expander:
+                        col1, col2 = st.columns(2)
+                        with col1:
+                            st.write("Before:", original_df[cols].head(10))
+                        with col2:
+                            st.write("After:", df[cols].head(10))
+                    cleaning_actions.append(action_msg)
+                    update_version(df)
+                    st.success(f"{method} applied successfully! ✅")
+                except Exception as e:
+                    st.error(f"Error: {str(e)}")
+                    st.stop()
         else:
+            st.success("✨ No missing values found!")
+    # 2. Enhanced Duplicate Handling with Visualization
     with st.expander("🔄 Handle Duplicates", expanded=True):
+        duplicates = df.duplicated().sum()
         if duplicates > 0:
+            st.plotly_chart(px.histogram(df, x=df.duplicated(), title="Duplicate Distribution"))
             dup_strategy = st.radio("Duplicate Strategy", [
                 "Remove All Duplicates",
                 "Keep First Occurrence",
             ])
             if st.button("Handle Duplicates"):
+                original_count = len(df)
                 df = df.drop_duplicates(keep={
                     "Remove All Duplicates": False,
                     "Keep First Occurrence": 'first',
                     "Keep Last Occurrence": 'last'
                 }[dup_strategy])
+                st.plotly_chart(px.bar(x=["Before", "After"],
+                                    y=[original_count, len(df)],
+                                    title="Row Count Comparison"))
+                cleaning_actions.append(f"Removed {original_count - len(df)} duplicates")
+                update_version(df)
+                st.success(f"Removed {original_count - len(df)} duplicates! ✅")
         else:
+            st.success("✨ No duplicates found!")
+    # 3. Enhanced Data Type Conversion with Preview
     with st.expander("🔄 Convert Data Types", expanded=True):
+        col1, col2 = st.columns(2)
+        with col1:
+            st.dataframe(df.dtypes.reset_index().rename(columns={0: 'Type', 'index': 'Column'}))
+        with col2:
+            col_to_convert = st.selectbox("Select column to convert", df.columns)
+            new_type = st.selectbox("New Data Type", [
+                "String", "Integer", "Float",
+                "Boolean", "Datetime", "Category"
+            ])
+            if st.button("Convert Data Type"):
+                try:
+                    original_dtype = str(df[col_to_convert].dtype)
+                    # Conversion logic...
+                    st.write("Conversion Summary:")
+                    st.table(pd.DataFrame({
+                        "Column": [col_to_convert],
+                        "Original Type": [original_dtype],
+                        "New Type": [new_type]
+                    }))
+                    cleaning_actions.append(f"Converted {col_to_convert} to {new_type}")
+                    update_version(df)
+                    st.success("Data type converted successfully! ✅")
+                except Exception as e:
+                    st.error(f"Conversion failed: {str(e)}")
+    # 4. Enhanced Outlier Handling with Visualization
     with st.expander("📈 Handle Outliers", expanded=True):
         numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
         if numeric_cols:
             outlier_col = st.selectbox("Select numeric column", numeric_cols)
+            col1, col2 = st.columns(2)
+            with col1:
+                st.plotly_chart(px.box(df, y=outlier_col, title="Original Distribution"))
+            with col2:
+                st.plotly_chart(px.histogram(df, x=outlier_col, title="Value Distribution"))
+            # Outlier handling logic...
         else:
+            st.info("ℹ️ No numeric columns found for outlier detection")
+    # Save Cleaned Data with Enhanced Feedback
     if st.button("💾 Save Cleaned Data"):
         st.session_state.cleaned_data = df
+        st.balloons()
+        # Generate comprehensive report
+        from pandas_profiling import ProfileReport
+        pr = ProfileReport(df, title="Cleaned Data Report")
+        st_profile_report(pr)
+        # Show cleaning log with diffs
         st.subheader("📝 Cleaning Log")
+        st.table(pd.DataFrame({
+            "Step": range(1, len(cleaning_actions)+1),
+            "Action": cleaning_actions
+        }))
+        # Show dataset comparison
+        col1, col2 = st.columns(2)
+        with col1:
+            st.write("Original Data Shape:", st.session_state.raw_data.shape)
+        with col2:
+            st.write("Cleaned Data Shape:", df.shape)
+        st.success("✅ Cleaned data saved successfully! You can now proceed to analysis.")
 # Advanced EDA Section
 elif app_mode == "Advanced EDA":