Spaces:

CosmickVisions
/

Data-Vision

Sleeping

App Files Files Community

CosmickVisions commited on Feb 28

Commit

2c65c4c

verified ·

1 Parent(s): 30b331d

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -13

app.py CHANGED Viewed

@@ -2,16 +2,19 @@ import streamlit as st
 import pandas as pd
 import numpy as np
 import plotly.express as px
-from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
 from sklearn.linear_model import LinearRegression, LogisticRegression
 from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
 from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
 from sklearn.svm import SVR, SVC
 from sklearn.feature_selection import SelectKBest
 import joblib
 from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score
 from sklearn.impute import KNNImputer, SimpleImputer
-from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
 from ydata_profiling import ProfileReport
@@ -23,7 +26,6 @@ from io import BytesIO
 import base64
 import mimetypes
 import matplotlib.pyplot as plt
-from sklearn.model_selection import learning_curve
 # Enhanced configuration
 st.set_page_config(
@@ -254,25 +256,85 @@ elif app_mode == "Smart Cleaning":
                 "Drop Missing",
                 "Mean/Median/Mode",
                 "KNN Imputation",
-                "Advanced Imputation"
             ], horizontal=True)
-            if method == "Mean/Median/Mode":
                 strategy = st.selectbox("Strategy", ["mean", "median", "most_frequent"])
                 if st.button("Apply Imputation"):
-                    df[cols] = df[cols].fillna(df[cols].agg(strategy))
-                    cleaning_actions.append(f"Filled missing values in {cols} using {strategy}")
             elif method == "KNN Imputation":
                 n_neighbors = st.slider("Number of neighbors", 2, 15, 5)
                 if st.button("Apply KNN Imputation"):
-                    from sklearn.impute import KNNImputer
-                    imputer = KNNImputer(n_neighbors=n_neighbors)
-                    df[cols] = imputer.fit_transform(df[cols])
-                    cleaning_actions.append(f"Applied KNN imputation (k={n_neighbors}) on {cols}")
-            elif method == "Advanced Imputation":
-                st.write("Coming soon: MICE, Deep Learning imputation")
         else:
             st.success("No missing values found!")
@@ -387,6 +449,33 @@ elif app_mode == "Smart Cleaning":
         else:
             st.info("No text columns found for cleaning")
     # Save Cleaned Data
     if st.button("💾 Save Cleaned Data"):
         st.session_state.cleaned_data = df

 import pandas as pd
 import numpy as np
 import plotly.express as px
+from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, learning_curve
 from sklearn.linear_model import LinearRegression, LogisticRegression
 from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
 from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
 from sklearn.svm import SVR, SVC
 from sklearn.feature_selection import SelectKBest
+from sklearn.experimental import enable_iterative_imputer
+from sklearn.impute import IterativeImputer
+from sklearn.neural_network import MLPRegressor
 import joblib
 from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score
 from sklearn.impute import KNNImputer, SimpleImputer
+from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder, LabelEncoder
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
 from ydata_profiling import ProfileReport
 import base64
 import mimetypes
 import matplotlib.pyplot as plt
 # Enhanced configuration
 st.set_page_config(
                 "Drop Missing",
                 "Mean/Median/Mode",
                 "KNN Imputation",
+                "MICE Imputation",
+                "Deep Learning Imputation"
             ], horizontal=True)
+            if method == "Drop Missing":
+                if st.button("Apply Drop Missing"):
+                    try:
+                        df.dropna(subset=cols, inplace=True)
+                        cleaning_actions.append(f"Dropped missing values in {cols}")
+                        st.success("Missing values dropped successfully!")
+                    except Exception as e:
+                        st.error(f"Error during dropping missing values: {e}")
+            elif method == "Mean/Median/Mode":
                 strategy = st.selectbox("Strategy", ["mean", "median", "most_frequent"])
                 if st.button("Apply Imputation"):
+                    try:
+                        for col in cols:
+                            if pd.api.types.is_numeric_dtype(df[col]):
+                                if strategy == "most_frequent":
+                                    from sklearn.impute import SimpleImputer
+                                    imputer = SimpleImputer(strategy=strategy)
+                                    df[col] = imputer.fit_transform(df[[col]])
+                                else:
+                                    df[col] = df[col].fillna(df[col].agg(strategy))
+                            else:
+                                st.warning(f"Cannot apply {strategy} to non-numeric column: {col}")
+                        cleaning_actions.append(f"Filled missing values in {cols} using {strategy}")
+                        st.success("Imputation applied successfully!")
+                    except Exception as e:
+                        st.error(f"Error during imputation: {e}")
             elif method == "KNN Imputation":
                 n_neighbors = st.slider("Number of neighbors", 2, 15, 5)
                 if st.button("Apply KNN Imputation"):
+                    try:
+                        from sklearn.impute import KNNImputer
+                        imputer = KNNImputer(n_neighbors=n_neighbors)
+                        df[cols] = imputer.fit_transform(df[cols])
+                        cleaning_actions.append(f"Applied KNN imputation (k={n_neighbors}) on {cols}")
+                        st.success("KNN imputation applied successfully!")
+                    except Exception as e:
+                        st.error(f"Error during KNN imputation: {e}")
+            elif method == "MICE Imputation":
+                if st.button("Apply MICE Imputation"):
+                    try:
+                        from sklearn.experimental import enable_iterative_imputer
+                        from sklearn.impute import IterativeImputer
+                        imputer = IterativeImputer(random_state=42)
+                        df[cols] = imputer.fit_transform(df[cols])
+                        cleaning_actions.append(f"Applied MICE imputation on {cols}")
+                        st.success("MICE imputation applied successfully!")
+                    except Exception as e:
+                        st.error(f"Error during MICE imputation: {e}")
+            elif method == "Deep Learning Imputation":
+                if st.button("Apply Deep Learning Imputation"):
+                    try:
+                        from sklearn.neural_network import MLPRegressor
+                        from sklearn.model_selection import train_test_split
+                        for col in cols:
+                            if pd.api.types.is_numeric_dtype(df[col]):
+                                train_data = df[cols].dropna()
+                                X_train = train_data.drop(columns=[col])
+                                y_train = train_data[col]
+                                model = MLPRegressor(random_state=42)
+                                model.fit(X_train, y_train)
+                                missing_data = df[cols][df[cols][col].isna()]
+                                X_missing = missing_data.drop(columns=[col])
+                                df.loc[df[cols][col].isna(), col] = model.predict(X_missing)
+                        cleaning_actions.append(f"Applied Deep Learning imputation on {cols}")
+                        st.success("Deep Learning imputation applied successfully!")
+                    except Exception as e:
+                        st.error(f"Error during Deep Learning imputation: {e}")
         else:
             st.success("No missing values found!")
         else:
             st.info("No text columns found for cleaning")
+    # 6. Standardization Methods for Categorical Values
+    with st.expander("🔄 Standardize Categorical Values", expanded=True):
+        cat_cols = df.select_dtypes(include='object').columns.tolist()
+        if cat_cols:
+            cat_col = st.selectbox("Select Categorical Column", cat_cols)
+            standardization_method = st.selectbox("Standardization Method", ["Label Encoding", "One-Hot Encoding"])
+            if st.button("Apply Standardization"):
+                try:
+                    if standardization_method == "Label Encoding":
+                        from sklearn.preprocessing import LabelEncoder
+                        le = LabelEncoder()
+                        df[cat_col] = le.fit_transform(df[cat_col])
+                        cleaning_actions.append(f"Applied Label Encoding to {cat_col}")
+                    elif standardization_method == "One-Hot Encoding":
+                        from sklearn.preprocessing import OneHotEncoder
+                        ohe = OneHotEncoder(sparse=False, drop='first')
+                        encoded_cols = ohe.fit_transform(df[[cat_col]])
+                        encoded_df = pd.DataFrame(encoded_cols, columns=ohe.get_feature_names_out([cat_col]))
+                        df = pd.concat([df.drop(columns=[cat_col]), encoded_df], axis=1)
+                        cleaning_actions.append(f"Applied One-Hot Encoding to {cat_col}")
+                    st.success("Standardization applied successfully!")
+                except Exception as e:
+                    st.error(f"Error during standardization: {e}")
+        else:
+            st.info("No categorical columns found for standardization")
     # Save Cleaned Data
     if st.button("💾 Save Cleaned Data"):
         st.session_state.cleaned_data = df