Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,16 +2,19 @@ import streamlit as st
|
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
import plotly.express as px
|
5 |
-
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
|
6 |
from sklearn.linear_model import LinearRegression, LogisticRegression
|
7 |
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
|
8 |
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
|
9 |
from sklearn.svm import SVR, SVC
|
10 |
from sklearn.feature_selection import SelectKBest
|
|
|
|
|
|
|
11 |
import joblib
|
12 |
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score
|
13 |
from sklearn.impute import KNNImputer, SimpleImputer
|
14 |
-
from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder
|
15 |
from sklearn.compose import ColumnTransformer
|
16 |
from sklearn.pipeline import Pipeline
|
17 |
from ydata_profiling import ProfileReport
|
@@ -23,7 +26,6 @@ from io import BytesIO
|
|
23 |
import base64
|
24 |
import mimetypes
|
25 |
import matplotlib.pyplot as plt
|
26 |
-
from sklearn.model_selection import learning_curve
|
27 |
|
28 |
# Enhanced configuration
|
29 |
st.set_page_config(
|
@@ -254,25 +256,85 @@ elif app_mode == "Smart Cleaning":
|
|
254 |
"Drop Missing",
|
255 |
"Mean/Median/Mode",
|
256 |
"KNN Imputation",
|
257 |
-
"
|
|
|
258 |
], horizontal=True)
|
259 |
|
260 |
-
if method == "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
strategy = st.selectbox("Strategy", ["mean", "median", "most_frequent"])
|
262 |
if st.button("Apply Imputation"):
|
263 |
-
|
264 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
|
266 |
elif method == "KNN Imputation":
|
267 |
n_neighbors = st.slider("Number of neighbors", 2, 15, 5)
|
268 |
if st.button("Apply KNN Imputation"):
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
273 |
|
274 |
-
elif method == "
|
275 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
276 |
else:
|
277 |
st.success("No missing values found!")
|
278 |
|
@@ -387,6 +449,33 @@ elif app_mode == "Smart Cleaning":
|
|
387 |
else:
|
388 |
st.info("No text columns found for cleaning")
|
389 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
390 |
# Save Cleaned Data
|
391 |
if st.button("💾 Save Cleaned Data"):
|
392 |
st.session_state.cleaned_data = df
|
|
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
import plotly.express as px
|
5 |
+
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, learning_curve
|
6 |
from sklearn.linear_model import LinearRegression, LogisticRegression
|
7 |
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
|
8 |
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
|
9 |
from sklearn.svm import SVR, SVC
|
10 |
from sklearn.feature_selection import SelectKBest
|
11 |
+
from sklearn.experimental import enable_iterative_imputer
|
12 |
+
from sklearn.impute import IterativeImputer
|
13 |
+
from sklearn.neural_network import MLPRegressor
|
14 |
import joblib
|
15 |
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score
|
16 |
from sklearn.impute import KNNImputer, SimpleImputer
|
17 |
+
from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder, LabelEncoder
|
18 |
from sklearn.compose import ColumnTransformer
|
19 |
from sklearn.pipeline import Pipeline
|
20 |
from ydata_profiling import ProfileReport
|
|
|
26 |
import base64
|
27 |
import mimetypes
|
28 |
import matplotlib.pyplot as plt
|
|
|
29 |
|
30 |
# Enhanced configuration
|
31 |
st.set_page_config(
|
|
|
256 |
"Drop Missing",
|
257 |
"Mean/Median/Mode",
|
258 |
"KNN Imputation",
|
259 |
+
"MICE Imputation",
|
260 |
+
"Deep Learning Imputation"
|
261 |
], horizontal=True)
|
262 |
|
263 |
+
if method == "Drop Missing":
|
264 |
+
if st.button("Apply Drop Missing"):
|
265 |
+
try:
|
266 |
+
df.dropna(subset=cols, inplace=True)
|
267 |
+
cleaning_actions.append(f"Dropped missing values in {cols}")
|
268 |
+
st.success("Missing values dropped successfully!")
|
269 |
+
except Exception as e:
|
270 |
+
st.error(f"Error during dropping missing values: {e}")
|
271 |
+
|
272 |
+
elif method == "Mean/Median/Mode":
|
273 |
strategy = st.selectbox("Strategy", ["mean", "median", "most_frequent"])
|
274 |
if st.button("Apply Imputation"):
|
275 |
+
try:
|
276 |
+
for col in cols:
|
277 |
+
if pd.api.types.is_numeric_dtype(df[col]):
|
278 |
+
if strategy == "most_frequent":
|
279 |
+
from sklearn.impute import SimpleImputer
|
280 |
+
imputer = SimpleImputer(strategy=strategy)
|
281 |
+
df[col] = imputer.fit_transform(df[[col]])
|
282 |
+
else:
|
283 |
+
df[col] = df[col].fillna(df[col].agg(strategy))
|
284 |
+
else:
|
285 |
+
st.warning(f"Cannot apply {strategy} to non-numeric column: {col}")
|
286 |
+
cleaning_actions.append(f"Filled missing values in {cols} using {strategy}")
|
287 |
+
st.success("Imputation applied successfully!")
|
288 |
+
except Exception as e:
|
289 |
+
st.error(f"Error during imputation: {e}")
|
290 |
|
291 |
elif method == "KNN Imputation":
|
292 |
n_neighbors = st.slider("Number of neighbors", 2, 15, 5)
|
293 |
if st.button("Apply KNN Imputation"):
|
294 |
+
try:
|
295 |
+
from sklearn.impute import KNNImputer
|
296 |
+
imputer = KNNImputer(n_neighbors=n_neighbors)
|
297 |
+
df[cols] = imputer.fit_transform(df[cols])
|
298 |
+
cleaning_actions.append(f"Applied KNN imputation (k={n_neighbors}) on {cols}")
|
299 |
+
st.success("KNN imputation applied successfully!")
|
300 |
+
except Exception as e:
|
301 |
+
st.error(f"Error during KNN imputation: {e}")
|
302 |
+
|
303 |
+
elif method == "MICE Imputation":
|
304 |
+
if st.button("Apply MICE Imputation"):
|
305 |
+
try:
|
306 |
+
from sklearn.experimental import enable_iterative_imputer
|
307 |
+
from sklearn.impute import IterativeImputer
|
308 |
+
imputer = IterativeImputer(random_state=42)
|
309 |
+
df[cols] = imputer.fit_transform(df[cols])
|
310 |
+
cleaning_actions.append(f"Applied MICE imputation on {cols}")
|
311 |
+
st.success("MICE imputation applied successfully!")
|
312 |
+
except Exception as e:
|
313 |
+
st.error(f"Error during MICE imputation: {e}")
|
314 |
|
315 |
+
elif method == "Deep Learning Imputation":
|
316 |
+
if st.button("Apply Deep Learning Imputation"):
|
317 |
+
try:
|
318 |
+
from sklearn.neural_network import MLPRegressor
|
319 |
+
from sklearn.model_selection import train_test_split
|
320 |
+
|
321 |
+
for col in cols:
|
322 |
+
if pd.api.types.is_numeric_dtype(df[col]):
|
323 |
+
train_data = df[cols].dropna()
|
324 |
+
X_train = train_data.drop(columns=[col])
|
325 |
+
y_train = train_data[col]
|
326 |
+
|
327 |
+
model = MLPRegressor(random_state=42)
|
328 |
+
model.fit(X_train, y_train)
|
329 |
+
|
330 |
+
missing_data = df[cols][df[cols][col].isna()]
|
331 |
+
X_missing = missing_data.drop(columns=[col])
|
332 |
+
df.loc[df[cols][col].isna(), col] = model.predict(X_missing)
|
333 |
+
|
334 |
+
cleaning_actions.append(f"Applied Deep Learning imputation on {cols}")
|
335 |
+
st.success("Deep Learning imputation applied successfully!")
|
336 |
+
except Exception as e:
|
337 |
+
st.error(f"Error during Deep Learning imputation: {e}")
|
338 |
else:
|
339 |
st.success("No missing values found!")
|
340 |
|
|
|
449 |
else:
|
450 |
st.info("No text columns found for cleaning")
|
451 |
|
452 |
+
# 6. Standardization Methods for Categorical Values
|
453 |
+
with st.expander("🔄 Standardize Categorical Values", expanded=True):
|
454 |
+
cat_cols = df.select_dtypes(include='object').columns.tolist()
|
455 |
+
if cat_cols:
|
456 |
+
cat_col = st.selectbox("Select Categorical Column", cat_cols)
|
457 |
+
standardization_method = st.selectbox("Standardization Method", ["Label Encoding", "One-Hot Encoding"])
|
458 |
+
|
459 |
+
if st.button("Apply Standardization"):
|
460 |
+
try:
|
461 |
+
if standardization_method == "Label Encoding":
|
462 |
+
from sklearn.preprocessing import LabelEncoder
|
463 |
+
le = LabelEncoder()
|
464 |
+
df[cat_col] = le.fit_transform(df[cat_col])
|
465 |
+
cleaning_actions.append(f"Applied Label Encoding to {cat_col}")
|
466 |
+
elif standardization_method == "One-Hot Encoding":
|
467 |
+
from sklearn.preprocessing import OneHotEncoder
|
468 |
+
ohe = OneHotEncoder(sparse=False, drop='first')
|
469 |
+
encoded_cols = ohe.fit_transform(df[[cat_col]])
|
470 |
+
encoded_df = pd.DataFrame(encoded_cols, columns=ohe.get_feature_names_out([cat_col]))
|
471 |
+
df = pd.concat([df.drop(columns=[cat_col]), encoded_df], axis=1)
|
472 |
+
cleaning_actions.append(f"Applied One-Hot Encoding to {cat_col}")
|
473 |
+
st.success("Standardization applied successfully!")
|
474 |
+
except Exception as e:
|
475 |
+
st.error(f"Error during standardization: {e}")
|
476 |
+
else:
|
477 |
+
st.info("No categorical columns found for standardization")
|
478 |
+
|
479 |
# Save Cleaned Data
|
480 |
if st.button("💾 Save Cleaned Data"):
|
481 |
st.session_state.cleaned_data = df
|