CosmickVisions commited on
Commit
2c65c4c
·
verified ·
1 Parent(s): 30b331d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -13
app.py CHANGED
@@ -2,16 +2,19 @@ import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
  import plotly.express as px
5
- from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
6
  from sklearn.linear_model import LinearRegression, LogisticRegression
7
  from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
8
  from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
9
  from sklearn.svm import SVR, SVC
10
  from sklearn.feature_selection import SelectKBest
 
 
 
11
  import joblib
12
  from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score
13
  from sklearn.impute import KNNImputer, SimpleImputer
14
- from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder
15
  from sklearn.compose import ColumnTransformer
16
  from sklearn.pipeline import Pipeline
17
  from ydata_profiling import ProfileReport
@@ -23,7 +26,6 @@ from io import BytesIO
23
  import base64
24
  import mimetypes
25
  import matplotlib.pyplot as plt
26
- from sklearn.model_selection import learning_curve
27
 
28
  # Enhanced configuration
29
  st.set_page_config(
@@ -254,25 +256,85 @@ elif app_mode == "Smart Cleaning":
254
  "Drop Missing",
255
  "Mean/Median/Mode",
256
  "KNN Imputation",
257
- "Advanced Imputation"
 
258
  ], horizontal=True)
259
 
260
- if method == "Mean/Median/Mode":
 
 
 
 
 
 
 
 
 
261
  strategy = st.selectbox("Strategy", ["mean", "median", "most_frequent"])
262
  if st.button("Apply Imputation"):
263
- df[cols] = df[cols].fillna(df[cols].agg(strategy))
264
- cleaning_actions.append(f"Filled missing values in {cols} using {strategy}")
 
 
 
 
 
 
 
 
 
 
 
 
 
265
 
266
  elif method == "KNN Imputation":
267
  n_neighbors = st.slider("Number of neighbors", 2, 15, 5)
268
  if st.button("Apply KNN Imputation"):
269
- from sklearn.impute import KNNImputer
270
- imputer = KNNImputer(n_neighbors=n_neighbors)
271
- df[cols] = imputer.fit_transform(df[cols])
272
- cleaning_actions.append(f"Applied KNN imputation (k={n_neighbors}) on {cols}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
 
274
- elif method == "Advanced Imputation":
275
- st.write("Coming soon: MICE, Deep Learning imputation")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
  else:
277
  st.success("No missing values found!")
278
 
@@ -387,6 +449,33 @@ elif app_mode == "Smart Cleaning":
387
  else:
388
  st.info("No text columns found for cleaning")
389
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
  # Save Cleaned Data
391
  if st.button("💾 Save Cleaned Data"):
392
  st.session_state.cleaned_data = df
 
2
  import pandas as pd
3
  import numpy as np
4
  import plotly.express as px
5
+ from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, learning_curve
6
  from sklearn.linear_model import LinearRegression, LogisticRegression
7
  from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
8
  from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
9
  from sklearn.svm import SVR, SVC
10
  from sklearn.feature_selection import SelectKBest
11
+ from sklearn.experimental import enable_iterative_imputer
12
+ from sklearn.impute import IterativeImputer
13
+ from sklearn.neural_network import MLPRegressor
14
  import joblib
15
  from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score
16
  from sklearn.impute import KNNImputer, SimpleImputer
17
+ from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder, LabelEncoder
18
  from sklearn.compose import ColumnTransformer
19
  from sklearn.pipeline import Pipeline
20
  from ydata_profiling import ProfileReport
 
26
  import base64
27
  import mimetypes
28
  import matplotlib.pyplot as plt
 
29
 
30
  # Enhanced configuration
31
  st.set_page_config(
 
256
  "Drop Missing",
257
  "Mean/Median/Mode",
258
  "KNN Imputation",
259
+ "MICE Imputation",
260
+ "Deep Learning Imputation"
261
  ], horizontal=True)
262
 
263
+ if method == "Drop Missing":
264
+ if st.button("Apply Drop Missing"):
265
+ try:
266
+ df.dropna(subset=cols, inplace=True)
267
+ cleaning_actions.append(f"Dropped missing values in {cols}")
268
+ st.success("Missing values dropped successfully!")
269
+ except Exception as e:
270
+ st.error(f"Error during dropping missing values: {e}")
271
+
272
+ elif method == "Mean/Median/Mode":
273
  strategy = st.selectbox("Strategy", ["mean", "median", "most_frequent"])
274
  if st.button("Apply Imputation"):
275
+ try:
276
+ for col in cols:
277
+ if pd.api.types.is_numeric_dtype(df[col]):
278
+ if strategy == "most_frequent":
279
+ from sklearn.impute import SimpleImputer
280
+ imputer = SimpleImputer(strategy=strategy)
281
+ df[col] = imputer.fit_transform(df[[col]])
282
+ else:
283
+ df[col] = df[col].fillna(df[col].agg(strategy))
284
+ else:
285
+ st.warning(f"Cannot apply {strategy} to non-numeric column: {col}")
286
+ cleaning_actions.append(f"Filled missing values in {cols} using {strategy}")
287
+ st.success("Imputation applied successfully!")
288
+ except Exception as e:
289
+ st.error(f"Error during imputation: {e}")
290
 
291
  elif method == "KNN Imputation":
292
  n_neighbors = st.slider("Number of neighbors", 2, 15, 5)
293
  if st.button("Apply KNN Imputation"):
294
+ try:
295
+ from sklearn.impute import KNNImputer
296
+ imputer = KNNImputer(n_neighbors=n_neighbors)
297
+ df[cols] = imputer.fit_transform(df[cols])
298
+ cleaning_actions.append(f"Applied KNN imputation (k={n_neighbors}) on {cols}")
299
+ st.success("KNN imputation applied successfully!")
300
+ except Exception as e:
301
+ st.error(f"Error during KNN imputation: {e}")
302
+
303
+ elif method == "MICE Imputation":
304
+ if st.button("Apply MICE Imputation"):
305
+ try:
306
+ from sklearn.experimental import enable_iterative_imputer
307
+ from sklearn.impute import IterativeImputer
308
+ imputer = IterativeImputer(random_state=42)
309
+ df[cols] = imputer.fit_transform(df[cols])
310
+ cleaning_actions.append(f"Applied MICE imputation on {cols}")
311
+ st.success("MICE imputation applied successfully!")
312
+ except Exception as e:
313
+ st.error(f"Error during MICE imputation: {e}")
314
 
315
+ elif method == "Deep Learning Imputation":
316
+ if st.button("Apply Deep Learning Imputation"):
317
+ try:
318
+ from sklearn.neural_network import MLPRegressor
319
+ from sklearn.model_selection import train_test_split
320
+
321
+ for col in cols:
322
+ if pd.api.types.is_numeric_dtype(df[col]):
323
+ train_data = df[cols].dropna()
324
+ X_train = train_data.drop(columns=[col])
325
+ y_train = train_data[col]
326
+
327
+ model = MLPRegressor(random_state=42)
328
+ model.fit(X_train, y_train)
329
+
330
+ missing_data = df[cols][df[cols][col].isna()]
331
+ X_missing = missing_data.drop(columns=[col])
332
+ df.loc[df[cols][col].isna(), col] = model.predict(X_missing)
333
+
334
+ cleaning_actions.append(f"Applied Deep Learning imputation on {cols}")
335
+ st.success("Deep Learning imputation applied successfully!")
336
+ except Exception as e:
337
+ st.error(f"Error during Deep Learning imputation: {e}")
338
  else:
339
  st.success("No missing values found!")
340
 
 
449
  else:
450
  st.info("No text columns found for cleaning")
451
 
452
+ # 6. Standardization Methods for Categorical Values
453
+ with st.expander("🔄 Standardize Categorical Values", expanded=True):
454
+ cat_cols = df.select_dtypes(include='object').columns.tolist()
455
+ if cat_cols:
456
+ cat_col = st.selectbox("Select Categorical Column", cat_cols)
457
+ standardization_method = st.selectbox("Standardization Method", ["Label Encoding", "One-Hot Encoding"])
458
+
459
+ if st.button("Apply Standardization"):
460
+ try:
461
+ if standardization_method == "Label Encoding":
462
+ from sklearn.preprocessing import LabelEncoder
463
+ le = LabelEncoder()
464
+ df[cat_col] = le.fit_transform(df[cat_col])
465
+ cleaning_actions.append(f"Applied Label Encoding to {cat_col}")
466
+ elif standardization_method == "One-Hot Encoding":
467
+ from sklearn.preprocessing import OneHotEncoder
468
+ ohe = OneHotEncoder(sparse=False, drop='first')
469
+ encoded_cols = ohe.fit_transform(df[[cat_col]])
470
+ encoded_df = pd.DataFrame(encoded_cols, columns=ohe.get_feature_names_out([cat_col]))
471
+ df = pd.concat([df.drop(columns=[cat_col]), encoded_df], axis=1)
472
+ cleaning_actions.append(f"Applied One-Hot Encoding to {cat_col}")
473
+ st.success("Standardization applied successfully!")
474
+ except Exception as e:
475
+ st.error(f"Error during standardization: {e}")
476
+ else:
477
+ st.info("No categorical columns found for standardization")
478
+
479
  # Save Cleaned Data
480
  if st.button("💾 Save Cleaned Data"):
481
  st.session_state.cleaned_data = df