CosmickVisions commited on
Commit
da4d621
Β·
verified Β·
1 Parent(s): a25447e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -11
app.py CHANGED
@@ -337,7 +337,7 @@ elif app_mode == "Smart Cleaning":
337
  st.subheader("πŸ”§ Cleaning Operations")
338
  tab1, tab2, tab3, tab4 = st.tabs(["Missing Values", "Duplicates", "Data Types", "Outliers"])
339
 
340
- # 1. Missing Value Handling
341
  with tab1:
342
  st.markdown("### πŸ•³οΈ Handle Missing Values")
343
  missing_cols = df.columns[df.isna().any()].tolist()
@@ -346,6 +346,7 @@ elif app_mode == "Smart Cleaning":
346
  cols = st.multiselect("Select columns to clean", missing_cols, default=missing_cols)
347
 
348
  method = st.radio("Imputation Method", [
 
349
  "Drop Missing",
350
  "Mean/Median/Mode",
351
  "KNN Imputation",
@@ -355,11 +356,47 @@ elif app_mode == "Smart Cleaning":
355
 
356
  if st.button(f"Apply {method}"):
357
  try:
358
- original_df = df.copy()
359
- # Imputation logic here...
360
- cleaning_actions.append(f"Applied {method} on {cols}")
361
- update_version(df)
362
- st.success(f"{method} applied successfully! βœ…")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
  except Exception as e:
364
  st.error(f"Error: {str(e)}")
365
  else:
@@ -403,7 +440,19 @@ elif app_mode == "Smart Cleaning":
403
  ])
404
  if st.button("Convert Data Type"):
405
  try:
406
- # Conversion logic here...
 
 
 
 
 
 
 
 
 
 
 
 
407
  cleaning_actions.append(f"Converted {col_to_convert} to {new_type}")
408
  update_version(df)
409
  st.success("Data type converted successfully! βœ…")
@@ -417,14 +466,64 @@ elif app_mode == "Smart Cleaning":
417
  if numeric_cols:
418
  outlier_col = st.selectbox("Select numeric column", numeric_cols)
419
  st.plotly_chart(px.box(df, y=outlier_col, title="Outlier Distribution"))
 
420
  if st.button("Remove Outliers"):
421
- # Outlier removal logic here...
422
- cleaning_actions.append(f"Removed outliers from {outlier_col}")
423
- update_version(df)
424
- st.success("Outliers removed successfully! βœ…")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
425
  else:
426
  st.info("ℹ️ No numeric columns found for outlier detection")
427
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428
  # Save Cleaned Data with Enhanced Feedback
429
  if st.button("πŸ’Ύ Save Cleaned Data"):
430
  st.session_state.cleaned_data = df
 
337
  st.subheader("πŸ”§ Cleaning Operations")
338
  tab1, tab2, tab3, tab4 = st.tabs(["Missing Values", "Duplicates", "Data Types", "Outliers"])
339
 
340
+ # 1. Missing Value Handling
341
  with tab1:
342
  st.markdown("### πŸ•³οΈ Handle Missing Values")
343
  missing_cols = df.columns[df.isna().any()].tolist()
 
346
  cols = st.multiselect("Select columns to clean", missing_cols, default=missing_cols)
347
 
348
  method = st.radio("Imputation Method", [
349
+ "Keep Missing",
350
  "Drop Missing",
351
  "Mean/Median/Mode",
352
  "KNN Imputation",
 
356
 
357
  if st.button(f"Apply {method}"):
358
  try:
359
+ original_df = df.copy() # Store the original df before applying any change
360
+ if missing_value_method == "Drop Missing":
361
+ df = df.dropna(subset=cols) # Drop rows with missing values in selected columns
362
+ cleaning_actions.append(f"Dropped missing values in selected columns")
363
+ elif missing_value_method == "Mean/Median/Mode":
364
+ # Imputation logic here, added to perform the imputation in multiple columns
365
+ for col in cols:
366
+ if df[col].isnull().any(): # Check if missing values exist before imputing
367
+ if pd.api.types.is_numeric_dtype(df[col]):
368
+ df[col] = df[col].fillna(df[col].mean())
369
+ else: # Impute strings with mode
370
+ df[col] = df[col].fillna(df[col].mode()[0])
371
+
372
+ cleaning_actions.append(f"Applied Mean/Median/Mode imputation on {cols}")
373
+
374
+ elif missing_value_method == "KNN Imputation":
375
+ from sklearn.impute import KNNImputer
376
+ imputer = KNNImputer(n_neighbors=5)
377
+ # Ensure numeric data for KNN, select only numeric columns to impute
378
+ numeric_cols = df[cols].select_dtypes(include=np.number).columns
379
+ if not numeric_cols.empty: # Check if there are numeric columns to impute
380
+ df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
381
+ cleaning_actions.append(f"Applied KNN Imputation on {cols}")
382
+ else:
383
+ st.warning("No numeric columns to apply KNN imputation")
384
+ elif missing_value_method == "MICE Imputation":
385
+ from sklearn.impute import IterativeImputer
386
+ # Select numeric columns for MICE
387
+ numeric_cols = df[cols].select_dtypes(include=np.number).columns
388
+ if not numeric_cols.empty: # Check if there are numeric columns to impute
389
+ imputer = IterativeImputer()
390
+ df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
391
+ cleaning_actions.append(f"Applied MICE Imputation on {cols}")
392
+ else:
393
+ st.warning("No numeric columns to apply MICE imputation")
394
+
395
+ elif missing_value_method == "Deep Learning Imputation":
396
+ st.warning("Deep Learning Imputation is not implemented in this example. Please use other methods.")
397
+
398
+ update_version(df) # Update the version after cleaning
399
+ st.success(f"{missing_value_method} applied successfully! βœ…")
400
  except Exception as e:
401
  st.error(f"Error: {str(e)}")
402
  else:
 
440
  ])
441
  if st.button("Convert Data Type"):
442
  try:
443
+ if new_type == "String":
444
+ df[col_to_convert] = df[col_to_convert].astype(str)
445
+ elif new_type == "Integer":
446
+ df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce').astype('Int64')
447
+ elif new_type == "Float":
448
+ df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce')
449
+ elif new_type == "Boolean":
450
+ df[col_to_convert] = df[col_to_convert].astype(bool)
451
+ elif new_type == "Datetime":
452
+ df[col_to_convert] = pd.to_datetime(df[col_to_convert], errors='coerce')
453
+ elif new_type == "Category":
454
+ df[col_to_convert] = df[col_to_convert].astype('category')
455
+
456
  cleaning_actions.append(f"Converted {col_to_convert} to {new_type}")
457
  update_version(df)
458
  st.success("Data type converted successfully! βœ…")
 
466
  if numeric_cols:
467
  outlier_col = st.selectbox("Select numeric column", numeric_cols)
468
  st.plotly_chart(px.box(df, y=outlier_col, title="Outlier Distribution"))
469
+ outlier_method = st.radio("Outlier Handling Method", ["Z-score", "IQR", "Manual"], horizontal=True)
470
  if st.button("Remove Outliers"):
471
+ try:
472
+ original_df = df.copy()
473
+ if outlier_method == "Z-score":
474
+ from scipy import stats
475
+ z_scores = np.abs(stats.zscore(df[outlier_col]))
476
+ df = df[(z_scores < 3)] # Keep only values with zscore less than 3
477
+ cleaning_actions.append(f"Removed outliers from {outlier_col} using Z-score (threshold 3)")
478
+ elif outlier_method == "IQR":
479
+ Q1 = df[outlier_col].quantile(0.25)
480
+ Q3 = df[outlier_col].quantile(0.75)
481
+ IQR = Q3 - Q1
482
+ df = df[~((df[outlier_col] < (Q1 - 1.5 * IQR)) |(df[outlier_col] > (Q3 + 1.5 * IQR)))]
483
+ cleaning_actions.append(f"Removed outliers from {outlier_col} using IQR")
484
+ elif outlier_method == "Manual":
485
+ lower_bound = st.number_input("Lower Bound", value=df[outlier_col].min(), step=1.0)
486
+ upper_bound = st.number_input("Upper Bound", value=df[outlier_col].max(), step=1.0)
487
+ df = df[(df[outlier_col] >= lower_bound) & (df[outlier_col] <= upper_bound)]
488
+ cleaning_actions.append(f"Removed outliers from {outlier_col} using manual bounds")
489
+ update_version(df)
490
+ st.success("Outliers removed successfully! βœ…")
491
+ except Exception as e:
492
+ st.error(f"Outlier removal failed: {str(e)}")
493
  else:
494
  st.info("ℹ️ No numeric columns found for outlier detection")
495
 
496
+ # Drop Column Functionality with Interface
497
+ st.subheader("πŸ—‘οΈ Drop Specific Columns")
498
+ cols_to_drop = st.multiselect("Select Columns to Drop", df.columns)
499
+ if st.button("Drop Selected Columns"):
500
+ try:
501
+ df = df.drop(columns=cols_to_drop) #Drop the cols here.
502
+ cleaning_actions.append(f"Dropped columns: {', '.join(cols_to_drop)}")
503
+ update_version(df)
504
+ st.success(f"Columns dropped successfully! βœ…")
505
+ except (KeyError):
506
+ st.error("Invalid column(s) selected.")
507
+ except Exception as e:
508
+ st.error(f"An unexpected error occurred: {e}")
509
+ # Label Encoding (Categorical to Numeric)
510
+ st.subheader("πŸ”’ Label Encoding")
511
+ if st.button("Encode Categorical Columns"):
512
+ try:
513
+ le = LabelEncoder()
514
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns
515
+ for col in categorical_cols:
516
+ df[col] = df[col].astype(str) # Ensure all cols are string
517
+ df[col] = le.fit_transform(df[col])
518
+ cleaning_actions.append("Applied Label Encoding to categorical columns")
519
+ update_version(df)
520
+ st.success("Label encoding applied successfully! βœ…")
521
+ except Exception as e:
522
+ st.error(f"Label encoding failed: {str(e)}")
523
+
524
+ # Live Data Preview after every cleaning action
525
+ st.subheader("✨ Live Data Preview")
526
+ st.dataframe(df.head(10)) # show 10 rows
527
  # Save Cleaned Data with Enhanced Feedback
528
  if st.button("πŸ’Ύ Save Cleaned Data"):
529
  st.session_state.cleaned_data = df