Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -337,7 +337,7 @@ elif app_mode == "Smart Cleaning":
|
|
337 |
st.subheader("π§ Cleaning Operations")
|
338 |
tab1, tab2, tab3, tab4 = st.tabs(["Missing Values", "Duplicates", "Data Types", "Outliers"])
|
339 |
|
340 |
-
|
341 |
with tab1:
|
342 |
st.markdown("### π³οΈ Handle Missing Values")
|
343 |
missing_cols = df.columns[df.isna().any()].tolist()
|
@@ -346,6 +346,7 @@ elif app_mode == "Smart Cleaning":
|
|
346 |
cols = st.multiselect("Select columns to clean", missing_cols, default=missing_cols)
|
347 |
|
348 |
method = st.radio("Imputation Method", [
|
|
|
349 |
"Drop Missing",
|
350 |
"Mean/Median/Mode",
|
351 |
"KNN Imputation",
|
@@ -355,11 +356,47 @@ elif app_mode == "Smart Cleaning":
|
|
355 |
|
356 |
if st.button(f"Apply {method}"):
|
357 |
try:
|
358 |
-
original_df = df.copy()
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
363 |
except Exception as e:
|
364 |
st.error(f"Error: {str(e)}")
|
365 |
else:
|
@@ -403,7 +440,19 @@ elif app_mode == "Smart Cleaning":
|
|
403 |
])
|
404 |
if st.button("Convert Data Type"):
|
405 |
try:
|
406 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
407 |
cleaning_actions.append(f"Converted {col_to_convert} to {new_type}")
|
408 |
update_version(df)
|
409 |
st.success("Data type converted successfully! β
")
|
@@ -417,14 +466,64 @@ elif app_mode == "Smart Cleaning":
|
|
417 |
if numeric_cols:
|
418 |
outlier_col = st.selectbox("Select numeric column", numeric_cols)
|
419 |
st.plotly_chart(px.box(df, y=outlier_col, title="Outlier Distribution"))
|
|
|
420 |
if st.button("Remove Outliers"):
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
425 |
else:
|
426 |
st.info("βΉοΈ No numeric columns found for outlier detection")
|
427 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
428 |
# Save Cleaned Data with Enhanced Feedback
|
429 |
if st.button("πΎ Save Cleaned Data"):
|
430 |
st.session_state.cleaned_data = df
|
|
|
337 |
st.subheader("π§ Cleaning Operations")
|
338 |
tab1, tab2, tab3, tab4 = st.tabs(["Missing Values", "Duplicates", "Data Types", "Outliers"])
|
339 |
|
340 |
+
# 1. Missing Value Handling
|
341 |
with tab1:
|
342 |
st.markdown("### π³οΈ Handle Missing Values")
|
343 |
missing_cols = df.columns[df.isna().any()].tolist()
|
|
|
346 |
cols = st.multiselect("Select columns to clean", missing_cols, default=missing_cols)
|
347 |
|
348 |
method = st.radio("Imputation Method", [
|
349 |
+
"Keep Missing",
|
350 |
"Drop Missing",
|
351 |
"Mean/Median/Mode",
|
352 |
"KNN Imputation",
|
|
|
356 |
|
357 |
if st.button(f"Apply {method}"):
|
358 |
try:
|
359 |
+
original_df = df.copy() # Store the original df before applying any change
|
360 |
+
if missing_value_method == "Drop Missing":
|
361 |
+
df = df.dropna(subset=cols) # Drop rows with missing values in selected columns
|
362 |
+
cleaning_actions.append(f"Dropped missing values in selected columns")
|
363 |
+
elif missing_value_method == "Mean/Median/Mode":
|
364 |
+
# Imputation logic here, added to perform the imputation in multiple columns
|
365 |
+
for col in cols:
|
366 |
+
if df[col].isnull().any(): # Check if missing values exist before imputing
|
367 |
+
if pd.api.types.is_numeric_dtype(df[col]):
|
368 |
+
df[col] = df[col].fillna(df[col].mean())
|
369 |
+
else: # Impute strings with mode
|
370 |
+
df[col] = df[col].fillna(df[col].mode()[0])
|
371 |
+
|
372 |
+
cleaning_actions.append(f"Applied Mean/Median/Mode imputation on {cols}")
|
373 |
+
|
374 |
+
elif missing_value_method == "KNN Imputation":
|
375 |
+
from sklearn.impute import KNNImputer
|
376 |
+
imputer = KNNImputer(n_neighbors=5)
|
377 |
+
# Ensure numeric data for KNN, select only numeric columns to impute
|
378 |
+
numeric_cols = df[cols].select_dtypes(include=np.number).columns
|
379 |
+
if not numeric_cols.empty: # Check if there are numeric columns to impute
|
380 |
+
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
|
381 |
+
cleaning_actions.append(f"Applied KNN Imputation on {cols}")
|
382 |
+
else:
|
383 |
+
st.warning("No numeric columns to apply KNN imputation")
|
384 |
+
elif missing_value_method == "MICE Imputation":
|
385 |
+
from sklearn.impute import IterativeImputer
|
386 |
+
# Select numeric columns for MICE
|
387 |
+
numeric_cols = df[cols].select_dtypes(include=np.number).columns
|
388 |
+
if not numeric_cols.empty: # Check if there are numeric columns to impute
|
389 |
+
imputer = IterativeImputer()
|
390 |
+
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
|
391 |
+
cleaning_actions.append(f"Applied MICE Imputation on {cols}")
|
392 |
+
else:
|
393 |
+
st.warning("No numeric columns to apply MICE imputation")
|
394 |
+
|
395 |
+
elif missing_value_method == "Deep Learning Imputation":
|
396 |
+
st.warning("Deep Learning Imputation is not implemented in this example. Please use other methods.")
|
397 |
+
|
398 |
+
update_version(df) # Update the version after cleaning
|
399 |
+
st.success(f"{missing_value_method} applied successfully! β
")
|
400 |
except Exception as e:
|
401 |
st.error(f"Error: {str(e)}")
|
402 |
else:
|
|
|
440 |
])
|
441 |
if st.button("Convert Data Type"):
|
442 |
try:
|
443 |
+
if new_type == "String":
|
444 |
+
df[col_to_convert] = df[col_to_convert].astype(str)
|
445 |
+
elif new_type == "Integer":
|
446 |
+
df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce').astype('Int64')
|
447 |
+
elif new_type == "Float":
|
448 |
+
df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce')
|
449 |
+
elif new_type == "Boolean":
|
450 |
+
df[col_to_convert] = df[col_to_convert].astype(bool)
|
451 |
+
elif new_type == "Datetime":
|
452 |
+
df[col_to_convert] = pd.to_datetime(df[col_to_convert], errors='coerce')
|
453 |
+
elif new_type == "Category":
|
454 |
+
df[col_to_convert] = df[col_to_convert].astype('category')
|
455 |
+
|
456 |
cleaning_actions.append(f"Converted {col_to_convert} to {new_type}")
|
457 |
update_version(df)
|
458 |
st.success("Data type converted successfully! β
")
|
|
|
466 |
if numeric_cols:
|
467 |
outlier_col = st.selectbox("Select numeric column", numeric_cols)
|
468 |
st.plotly_chart(px.box(df, y=outlier_col, title="Outlier Distribution"))
|
469 |
+
outlier_method = st.radio("Outlier Handling Method", ["Z-score", "IQR", "Manual"], horizontal=True)
|
470 |
if st.button("Remove Outliers"):
|
471 |
+
try:
|
472 |
+
original_df = df.copy()
|
473 |
+
if outlier_method == "Z-score":
|
474 |
+
from scipy import stats
|
475 |
+
z_scores = np.abs(stats.zscore(df[outlier_col]))
|
476 |
+
df = df[(z_scores < 3)] # Keep only values with zscore less than 3
|
477 |
+
cleaning_actions.append(f"Removed outliers from {outlier_col} using Z-score (threshold 3)")
|
478 |
+
elif outlier_method == "IQR":
|
479 |
+
Q1 = df[outlier_col].quantile(0.25)
|
480 |
+
Q3 = df[outlier_col].quantile(0.75)
|
481 |
+
IQR = Q3 - Q1
|
482 |
+
df = df[~((df[outlier_col] < (Q1 - 1.5 * IQR)) |(df[outlier_col] > (Q3 + 1.5 * IQR)))]
|
483 |
+
cleaning_actions.append(f"Removed outliers from {outlier_col} using IQR")
|
484 |
+
elif outlier_method == "Manual":
|
485 |
+
lower_bound = st.number_input("Lower Bound", value=df[outlier_col].min(), step=1.0)
|
486 |
+
upper_bound = st.number_input("Upper Bound", value=df[outlier_col].max(), step=1.0)
|
487 |
+
df = df[(df[outlier_col] >= lower_bound) & (df[outlier_col] <= upper_bound)]
|
488 |
+
cleaning_actions.append(f"Removed outliers from {outlier_col} using manual bounds")
|
489 |
+
update_version(df)
|
490 |
+
st.success("Outliers removed successfully! β
")
|
491 |
+
except Exception as e:
|
492 |
+
st.error(f"Outlier removal failed: {str(e)}")
|
493 |
else:
|
494 |
st.info("βΉοΈ No numeric columns found for outlier detection")
|
495 |
|
496 |
+
# Drop Column Functionality with Interface
|
497 |
+
st.subheader("ποΈ Drop Specific Columns")
|
498 |
+
cols_to_drop = st.multiselect("Select Columns to Drop", df.columns)
|
499 |
+
if st.button("Drop Selected Columns"):
|
500 |
+
try:
|
501 |
+
df = df.drop(columns=cols_to_drop) #Drop the cols here.
|
502 |
+
cleaning_actions.append(f"Dropped columns: {', '.join(cols_to_drop)}")
|
503 |
+
update_version(df)
|
504 |
+
st.success(f"Columns dropped successfully! β
")
|
505 |
+
except (KeyError):
|
506 |
+
st.error("Invalid column(s) selected.")
|
507 |
+
except Exception as e:
|
508 |
+
st.error(f"An unexpected error occurred: {e}")
|
509 |
+
# Label Encoding (Categorical to Numeric)
|
510 |
+
st.subheader("π’ Label Encoding")
|
511 |
+
if st.button("Encode Categorical Columns"):
|
512 |
+
try:
|
513 |
+
le = LabelEncoder()
|
514 |
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
|
515 |
+
for col in categorical_cols:
|
516 |
+
df[col] = df[col].astype(str) # Ensure all cols are string
|
517 |
+
df[col] = le.fit_transform(df[col])
|
518 |
+
cleaning_actions.append("Applied Label Encoding to categorical columns")
|
519 |
+
update_version(df)
|
520 |
+
st.success("Label encoding applied successfully! β
")
|
521 |
+
except Exception as e:
|
522 |
+
st.error(f"Label encoding failed: {str(e)}")
|
523 |
+
|
524 |
+
# Live Data Preview after every cleaning action
|
525 |
+
st.subheader("β¨ Live Data Preview")
|
526 |
+
st.dataframe(df.head(10)) # show 10 rows
|
527 |
# Save Cleaned Data with Enhanced Feedback
|
528 |
if st.button("πΎ Save Cleaned Data"):
|
529 |
st.session_state.cleaned_data = df
|