CosmickVisions commited on
Commit
0bc0d5d
ยท
verified ยท
1 Parent(s): 3f24c82

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +127 -82
app.py CHANGED
@@ -259,85 +259,7 @@ if app_mode == "Data Upload":
259
  pr = ProfileReport(df, explorative=True,title="Data Upload Report") # Added title to pandas profiling
260
  st_profile_report(pr)
261
 
262
- elif app_mode == "Smart Cleaning":
263
- st.title("๐Ÿงผ Intelligent Data Cleaning")
264
- st.markdown("""
265
- **Automated Data Cleaning** with smart suggestions and advanced transformations.
266
- Clean your data with confidence using AI-powered recommendations.
267
- """)
268
-
269
- if 'raw_data' not in st.session_state or st.session_state.raw_data is None:
270
- st.warning("Please upload your data in the Data Upload section first.")
271
- st.stop()
272
-
273
- # Initialize versioning
274
- if 'data_versions' not in st.session_state:
275
- st.session_state.data_versions = [st.session_state.raw_data.copy()]
276
- st.session_state.current_version = 0
277
-
278
- def update_version(new_df):
279
- st.session_state.data_versions = st.session_state.data_versions[:st.session_state.current_version+1]
280
- st.session_state.data_versions.append(new_df.copy())
281
- st.session_state.current_version += 1
282
-
283
- df = st.session_state.data_versions[st.session_state.current_version].copy()
284
- cleaning_actions = st.session_state.get('cleaning_actions', [])
285
-
286
- # Version Control with Progress Bar
287
- with st.expander("โช Version Control", expanded=True):
288
- st.caption(f"Current Version: {st.session_state.current_version+1}/{len(st.session_state.data_versions)}")
289
- progress = (st.session_state.current_version + 1) / len(st.session_state.data_versions)
290
- st.progress(progress)
291
-
292
- col1, col2 = st.columns(2)
293
- with col1:
294
- if st.button("โฎ๏ธ Undo Last Action", disabled=st.session_state.current_version == 0):
295
- st.session_state.current_version -= 1
296
- st.experimental_rerun()
297
- with col2:
298
- if st.button("โญ๏ธ Redo Next Action", disabled=st.session_state.current_version == len(st.session_state.data_versions)-1):
299
- st.session_state.current_version += 1
300
- st.experimental_rerun()
301
- dtype_counts = df.dtypes.astype(str).value_counts()
302
-
303
- # Data Health Dashboard with Cards
304
- st.subheader("๐Ÿ“Š Data Health Dashboard")
305
- with st.expander("Show Comprehensive Data Report", expanded=True):
306
- try: #Add a try except for the pandas profiling
307
- pr = ProfileReport(df, title="Cleaned Data Report") # Add title to pandas profiling report
308
- st_profile_report(pr)
309
- except ValueError as e:
310
- st.error(f"Error generating data report: {e}. This can often be caused by an empty or inappropriate dataset. Try checking the dataset or cleaning steps")
311
- st.stop() #stop to fix
312
- # Enhanced Health Summary with Cards
313
- col1, col2, col3, col4 = st.columns(4)
314
- with col1:
315
- st.metric("Total Rows", len(df), help="Number of rows in the dataset")
316
- with col2:
317
- st.metric("Total Columns", len(df.columns), help="Number of columns in the dataset")
318
- with col3:
319
- missing_pct = df.isna().mean().mean()
320
- st.metric("Missing Values", f"{missing_pct:.1%}", help="Percentage of missing values in the dataset")
321
- with col4:
322
- duplicates = df.duplicated().sum()
323
- st.metric("Duplicates", duplicates, help="Number of duplicate rows in the dataset")
324
-
325
- # Visualizations for Data Health
326
- st.markdown("### ๐Ÿ“ˆ Data Health Visualizations")
327
- col1, col2 = st.columns(2)
328
- with col1:
329
- st.plotly_chart(px.bar(df.isna().sum(), title="Missing Values per Column",
330
- labels={'index': 'Column', 'value': 'Missing Count'},
331
- color=df.isna().sum(), color_continuous_scale="Bluered"))
332
- with col2:
333
- st.plotly_chart(px.pie(values = df.dtypes.value_counts().tolist(),names = df.dtypes.value_counts().index.astype(str).tolist(),
334
- title="Data Type Distribution", hole=0.3))
335
-
336
- # Cleaning Operations with Tabs
337
- st.subheader("๐Ÿ”ง Cleaning Operations")
338
- tab1, tab2, tab3, tab4 = st.tabs(["Missing Values", "Duplicates", "Data Types", "Outliers"])
339
-
340
- # 1. Missing Value Handling
341
  with tab1:
342
  st.markdown("### ๐Ÿ•ณ๏ธ Handle Missing Values")
343
  missing_cols = df.columns[df.isna().any()].tolist()
@@ -360,10 +282,10 @@ elif app_mode == "Smart Cleaning":
360
  if missing_value_method == "Drop Missing":
361
  df = df.dropna(subset=cols) # Drop rows with missing values in selected columns
362
  cleaning_actions.append(f"Dropped missing values in selected columns")
363
- elif missing_value_method == "Mean/Median/Mode":
364
  # Allow the user to select the specific imputation method
365
  imputation_choice = st.radio("Select Imputation Method", ["Mean", "Median", "Mode"], horizontal=True)
366
-
367
  # Imputation logic here, added to perform the imputation in multiple columns
368
  for col in cols:
369
  if df[col].isnull().any(): # Check if missing values exist before imputing
@@ -376,7 +298,7 @@ elif app_mode == "Smart Cleaning":
376
  df[col] = df[col].fillna(df[col].mode()[0])
377
  else: # Impute strings with mode
378
  df[col] = df[col].fillna(df[col].mode()[0])
379
- cleaning_actions.append(f"Applied Mean/Median/Mode imputation on {cols}")
380
 
381
  elif missing_value_method == "KNN Imputation":
382
  from sklearn.impute import KNNImputer
@@ -409,6 +331,129 @@ elif app_mode == "Smart Cleaning":
409
  else:
410
  st.success("โœจ No missing values found!")
411
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
  # 2. Duplicate Handling
413
  with tab2:
414
  st.markdown("### ๐Ÿ”„ Handle Duplicates")
 
259
  pr = ProfileReport(df, explorative=True,title="Data Upload Report") # Added title to pandas profiling
260
  st_profile_report(pr)
261
 
262
+ # 1. Missing Value Handling
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  with tab1:
264
  st.markdown("### ๐Ÿ•ณ๏ธ Handle Missing Values")
265
  missing_cols = df.columns[df.isna().any()].tolist()
 
282
  if missing_value_method == "Drop Missing":
283
  df = df.dropna(subset=cols) # Drop rows with missing values in selected columns
284
  cleaning_actions.append(f"Dropped missing values in selected columns")
285
+ elif missing_value_method == "Mean/Median/Mode":
286
  # Allow the user to select the specific imputation method
287
  imputation_choice = st.radio("Select Imputation Method", ["Mean", "Median", "Mode"], horizontal=True)
288
+
289
  # Imputation logic here, added to perform the imputation in multiple columns
290
  for col in cols:
291
  if df[col].isnull().any(): # Check if missing values exist before imputing
 
298
  df[col] = df[col].fillna(df[col].mode()[0])
299
  else: # Impute strings with mode
300
  df[col] = df[col].fillna(df[col].mode()[0])
301
+ cleaning_actions.append(f"Applied Mean/Median/Mode imputation on {cols}")
302
 
303
  elif missing_value_method == "KNN Imputation":
304
  from sklearn.impute import KNNImputer
 
331
  else:
332
  st.success("โœจ No missing values found!")
333
 
334
+ # 2. Duplicate Handling
335
+ with tab2:
336
+ st.markdown("### ๐Ÿ”„ Handle Duplicates")
337
+ duplicates = df.duplicated().sum()
338
+ if duplicates > 0:
339
+ st.plotly_chart(px.histogram(df, x=df.duplicated(), title="Duplicate Distribution"))
340
+ dup_strategy = st.radio("Duplicate Strategy", [
341
+ "Remove All Duplicates",
342
+ "Keep First Occurrence",
343
+ "Keep Last Occurrence"
344
+ ])
345
+ if st.button("Handle Duplicates"):
346
+ original_count = len(df)
347
+ df = df.drop_duplicates(keep={
348
+ "Remove All Duplicates": False,
349
+ "Keep First Occurrence": 'first',
350
+ "Keep Last Occurrence": 'last'
351
+ }[dup_strategy])
352
+ cleaning_actions.append(f"Removed {original_count - len(df)} duplicates")
353
+ update_version(df)
354
+ st.success(f"Removed {original_count - len(df)} duplicates! โœ…")
355
+ else:
356
+ st.success("โœจ No duplicates found!")
357
+
358
+ # 3. Data Type Conversion
359
+ with tab3:
360
+ st.markdown("### ๐Ÿ”„ Convert Data Types")
361
+ col1, col2 = st.columns(2)
362
+ with col1:
363
+ st.dataframe(df.dtypes.reset_index().rename(columns={0: 'Type', 'index': 'Column'}))
364
+ with col2:
365
+ col_to_convert = st.selectbox("Select column to convert", df.columns)
366
+ new_type = st.selectbox("New Data Type", [
367
+ "String", "Integer", "Float",
368
+ "Boolean", "Datetime", "Category"
369
+ ])
370
+ if st.button("Convert Data Type"):
371
+ try:
372
+ if new_type == "String":
373
+ df[col_to_convert] = df[col_to_convert].astype(str)
374
+ elif new_type == "Integer":
375
+ df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce').astype('Int64')
376
+ elif new_type == "Float":
377
+ df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce')
378
+ elif new_type == "Boolean":
379
+ df[col_to_convert] = df[col_to_convert].astype(bool)
380
+ elif new_type == "Datetime":
381
+ df[col_to_convert] = pd.to_datetime(df[col_to_convert], errors='coerce')
382
+ elif new_type == "Category":
383
+ df[col_to_convert] = df[col_to_convert].astype('category')
384
+
385
+ cleaning_actions.append(f"Converted {col_to_convert} to {new_type}")
386
+ update_version(df)
387
+ st.success("Data type converted successfully! โœ…")
388
+ except Exception as e:
389
+ st.error(f"Conversion failed: {str(e)}")
390
+
391
+ # 4. Outlier Handling
392
+ with tab4:
393
+ st.markdown("### ๐Ÿ“ˆ Handle Outliers")
394
+ numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
395
+ if numeric_cols:
396
+ outlier_col = st.selectbox("Select numeric column", numeric_cols)
397
+ st.plotly_chart(px.box(df, y=outlier_col, title="Outlier Distribution"))
398
+ outlier_method = st.radio("Outlier Handling Method", ["Z-score", "IQR", "Manual"], horizontal=True)
399
+ if st.button("Remove Outliers"):
400
+ try:
401
+ original_df = df.copy()
402
+ if outlier_method == "Z-score":
403
+ from scipy import stats
404
+ z_scores = np.abs(stats.zscore(df[outlier_col]))
405
+ df = df[(z_scores < 3)] # Keep only values with zscore less than 3
406
+ cleaning_actions.append(f"Removed outliers from {outlier_col} using Z-score (threshold 3)")
407
+ elif outlier_method == "IQR":
408
+ Q1 = df[outlier_col].quantile(0.25)
409
+ Q3 = df[outlier_col].quantile(0.75)
410
+ IQR = Q3 - Q1
411
+ df = df[~((df[outlier_col] < (Q1 - 1.5 * IQR)) |(df[outlier_col] > (Q3 + 1.5 * IQR)))]
412
+ cleaning_actions.append(f"Removed outliers from {outlier_col} using IQR")
413
+ elif outlier_method == "Manual":
414
+ lower_bound = st.number_input("Lower Bound", value=df[outlier_col].min(), step=1.0)
415
+ upper_bound = st.number_input("Upper Bound", value=df[outlier_col].max(), step=1.0)
416
+ df = df[(df[outlier_col] >= lower_bound) & (df[outlier_col] <= upper_bound)]
417
+ cleaning_actions.append(f"Removed outliers from {outlier_col} using manual bounds")
418
+ update_version(df)
419
+ st.success("Outliers removed successfully! โœ…")
420
+ except Exception as e:
421
+ st.error(f"Outlier removal failed: {str(e)}")
422
+ else:
423
+ st.info("โ„น๏ธ No numeric columns found for outlier detection")
424
+
425
+ # Drop Column Functionality with Interface
426
+ st.subheader("๐Ÿ—‘๏ธ Drop Specific Columns")
427
+ cols_to_drop = st.multiselect("Select Columns to Drop", df.columns)
428
+ if st.button("Drop Selected Columns"):
429
+ try:
430
+ df = df.drop(columns=cols_to_drop) # Drop the cols here.
431
+ cleaning_actions.append(f"Dropped columns: {', '.join(cols_to_drop)}")
432
+ update_version(df)
433
+ st.success(f"Columns dropped successfully! โœ…")
434
+ except (KeyError, ValueError) as e:
435
+ st.error(f"Invalid column(s) selected or other error: {e}") # Handle ValueErrors
436
+ except Exception as e:
437
+ st.error(f"An unexpected error occurred: {e}")
438
+ # Label Encoding (Categorical to Numeric)
439
+ st.subheader("๐Ÿ”ข Label Encoding")
440
+ if st.button("Encode Categorical Columns"):
441
+ try:
442
+ le = LabelEncoder()
443
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns
444
+ for col in categorical_cols:
445
+ df[col] = df[col].astype(str) # Ensure all cols are string
446
+ df[col] = le.fit_transform(df[col])
447
+ cleaning_actions.append("Applied Label Encoding to categorical columns")
448
+ update_version(df)
449
+ st.success("Label encoding applied successfully! โœ…")
450
+ except Exception as e:
451
+ st.error(f"Label encoding failed: {str(e)}")
452
+
453
+ # Live Data Preview after every cleaning action
454
+ st.subheader("โœจ Live Data Preview")
455
+ st.dataframe(df.head(10)) # show 10 rows
456
+
457
  # 2. Duplicate Handling
458
  with tab2:
459
  st.markdown("### ๐Ÿ”„ Handle Duplicates")