Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -259,85 +259,7 @@ if app_mode == "Data Upload":
|
|
259 |
pr = ProfileReport(df, explorative=True,title="Data Upload Report") # Added title to pandas profiling
|
260 |
st_profile_report(pr)
|
261 |
|
262 |
-
|
263 |
-
st.title("๐งผ Intelligent Data Cleaning")
|
264 |
-
st.markdown("""
|
265 |
-
**Automated Data Cleaning** with smart suggestions and advanced transformations.
|
266 |
-
Clean your data with confidence using AI-powered recommendations.
|
267 |
-
""")
|
268 |
-
|
269 |
-
if 'raw_data' not in st.session_state or st.session_state.raw_data is None:
|
270 |
-
st.warning("Please upload your data in the Data Upload section first.")
|
271 |
-
st.stop()
|
272 |
-
|
273 |
-
# Initialize versioning
|
274 |
-
if 'data_versions' not in st.session_state:
|
275 |
-
st.session_state.data_versions = [st.session_state.raw_data.copy()]
|
276 |
-
st.session_state.current_version = 0
|
277 |
-
|
278 |
-
def update_version(new_df):
|
279 |
-
st.session_state.data_versions = st.session_state.data_versions[:st.session_state.current_version+1]
|
280 |
-
st.session_state.data_versions.append(new_df.copy())
|
281 |
-
st.session_state.current_version += 1
|
282 |
-
|
283 |
-
df = st.session_state.data_versions[st.session_state.current_version].copy()
|
284 |
-
cleaning_actions = st.session_state.get('cleaning_actions', [])
|
285 |
-
|
286 |
-
# Version Control with Progress Bar
|
287 |
-
with st.expander("โช Version Control", expanded=True):
|
288 |
-
st.caption(f"Current Version: {st.session_state.current_version+1}/{len(st.session_state.data_versions)}")
|
289 |
-
progress = (st.session_state.current_version + 1) / len(st.session_state.data_versions)
|
290 |
-
st.progress(progress)
|
291 |
-
|
292 |
-
col1, col2 = st.columns(2)
|
293 |
-
with col1:
|
294 |
-
if st.button("โฎ๏ธ Undo Last Action", disabled=st.session_state.current_version == 0):
|
295 |
-
st.session_state.current_version -= 1
|
296 |
-
st.experimental_rerun()
|
297 |
-
with col2:
|
298 |
-
if st.button("โญ๏ธ Redo Next Action", disabled=st.session_state.current_version == len(st.session_state.data_versions)-1):
|
299 |
-
st.session_state.current_version += 1
|
300 |
-
st.experimental_rerun()
|
301 |
-
dtype_counts = df.dtypes.astype(str).value_counts()
|
302 |
-
|
303 |
-
# Data Health Dashboard with Cards
|
304 |
-
st.subheader("๐ Data Health Dashboard")
|
305 |
-
with st.expander("Show Comprehensive Data Report", expanded=True):
|
306 |
-
try: #Add a try except for the pandas profiling
|
307 |
-
pr = ProfileReport(df, title="Cleaned Data Report") # Add title to pandas profiling report
|
308 |
-
st_profile_report(pr)
|
309 |
-
except ValueError as e:
|
310 |
-
st.error(f"Error generating data report: {e}. This can often be caused by an empty or inappropriate dataset. Try checking the dataset or cleaning steps")
|
311 |
-
st.stop() #stop to fix
|
312 |
-
# Enhanced Health Summary with Cards
|
313 |
-
col1, col2, col3, col4 = st.columns(4)
|
314 |
-
with col1:
|
315 |
-
st.metric("Total Rows", len(df), help="Number of rows in the dataset")
|
316 |
-
with col2:
|
317 |
-
st.metric("Total Columns", len(df.columns), help="Number of columns in the dataset")
|
318 |
-
with col3:
|
319 |
-
missing_pct = df.isna().mean().mean()
|
320 |
-
st.metric("Missing Values", f"{missing_pct:.1%}", help="Percentage of missing values in the dataset")
|
321 |
-
with col4:
|
322 |
-
duplicates = df.duplicated().sum()
|
323 |
-
st.metric("Duplicates", duplicates, help="Number of duplicate rows in the dataset")
|
324 |
-
|
325 |
-
# Visualizations for Data Health
|
326 |
-
st.markdown("### ๐ Data Health Visualizations")
|
327 |
-
col1, col2 = st.columns(2)
|
328 |
-
with col1:
|
329 |
-
st.plotly_chart(px.bar(df.isna().sum(), title="Missing Values per Column",
|
330 |
-
labels={'index': 'Column', 'value': 'Missing Count'},
|
331 |
-
color=df.isna().sum(), color_continuous_scale="Bluered"))
|
332 |
-
with col2:
|
333 |
-
st.plotly_chart(px.pie(values = df.dtypes.value_counts().tolist(),names = df.dtypes.value_counts().index.astype(str).tolist(),
|
334 |
-
title="Data Type Distribution", hole=0.3))
|
335 |
-
|
336 |
-
# Cleaning Operations with Tabs
|
337 |
-
st.subheader("๐ง Cleaning Operations")
|
338 |
-
tab1, tab2, tab3, tab4 = st.tabs(["Missing Values", "Duplicates", "Data Types", "Outliers"])
|
339 |
-
|
340 |
-
# 1. Missing Value Handling
|
341 |
with tab1:
|
342 |
st.markdown("### ๐ณ๏ธ Handle Missing Values")
|
343 |
missing_cols = df.columns[df.isna().any()].tolist()
|
@@ -360,10 +282,10 @@ elif app_mode == "Smart Cleaning":
|
|
360 |
if missing_value_method == "Drop Missing":
|
361 |
df = df.dropna(subset=cols) # Drop rows with missing values in selected columns
|
362 |
cleaning_actions.append(f"Dropped missing values in selected columns")
|
363 |
-
|
364 |
# Allow the user to select the specific imputation method
|
365 |
imputation_choice = st.radio("Select Imputation Method", ["Mean", "Median", "Mode"], horizontal=True)
|
366 |
-
|
367 |
# Imputation logic here, added to perform the imputation in multiple columns
|
368 |
for col in cols:
|
369 |
if df[col].isnull().any(): # Check if missing values exist before imputing
|
@@ -376,7 +298,7 @@ elif app_mode == "Smart Cleaning":
|
|
376 |
df[col] = df[col].fillna(df[col].mode()[0])
|
377 |
else: # Impute strings with mode
|
378 |
df[col] = df[col].fillna(df[col].mode()[0])
|
379 |
-
|
380 |
|
381 |
elif missing_value_method == "KNN Imputation":
|
382 |
from sklearn.impute import KNNImputer
|
@@ -409,6 +331,129 @@ elif app_mode == "Smart Cleaning":
|
|
409 |
else:
|
410 |
st.success("โจ No missing values found!")
|
411 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
412 |
# 2. Duplicate Handling
|
413 |
with tab2:
|
414 |
st.markdown("### ๐ Handle Duplicates")
|
|
|
259 |
pr = ProfileReport(df, explorative=True,title="Data Upload Report") # Added title to pandas profiling
|
260 |
st_profile_report(pr)
|
261 |
|
262 |
+
# 1. Missing Value Handling
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
with tab1:
|
264 |
st.markdown("### ๐ณ๏ธ Handle Missing Values")
|
265 |
missing_cols = df.columns[df.isna().any()].tolist()
|
|
|
282 |
if missing_value_method == "Drop Missing":
|
283 |
df = df.dropna(subset=cols) # Drop rows with missing values in selected columns
|
284 |
cleaning_actions.append(f"Dropped missing values in selected columns")
|
285 |
+
elif missing_value_method == "Mean/Median/Mode":
|
286 |
# Allow the user to select the specific imputation method
|
287 |
imputation_choice = st.radio("Select Imputation Method", ["Mean", "Median", "Mode"], horizontal=True)
|
288 |
+
|
289 |
# Imputation logic here, added to perform the imputation in multiple columns
|
290 |
for col in cols:
|
291 |
if df[col].isnull().any(): # Check if missing values exist before imputing
|
|
|
298 |
df[col] = df[col].fillna(df[col].mode()[0])
|
299 |
else: # Impute strings with mode
|
300 |
df[col] = df[col].fillna(df[col].mode()[0])
|
301 |
+
cleaning_actions.append(f"Applied Mean/Median/Mode imputation on {cols}")
|
302 |
|
303 |
elif missing_value_method == "KNN Imputation":
|
304 |
from sklearn.impute import KNNImputer
|
|
|
331 |
else:
|
332 |
st.success("โจ No missing values found!")
|
333 |
|
334 |
+
# 2. Duplicate Handling
|
335 |
+
with tab2:
|
336 |
+
st.markdown("### ๐ Handle Duplicates")
|
337 |
+
duplicates = df.duplicated().sum()
|
338 |
+
if duplicates > 0:
|
339 |
+
st.plotly_chart(px.histogram(df, x=df.duplicated(), title="Duplicate Distribution"))
|
340 |
+
dup_strategy = st.radio("Duplicate Strategy", [
|
341 |
+
"Remove All Duplicates",
|
342 |
+
"Keep First Occurrence",
|
343 |
+
"Keep Last Occurrence"
|
344 |
+
])
|
345 |
+
if st.button("Handle Duplicates"):
|
346 |
+
original_count = len(df)
|
347 |
+
df = df.drop_duplicates(keep={
|
348 |
+
"Remove All Duplicates": False,
|
349 |
+
"Keep First Occurrence": 'first',
|
350 |
+
"Keep Last Occurrence": 'last'
|
351 |
+
}[dup_strategy])
|
352 |
+
cleaning_actions.append(f"Removed {original_count - len(df)} duplicates")
|
353 |
+
update_version(df)
|
354 |
+
st.success(f"Removed {original_count - len(df)} duplicates! โ
")
|
355 |
+
else:
|
356 |
+
st.success("โจ No duplicates found!")
|
357 |
+
|
358 |
+
# 3. Data Type Conversion
|
359 |
+
with tab3:
|
360 |
+
st.markdown("### ๐ Convert Data Types")
|
361 |
+
col1, col2 = st.columns(2)
|
362 |
+
with col1:
|
363 |
+
st.dataframe(df.dtypes.reset_index().rename(columns={0: 'Type', 'index': 'Column'}))
|
364 |
+
with col2:
|
365 |
+
col_to_convert = st.selectbox("Select column to convert", df.columns)
|
366 |
+
new_type = st.selectbox("New Data Type", [
|
367 |
+
"String", "Integer", "Float",
|
368 |
+
"Boolean", "Datetime", "Category"
|
369 |
+
])
|
370 |
+
if st.button("Convert Data Type"):
|
371 |
+
try:
|
372 |
+
if new_type == "String":
|
373 |
+
df[col_to_convert] = df[col_to_convert].astype(str)
|
374 |
+
elif new_type == "Integer":
|
375 |
+
df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce').astype('Int64')
|
376 |
+
elif new_type == "Float":
|
377 |
+
df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce')
|
378 |
+
elif new_type == "Boolean":
|
379 |
+
df[col_to_convert] = df[col_to_convert].astype(bool)
|
380 |
+
elif new_type == "Datetime":
|
381 |
+
df[col_to_convert] = pd.to_datetime(df[col_to_convert], errors='coerce')
|
382 |
+
elif new_type == "Category":
|
383 |
+
df[col_to_convert] = df[col_to_convert].astype('category')
|
384 |
+
|
385 |
+
cleaning_actions.append(f"Converted {col_to_convert} to {new_type}")
|
386 |
+
update_version(df)
|
387 |
+
st.success("Data type converted successfully! โ
")
|
388 |
+
except Exception as e:
|
389 |
+
st.error(f"Conversion failed: {str(e)}")
|
390 |
+
|
391 |
+
# 4. Outlier Handling
|
392 |
+
with tab4:
|
393 |
+
st.markdown("### ๐ Handle Outliers")
|
394 |
+
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
|
395 |
+
if numeric_cols:
|
396 |
+
outlier_col = st.selectbox("Select numeric column", numeric_cols)
|
397 |
+
st.plotly_chart(px.box(df, y=outlier_col, title="Outlier Distribution"))
|
398 |
+
outlier_method = st.radio("Outlier Handling Method", ["Z-score", "IQR", "Manual"], horizontal=True)
|
399 |
+
if st.button("Remove Outliers"):
|
400 |
+
try:
|
401 |
+
original_df = df.copy()
|
402 |
+
if outlier_method == "Z-score":
|
403 |
+
from scipy import stats
|
404 |
+
z_scores = np.abs(stats.zscore(df[outlier_col]))
|
405 |
+
df = df[(z_scores < 3)] # Keep only values with zscore less than 3
|
406 |
+
cleaning_actions.append(f"Removed outliers from {outlier_col} using Z-score (threshold 3)")
|
407 |
+
elif outlier_method == "IQR":
|
408 |
+
Q1 = df[outlier_col].quantile(0.25)
|
409 |
+
Q3 = df[outlier_col].quantile(0.75)
|
410 |
+
IQR = Q3 - Q1
|
411 |
+
df = df[~((df[outlier_col] < (Q1 - 1.5 * IQR)) |(df[outlier_col] > (Q3 + 1.5 * IQR)))]
|
412 |
+
cleaning_actions.append(f"Removed outliers from {outlier_col} using IQR")
|
413 |
+
elif outlier_method == "Manual":
|
414 |
+
lower_bound = st.number_input("Lower Bound", value=df[outlier_col].min(), step=1.0)
|
415 |
+
upper_bound = st.number_input("Upper Bound", value=df[outlier_col].max(), step=1.0)
|
416 |
+
df = df[(df[outlier_col] >= lower_bound) & (df[outlier_col] <= upper_bound)]
|
417 |
+
cleaning_actions.append(f"Removed outliers from {outlier_col} using manual bounds")
|
418 |
+
update_version(df)
|
419 |
+
st.success("Outliers removed successfully! โ
")
|
420 |
+
except Exception as e:
|
421 |
+
st.error(f"Outlier removal failed: {str(e)}")
|
422 |
+
else:
|
423 |
+
st.info("โน๏ธ No numeric columns found for outlier detection")
|
424 |
+
|
425 |
+
# Drop Column Functionality with Interface
|
426 |
+
st.subheader("๐๏ธ Drop Specific Columns")
|
427 |
+
cols_to_drop = st.multiselect("Select Columns to Drop", df.columns)
|
428 |
+
if st.button("Drop Selected Columns"):
|
429 |
+
try:
|
430 |
+
df = df.drop(columns=cols_to_drop) # Drop the cols here.
|
431 |
+
cleaning_actions.append(f"Dropped columns: {', '.join(cols_to_drop)}")
|
432 |
+
update_version(df)
|
433 |
+
st.success(f"Columns dropped successfully! โ
")
|
434 |
+
except (KeyError, ValueError) as e:
|
435 |
+
st.error(f"Invalid column(s) selected or other error: {e}") # Handle ValueErrors
|
436 |
+
except Exception as e:
|
437 |
+
st.error(f"An unexpected error occurred: {e}")
|
438 |
+
# Label Encoding (Categorical to Numeric)
|
439 |
+
st.subheader("๐ข Label Encoding")
|
440 |
+
if st.button("Encode Categorical Columns"):
|
441 |
+
try:
|
442 |
+
le = LabelEncoder()
|
443 |
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
|
444 |
+
for col in categorical_cols:
|
445 |
+
df[col] = df[col].astype(str) # Ensure all cols are string
|
446 |
+
df[col] = le.fit_transform(df[col])
|
447 |
+
cleaning_actions.append("Applied Label Encoding to categorical columns")
|
448 |
+
update_version(df)
|
449 |
+
st.success("Label encoding applied successfully! โ
")
|
450 |
+
except Exception as e:
|
451 |
+
st.error(f"Label encoding failed: {str(e)}")
|
452 |
+
|
453 |
+
# Live Data Preview after every cleaning action
|
454 |
+
st.subheader("โจ Live Data Preview")
|
455 |
+
st.dataframe(df.head(10)) # show 10 rows
|
456 |
+
|
457 |
# 2. Duplicate Handling
|
458 |
with tab2:
|
459 |
st.markdown("### ๐ Handle Duplicates")
|