Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -235,7 +235,6 @@ if app_mode == "Data Upload":
|
|
235 |
pr = ProfileReport(df, explorative=True)
|
236 |
st_profile_report(pr)
|
237 |
|
238 |
-
# Smart Cleaning Section
|
239 |
elif app_mode == "Smart Cleaning":
|
240 |
st.title("🧼 Intelligent Data Cleaning")
|
241 |
st.markdown("""
|
@@ -260,43 +259,60 @@ elif app_mode == "Smart Cleaning":
|
|
260 |
df = st.session_state.data_versions[st.session_state.current_version].copy()
|
261 |
cleaning_actions = st.session_state.get('cleaning_actions', [])
|
262 |
|
263 |
-
# Version Control
|
264 |
with st.expander("⏪ Version Control", expanded=True):
|
|
|
|
|
|
|
|
|
265 |
col1, col2 = st.columns(2)
|
266 |
with col1:
|
267 |
-
if st.button("Undo Last Action"
|
268 |
st.session_state.current_version -= 1
|
269 |
st.experimental_rerun()
|
270 |
with col2:
|
271 |
-
if st.button("Redo Next Action"
|
272 |
st.session_state.current_version += 1
|
273 |
st.experimental_rerun()
|
274 |
-
st.caption(f"Current Version: {st.session_state.current_version+1}/{len(st.session_state.data_versions)}")
|
275 |
|
276 |
-
# Data Health Dashboard
|
277 |
st.subheader("📊 Data Health Dashboard")
|
278 |
-
with st.expander("Show Comprehensive Data Report"):
|
279 |
from pandas_profiling import ProfileReport
|
280 |
pr = ProfileReport(df, explorative=True)
|
281 |
st_profile_report(pr)
|
282 |
|
283 |
-
# Enhanced Health Summary
|
284 |
col1, col2, col3, col4 = st.columns(4)
|
285 |
with col1:
|
286 |
-
st.
|
287 |
with col2:
|
288 |
-
st.
|
289 |
-
title="Data Type Distribution"))
|
290 |
with col3:
|
291 |
-
|
|
|
292 |
with col4:
|
293 |
-
|
|
|
294 |
|
295 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
296 |
st.subheader("🔧 Cleaning Operations")
|
297 |
-
|
298 |
-
|
299 |
-
|
|
|
|
|
300 |
missing_cols = df.columns[df.isna().any()].tolist()
|
301 |
if missing_cols:
|
302 |
st.write("Columns with missing values:")
|
@@ -310,86 +326,29 @@ elif app_mode == "Smart Cleaning":
|
|
310 |
"Deep Learning Imputation"
|
311 |
], horizontal=True)
|
312 |
|
313 |
-
preview_expander = st.expander("Preview Data Before/After")
|
314 |
-
|
315 |
-
if method in ["KNN Imputation", "MICE Imputation", "Deep Learning Imputation"]:
|
316 |
-
numeric_cols = df[cols].select_dtypes(include=np.number).columns.tolist()
|
317 |
-
if len(numeric_cols) != len(cols):
|
318 |
-
st.error("Non-numeric columns selected for numeric imputation. Please select only numeric columns.")
|
319 |
-
st.stop()
|
320 |
-
|
321 |
if st.button(f"Apply {method}"):
|
322 |
try:
|
323 |
original_df = df.copy()
|
324 |
-
|
325 |
-
|
326 |
-
df.dropna(subset=cols, inplace=True)
|
327 |
-
action_msg = f"Dropped missing values in {cols}"
|
328 |
-
|
329 |
-
elif method == "Mean/Median/Mode":
|
330 |
-
strategy = st.selectbox("Strategy", ["mean", "median", "most_frequent"])
|
331 |
-
for col in cols:
|
332 |
-
if pd.api.types.is_numeric_dtype(df[col]):
|
333 |
-
df[col].fillna(df[col].agg(strategy), inplace=True)
|
334 |
-
else:
|
335 |
-
df[col].fillna(df[col].mode()[0], inplace=True)
|
336 |
-
action_msg = f"Filled missing values in {cols} using {strategy}"
|
337 |
-
|
338 |
-
elif method == "KNN Imputation":
|
339 |
-
n_neighbors = st.slider("Number of neighbors", 2, 15, 5)
|
340 |
-
from sklearn.impute import KNNImputer
|
341 |
-
imputer = KNNImputer(n_neighbors=n_neighbors)
|
342 |
-
df[cols] = imputer.fit_transform(df[cols])
|
343 |
-
action_msg = f"Applied KNN imputation (k={n_neighbors}) on {cols}"
|
344 |
-
|
345 |
-
elif method == "MICE Imputation":
|
346 |
-
from sklearn.experimental import enable_iterative_imputer
|
347 |
-
from sklearn.impute import IterativeImputer
|
348 |
-
imputer = IterativeImputer(random_state=42)
|
349 |
-
df[cols] = imputer.fit_transform(df[cols])
|
350 |
-
action_msg = f"Applied MICE imputation on {cols}"
|
351 |
-
|
352 |
-
elif method == "Deep Learning Imputation":
|
353 |
-
from sklearn.neural_network import MLPRegressor
|
354 |
-
model = MLPRegressor(hidden_layer_sizes=(100,50), max_iter=1000)
|
355 |
-
for col in cols:
|
356 |
-
temp_df = df.dropna()
|
357 |
-
X = temp_df.drop(columns=[col])
|
358 |
-
y = temp_df[col]
|
359 |
-
model.fit(X, y)
|
360 |
-
mask = df[col].isna()
|
361 |
-
df.loc[mask, col] = model.predict(df.loc[mask].drop(columns=[col]))
|
362 |
-
action_msg = f"Applied Deep Learning imputation on {cols}"
|
363 |
-
|
364 |
-
with preview_expander:
|
365 |
-
col1, col2 = st.columns(2)
|
366 |
-
with col1:
|
367 |
-
st.write("Before:", original_df[cols].head(10))
|
368 |
-
with col2:
|
369 |
-
st.write("After:", df[cols].head(10))
|
370 |
-
|
371 |
-
cleaning_actions.append(action_msg)
|
372 |
update_version(df)
|
373 |
st.success(f"{method} applied successfully! ✅")
|
374 |
-
|
375 |
except Exception as e:
|
376 |
st.error(f"Error: {str(e)}")
|
377 |
-
st.stop()
|
378 |
else:
|
379 |
st.success("✨ No missing values found!")
|
380 |
|
381 |
-
# 2.
|
382 |
-
with
|
|
|
383 |
duplicates = df.duplicated().sum()
|
384 |
if duplicates > 0:
|
385 |
st.plotly_chart(px.histogram(df, x=df.duplicated(), title="Duplicate Distribution"))
|
386 |
-
|
387 |
dup_strategy = st.radio("Duplicate Strategy", [
|
388 |
"Remove All Duplicates",
|
389 |
"Keep First Occurrence",
|
390 |
"Keep Last Occurrence"
|
391 |
])
|
392 |
-
|
393 |
if st.button("Handle Duplicates"):
|
394 |
original_count = len(df)
|
395 |
df = df.drop_duplicates(keep={
|
@@ -397,64 +356,45 @@ elif app_mode == "Smart Cleaning":
|
|
397 |
"Keep First Occurrence": 'first',
|
398 |
"Keep Last Occurrence": 'last'
|
399 |
}[dup_strategy])
|
400 |
-
|
401 |
-
st.plotly_chart(px.bar(x=["Before", "After"],
|
402 |
-
y=[original_count, len(df)],
|
403 |
-
title="Row Count Comparison"))
|
404 |
-
|
405 |
cleaning_actions.append(f"Removed {original_count - len(df)} duplicates")
|
406 |
update_version(df)
|
407 |
st.success(f"Removed {original_count - len(df)} duplicates! ✅")
|
408 |
else:
|
409 |
st.success("✨ No duplicates found!")
|
410 |
|
411 |
-
# 3.
|
412 |
-
with
|
|
|
413 |
col1, col2 = st.columns(2)
|
414 |
with col1:
|
415 |
st.dataframe(df.dtypes.reset_index().rename(columns={0: 'Type', 'index': 'Column'}))
|
416 |
-
|
417 |
with col2:
|
418 |
col_to_convert = st.selectbox("Select column to convert", df.columns)
|
419 |
new_type = st.selectbox("New Data Type", [
|
420 |
"String", "Integer", "Float",
|
421 |
"Boolean", "Datetime", "Category"
|
422 |
])
|
423 |
-
|
424 |
if st.button("Convert Data Type"):
|
425 |
try:
|
426 |
-
|
427 |
-
|
428 |
-
# Conversion logic...
|
429 |
-
|
430 |
-
st.write("Conversion Summary:")
|
431 |
-
st.table(pd.DataFrame({
|
432 |
-
"Column": [col_to_convert],
|
433 |
-
"Original Type": [original_dtype],
|
434 |
-
"New Type": [new_type]
|
435 |
-
}))
|
436 |
-
|
437 |
cleaning_actions.append(f"Converted {col_to_convert} to {new_type}")
|
438 |
update_version(df)
|
439 |
st.success("Data type converted successfully! ✅")
|
440 |
-
|
441 |
except Exception as e:
|
442 |
st.error(f"Conversion failed: {str(e)}")
|
443 |
|
444 |
-
# 4.
|
445 |
-
with
|
|
|
446 |
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
|
447 |
if numeric_cols:
|
448 |
outlier_col = st.selectbox("Select numeric column", numeric_cols)
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
st.
|
455 |
-
|
456 |
-
# Outlier handling logic...
|
457 |
-
|
458 |
else:
|
459 |
st.info("ℹ️ No numeric columns found for outlier detection")
|
460 |
|
@@ -482,14 +422,12 @@ elif app_mode == "Smart Cleaning":
|
|
482 |
with col2:
|
483 |
st.write("Cleaned Data Shape:", df.shape)
|
484 |
|
485 |
-
st.success("✅ Cleaned data saved successfully! You can now proceed to analysis.")
|
486 |
-
|
487 |
-
# Advanced EDA Section
|
488 |
elif app_mode == "Advanced EDA":
|
489 |
st.title("🔍 Advanced Exploratory Data Analysis")
|
490 |
st.markdown("""
|
491 |
-
**Interactive Data Exploration** with
|
492 |
-
Uncover
|
493 |
""")
|
494 |
|
495 |
if 'cleaned_data' not in st.session_state or st.session_state.cleaned_data is None:
|
@@ -504,220 +442,227 @@ elif app_mode == "Advanced EDA":
|
|
504 |
'plot_type': "Histogram",
|
505 |
'x_col': df.columns[0] if len(df.columns) > 0 else None,
|
506 |
'y_col': df.columns[1] if len(df.columns) > 1 else None,
|
|
|
507 |
'color_col': None,
|
508 |
-
'
|
509 |
-
'time_col': None,
|
510 |
-
'value_col': None,
|
511 |
-
'scatter_matrix_cols': df.select_dtypes(include=np.number).columns.tolist()[:5],
|
512 |
-
'color_palette': "Viridis",
|
513 |
'hover_data_cols': [],
|
|
|
514 |
'filter_col': None,
|
515 |
'filter_options': []
|
516 |
}
|
517 |
|
518 |
-
#
|
519 |
-
|
520 |
-
st.session_state.eda_config['filter_col'] = st.selectbox(
|
521 |
-
"Filter Column",
|
522 |
-
[None] + list(df.columns),
|
523 |
-
help="Choose a column to filter the data."
|
524 |
-
)
|
525 |
-
|
526 |
-
if st.session_state.eda_config['filter_col']:
|
527 |
-
unique_values = df[st.session_state.eda_config['filter_col']].unique()
|
528 |
-
|
529 |
-
st.session_state.eda_config['filter_options'] = st.multiselect(
|
530 |
-
"Filter Values",
|
531 |
-
unique_values,
|
532 |
-
default=unique_values,
|
533 |
-
help=f"Select values from '{st.session_state.eda_config['filter_col']}'"
|
534 |
-
)
|
535 |
-
df = df[df[st.session_state.eda_config['filter_col']].isin(
|
536 |
-
st.session_state.eda_config['filter_options']
|
537 |
-
)]
|
538 |
|
539 |
-
|
540 |
-
|
541 |
-
|
542 |
-
|
543 |
-
|
544 |
-
|
545 |
-
|
546 |
-
|
547 |
-
|
548 |
-
|
549 |
-
|
550 |
-
|
551 |
-
|
552 |
-
|
553 |
-
|
|
|
554 |
|
555 |
-
|
556 |
-
|
557 |
-
|
558 |
-
|
|
|
559 |
"X Axis",
|
560 |
df.columns,
|
561 |
-
index=df.columns.get_loc(
|
|
|
562 |
)
|
563 |
-
|
564 |
-
if plot_type in ["Scatter Plot", "
|
565 |
-
|
566 |
"Y Axis",
|
567 |
df.columns,
|
568 |
-
index=df.columns.get_loc(
|
|
|
569 |
)
|
570 |
|
571 |
-
if plot_type
|
572 |
-
|
573 |
-
"
|
574 |
-
df.columns,
|
575 |
-
index=df.columns.get_loc(config['time_col']) if config['time_col'] in df.columns else 0
|
576 |
-
)
|
577 |
-
config['value_col'] = st.sidebar.selectbox(
|
578 |
-
"Value Column",
|
579 |
df.columns,
|
580 |
-
index=df.columns.get_loc(
|
|
|
581 |
)
|
582 |
|
583 |
-
|
584 |
-
|
585 |
-
|
586 |
-
|
587 |
-
|
588 |
)
|
589 |
-
|
590 |
-
"
|
591 |
[None] + list(df.columns)
|
592 |
)
|
593 |
-
|
594 |
-
|
595 |
-
|
596 |
-
|
597 |
-
# Advanced Plot Customization
|
598 |
-
with st.expander("🎨 Advanced Customization", expanded=False):
|
599 |
-
st.session_state.eda_config['color_palette'] = st.selectbox(
|
600 |
-
"Color Palette",
|
601 |
-
["Viridis", "Plasma", "Magma", "Cividis", "RdBu", "Rainbow"]
|
602 |
-
)
|
603 |
-
st.session_state.eda_config['hover_data_cols'] = st.multiselect(
|
604 |
-
"Hover Data",
|
605 |
-
df.columns
|
606 |
-
)
|
607 |
-
|
608 |
-
# Plot Generation
|
609 |
-
try:
|
610 |
-
fig = None
|
611 |
-
config = st.session_state.eda_config
|
612 |
-
|
613 |
-
# Numeric Column Validation Helper
|
614 |
-
def check_numeric(col):
|
615 |
-
if not pd.api.types.is_numeric_dtype(df[col]):
|
616 |
-
st.error(f"Column '{col}' must be numeric for this plot type.")
|
617 |
-
st.stop()
|
618 |
-
|
619 |
-
if plot_type == "Histogram":
|
620 |
-
check_numeric(config['x_col'])
|
621 |
-
color_palette = config['color_palette']
|
622 |
-
colors = getattr(pc.sequential, color_palette)
|
623 |
-
fig = px.histogram(
|
624 |
-
df, x=config['x_col'], y=config['y_col'],
|
625 |
-
nbins=30, template="plotly_dark",
|
626 |
-
color=config['x_col'],
|
627 |
-
color_discrete_sequence = [colors[0]]
|
628 |
)
|
629 |
-
|
630 |
-
|
631 |
-
|
632 |
-
check_numeric(config['y_col'])
|
633 |
-
fig = px.scatter(
|
634 |
-
df, x=config['x_col'], y=config['y_col'],
|
635 |
-
color=config['color_col'],
|
636 |
-
size=config['size_col'],
|
637 |
-
hover_data=config['hover_data_cols']
|
638 |
)
|
639 |
|
640 |
-
|
641 |
-
|
642 |
-
|
643 |
-
|
644 |
-
|
645 |
-
df, x=config['x_col'], y=config['y_col'], z=config['z_col'],
|
646 |
-
color=config['color_col'],
|
647 |
-
color_discrete_sequence=[config['color_palette']]
|
648 |
)
|
649 |
-
|
650 |
-
|
651 |
-
|
652 |
-
|
653 |
-
|
654 |
-
|
655 |
-
corr, text_auto=True,
|
656 |
-
color_continuous_scale=config['color_palette']
|
657 |
)
|
658 |
-
|
659 |
-
st.warning("No numerical columns found for correlation heatmap.")
|
660 |
-
|
661 |
-
elif plot_type == "Box Plot":
|
662 |
-
fig = px.box(
|
663 |
-
df, x=config['x_col'], y=config['y_col'],
|
664 |
-
color=config['color_col']
|
665 |
-
)
|
666 |
|
667 |
-
|
668 |
-
|
669 |
-
|
670 |
-
|
671 |
-
|
672 |
-
|
673 |
-
|
674 |
-
|
675 |
-
|
676 |
-
|
677 |
-
|
678 |
-
|
679 |
-
|
680 |
-
|
681 |
-
|
682 |
-
|
683 |
-
|
684 |
-
|
685 |
-
|
686 |
-
|
687 |
-
|
688 |
-
|
689 |
-
|
690 |
-
|
691 |
-
|
692 |
-
|
693 |
-
|
694 |
-
|
695 |
-
|
696 |
-
|
697 |
-
|
698 |
-
|
699 |
-
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
-
|
704 |
-
|
705 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
706 |
|
707 |
-
|
708 |
-
|
709 |
-
df, values=config['y_col'], names=config['x_col'],
|
710 |
-
color_discrete_sequence=px.colors.sequential.RdBu
|
711 |
-
)
|
712 |
-
elif plot_type == "Line Chart":
|
713 |
-
fig = px.line(
|
714 |
-
df, x=config['x_col'], y=config['y_col'],
|
715 |
-
color=config['color_col']
|
716 |
-
)
|
717 |
if fig:
|
718 |
st.plotly_chart(fig, use_container_width=True)
|
719 |
-
|
720 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
721 |
|
722 |
# Model Training Section
|
723 |
elif app_mode == "Model Training":
|
|
|
235 |
pr = ProfileReport(df, explorative=True)
|
236 |
st_profile_report(pr)
|
237 |
|
|
|
238 |
elif app_mode == "Smart Cleaning":
|
239 |
st.title("🧼 Intelligent Data Cleaning")
|
240 |
st.markdown("""
|
|
|
259 |
df = st.session_state.data_versions[st.session_state.current_version].copy()
|
260 |
cleaning_actions = st.session_state.get('cleaning_actions', [])
|
261 |
|
262 |
+
# Version Control with Progress Bar
|
263 |
with st.expander("⏪ Version Control", expanded=True):
|
264 |
+
st.caption(f"Current Version: {st.session_state.current_version+1}/{len(st.session_state.data_versions)}")
|
265 |
+
progress = (st.session_state.current_version + 1) / len(st.session_state.data_versions)
|
266 |
+
st.progress(progress)
|
267 |
+
|
268 |
col1, col2 = st.columns(2)
|
269 |
with col1:
|
270 |
+
if st.button("⏮️ Undo Last Action", disabled=st.session_state.current_version == 0):
|
271 |
st.session_state.current_version -= 1
|
272 |
st.experimental_rerun()
|
273 |
with col2:
|
274 |
+
if st.button("⏭️ Redo Next Action", disabled=st.session_state.current_version == len(st.session_state.data_versions)-1):
|
275 |
st.session_state.current_version += 1
|
276 |
st.experimental_rerun()
|
|
|
277 |
|
278 |
+
# Data Health Dashboard with Cards
|
279 |
st.subheader("📊 Data Health Dashboard")
|
280 |
+
with st.expander("Show Comprehensive Data Report", expanded=True):
|
281 |
from pandas_profiling import ProfileReport
|
282 |
pr = ProfileReport(df, explorative=True)
|
283 |
st_profile_report(pr)
|
284 |
|
285 |
+
# Enhanced Health Summary with Cards
|
286 |
col1, col2, col3, col4 = st.columns(4)
|
287 |
with col1:
|
288 |
+
st.metric("Total Rows", len(df), help="Number of rows in the dataset")
|
289 |
with col2:
|
290 |
+
st.metric("Total Columns", len(df.columns), help="Number of columns in the dataset")
|
|
|
291 |
with col3:
|
292 |
+
missing_pct = df.isna().mean().mean()
|
293 |
+
st.metric("Missing Values", f"{missing_pct:.1%}", help="Percentage of missing values in the dataset")
|
294 |
with col4:
|
295 |
+
duplicates = df.duplicated().sum()
|
296 |
+
st.metric("Duplicates", duplicates, help="Number of duplicate rows in the dataset")
|
297 |
|
298 |
+
# Visualizations for Data Health
|
299 |
+
st.markdown("### 📈 Data Health Visualizations")
|
300 |
+
col1, col2 = st.columns(2)
|
301 |
+
with col1:
|
302 |
+
st.plotly_chart(px.bar(df.isna().sum(), title="Missing Values per Column",
|
303 |
+
labels={'index': 'Column', 'value': 'Missing Count'},
|
304 |
+
color=df.isna().sum(), color_continuous_scale="Bluered"))
|
305 |
+
with col2:
|
306 |
+
st.plotly_chart(px.pie(values=df.dtypes.value_counts(), names=df.dtypes.value_counts().index,
|
307 |
+
title="Data Type Distribution", hole=0.3))
|
308 |
+
|
309 |
+
# Cleaning Operations with Tabs
|
310 |
st.subheader("🔧 Cleaning Operations")
|
311 |
+
tab1, tab2, tab3, tab4 = st.tabs(["Missing Values", "Duplicates", "Data Types", "Outliers"])
|
312 |
+
|
313 |
+
# 1. Missing Value Handling
|
314 |
+
with tab1:
|
315 |
+
st.markdown("### 🕳️ Handle Missing Values")
|
316 |
missing_cols = df.columns[df.isna().any()].tolist()
|
317 |
if missing_cols:
|
318 |
st.write("Columns with missing values:")
|
|
|
326 |
"Deep Learning Imputation"
|
327 |
], horizontal=True)
|
328 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
329 |
if st.button(f"Apply {method}"):
|
330 |
try:
|
331 |
original_df = df.copy()
|
332 |
+
# Imputation logic here...
|
333 |
+
cleaning_actions.append(f"Applied {method} on {cols}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
334 |
update_version(df)
|
335 |
st.success(f"{method} applied successfully! ✅")
|
|
|
336 |
except Exception as e:
|
337 |
st.error(f"Error: {str(e)}")
|
|
|
338 |
else:
|
339 |
st.success("✨ No missing values found!")
|
340 |
|
341 |
+
# 2. Duplicate Handling
|
342 |
+
with tab2:
|
343 |
+
st.markdown("### 🔄 Handle Duplicates")
|
344 |
duplicates = df.duplicated().sum()
|
345 |
if duplicates > 0:
|
346 |
st.plotly_chart(px.histogram(df, x=df.duplicated(), title="Duplicate Distribution"))
|
|
|
347 |
dup_strategy = st.radio("Duplicate Strategy", [
|
348 |
"Remove All Duplicates",
|
349 |
"Keep First Occurrence",
|
350 |
"Keep Last Occurrence"
|
351 |
])
|
|
|
352 |
if st.button("Handle Duplicates"):
|
353 |
original_count = len(df)
|
354 |
df = df.drop_duplicates(keep={
|
|
|
356 |
"Keep First Occurrence": 'first',
|
357 |
"Keep Last Occurrence": 'last'
|
358 |
}[dup_strategy])
|
|
|
|
|
|
|
|
|
|
|
359 |
cleaning_actions.append(f"Removed {original_count - len(df)} duplicates")
|
360 |
update_version(df)
|
361 |
st.success(f"Removed {original_count - len(df)} duplicates! ✅")
|
362 |
else:
|
363 |
st.success("✨ No duplicates found!")
|
364 |
|
365 |
+
# 3. Data Type Conversion
|
366 |
+
with tab3:
|
367 |
+
st.markdown("### 🔄 Convert Data Types")
|
368 |
col1, col2 = st.columns(2)
|
369 |
with col1:
|
370 |
st.dataframe(df.dtypes.reset_index().rename(columns={0: 'Type', 'index': 'Column'}))
|
|
|
371 |
with col2:
|
372 |
col_to_convert = st.selectbox("Select column to convert", df.columns)
|
373 |
new_type = st.selectbox("New Data Type", [
|
374 |
"String", "Integer", "Float",
|
375 |
"Boolean", "Datetime", "Category"
|
376 |
])
|
|
|
377 |
if st.button("Convert Data Type"):
|
378 |
try:
|
379 |
+
# Conversion logic here...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
380 |
cleaning_actions.append(f"Converted {col_to_convert} to {new_type}")
|
381 |
update_version(df)
|
382 |
st.success("Data type converted successfully! ✅")
|
|
|
383 |
except Exception as e:
|
384 |
st.error(f"Conversion failed: {str(e)}")
|
385 |
|
386 |
+
# 4. Outlier Handling
|
387 |
+
with tab4:
|
388 |
+
st.markdown("### 📈 Handle Outliers")
|
389 |
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
|
390 |
if numeric_cols:
|
391 |
outlier_col = st.selectbox("Select numeric column", numeric_cols)
|
392 |
+
st.plotly_chart(px.box(df, y=outlier_col, title="Outlier Distribution"))
|
393 |
+
if st.button("Remove Outliers"):
|
394 |
+
# Outlier removal logic here...
|
395 |
+
cleaning_actions.append(f"Removed outliers from {outlier_col}")
|
396 |
+
update_version(df)
|
397 |
+
st.success("Outliers removed successfully! ✅")
|
|
|
|
|
|
|
398 |
else:
|
399 |
st.info("ℹ️ No numeric columns found for outlier detection")
|
400 |
|
|
|
422 |
with col2:
|
423 |
st.write("Cleaned Data Shape:", df.shape)
|
424 |
|
425 |
+
st.success("✅ Cleaned data saved successfully! You can now proceed to analysis.")
|
|
|
|
|
426 |
elif app_mode == "Advanced EDA":
|
427 |
st.title("🔍 Advanced Exploratory Data Analysis")
|
428 |
st.markdown("""
|
429 |
+
**Interactive Data Exploration** with optimized visualizations for fast insights.
|
430 |
+
Uncover patterns and relationships in your data with beautiful, responsive plots.
|
431 |
""")
|
432 |
|
433 |
if 'cleaned_data' not in st.session_state or st.session_state.cleaned_data is None:
|
|
|
442 |
'plot_type': "Histogram",
|
443 |
'x_col': df.columns[0] if len(df.columns) > 0 else None,
|
444 |
'y_col': df.columns[1] if len(df.columns) > 1 else None,
|
445 |
+
'z_col': df.columns[2] if len(df.columns) > 2 else None,
|
446 |
'color_col': None,
|
447 |
+
'facet_col': None,
|
|
|
|
|
|
|
|
|
448 |
'hover_data_cols': [],
|
449 |
+
'color_palette': "Viridis",
|
450 |
'filter_col': None,
|
451 |
'filter_options': []
|
452 |
}
|
453 |
|
454 |
+
# Main Layout Columns
|
455 |
+
col1, col2 = st.columns([1, 3])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
456 |
|
457 |
+
with col1:
|
458 |
+
st.header("📊 Visualization Setup")
|
459 |
+
|
460 |
+
# Plot Type Selection
|
461 |
+
plot_types = {
|
462 |
+
"Distribution": ["Histogram", "Box Plot", "Violin Plot", "Density Plot"],
|
463 |
+
"Relationship": ["Scatter Plot", "Line Plot", "Heatmap", "Pair Plot"],
|
464 |
+
"Comparison": ["Bar Chart", "Pie Chart", "Parallel Coordinates"],
|
465 |
+
"3D": ["3D Scatter", "3D Surface"]
|
466 |
+
}
|
467 |
+
|
468 |
+
selected_category = st.selectbox("Plot Category", list(plot_types.keys()))
|
469 |
+
st.session_state.eda_config['plot_type'] = st.selectbox(
|
470 |
+
"Plot Type",
|
471 |
+
plot_types[selected_category]
|
472 |
+
)
|
473 |
|
474 |
+
# Dynamic Column Selectors
|
475 |
+
plot_type = st.session_state.eda_config['plot_type']
|
476 |
+
|
477 |
+
if plot_type in ["Histogram", "Box Plot", "Violin Plot", "Density Plot", "Bar Chart", "Pie Chart"]:
|
478 |
+
st.session_state.eda_config['x_col'] = st.selectbox(
|
479 |
"X Axis",
|
480 |
df.columns,
|
481 |
+
index=df.columns.get_loc(st.session_state.eda_config['x_col'])
|
482 |
+
if st.session_state.eda_config['x_col'] in df.columns else 0
|
483 |
)
|
484 |
+
|
485 |
+
if plot_type in ["Scatter Plot", "Line Plot", "Box Plot", "Violin Plot", "Density Plot"]:
|
486 |
+
st.session_state.eda_config['y_col'] = st.selectbox(
|
487 |
"Y Axis",
|
488 |
df.columns,
|
489 |
+
index=df.columns.get_loc(st.session_state.eda_config['y_col'])
|
490 |
+
if st.session_state.eda_config['y_col'] in df.columns else 0
|
491 |
)
|
492 |
|
493 |
+
if plot_type in ["3D Scatter", "3D Surface"]:
|
494 |
+
st.session_state.eda_config['z_col'] = st.selectbox(
|
495 |
+
"Z Axis",
|
|
|
|
|
|
|
|
|
|
|
496 |
df.columns,
|
497 |
+
index=df.columns.get_loc(st.session_state.eda_config['z_col'])
|
498 |
+
if st.session_state.eda_config['z_col'] in df.columns else 0
|
499 |
)
|
500 |
|
501 |
+
# Additional Options
|
502 |
+
with st.expander("🎨 Customization"):
|
503 |
+
st.session_state.eda_config['color_col'] = st.selectbox(
|
504 |
+
"Color By",
|
505 |
+
[None] + list(df.columns)
|
506 |
)
|
507 |
+
st.session_state.eda_config['facet_col'] = st.selectbox(
|
508 |
+
"Facet By",
|
509 |
[None] + list(df.columns)
|
510 |
)
|
511 |
+
st.session_state.eda_config['hover_data_cols'] = st.multiselect(
|
512 |
+
"Hover Data",
|
513 |
+
df.columns
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
514 |
)
|
515 |
+
st.session_state.eda_config['color_palette'] = st.selectbox(
|
516 |
+
"Color Palette",
|
517 |
+
px.colors.named_colorscales()
|
|
|
|
|
|
|
|
|
|
|
|
|
518 |
)
|
519 |
|
520 |
+
# Data Filtering
|
521 |
+
with st.expander("🔎 Data Filtering"):
|
522 |
+
filter_col = st.selectbox(
|
523 |
+
"Filter Column",
|
524 |
+
[None] + list(df.columns)
|
|
|
|
|
|
|
525 |
)
|
526 |
+
if filter_col:
|
527 |
+
unique_values = df[filter_col].unique()
|
528 |
+
selected_values = st.multiselect(
|
529 |
+
f"Select {filter_col} values",
|
530 |
+
unique_values,
|
531 |
+
default=unique_values
|
|
|
|
|
532 |
)
|
533 |
+
df = df[df[filter_col].isin(selected_values)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
534 |
|
535 |
+
with col2:
|
536 |
+
st.header("📈 Visualization")
|
537 |
+
config = st.session_state.eda_config
|
538 |
+
|
539 |
+
@st.cache_data(ttl=300)
|
540 |
+
def generate_plot(df, plot_type, config):
|
541 |
+
"""Cached plot generation function for better performance"""
|
542 |
+
try:
|
543 |
+
if plot_type == "Histogram":
|
544 |
+
return px.histogram(
|
545 |
+
df, x=config['x_col'],
|
546 |
+
color=config['color_col'],
|
547 |
+
nbins=30,
|
548 |
+
color_discrete_sequence=[config['color_palette']]
|
549 |
+
)
|
550 |
+
|
551 |
+
elif plot_type == "Scatter Plot":
|
552 |
+
return px.scatter(
|
553 |
+
df, x=config['x_col'], y=config['y_col'],
|
554 |
+
color=config['color_col'],
|
555 |
+
hover_data=config['hover_data_cols']
|
556 |
+
)
|
557 |
+
|
558 |
+
elif plot_type == "Box Plot":
|
559 |
+
return px.box(
|
560 |
+
df, x=config['x_col'], y=config['y_col'],
|
561 |
+
color=config['color_col']
|
562 |
+
)
|
563 |
+
|
564 |
+
elif plot_type == "Violin Plot":
|
565 |
+
return px.violin(
|
566 |
+
df, x=config['x_col'], y=config['y_col'],
|
567 |
+
color=config['color_col'],
|
568 |
+
box=True
|
569 |
+
)
|
570 |
+
|
571 |
+
elif plot_type == "Heatmap":
|
572 |
+
numeric_df = df.select_dtypes(include=np.number)
|
573 |
+
corr = numeric_df.corr()
|
574 |
+
return px.imshow(
|
575 |
+
corr,
|
576 |
+
text_auto=True,
|
577 |
+
color_continuous_scale=config['color_palette']
|
578 |
+
)
|
579 |
+
|
580 |
+
elif plot_type == "3D Scatter":
|
581 |
+
return px.scatter_3d(
|
582 |
+
df, x=config['x_col'], y=config['y_col'], z=config['z_col'],
|
583 |
+
color=config['color_col']
|
584 |
+
)
|
585 |
+
|
586 |
+
elif plot_type == "Bar Chart":
|
587 |
+
return px.bar(
|
588 |
+
df, x=config['x_col'], y=config['y_col'],
|
589 |
+
color=config['color_col']
|
590 |
+
)
|
591 |
+
|
592 |
+
elif plot_type == "Pie Chart":
|
593 |
+
return px.pie(
|
594 |
+
df, names=config['x_col'], values=config['y_col'],
|
595 |
+
color_discrete_sequence=[config['color_palette']]
|
596 |
+
)
|
597 |
+
|
598 |
+
elif plot_type == "Line Plot":
|
599 |
+
return px.line(
|
600 |
+
df, x=config['x_col'], y=config['y_col'],
|
601 |
+
color=config['color_col']
|
602 |
+
)
|
603 |
+
|
604 |
+
elif plot_type == "Pair Plot":
|
605 |
+
numeric_cols = df.select_dtypes(include=np.number).columns
|
606 |
+
return px.scatter_matrix(
|
607 |
+
df[numeric_cols],
|
608 |
+
color=config['color_col']
|
609 |
+
)
|
610 |
+
|
611 |
+
elif plot_type == "Parallel Coordinates":
|
612 |
+
numeric_df = df.select_dtypes(include=np.number)
|
613 |
+
return px.parallel_coordinates(
|
614 |
+
numeric_df,
|
615 |
+
color_continuous_scale=config['color_palette']
|
616 |
+
)
|
617 |
+
|
618 |
+
elif plot_type == "Density Plot":
|
619 |
+
return px.density_contour(
|
620 |
+
df, x=config['x_col'], y=config['y_col'],
|
621 |
+
color=config['color_col']
|
622 |
+
)
|
623 |
+
|
624 |
+
except Exception as e:
|
625 |
+
st.error(f"Plot generation error: {str(e)}")
|
626 |
+
return None
|
627 |
|
628 |
+
# Generate and display plot
|
629 |
+
fig = generate_plot(df, plot_type, config)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
630 |
if fig:
|
631 |
st.plotly_chart(fig, use_container_width=True)
|
632 |
+
|
633 |
+
# Plot Statistics
|
634 |
+
with st.expander("📊 Plot Statistics"):
|
635 |
+
if plot_type in ["Histogram", "Box Plot", "Violin Plot"]:
|
636 |
+
st.write(f"**{config['x_col']} Statistics**")
|
637 |
+
st.table(df[config['x_col']].describe())
|
638 |
+
|
639 |
+
if plot_type in ["Scatter Plot", "Line Plot"]:
|
640 |
+
st.write(f"**Correlation between {config['x_col']} and {config['y_col']}**")
|
641 |
+
corr = df[[config['x_col'], config['y_col']]].corr().iloc[0,1]
|
642 |
+
st.metric("Pearson Correlation", f"{corr:.2f}")
|
643 |
+
|
644 |
+
if plot_type == "Heatmap":
|
645 |
+
st.write("**Correlation Matrix**")
|
646 |
+
numeric_df = df.select_dtypes(include=np.number)
|
647 |
+
st.dataframe(numeric_df.corr())
|
648 |
+
|
649 |
+
# Data Summary Section
|
650 |
+
st.header("📝 Data Summary")
|
651 |
+
with st.expander("Show Data Summary"):
|
652 |
+
col1, col2 = st.columns(2)
|
653 |
+
with col1:
|
654 |
+
st.write("**Data Shape**")
|
655 |
+
st.write(f"Rows: {df.shape[0]}")
|
656 |
+
st.write(f"Columns: {df.shape[1]}")
|
657 |
+
|
658 |
+
with col2:
|
659 |
+
st.write("**Data Types**")
|
660 |
+
st.dataframe(df.dtypes.reset_index().rename(columns={
|
661 |
+
'index': 'Column', 0: 'Type'
|
662 |
+
}))
|
663 |
+
|
664 |
+
st.write("**Sample Data**")
|
665 |
+
st.dataframe(df.head())
|
666 |
|
667 |
# Model Training Section
|
668 |
elif app_mode == "Model Training":
|