Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -27,103 +27,6 @@ st.set_page_config(
|
|
27 |
)
|
28 |
|
29 |
|
30 |
-
# HTML and CSS for the draggable button
|
31 |
-
html_code = """
|
32 |
-
<style>
|
33 |
-
#floatingButton {
|
34 |
-
position: fixed;
|
35 |
-
bottom: 20px;
|
36 |
-
right: 20px;
|
37 |
-
width: 60px;
|
38 |
-
height: 60px;
|
39 |
-
background-color: #007bff;
|
40 |
-
color: white;
|
41 |
-
border: none;
|
42 |
-
border-radius: 50%;
|
43 |
-
cursor: pointer;
|
44 |
-
font-size: 24px;
|
45 |
-
z-index: 1000;
|
46 |
-
}
|
47 |
-
|
48 |
-
#floatingButton:active {
|
49 |
-
background-color: #0056b3;
|
50 |
-
}
|
51 |
-
|
52 |
-
.draggable {
|
53 |
-
position: absolute;
|
54 |
-
cursor: move;
|
55 |
-
}
|
56 |
-
</style>
|
57 |
-
<button id="floatingButton" class="draggable">+</button>
|
58 |
-
<script>
|
59 |
-
dragElement(document.getElementById("floatingButton"));
|
60 |
-
|
61 |
-
function dragElement(elmnt) {
|
62 |
-
var pos1 = 0, pos2 = 0, pos3 = 0, pos4 = 0;
|
63 |
-
elmnt.onmousedown = dragMouseDown;
|
64 |
-
|
65 |
-
function dragMouseDown(e) {
|
66 |
-
e = e || window.event;
|
67 |
-
e.preventDefault();
|
68 |
-
pos3 = e.clientX;
|
69 |
-
pos4 = e.clientY;
|
70 |
-
document.onmouseup = closeDragElement;
|
71 |
-
document.onmousemove = elementDrag;
|
72 |
-
}
|
73 |
-
|
74 |
-
function elementDrag(e) {
|
75 |
-
e = e || window.event;
|
76 |
-
e.preventDefault();
|
77 |
-
pos1 = pos3 - e.clientX;
|
78 |
-
pos2 = e.clientY;
|
79 |
-
pos3 = e.clientX;
|
80 |
-
pos4 = e.clientY;
|
81 |
-
elmnt.style.top = (elmnt.offsetTop - pos2) + "px";
|
82 |
-
elmnt.style.left = (elmnt.offsetLeft - pos1) + "px";
|
83 |
-
}
|
84 |
-
|
85 |
-
function closeDragElement() {
|
86 |
-
document.onmouseup = null;
|
87 |
-
document.onmousemove = null;
|
88 |
-
}
|
89 |
-
}
|
90 |
-
|
91 |
-
document.getElementById("floatingButton").onclick = function() {
|
92 |
-
var expander = document.getElementById("dataExpander");
|
93 |
-
if (expander.style.display === "none") {
|
94 |
-
expander.style.display = "block";
|
95 |
-
} else {
|
96 |
-
expander.style.display = "none";
|
97 |
-
}
|
98 |
-
fetch("/?show_data=true", {method: "POST"});
|
99 |
-
}
|
100 |
-
</script>
|
101 |
-
"""
|
102 |
-
|
103 |
-
# JavaScript to handle the toggle functionality
|
104 |
-
js_code = """
|
105 |
-
<script>
|
106 |
-
document.addEventListener('DOMContentLoaded', function() {
|
107 |
-
var expander = document.createElement('div');
|
108 |
-
expander.id = "dataExpander";
|
109 |
-
expander.style.display = "none";
|
110 |
-
document.body.appendChild(expander);
|
111 |
-
});
|
112 |
-
</script>
|
113 |
-
"""
|
114 |
-
|
115 |
-
st.markdown(html_code, unsafe_allow_html=True)
|
116 |
-
st.markdown(js_code, unsafe_allow_html=True)
|
117 |
-
|
118 |
-
# Function to show data in an expander
|
119 |
-
def show_data():
|
120 |
-
st.session_state.show_data = not st.session_state.show_data # Toggle the state
|
121 |
-
if st.session_state.show_data:
|
122 |
-
with st.expander("✨ Data Viewport", expanded=True):
|
123 |
-
st.dataframe(df, use_container_width=True)
|
124 |
-
|
125 |
-
# --------------------------
|
126 |
-
|
127 |
# --------------------------
|
128 |
# Custom Styling
|
129 |
# --------------------------
|
@@ -152,6 +55,16 @@ if 'model' not in st.session_state:
|
|
152 |
# --------------------------
|
153 |
# Helper Functions
|
154 |
# --------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
def generate_quality_report(df):
|
156 |
"""Generate comprehensive data quality report"""
|
157 |
report = {
|
@@ -484,23 +397,29 @@ if app_mode == "Data Upload":
|
|
484 |
|
485 |
except Exception as e:
|
486 |
st.error(f"Error loading file: {str(e)}")
|
487 |
-
|
488 |
-
|
|
|
|
|
|
|
489 |
st.title("🧹 Smart Data Cleaning")
|
490 |
-
|
491 |
if st.session_state.raw_data is None:
|
492 |
st.warning("Please upload data first")
|
493 |
st.stop()
|
494 |
-
|
495 |
-
# Initialize session state
|
496 |
if 'data_versions' not in st.session_state:
|
497 |
st.session_state.data_versions = [st.session_state.raw_data.copy()]
|
498 |
-
|
499 |
-
|
500 |
-
|
|
|
|
|
501 |
# --------------------------
|
502 |
# Data Health Dashboard
|
503 |
# --------------------------
|
|
|
504 |
with st.expander("📊 Data Health Dashboard", expanded=True):
|
505 |
col1, col2, col3 = st.columns(3)
|
506 |
with col1:
|
@@ -509,200 +428,227 @@ elif app_mode == "Data Cleaning":
|
|
509 |
st.metric("Total Rows", len(df))
|
510 |
with col3:
|
511 |
st.metric("Missing Values", df.isna().sum().sum())
|
512 |
-
|
513 |
# Generate quick profile report
|
514 |
if st.button("Generate Data Health Report"):
|
515 |
with st.spinner("Analyzing data..."):
|
516 |
profile = ProfileReport(df, minimal=True)
|
517 |
st_profile_report(profile)
|
518 |
-
|
519 |
# --------------------------
|
520 |
# Undo Functionality
|
521 |
# --------------------------
|
522 |
if len(st.session_state.data_versions) > 1:
|
523 |
if st.button("⏮️ Undo Last Action"):
|
524 |
-
st.session_state.data_versions.pop()
|
525 |
-
|
526 |
-
st.session_state.cleaned_data = df
|
527 |
st.success("Last action undone!")
|
528 |
-
|
|
|
529 |
# --------------------------
|
530 |
# Missing Value Handling
|
531 |
# --------------------------
|
|
|
532 |
with st.expander("🔍 Missing Values Treatment", expanded=True):
|
533 |
missing_cols = df.columns[df.isna().any()].tolist()
|
534 |
if missing_cols:
|
535 |
cols = st.multiselect("Select columns to handle", missing_cols)
|
536 |
method = st.selectbox("Imputation Method", [
|
537 |
-
"Drop Missing",
|
538 |
-
"Mean/Median",
|
539 |
"Custom Value",
|
540 |
"Forward Fill",
|
541 |
"Backward Fill"
|
542 |
])
|
543 |
-
|
544 |
if method == "Custom Value":
|
545 |
custom_val = st.text_input("Enter custom value")
|
546 |
-
|
547 |
-
if st.button("Apply Treatment"):
|
548 |
-
st.session_state.data_versions.append(df.copy())
|
549 |
try:
|
|
|
550 |
if method == "Drop Missing":
|
551 |
-
|
552 |
elif method == "Mean/Median":
|
553 |
for col in cols:
|
554 |
-
if pd.api.types.is_numeric_dtype(
|
555 |
-
|
556 |
else:
|
557 |
-
|
558 |
elif method == "Custom Value" and custom_val:
|
559 |
for col in cols:
|
560 |
-
|
561 |
elif method == "Forward Fill":
|
562 |
-
|
563 |
elif method == "Backward Fill":
|
564 |
-
|
565 |
-
|
566 |
-
|
567 |
-
st.
|
|
|
568 |
except Exception as e:
|
569 |
st.error(f"Error: {str(e)}")
|
570 |
else:
|
571 |
st.success("✨ No missing values found!")
|
572 |
-
|
573 |
# --------------------------
|
574 |
# Data Type Conversion
|
575 |
# --------------------------
|
|
|
576 |
with st.expander("🔄 Data Type Conversion"):
|
577 |
col_to_convert = st.selectbox("Select column", df.columns)
|
578 |
new_type = st.selectbox("New data type", [
|
579 |
-
"String", "Integer", "Float",
|
580 |
"Boolean", "Datetime"
|
581 |
])
|
582 |
-
|
583 |
if new_type == "Datetime":
|
584 |
date_format = st.text_input("Date format (e.g. %Y-%m-%d)", "%Y-%m-%d")
|
585 |
-
|
586 |
-
if st.button("Convert"):
|
587 |
-
st.session_state.data_versions.append(df.copy())
|
588 |
try:
|
|
|
589 |
if new_type == "String":
|
590 |
-
|
591 |
elif new_type == "Integer":
|
592 |
-
if
|
593 |
st.error("Cannot convert text column to integer!")
|
594 |
else:
|
595 |
-
|
596 |
elif new_type == "Float":
|
597 |
-
if
|
598 |
st.error("Cannot convert text column to float!")
|
599 |
else:
|
600 |
-
|
601 |
elif new_type == "Boolean":
|
602 |
-
|
603 |
elif new_type == "Datetime":
|
604 |
-
|
605 |
-
|
606 |
-
|
607 |
-
st.
|
608 |
except Exception as e:
|
609 |
st.error(f"Error: {str(e)}")
|
610 |
-
|
611 |
# --------------------------
|
612 |
# Drop Columns
|
613 |
# --------------------------
|
|
|
614 |
with st.expander("🗑️ Drop Columns"):
|
615 |
columns_to_drop = st.multiselect("Select columns to drop", df.columns)
|
616 |
if columns_to_drop:
|
617 |
st.warning(f"Will drop: {', '.join(columns_to_drop)}")
|
618 |
-
if st.button("Confirm Drop"):
|
619 |
-
|
620 |
-
|
621 |
-
|
622 |
-
st.
|
623 |
-
|
624 |
# --------------------------
|
625 |
# Label Encoding
|
626 |
# --------------------------
|
|
|
627 |
with st.expander("🔢 Label Encoding"):
|
628 |
data_to_encode = st.multiselect("Select categorical columns to encode", df.select_dtypes(include='object').columns)
|
629 |
if data_to_encode:
|
630 |
-
if st.button("Apply Label Encoding"):
|
631 |
-
|
632 |
label_encoders = {}
|
633 |
for col in data_to_encode:
|
634 |
le = LabelEncoder()
|
635 |
-
|
636 |
label_encoders[col] = le
|
637 |
-
|
638 |
-
st.
|
639 |
-
|
640 |
# --------------------------
|
641 |
# StandardScaler
|
642 |
# --------------------------
|
|
|
643 |
with st.expander("📏 StandardScaler"):
|
644 |
scale_cols = st.multiselect("Select numeric columns to scale", df.select_dtypes(include=np.number).columns)
|
645 |
if scale_cols:
|
646 |
-
if st.button("Apply StandardScaler"):
|
647 |
-
st.session_state.data_versions.append(df.copy())
|
648 |
try:
|
|
|
649 |
scaler = StandardScaler()
|
650 |
-
|
651 |
-
|
652 |
-
st.
|
653 |
except Exception as e:
|
654 |
st.error(f"Error: {str(e)}")
|
655 |
-
|
656 |
# --------------------------
|
657 |
# Pattern-Based Cleaning
|
658 |
# --------------------------
|
|
|
659 |
with st.expander("🕵️ Pattern-Based Cleaning"):
|
660 |
selected_col = st.selectbox("Select text column", df.select_dtypes(include='object').columns)
|
661 |
pattern = st.text_input("Regex pattern (e.g. \d+ for numbers)")
|
662 |
replacement = st.text_input("Replacement value")
|
663 |
-
|
664 |
-
if st.button("Apply Pattern Replacement"):
|
665 |
-
st.session_state.data_versions.append(df.copy())
|
666 |
try:
|
667 |
-
|
668 |
-
|
669 |
-
|
|
|
670 |
except Exception as e:
|
671 |
st.error(f"Error: {str(e)}")
|
672 |
-
|
673 |
# --------------------------
|
674 |
# Bulk Operations
|
675 |
# --------------------------
|
|
|
676 |
with st.expander("🚀 Bulk Actions"):
|
677 |
-
if st.button("Auto-Clean Common Issues"):
|
678 |
-
|
679 |
-
|
680 |
-
|
681 |
-
text_cols =
|
682 |
-
|
683 |
-
|
684 |
-
st.
|
685 |
-
|
686 |
# --------------------------
|
687 |
# Cleaned Data Preview
|
688 |
# --------------------------
|
689 |
-
if st.session_state.cleaned_data is not None:
|
|
|
690 |
with st.expander("✨ Cleaned Data Preview", expanded=True):
|
691 |
-
st.dataframe(st.session_state.cleaned_data.head(), use_container_width=True)
|
692 |
|
693 |
|
694 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
695 |
st.title("🔍 Interactive Data Explorer")
|
696 |
-
|
697 |
if st.session_state.cleaned_data is None:
|
698 |
st.warning("Please clean your data first")
|
699 |
st.stop()
|
700 |
-
|
701 |
df = st.session_state.cleaned_data
|
702 |
-
|
703 |
# --------------------------
|
704 |
# Enhanced Data Overview
|
705 |
# --------------------------
|
|
|
706 |
with st.expander("📁 Dataset Overview", expanded=True):
|
707 |
col1, col2, col3, col4 = st.columns(4)
|
708 |
with col1:
|
@@ -715,7 +661,7 @@ elif app_mode == "EDA":
|
|
715 |
with col4:
|
716 |
dupes = df.duplicated().sum()
|
717 |
st.metric("Duplicates", dupes, help="Fully duplicated rows")
|
718 |
-
|
719 |
# Data Preview Tabs
|
720 |
tab1, tab2, tab3 = st.tabs(["Quick Preview", "Column Types", "Missing Matrix"])
|
721 |
with tab1:
|
@@ -727,16 +673,17 @@ elif app_mode == "EDA":
|
|
727 |
with tab3:
|
728 |
fig = px.imshow(df.isna(), color_continuous_scale='gray')
|
729 |
st.plotly_chart(fig, use_container_width=True)
|
730 |
-
|
731 |
# --------------------------
|
732 |
# Smart Visualization Builder
|
733 |
# --------------------------
|
|
|
734 |
st.subheader("📊 Visualization Builder")
|
735 |
-
|
736 |
# Automatic plot type suggestions
|
737 |
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
|
738 |
categorical_cols = df.select_dtypes(exclude=np.number).columns.tolist()
|
739 |
-
|
740 |
col1, col2 = st.columns([1, 3])
|
741 |
with col1:
|
742 |
# Dynamic plot type filtering
|
@@ -752,81 +699,110 @@ elif app_mode == "EDA":
|
|
752 |
index=0,
|
753 |
help="Automatically filtered based on data types"
|
754 |
)
|
755 |
-
|
756 |
-
#
|
757 |
-
x_axis =
|
758 |
-
|
759 |
-
|
760 |
-
|
761 |
-
|
762 |
-
|
763 |
-
|
764 |
-
|
765 |
-
|
766 |
-
|
767 |
-
|
768 |
-
|
769 |
-
|
770 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
771 |
if plot_type == "Parallel Categories":
|
772 |
dimensions = st.multiselect("Dimensions", df.columns.tolist(), default=df.columns[:3])
|
773 |
-
|
774 |
with col2:
|
775 |
try:
|
776 |
-
#
|
|
|
|
|
777 |
if plot_type == "Scatter Plot":
|
778 |
-
|
779 |
-
|
|
|
780 |
elif plot_type == "Histogram":
|
781 |
-
|
782 |
-
|
|
|
783 |
elif plot_type == "Box Plot":
|
784 |
-
|
|
|
785 |
elif plot_type == "Violin Plot":
|
786 |
-
|
787 |
-
|
|
|
788 |
elif plot_type == "Line Chart":
|
789 |
-
|
|
|
790 |
elif plot_type == "Bar Chart":
|
791 |
-
|
|
|
792 |
elif plot_type == "Correlation Matrix":
|
793 |
-
|
794 |
-
|
795 |
-
|
|
|
|
|
796 |
elif plot_type == "Pair Plot":
|
797 |
-
|
798 |
-
|
|
|
|
|
|
|
|
|
799 |
elif plot_type == "Heatmap":
|
800 |
-
|
|
|
801 |
elif plot_type == "3D Scatter":
|
802 |
-
|
803 |
-
|
|
|
804 |
elif plot_type == "Parallel Categories":
|
805 |
-
|
806 |
-
|
807 |
-
|
|
|
|
|
|
|
808 |
# Interactive plot customization
|
809 |
-
|
810 |
-
|
811 |
-
|
812 |
-
|
813 |
-
|
814 |
-
|
815 |
-
|
816 |
-
|
817 |
-
|
818 |
-
|
819 |
-
|
|
|
820 |
except Exception as e:
|
821 |
st.error(f"Couldn't create visualization: {str(e)}")
|
822 |
st.info("Try selecting different columns or changing the visualization type")
|
823 |
-
|
824 |
# --------------------------
|
825 |
# Advanced Analysis
|
826 |
# --------------------------
|
|
|
827 |
with st.expander("🔬 Deep Analysis Tools", expanded=False):
|
828 |
tab1, tab2, tab3 = st.tabs(["Statistical Tests", "Pattern Explorer", "Data Transformation"])
|
829 |
-
|
830 |
with tab1:
|
831 |
st.subheader("Hypothesis Testing")
|
832 |
col1, col2 = st.columns(2)
|
@@ -834,12 +810,15 @@ elif app_mode == "EDA":
|
|
834 |
test_var = st.selectbox("Test variable", numeric_cols)
|
835 |
with col2:
|
836 |
group_var = st.selectbox("Grouping variable", [None] + categorical_cols)
|
837 |
-
|
838 |
if group_var and st.button("Run ANOVA"):
|
839 |
-
|
840 |
-
|
841 |
-
|
842 |
-
|
|
|
|
|
|
|
843 |
with tab2:
|
844 |
st.subheader("Pattern Discovery")
|
845 |
explore_col = st.selectbox("Column to analyze", df.columns)
|
@@ -848,7 +827,7 @@ elif app_mode == "EDA":
|
|
848 |
if pattern:
|
849 |
matches = df[explore_col].str.contains(pattern).sum()
|
850 |
st.write(f"Found {matches} matches")
|
851 |
-
|
852 |
with tab3:
|
853 |
st.subheader("Data Transformation")
|
854 |
transform_col = st.selectbox("Column to transform", numeric_cols)
|
@@ -859,21 +838,28 @@ elif app_mode == "EDA":
|
|
859 |
df[transform_col] = np.sqrt(df[transform_col])
|
860 |
elif transform_type == "Z-score":
|
861 |
df[transform_col] = (df[transform_col] - df[transform_col].mean())/df[transform_col].std()
|
862 |
-
|
863 |
# --------------------------
|
864 |
# Export & Save
|
865 |
# --------------------------
|
|
|
866 |
st.subheader("💾 Export Options")
|
867 |
col1, col2 = st.columns(2)
|
868 |
with col1:
|
869 |
if st.button("📥 Download Current Visualization"):
|
870 |
-
|
871 |
-
|
|
|
|
|
|
|
872 |
with col2:
|
873 |
if st.button("📊 Export Analysis Report"):
|
874 |
-
|
875 |
-
|
876 |
-
|
|
|
|
|
|
|
877 |
|
878 |
# Streamlit App
|
879 |
elif app_mode == "Model Training":
|
|
|
27 |
)
|
28 |
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
# --------------------------
|
31 |
# Custom Styling
|
32 |
# --------------------------
|
|
|
55 |
# --------------------------
|
56 |
# Helper Functions
|
57 |
# --------------------------
|
58 |
+
def enhance_section_title(title, icon="✨"):
|
59 |
+
"""Helper function to create a styled section title with an icon."""
|
60 |
+
st.markdown(f"<h2 style='border-bottom: 2px solid #ccc; padding-bottom: 5px;'>{icon} {title}</h2>", unsafe_allow_html=True)
|
61 |
+
|
62 |
+
def update_cleaned_data(df):
|
63 |
+
"""Updates the cleaned data in session state."""
|
64 |
+
st.session_state.cleaned_data = df
|
65 |
+
st.session_state.data_versions.append(df.copy()) # Append to history
|
66 |
+
st.success("Action completed successfully!")
|
67 |
+
|
68 |
def generate_quality_report(df):
|
69 |
"""Generate comprehensive data quality report"""
|
70 |
report = {
|
|
|
397 |
|
398 |
except Exception as e:
|
399 |
st.error(f"Error loading file: {str(e)}")
|
400 |
+
|
401 |
+
# --------------------------
|
402 |
+
# Page Content
|
403 |
+
# --------------------------
|
404 |
+
if st.session_state.get("app_mode") == "Data Cleaning":
|
405 |
st.title("🧹 Smart Data Cleaning")
|
406 |
+
|
407 |
if st.session_state.raw_data is None:
|
408 |
st.warning("Please upload data first")
|
409 |
st.stop()
|
410 |
+
|
411 |
+
# Initialize session state (only if it's not already there)
|
412 |
if 'data_versions' not in st.session_state:
|
413 |
st.session_state.data_versions = [st.session_state.raw_data.copy()]
|
414 |
+
if 'cleaned_data' not in st.session_state: #Added a conditional value
|
415 |
+
st.session_state.cleaned_data = st.session_state.raw_data.copy()
|
416 |
+
|
417 |
+
df = st.session_state.cleaned_data.copy()
|
418 |
+
|
419 |
# --------------------------
|
420 |
# Data Health Dashboard
|
421 |
# --------------------------
|
422 |
+
enhance_section_title("Data Health Dashboard", "📊")
|
423 |
with st.expander("📊 Data Health Dashboard", expanded=True):
|
424 |
col1, col2, col3 = st.columns(3)
|
425 |
with col1:
|
|
|
428 |
st.metric("Total Rows", len(df))
|
429 |
with col3:
|
430 |
st.metric("Missing Values", df.isna().sum().sum())
|
431 |
+
|
432 |
# Generate quick profile report
|
433 |
if st.button("Generate Data Health Report"):
|
434 |
with st.spinner("Analyzing data..."):
|
435 |
profile = ProfileReport(df, minimal=True)
|
436 |
st_profile_report(profile)
|
437 |
+
|
438 |
# --------------------------
|
439 |
# Undo Functionality
|
440 |
# --------------------------
|
441 |
if len(st.session_state.data_versions) > 1:
|
442 |
if st.button("⏮️ Undo Last Action"):
|
443 |
+
st.session_state.data_versions.pop() # Remove current version
|
444 |
+
st.session_state.cleaned_data = st.session_state.data_versions[-1].copy() # Set data
|
|
|
445 |
st.success("Last action undone!")
|
446 |
+
st.experimental_rerun() #Force re-run after undo
|
447 |
+
|
448 |
# --------------------------
|
449 |
# Missing Value Handling
|
450 |
# --------------------------
|
451 |
+
enhance_section_title("Missing Values Treatment", "🔍")
|
452 |
with st.expander("🔍 Missing Values Treatment", expanded=True):
|
453 |
missing_cols = df.columns[df.isna().any()].tolist()
|
454 |
if missing_cols:
|
455 |
cols = st.multiselect("Select columns to handle", missing_cols)
|
456 |
method = st.selectbox("Imputation Method", [
|
457 |
+
"Drop Missing",
|
458 |
+
"Mean/Median",
|
459 |
"Custom Value",
|
460 |
"Forward Fill",
|
461 |
"Backward Fill"
|
462 |
])
|
463 |
+
|
464 |
if method == "Custom Value":
|
465 |
custom_val = st.text_input("Enter custom value")
|
466 |
+
|
467 |
+
if st.button("Apply Treatment (Missing)"):
|
|
|
468 |
try:
|
469 |
+
new_df = df.copy() # Create a copy to modify
|
470 |
if method == "Drop Missing":
|
471 |
+
new_df = new_df.dropna(subset=cols)
|
472 |
elif method == "Mean/Median":
|
473 |
for col in cols:
|
474 |
+
if pd.api.types.is_numeric_dtype(new_df[col]):
|
475 |
+
new_df[col] = new_df[col].fillna(new_df[col].median())
|
476 |
else:
|
477 |
+
new_df[col] = new_df[col].fillna(new_df[col].mode()[0])
|
478 |
elif method == "Custom Value" and custom_val:
|
479 |
for col in cols:
|
480 |
+
new_df[col] = new_df[col].fillna(custom_val)
|
481 |
elif method == "Forward Fill":
|
482 |
+
new_df[cols] = new_df[cols].ffill()
|
483 |
elif method == "Backward Fill":
|
484 |
+
new_df[cols] = new_df[cols].bfill()
|
485 |
+
|
486 |
+
update_cleaned_data(new_df)
|
487 |
+
st.experimental_rerun() #Force re-run after apply
|
488 |
+
|
489 |
except Exception as e:
|
490 |
st.error(f"Error: {str(e)}")
|
491 |
else:
|
492 |
st.success("✨ No missing values found!")
|
493 |
+
|
494 |
# --------------------------
|
495 |
# Data Type Conversion
|
496 |
# --------------------------
|
497 |
+
enhance_section_title("Data Type Conversion", "🔄")
|
498 |
with st.expander("🔄 Data Type Conversion"):
|
499 |
col_to_convert = st.selectbox("Select column", df.columns)
|
500 |
new_type = st.selectbox("New data type", [
|
501 |
+
"String", "Integer", "Float",
|
502 |
"Boolean", "Datetime"
|
503 |
])
|
504 |
+
|
505 |
if new_type == "Datetime":
|
506 |
date_format = st.text_input("Date format (e.g. %Y-%m-%d)", "%Y-%m-%d")
|
507 |
+
|
508 |
+
if st.button("Convert (Data Type)"):
|
|
|
509 |
try:
|
510 |
+
new_df = df.copy()
|
511 |
if new_type == "String":
|
512 |
+
new_df[col_to_convert] = new_df[col_to_convert].astype(str)
|
513 |
elif new_type == "Integer":
|
514 |
+
if new_df[col_to_convert].dtype == 'object':
|
515 |
st.error("Cannot convert text column to integer!")
|
516 |
else:
|
517 |
+
new_df[col_to_convert] = pd.to_numeric(new_df[col_to_convert], errors='coerce').astype('Int64')
|
518 |
elif new_type == "Float":
|
519 |
+
if new_df[col_to_convert].dtype == 'object':
|
520 |
st.error("Cannot convert text column to float!")
|
521 |
else:
|
522 |
+
new_df[col_to_convert] = pd.to_numeric(new_df[col_to_convert], errors='coerce')
|
523 |
elif new_type == "Boolean":
|
524 |
+
new_df[col_to_convert] = new_df[col_to_convert].astype(bool)
|
525 |
elif new_type == "Datetime":
|
526 |
+
new_df[col_to_convert] = pd.to_datetime(new_df[col_to_convert], format=date_format, errors='coerce')
|
527 |
+
|
528 |
+
update_cleaned_data(new_df)
|
529 |
+
st.experimental_rerun() #Force re-run after apply
|
530 |
except Exception as e:
|
531 |
st.error(f"Error: {str(e)}")
|
532 |
+
|
533 |
# --------------------------
|
534 |
# Drop Columns
|
535 |
# --------------------------
|
536 |
+
enhance_section_title("Drop Columns", "🗑️")
|
537 |
with st.expander("🗑️ Drop Columns"):
|
538 |
columns_to_drop = st.multiselect("Select columns to drop", df.columns)
|
539 |
if columns_to_drop:
|
540 |
st.warning(f"Will drop: {', '.join(columns_to_drop)}")
|
541 |
+
if st.button("Confirm Drop (Columns)"):
|
542 |
+
new_df = df.copy()
|
543 |
+
new_df = new_df.drop(columns=columns_to_drop)
|
544 |
+
update_cleaned_data(new_df)
|
545 |
+
st.experimental_rerun() #Force re-run after apply
|
546 |
+
|
547 |
# --------------------------
|
548 |
# Label Encoding
|
549 |
# --------------------------
|
550 |
+
enhance_section_title("Label Encoding", "🔢")
|
551 |
with st.expander("🔢 Label Encoding"):
|
552 |
data_to_encode = st.multiselect("Select categorical columns to encode", df.select_dtypes(include='object').columns)
|
553 |
if data_to_encode:
|
554 |
+
if st.button("Apply Label Encoding (Encoding)"):
|
555 |
+
new_df = df.copy()
|
556 |
label_encoders = {}
|
557 |
for col in data_to_encode:
|
558 |
le = LabelEncoder()
|
559 |
+
new_df[col] = le.fit_transform(new_df[col].astype(str))
|
560 |
label_encoders[col] = le
|
561 |
+
update_cleaned_data(new_df)
|
562 |
+
st.experimental_rerun() #Force re-run after apply
|
563 |
+
|
564 |
# --------------------------
|
565 |
# StandardScaler
|
566 |
# --------------------------
|
567 |
+
enhance_section_title("StandardScaler", "📏")
|
568 |
with st.expander("📏 StandardScaler"):
|
569 |
scale_cols = st.multiselect("Select numeric columns to scale", df.select_dtypes(include=np.number).columns)
|
570 |
if scale_cols:
|
571 |
+
if st.button("Apply StandardScaler (Scaling)"):
|
|
|
572 |
try:
|
573 |
+
new_df = df.copy()
|
574 |
scaler = StandardScaler()
|
575 |
+
new_df[scale_cols] = scaler.fit_transform(new_df[scale_cols])
|
576 |
+
update_cleaned_data(new_df)
|
577 |
+
st.experimental_rerun() #Force re-run after apply
|
578 |
except Exception as e:
|
579 |
st.error(f"Error: {str(e)}")
|
580 |
+
|
581 |
# --------------------------
|
582 |
# Pattern-Based Cleaning
|
583 |
# --------------------------
|
584 |
+
enhance_section_title("Pattern-Based Cleaning", "🕵️")
|
585 |
with st.expander("🕵️ Pattern-Based Cleaning"):
|
586 |
selected_col = st.selectbox("Select text column", df.select_dtypes(include='object').columns)
|
587 |
pattern = st.text_input("Regex pattern (e.g. \d+ for numbers)")
|
588 |
replacement = st.text_input("Replacement value")
|
589 |
+
|
590 |
+
if st.button("Apply Pattern Replacement (Replace)"):
|
|
|
591 |
try:
|
592 |
+
new_df = df.copy()
|
593 |
+
new_df[selected_col] = new_df[selected_col].str.replace(pattern, replacement, regex=True)
|
594 |
+
update_cleaned_data(new_df)
|
595 |
+
st.experimental_rerun() #Force re-run after apply
|
596 |
except Exception as e:
|
597 |
st.error(f"Error: {str(e)}")
|
598 |
+
|
599 |
# --------------------------
|
600 |
# Bulk Operations
|
601 |
# --------------------------
|
602 |
+
enhance_section_title("Bulk Actions", "🚀")
|
603 |
with st.expander("🚀 Bulk Actions"):
|
604 |
+
if st.button("Auto-Clean Common Issues (Cleaning)"):
|
605 |
+
new_df = df.copy()
|
606 |
+
new_df = new_df.dropna(axis=1, how='all') # Remove empty cols
|
607 |
+
new_df = new_df.convert_dtypes() # Better type inference
|
608 |
+
text_cols = new_df.select_dtypes(include='object').columns
|
609 |
+
new_df[text_cols] = new_df[text_cols].apply(lambda x: x.str.strip())
|
610 |
+
update_cleaned_data(new_df)
|
611 |
+
st.experimental_rerun() #Force re-run after apply
|
612 |
+
|
613 |
# --------------------------
|
614 |
# Cleaned Data Preview
|
615 |
# --------------------------
|
616 |
+
if st.session_state.get("cleaned_data") is not None:
|
617 |
+
enhance_section_title("Cleaned Data Preview", "✨")
|
618 |
with st.expander("✨ Cleaned Data Preview", expanded=True):
|
619 |
+
st.dataframe(st.session_state.cleaned_data.head(), use_container_width=True)
|
620 |
|
621 |
|
622 |
+
import streamlit as st
|
623 |
+
import pandas as pd
|
624 |
+
import numpy as np
|
625 |
+
import plotly.express as px
|
626 |
+
from scipy import stats # For statistical tests
|
627 |
+
from pandas_profiling import ProfileReport # Automated EDA (if you have it installed)
|
628 |
+
|
629 |
+
# --------------------------
|
630 |
+
# Helper Functions
|
631 |
+
# --------------------------
|
632 |
+
def enhance_section_title(title, icon="✨"):
|
633 |
+
"""Helper function to create a styled section title with an icon."""
|
634 |
+
st.markdown(f"<h2 style='border-bottom: 2px solid #ccc; padding-bottom: 5px;'>{icon} {title}</h2>", unsafe_allow_html=True)
|
635 |
+
|
636 |
+
# --------------------------
|
637 |
+
# Page Content
|
638 |
+
# --------------------------
|
639 |
+
if st.session_state.get("app_mode") == "EDA":
|
640 |
st.title("🔍 Interactive Data Explorer")
|
641 |
+
|
642 |
if st.session_state.cleaned_data is None:
|
643 |
st.warning("Please clean your data first")
|
644 |
st.stop()
|
645 |
+
|
646 |
df = st.session_state.cleaned_data
|
647 |
+
|
648 |
# --------------------------
|
649 |
# Enhanced Data Overview
|
650 |
# --------------------------
|
651 |
+
enhance_section_title("Dataset Overview", "📁")
|
652 |
with st.expander("📁 Dataset Overview", expanded=True):
|
653 |
col1, col2, col3, col4 = st.columns(4)
|
654 |
with col1:
|
|
|
661 |
with col4:
|
662 |
dupes = df.duplicated().sum()
|
663 |
st.metric("Duplicates", dupes, help="Fully duplicated rows")
|
664 |
+
|
665 |
# Data Preview Tabs
|
666 |
tab1, tab2, tab3 = st.tabs(["Quick Preview", "Column Types", "Missing Matrix"])
|
667 |
with tab1:
|
|
|
673 |
with tab3:
|
674 |
fig = px.imshow(df.isna(), color_continuous_scale='gray')
|
675 |
st.plotly_chart(fig, use_container_width=True)
|
676 |
+
|
677 |
# --------------------------
|
678 |
# Smart Visualization Builder
|
679 |
# --------------------------
|
680 |
+
enhance_section_title("Visualization Builder", "📊")
|
681 |
st.subheader("📊 Visualization Builder")
|
682 |
+
|
683 |
# Automatic plot type suggestions
|
684 |
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
|
685 |
categorical_cols = df.select_dtypes(exclude=np.number).columns.tolist()
|
686 |
+
|
687 |
col1, col2 = st.columns([1, 3])
|
688 |
with col1:
|
689 |
# Dynamic plot type filtering
|
|
|
699 |
index=0,
|
700 |
help="Automatically filtered based on data types"
|
701 |
)
|
702 |
+
|
703 |
+
# Axis selection - conditionally displayed
|
704 |
+
x_axis = None
|
705 |
+
y_axis = None
|
706 |
+
z_axis = None
|
707 |
+
color_by = "None" # Default color to None
|
708 |
+
|
709 |
+
if plot_type not in ["Correlation Matrix", "Pair Plot"]:
|
710 |
+
x_axis = st.selectbox("X-axis", df.columns, help="Primary dimension for analysis")
|
711 |
+
|
712 |
+
if plot_type in ["Scatter Plot", "Box Plot", "Violin Plot", "Line Chart", "Heatmap"]:
|
713 |
+
y_axis = st.selectbox("Y-axis", df.columns, help="Secondary dimension for analysis")
|
714 |
+
|
715 |
+
if plot_type == "3D Scatter":
|
716 |
+
z_axis = st.selectbox("Z-axis", df.columns, help="Third dimension for analysis")
|
717 |
+
|
718 |
+
# Color encoding
|
719 |
+
if plot_type not in ["Correlation Matrix", "Pair Plot"]:
|
720 |
+
color_options = ["None"] + df.columns.tolist()
|
721 |
+
color_by = st.selectbox("Color encoding", color_options,
|
722 |
+
format_func=lambda x: "No color" if x == "None" else x)
|
723 |
+
|
724 |
+
# Context-aware controls for Parallel Categories
|
725 |
+
dimensions = None
|
726 |
if plot_type == "Parallel Categories":
|
727 |
dimensions = st.multiselect("Dimensions", df.columns.tolist(), default=df.columns[:3])
|
728 |
+
|
729 |
with col2:
|
730 |
try:
|
731 |
+
fig = None # Initialize fig to None
|
732 |
+
|
733 |
+
# Generate appropriate visualization with input validation
|
734 |
if plot_type == "Scatter Plot":
|
735 |
+
if x_axis and y_axis:
|
736 |
+
fig = px.scatter(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None,
|
737 |
+
hover_data=df.columns, trendline="lowess")
|
738 |
elif plot_type == "Histogram":
|
739 |
+
if x_axis:
|
740 |
+
fig = px.histogram(df, x=x_axis, color=color_by if color_by != "None" else None,
|
741 |
+
nbins=30, marginal="box")
|
742 |
elif plot_type == "Box Plot":
|
743 |
+
if x_axis and y_axis:
|
744 |
+
fig = px.box(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None)
|
745 |
elif plot_type == "Violin Plot":
|
746 |
+
if x_axis and y_axis:
|
747 |
+
fig = px.violin(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None,
|
748 |
+
box=True)
|
749 |
elif plot_type == "Line Chart":
|
750 |
+
if x_axis and y_axis:
|
751 |
+
fig = px.line(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None)
|
752 |
elif plot_type == "Bar Chart":
|
753 |
+
if x_axis:
|
754 |
+
fig = px.bar(df, x=x_axis, color=color_by if color_by != "None" else None)
|
755 |
elif plot_type == "Correlation Matrix":
|
756 |
+
numeric_df = df.select_dtypes(include=np.number)
|
757 |
+
if len(numeric_df.columns) > 1:
|
758 |
+
corr = numeric_df.corr()
|
759 |
+
fig = px.imshow(corr, text_auto=True, color_continuous_scale='RdBu_r',
|
760 |
+
zmin=-1, zmax=1)
|
761 |
elif plot_type == "Pair Plot":
|
762 |
+
numeric_df = df.select_dtypes(include=np.number)
|
763 |
+
num_cols = len(numeric_df.columns)
|
764 |
+
if num_cols > 1:
|
765 |
+
dimensions = numeric_df.columns[:min(4, num_cols)].tolist() # Limit to the first 4 for performance
|
766 |
+
fig = px.scatter_matrix(df, dimensions=dimensions,
|
767 |
+
color=color_by if color_by != "None" else None)
|
768 |
elif plot_type == "Heatmap":
|
769 |
+
if x_axis and y_axis:
|
770 |
+
fig = px.density_heatmap(df, x=x_axis, y=y_axis, facet_col=color_by if color_by != "None" else None)
|
771 |
elif plot_type == "3D Scatter":
|
772 |
+
if x_axis and y_axis and z_axis:
|
773 |
+
fig = px.scatter_3d(df, x=x_axis, y=y_axis, z=z_axis,
|
774 |
+
color=color_by if color_by != "None" else None)
|
775 |
elif plot_type == "Parallel Categories":
|
776 |
+
if dimensions:
|
777 |
+
fig = px.parallel_categories(df, dimensions=dimensions,
|
778 |
+
color=color_by if color_by != "None" else None)
|
779 |
+
else:
|
780 |
+
st.error("Please choose the specific plot")
|
781 |
+
|
782 |
# Interactive plot customization
|
783 |
+
if fig: #Only display customization options when we have a plot
|
784 |
+
with st.expander("⚙️ Chart Settings", expanded=False):
|
785 |
+
col1, col2 = st.columns(2)
|
786 |
+
with col1:
|
787 |
+
chart_title = st.text_input("Chart title", f"{plot_type} of {x_axis} vs {y_axis}" if (x_axis and y_axis) else f"{plot_type} of {x_axis}" if x_axis else plot_type)
|
788 |
+
fig.update_layout(title=chart_title)
|
789 |
+
with col2:
|
790 |
+
theme = st.selectbox("Color theme", px.colors.named_colorscales())
|
791 |
+
fig.update_layout(colorway=px.colors.qualitative.Plotly)
|
792 |
+
|
793 |
+
st.plotly_chart(fig, use_container_width=True)
|
794 |
+
|
795 |
except Exception as e:
|
796 |
st.error(f"Couldn't create visualization: {str(e)}")
|
797 |
st.info("Try selecting different columns or changing the visualization type")
|
798 |
+
|
799 |
# --------------------------
|
800 |
# Advanced Analysis
|
801 |
# --------------------------
|
802 |
+
enhance_section_title("Deep Analysis Tools", "🔬")
|
803 |
with st.expander("🔬 Deep Analysis Tools", expanded=False):
|
804 |
tab1, tab2, tab3 = st.tabs(["Statistical Tests", "Pattern Explorer", "Data Transformation"])
|
805 |
+
|
806 |
with tab1:
|
807 |
st.subheader("Hypothesis Testing")
|
808 |
col1, col2 = st.columns(2)
|
|
|
810 |
test_var = st.selectbox("Test variable", numeric_cols)
|
811 |
with col2:
|
812 |
group_var = st.selectbox("Grouping variable", [None] + categorical_cols)
|
813 |
+
|
814 |
if group_var and st.button("Run ANOVA"):
|
815 |
+
if test_var and group_var:
|
816 |
+
groups = df.groupby(group_var)[test_var].apply(list)
|
817 |
+
f_val, p_val = stats.f_oneway(*groups)
|
818 |
+
st.write(f"F-value: {f_val:.2f}, p-value: {p_val:.4f}")
|
819 |
+
else:
|
820 |
+
st.warning("Please select both a Test variable and a Grouping variable for ANOVA.")
|
821 |
+
|
822 |
with tab2:
|
823 |
st.subheader("Pattern Discovery")
|
824 |
explore_col = st.selectbox("Column to analyze", df.columns)
|
|
|
827 |
if pattern:
|
828 |
matches = df[explore_col].str.contains(pattern).sum()
|
829 |
st.write(f"Found {matches} matches")
|
830 |
+
|
831 |
with tab3:
|
832 |
st.subheader("Data Transformation")
|
833 |
transform_col = st.selectbox("Column to transform", numeric_cols)
|
|
|
838 |
df[transform_col] = np.sqrt(df[transform_col])
|
839 |
elif transform_type == "Z-score":
|
840 |
df[transform_col] = (df[transform_col] - df[transform_col].mean())/df[transform_col].std()
|
841 |
+
|
842 |
# --------------------------
|
843 |
# Export & Save
|
844 |
# --------------------------
|
845 |
+
enhance_section_title("Export Options", "💾")
|
846 |
st.subheader("💾 Export Options")
|
847 |
col1, col2 = st.columns(2)
|
848 |
with col1:
|
849 |
if st.button("📥 Download Current Visualization"):
|
850 |
+
try:
|
851 |
+
fig.write_image("visualization.png")
|
852 |
+
st.success("Image saved!")
|
853 |
+
except NameError:
|
854 |
+
st.error("No visualization to download. Please create a chart first.")
|
855 |
with col2:
|
856 |
if st.button("📊 Export Analysis Report"):
|
857 |
+
try:
|
858 |
+
profile = ProfileReport(df, minimal=True)
|
859 |
+
profile.to_file("analysis_report.html")
|
860 |
+
st.success("Report generated!")
|
861 |
+
except Exception as e:
|
862 |
+
st.error(f"Could not generate analysis report. Ensure pandas-profiling is installed correctly.")
|
863 |
|
864 |
# Streamlit App
|
865 |
elif app_mode == "Model Training":
|