Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -632,9 +632,8 @@ if app_mode == "Data Cleaning":
|
|
632 |
st.dataframe(st.session_state.cleaned_data.head(), use_container_width=True)
|
633 |
|
634 |
|
635 |
-
|
636 |
-
|
637 |
-
st.title("π Exploratory Data Analysis")
|
638 |
|
639 |
if st.session_state.cleaned_data is None:
|
640 |
st.warning("Please clean your data first")
|
@@ -643,143 +642,179 @@ def eda():
|
|
643 |
df = st.session_state.cleaned_data
|
644 |
|
645 |
# --------------------------
|
646 |
-
# Data Overview
|
647 |
# --------------------------
|
648 |
-
with st.expander("
|
649 |
-
col1, col2, col3 = st.columns(
|
650 |
with col1:
|
651 |
-
st.metric("Total Rows", df.shape[0])
|
652 |
with col2:
|
653 |
-
st.metric("Total Columns", df.shape[1])
|
654 |
with col3:
|
655 |
-
|
|
|
|
|
|
|
|
|
656 |
|
657 |
-
|
|
|
|
|
658 |
st.dataframe(df.head(), use_container_width=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
659 |
|
660 |
# --------------------------
|
661 |
-
# Visualization
|
662 |
# --------------------------
|
663 |
-
st.subheader("
|
|
|
|
|
|
|
|
|
|
|
664 |
col1, col2 = st.columns([1, 3])
|
665 |
with col1:
|
666 |
-
|
667 |
-
|
668 |
-
|
669 |
-
"
|
670 |
-
|
671 |
-
|
672 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
673 |
|
674 |
-
|
675 |
-
|
676 |
-
|
677 |
-
|
678 |
-
|
|
|
|
|
|
|
|
|
|
|
679 |
|
680 |
with col2:
|
681 |
-
st.subheader("π Visualization")
|
682 |
try:
|
|
|
683 |
if plot_type == "Scatter Plot":
|
684 |
-
fig = px.scatter(df, x=x_axis, y=y_axis, color=color_by,
|
|
|
685 |
elif plot_type == "Histogram":
|
686 |
-
fig = px.histogram(df, x=x_axis, color=color_by,
|
|
|
687 |
elif plot_type == "Box Plot":
|
688 |
-
fig = px.box(df, x=x_axis, y=y_axis, color=color_by
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
689 |
elif plot_type == "Correlation Matrix":
|
690 |
corr = df.select_dtypes(include=np.number).corr()
|
691 |
-
fig = px.imshow(corr, text_auto=True, color_continuous_scale='
|
692 |
-
|
693 |
-
fig = px.line(df, x=x_axis, y=y_axis, color=color_by, facet_col=facet_col)
|
694 |
-
elif plot_type == "Heatmap":
|
695 |
-
fig = go.Figure(data=go.Heatmap(
|
696 |
-
z=df.corr().values,
|
697 |
-
x=df.columns,
|
698 |
-
y=df.columns,
|
699 |
-
colorscale='Viridis'))
|
700 |
-
elif plot_type == "Violin Plot":
|
701 |
-
fig = px.violin(df, x=x_axis, y=y_axis, color=color_by, facet_col=facet_col)
|
702 |
-
elif plot_type == "3D Scatter Plot":
|
703 |
-
fig = px.scatter_3d(df, x=x_axis, y=y_axis, z=z_axis, color=color_by)
|
704 |
-
elif plot_type == "Parallel Coordinates":
|
705 |
-
fig = px.parallel_coordinates(df, color=color_by)
|
706 |
elif plot_type == "Pair Plot":
|
707 |
-
fig = px.scatter_matrix(df,
|
708 |
-
|
709 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
710 |
|
711 |
st.plotly_chart(fig, use_container_width=True)
|
|
|
712 |
except Exception as e:
|
713 |
-
st.error(f"
|
|
|
714 |
|
715 |
# --------------------------
|
716 |
-
#
|
717 |
# --------------------------
|
718 |
-
st.
|
719 |
-
|
720 |
-
|
721 |
-
|
|
|
722 |
col1, col2 = st.columns(2)
|
723 |
with col1:
|
724 |
-
st.
|
725 |
-
fig = px.scatter(df, x=selected_columns[0], y=selected_columns[1], trendline="ols")
|
726 |
-
st.plotly_chart(fig, use_container_width=True)
|
727 |
-
|
728 |
with col2:
|
729 |
-
st.
|
730 |
-
|
731 |
-
|
732 |
-
|
733 |
-
|
734 |
-
|
735 |
-
|
736 |
-
|
737 |
-
|
738 |
-
|
739 |
-
|
740 |
-
st.
|
741 |
-
|
742 |
-
|
743 |
-
|
744 |
-
|
745 |
-
|
746 |
-
|
747 |
-
|
748 |
-
|
749 |
-
|
750 |
-
|
751 |
-
|
752 |
-
|
753 |
-
|
754 |
-
|
755 |
-
fig = px.histogram(df, x=selected_col, nbins=30)
|
756 |
-
st.plotly_chart(fig, use_container_width=True)
|
757 |
-
|
758 |
-
st.write("**Outlier Detection**")
|
759 |
-
Q1 = df[selected_col].quantile(0.25)
|
760 |
-
Q3 = df[selected_col].quantile(0.75)
|
761 |
-
IQR = Q3 - Q1
|
762 |
-
outliers = df[(df[selected_col] < (Q1 - 1.5 * IQR)) | (df[selected_col] > (Q3 + 1.5 * IQR))]
|
763 |
-
st.write(f"Number of outliers: {len(outliers)}")
|
764 |
-
st.dataframe(outliers.head(), use_container_width=True)
|
765 |
-
else:
|
766 |
-
st.write(f"**Value Counts for {selected_col}**")
|
767 |
-
value_counts = df[selected_col].value_counts()
|
768 |
-
st.bar_chart(value_counts)
|
769 |
|
770 |
# --------------------------
|
771 |
-
# Save
|
772 |
# --------------------------
|
773 |
-
st.subheader("πΎ
|
774 |
-
|
775 |
-
|
|
|
776 |
fig.write_image("visualization.png")
|
777 |
-
st.success("
|
778 |
-
|
779 |
-
|
780 |
-
|
781 |
-
|
782 |
-
|
783 |
|
784 |
# Streamlit App
|
785 |
elif app_mode == "Model Training":
|
|
|
632 |
st.dataframe(st.session_state.cleaned_data.head(), use_container_width=True)
|
633 |
|
634 |
|
635 |
+
elif app_mode = "EDA":
|
636 |
+
st.title("π Interactive Data Explorer")
|
|
|
637 |
|
638 |
if st.session_state.cleaned_data is None:
|
639 |
st.warning("Please clean your data first")
|
|
|
642 |
df = st.session_state.cleaned_data
|
643 |
|
644 |
# --------------------------
|
645 |
+
# Enhanced Data Overview
|
646 |
# --------------------------
|
647 |
+
with st.expander("π Dataset Overview", expanded=True):
|
648 |
+
col1, col2, col3, col4 = st.columns(4)
|
649 |
with col1:
|
650 |
+
st.metric("Total Rows", df.shape[0], help="Number of observations in the dataset")
|
651 |
with col2:
|
652 |
+
st.metric("Total Columns", df.shape[1], help="Number of features in the dataset")
|
653 |
with col3:
|
654 |
+
missing = df.isna().sum().sum()
|
655 |
+
st.metric("Missing Values", f"{missing} ({missing/(df.size)*100:.1f}%)")
|
656 |
+
with col4:
|
657 |
+
dupes = df.duplicated().sum()
|
658 |
+
st.metric("Duplicates", dupes, help="Fully duplicated rows")
|
659 |
|
660 |
+
# Data Preview Tabs
|
661 |
+
tab1, tab2, tab3 = st.tabs(["Quick Preview", "Column Types", "Missing Matrix"])
|
662 |
+
with tab1:
|
663 |
st.dataframe(df.head(), use_container_width=True)
|
664 |
+
with tab2:
|
665 |
+
types = df.dtypes.value_counts().reset_index()
|
666 |
+
types.columns = ['Type', 'Count']
|
667 |
+
st.dataframe(types, use_container_width=True)
|
668 |
+
with tab3:
|
669 |
+
fig = px.imshow(df.isna(), color_continuous_scale='gray')
|
670 |
+
st.plotly_chart(fig, use_container_width=True)
|
671 |
|
672 |
# --------------------------
|
673 |
+
# Smart Visualization Builder
|
674 |
# --------------------------
|
675 |
+
st.subheader("π Visualization Builder")
|
676 |
+
|
677 |
+
# Automatic plot type suggestions
|
678 |
+
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
|
679 |
+
categorical_cols = df.select_dtypes(exclude=np.number).columns.tolist()
|
680 |
+
|
681 |
col1, col2 = st.columns([1, 3])
|
682 |
with col1:
|
683 |
+
# Dynamic plot type filtering
|
684 |
+
default_plot = "Histogram" if len(numeric_cols) > 0 else "Bar Chart"
|
685 |
+
plot_type = st.selectbox(
|
686 |
+
"Choose visualization type",
|
687 |
+
options=[
|
688 |
+
"Scatter Plot", "Histogram", "Box Plot",
|
689 |
+
"Violin Plot", "Line Chart", "Bar Chart",
|
690 |
+
"Correlation Matrix", "Pair Plot", "Heatmap",
|
691 |
+
"3D Scatter", "Parallel Categories"
|
692 |
+
],
|
693 |
+
index=0,
|
694 |
+
help="Automatically filtered based on data types"
|
695 |
+
)
|
696 |
+
|
697 |
+
# Dynamic axis selection
|
698 |
+
x_axis = st.selectbox("X-axis", df.columns,
|
699 |
+
help="Primary dimension for analysis")
|
700 |
+
y_axis = st.selectbox("Y-axis", [None] + df.columns.tolist(),
|
701 |
+
disabled=plot_type in ["Histogram", "Bar Chart"],
|
702 |
+
help="Secondary dimension for analysis")
|
703 |
|
704 |
+
# Smart color encoding
|
705 |
+
color_options = ["None"] + df.columns.tolist()
|
706 |
+
color_by = st.selectbox("Color encoding", color_options,
|
707 |
+
format_func=lambda x: "No color" if x == "None" else x)
|
708 |
+
|
709 |
+
# Context-aware controls
|
710 |
+
if plot_type in ["3D Scatter", "Parallel Categories"]:
|
711 |
+
z_axis = st.selectbox("Z-axis", [None] + df.columns.tolist())
|
712 |
+
if plot_type == "Parallel Categories":
|
713 |
+
dimensions = st.multiselect("Dimensions", df.columns.tolist(), default=df.columns[:3])
|
714 |
|
715 |
with col2:
|
|
|
716 |
try:
|
717 |
+
# Generate appropriate visualization
|
718 |
if plot_type == "Scatter Plot":
|
719 |
+
fig = px.scatter(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None,
|
720 |
+
hover_data=df.columns, trendline="lowess")
|
721 |
elif plot_type == "Histogram":
|
722 |
+
fig = px.histogram(df, x=x_axis, color=color_by if color_by != "None" else None,
|
723 |
+
nbins=30, marginal="box")
|
724 |
elif plot_type == "Box Plot":
|
725 |
+
fig = px.box(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None)
|
726 |
+
elif plot_type == "Violin Plot":
|
727 |
+
fig = px.violin(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None,
|
728 |
+
box=True)
|
729 |
+
elif plot_type == "Line Chart":
|
730 |
+
fig = px.line(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None)
|
731 |
+
elif plot_type == "Bar Chart":
|
732 |
+
fig = px.bar(df, x=x_axis, color=color_by if color_by != "None" else None)
|
733 |
elif plot_type == "Correlation Matrix":
|
734 |
corr = df.select_dtypes(include=np.number).corr()
|
735 |
+
fig = px.imshow(corr, text_auto=True, color_continuous_scale='RdBu_r',
|
736 |
+
zmin=-1, zmax=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
737 |
elif plot_type == "Pair Plot":
|
738 |
+
fig = px.scatter_matrix(df, dimensions=numeric_cols[:4],
|
739 |
+
color=color_by if color_by != "None" else None)
|
740 |
+
elif plot_type == "Heatmap":
|
741 |
+
fig = px.density_heatmap(df, x=x_axis, y=y_axis, facet_col=color_by if color_by != "None" else None)
|
742 |
+
elif plot_type == "3D Scatter":
|
743 |
+
fig = px.scatter_3d(df, x=x_axis, y=y_axis, z=z_axis,
|
744 |
+
color=color_by if color_by != "None" else None)
|
745 |
+
elif plot_type == "Parallel Categories":
|
746 |
+
fig = px.parallel_categories(df, dimensions=dimensions,
|
747 |
+
color=color_by if color_by != "None" else None)
|
748 |
+
|
749 |
+
# Interactive plot customization
|
750 |
+
with st.expander("βοΈ Chart Settings", expanded=False):
|
751 |
+
col1, col2 = st.columns(2)
|
752 |
+
with col1:
|
753 |
+
chart_title = st.text_input("Chart title", f"{plot_type} of {x_axis} vs {y_axis}")
|
754 |
+
fig.update_layout(title=chart_title)
|
755 |
+
with col2:
|
756 |
+
theme = st.selectbox("Color theme", px.colors.named_colorscales())
|
757 |
+
fig.update_layout(colorway=px.colors.qualitative.Plotly)
|
758 |
|
759 |
st.plotly_chart(fig, use_container_width=True)
|
760 |
+
|
761 |
except Exception as e:
|
762 |
+
st.error(f"Couldn't create visualization: {str(e)}")
|
763 |
+
st.info("Try selecting different columns or changing the visualization type")
|
764 |
|
765 |
# --------------------------
|
766 |
+
# Advanced Analysis
|
767 |
# --------------------------
|
768 |
+
with st.expander("π¬ Deep Analysis Tools", expanded=False):
|
769 |
+
tab1, tab2, tab3 = st.tabs(["Statistical Tests", "Pattern Explorer", "Data Transformation"])
|
770 |
+
|
771 |
+
with tab1:
|
772 |
+
st.subheader("Hypothesis Testing")
|
773 |
col1, col2 = st.columns(2)
|
774 |
with col1:
|
775 |
+
test_var = st.selectbox("Test variable", numeric_cols)
|
|
|
|
|
|
|
776 |
with col2:
|
777 |
+
group_var = st.selectbox("Grouping variable", [None] + categorical_cols)
|
778 |
+
|
779 |
+
if group_var and st.button("Run ANOVA"):
|
780 |
+
groups = df.groupby(group_var)[test_var].apply(list)
|
781 |
+
f_val, p_val = stats.f_oneway(*groups)
|
782 |
+
st.write(f"F-value: {f_val:.2f}, p-value: {p_val:.4f}")
|
783 |
+
|
784 |
+
with tab2:
|
785 |
+
st.subheader("Pattern Discovery")
|
786 |
+
explore_col = st.selectbox("Column to analyze", df.columns)
|
787 |
+
if pd.api.types.is_string_dtype(df[explore_col]):
|
788 |
+
pattern = st.text_input("Regex pattern")
|
789 |
+
if pattern:
|
790 |
+
matches = df[explore_col].str.contains(pattern).sum()
|
791 |
+
st.write(f"Found {matches} matches")
|
792 |
+
|
793 |
+
with tab3:
|
794 |
+
st.subheader("Data Transformation")
|
795 |
+
transform_col = st.selectbox("Column to transform", numeric_cols)
|
796 |
+
transform_type = st.selectbox("Transformation", ["Log", "Square Root", "Z-score"])
|
797 |
+
if transform_type == "Log":
|
798 |
+
df[transform_col] = np.log1p(df[transform_col])
|
799 |
+
elif transform_type == "Square Root":
|
800 |
+
df[transform_col] = np.sqrt(df[transform_col])
|
801 |
+
elif transform_type == "Z-score":
|
802 |
+
df[transform_col] = (df[transform_col] - df[transform_col].mean())/df[transform_col].std()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
803 |
|
804 |
# --------------------------
|
805 |
+
# Export & Save
|
806 |
# --------------------------
|
807 |
+
st.subheader("πΎ Export Options")
|
808 |
+
col1, col2 = st.columns(2)
|
809 |
+
with col1:
|
810 |
+
if st.button("π₯ Download Current Visualization"):
|
811 |
fig.write_image("visualization.png")
|
812 |
+
st.success("Image saved!")
|
813 |
+
with col2:
|
814 |
+
if st.button("π Export Analysis Report"):
|
815 |
+
profile = ProfileReport(df, minimal=True)
|
816 |
+
profile.to_file("analysis_report.html")
|
817 |
+
st.success("Report generated!")
|
818 |
|
819 |
# Streamlit App
|
820 |
elif app_mode == "Model Training":
|