Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -519,241 +519,241 @@ elif app_mode == "Advanced EDA":
|
|
519 |
Uncover hidden patterns and relationships in your data.
|
520 |
""")
|
521 |
|
522 |
-
|
523 |
-
|
524 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
525 |
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
|
530 |
-
|
531 |
-
|
532 |
-
|
533 |
-
'y_col': df.columns[1] if len(df.columns) > 1 else None,
|
534 |
-
'z_col': df.columns[2] if len(df.columns) > 2 else None,
|
535 |
-
'color_col': None,
|
536 |
-
'size_col': None,
|
537 |
-
'time_col': None,
|
538 |
-
'value_col': None,
|
539 |
-
'scatter_matrix_cols': df.select_dtypes(include=np.number).columns.tolist()[:5],
|
540 |
-
'color_palette': "Viridis",
|
541 |
-
'hover_data_cols': [],
|
542 |
-
'filter_col': None,
|
543 |
-
'filter_options': []
|
544 |
-
}
|
545 |
|
546 |
-
|
547 |
-
|
548 |
-
st.session_state.eda_config['
|
549 |
-
"Filter
|
550 |
-
|
551 |
-
|
|
|
552 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
553 |
|
554 |
-
|
555 |
-
|
556 |
-
|
557 |
-
|
558 |
-
|
559 |
-
|
560 |
-
|
561 |
-
|
562 |
-
df = df[df[st.session_state.eda_config['filter_col']].isin(
|
563 |
-
st.session_state.eda_config['filter_options']
|
564 |
-
)]
|
565 |
|
566 |
-
|
567 |
-
st.sidebar.
|
568 |
-
|
569 |
-
|
570 |
-
|
571 |
-
|
572 |
-
"Sunburst Chart", "Funnel Chart", "Clustering Analysis"
|
573 |
-
]
|
574 |
-
st.session_state.eda_config['plot_type'] = st.sidebar.selectbox(
|
575 |
-
"Choose Visualization",
|
576 |
-
plot_types,
|
577 |
-
index=0
|
578 |
)
|
579 |
|
580 |
-
|
581 |
-
|
582 |
-
|
583 |
-
|
584 |
-
|
585 |
-
|
586 |
-
|
587 |
-
|
|
|
|
|
|
|
588 |
|
589 |
-
|
590 |
-
|
591 |
-
|
592 |
-
|
593 |
-
|
594 |
-
|
595 |
-
|
|
|
|
|
|
|
596 |
|
597 |
-
|
598 |
-
|
599 |
-
|
600 |
-
|
601 |
-
|
602 |
-
|
603 |
-
|
604 |
-
|
605 |
-
|
606 |
-
[
|
|
|
|
|
|
|
607 |
)
|
608 |
|
609 |
-
|
610 |
-
|
611 |
-
|
612 |
-
|
613 |
-
[
|
|
|
614 |
)
|
615 |
-
st.session_state.eda_config['hover_data_cols'] = st.multiselect(
|
616 |
-
"Hover Data",
|
617 |
-
df.columns
|
618 |
-
)
|
619 |
-
|
620 |
-
# Plot Generation
|
621 |
-
try:
|
622 |
-
fig = None
|
623 |
-
config = st.session_state.eda_config
|
624 |
|
625 |
-
|
626 |
-
|
627 |
-
|
628 |
-
|
629 |
-
|
630 |
-
|
631 |
-
color=config['x_col'],
|
632 |
-
color_discrete_sequence = [colors[0]]
|
633 |
-
)
|
634 |
-
|
635 |
-
elif config['plot_type'] == "Scatter Plot":
|
636 |
-
fig = px.scatter(
|
637 |
-
df, x=config['x_col'], y=config['y_col'],
|
638 |
-
color=config['color_col'],
|
639 |
-
size=config['size_col'],
|
640 |
-
hover_data=config['hover_data_cols']
|
641 |
-
)
|
642 |
|
643 |
-
|
644 |
-
|
645 |
-
|
646 |
-
|
647 |
-
|
|
|
|
|
648 |
)
|
|
|
|
|
649 |
|
650 |
-
|
651 |
-
|
652 |
-
|
653 |
-
|
654 |
-
|
655 |
-
corr, text_auto=True,
|
656 |
-
color_continuous_scale=config['color_palette']
|
657 |
-
)
|
658 |
-
else:
|
659 |
-
st.warning("No numerical columns found for correlation heatmap.")
|
660 |
-
|
661 |
-
elif config['plot_type'] == "Box Plot":
|
662 |
-
fig = px.box(
|
663 |
-
df, x=config['x_col'], y=config['y_col'],
|
664 |
-
color=config['color_col']
|
665 |
-
)
|
666 |
|
667 |
-
|
668 |
-
|
669 |
-
|
670 |
-
|
671 |
-
|
672 |
-
|
673 |
|
674 |
-
|
675 |
-
|
676 |
-
|
677 |
-
|
678 |
-
|
679 |
-
|
680 |
|
681 |
-
|
682 |
-
|
683 |
-
|
684 |
-
|
685 |
-
|
686 |
|
687 |
-
|
688 |
-
|
689 |
-
|
690 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
691 |
|
692 |
-
|
693 |
-
|
694 |
-
|
695 |
-
"Descriptive Statistics",
|
696 |
-
"Correlation Analysis",
|
697 |
-
"Hypothesis Testing",
|
698 |
-
"Distribution Fitting"
|
699 |
])
|
700 |
-
|
701 |
-
|
702 |
-
st.
|
703 |
-
|
704 |
-
|
705 |
-
|
706 |
-
|
707 |
-
|
708 |
-
"
|
709 |
-
|
710 |
-
|
711 |
-
st.write(corr_matrix)
|
712 |
-
st.heatmap(corr_matrix, annot=True, cmap=config['color_palette'])
|
713 |
-
else:
|
714 |
-
st.warning("Need at least 2 numeric columns for correlation analysis")
|
715 |
-
|
716 |
-
elif analysis_type == "Hypothesis Testing":
|
717 |
-
test_type = st.selectbox("Select Test Type", [
|
718 |
-
"T-test", "Chi-Squared Test", "ANOVA", "Mann-Whitney U"
|
719 |
-
])
|
720 |
-
if test_type == "T-test":
|
721 |
-
col1 = st.selectbox("Column 1 (Numeric)", df.select_dtypes(include=np.number).columns)
|
722 |
-
col2 = st.selectbox("Column 2 (Categorical)", df.select_dtypes(include='object').columns)
|
723 |
-
if st.button("Run T-test"):
|
724 |
-
groups = df.groupby(col2)[col1].apply(list)
|
725 |
-
if len(groups) == 2:
|
726 |
-
t_stat, p_value = stats.ttest_ind(groups.iloc[0], groups.iloc[1])
|
727 |
-
st.write(f"T-statistic: {t_stat:.4f}")
|
728 |
-
st.write(f"P-value: {p_value:.4f}")
|
729 |
-
if p_value < 0.05:
|
730 |
-
st.write("Reject the null hypothesis.")
|
731 |
-
else:
|
732 |
-
st.write("Fail to reject the null hypothesis.")
|
733 |
else:
|
734 |
-
st.write("
|
735 |
-
|
736 |
-
|
737 |
-
|
738 |
-
|
739 |
-
|
740 |
-
|
741 |
-
|
742 |
-
|
743 |
-
|
744 |
-
|
745 |
-
|
746 |
-
|
747 |
-
|
748 |
-
|
749 |
-
|
750 |
-
|
751 |
-
|
752 |
-
|
753 |
-
|
754 |
-
|
755 |
-
|
756 |
-
|
|
|
|
|
757 |
|
758 |
# Model Training Section
|
759 |
elif app_mode == "Model Training":
|
|
|
519 |
Uncover hidden patterns and relationships in your data.
|
520 |
""")
|
521 |
|
522 |
+
if 'cleaned_data' not in st.session_state or st.session_state.cleaned_data is None:
|
523 |
+
st.warning("Please clean your data in the Smart Cleaning section first.")
|
524 |
+
st.stop()
|
525 |
+
|
526 |
+
df = st.session_state.cleaned_data.copy()
|
527 |
+
|
528 |
+
# Initialize session state for EDA configuration
|
529 |
+
if 'eda_config' not in st.session_state:
|
530 |
+
st.session_state.eda_config = {
|
531 |
+
'plot_type': "Histogram",
|
532 |
+
'x_col': df.columns[0] if len(df.columns) > 0 else None,
|
533 |
+
'y_col': df.columns[1] if len(df.columns) > 1 else None,
|
534 |
+
'z_col': df.columns[2] if len(df.columns) > 2 else None,
|
535 |
+
'color_col': None,
|
536 |
+
'size_col': None,
|
537 |
+
'time_col': None,
|
538 |
+
'value_col': None,
|
539 |
+
'scatter_matrix_cols': df.select_dtypes(include=np.number).columns.tolist()[:5],
|
540 |
+
'color_palette': "Viridis",
|
541 |
+
'hover_data_cols': [],
|
542 |
+
'filter_col': None,
|
543 |
+
'filter_options': []
|
544 |
+
}
|
545 |
|
546 |
+
# Data Filtering Section
|
547 |
+
with st.expander("π Data Filtering", expanded=True):
|
548 |
+
st.session_state.eda_config['filter_col'] = st.selectbox(
|
549 |
+
"Filter Column",
|
550 |
+
[None] + list(df.columns),
|
551 |
+
help="Choose a column to filter the data."
|
552 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
553 |
|
554 |
+
if st.session_state.eda_config['filter_col']:
|
555 |
+
unique_values = df[st.session_state.eda_config['filter_col']].unique()
|
556 |
+
st.session_state.eda_config['filter_options'] = st.multiselect(
|
557 |
+
"Filter Values",
|
558 |
+
unique_values,
|
559 |
+
default=unique_values,
|
560 |
+
help=f"Select values from '{st.session_state.eda_config['filter_col']}'"
|
561 |
)
|
562 |
+
df = df[df[st.session_state.eda_config['filter_col']].isin(
|
563 |
+
st.session_state.eda_config['filter_options']
|
564 |
+
)]
|
565 |
+
|
566 |
+
# Visualization Type Selection
|
567 |
+
st.sidebar.header("π Visualization Configuration")
|
568 |
+
plot_types = [
|
569 |
+
"Histogram", "Scatter Plot", "Box Plot", "Violin Plot",
|
570 |
+
"Correlation Heatmap", "Parallel Coordinates", "Andrews Curves",
|
571 |
+
"Pair Plot", "Density Contour", "3D Scatter", "Time Series",
|
572 |
+
"Sunburst Chart", "Funnel Chart", "Clustering Analysis"
|
573 |
+
]
|
574 |
+
st.session_state.eda_config['plot_type'] = st.sidebar.selectbox(
|
575 |
+
"Choose Visualization",
|
576 |
+
plot_types,
|
577 |
+
index=0
|
578 |
+
)
|
579 |
|
580 |
+
# Dynamic Controls Based on Plot Type
|
581 |
+
if st.session_state.eda_config['plot_type'] != "Correlation Heatmap":
|
582 |
+
st.session_state.eda_config['x_col'] = st.sidebar.selectbox(
|
583 |
+
"X Axis",
|
584 |
+
df.columns,
|
585 |
+
index=df.columns.get_loc(st.session_state.eda_config['x_col'])
|
586 |
+
if st.session_state.eda_config['x_col'] in df.columns else 0
|
587 |
+
)
|
|
|
|
|
|
|
588 |
|
589 |
+
if st.session_state.eda_config['plot_type'] in ["Scatter Plot", "Box Plot", "Violin Plot", "Time Series", "3D Scatter", "Histogram"]:
|
590 |
+
st.session_state.eda_config['y_col'] = st.sidebar.selectbox(
|
591 |
+
"Y Axis",
|
592 |
+
df.columns,
|
593 |
+
index=df.columns.get_loc(st.session_state.eda_config['y_col'])
|
594 |
+
if st.session_state.eda_config['y_col'] in df.columns else 0
|
|
|
|
|
|
|
|
|
|
|
|
|
595 |
)
|
596 |
|
597 |
+
if st.session_state.eda_config['plot_type'] == "3D Scatter":
|
598 |
+
st.session_state.eda_config['z_col'] = st.sidebar.selectbox(
|
599 |
+
"Z Axis",
|
600 |
+
df.columns,
|
601 |
+
index=df.columns.get_loc(st.session_state.eda_config['z_col'])
|
602 |
+
if st.session_state.eda_config['z_col'] in df.columns else 0
|
603 |
+
)
|
604 |
+
st.session_state.eda_config['color_col'] = st.sidebar.selectbox(
|
605 |
+
"Color by",
|
606 |
+
[None] + list(df.columns)
|
607 |
+
)
|
608 |
|
609 |
+
# Advanced Plot Customization
|
610 |
+
with st.expander("π¨ Advanced Customization", expanded=False):
|
611 |
+
st.session_state.eda_config['color_palette'] = st.selectbox(
|
612 |
+
"Color Palette",
|
613 |
+
["Viridis", "Plasma", "Magma", "Cividis", "RdBu", "Rainbow"]
|
614 |
+
)
|
615 |
+
st.session_state.eda_config['hover_data_cols'] = st.multiselect(
|
616 |
+
"Hover Data",
|
617 |
+
df.columns
|
618 |
+
)
|
619 |
|
620 |
+
# Plot Generation
|
621 |
+
try:
|
622 |
+
fig = None
|
623 |
+
config = st.session_state.eda_config
|
624 |
+
|
625 |
+
if config['plot_type'] == "Histogram":
|
626 |
+
color_palette = config['color_palette']
|
627 |
+
colors = getattr(pc.sequential, color_palette)
|
628 |
+
fig = px.histogram(
|
629 |
+
df, x=config['x_col'], y=config['y_col'],
|
630 |
+
nbins=30, template="plotly_dark",
|
631 |
+
color=config['x_col'],
|
632 |
+
color_discrete_sequence = [colors[0]]
|
633 |
)
|
634 |
|
635 |
+
elif config['plot_type'] == "Scatter Plot":
|
636 |
+
fig = px.scatter(
|
637 |
+
df, x=config['x_col'], y=config['y_col'],
|
638 |
+
color=config['color_col'],
|
639 |
+
size=config['size_col'],
|
640 |
+
hover_data=config['hover_data_cols']
|
641 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
642 |
|
643 |
+
elif config['plot_type'] == "3D Scatter":
|
644 |
+
fig = px.scatter_3d(
|
645 |
+
df, x=config['x_col'], y=config['y_col'], z=config['z_col'],
|
646 |
+
color=config['color_col'],
|
647 |
+
color_discrete_sequence=[config['color_palette']]
|
648 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
649 |
|
650 |
+
elif config['plot_type'] == "Correlation Heatmap":
|
651 |
+
numeric_df = df.select_dtypes(include=np.number)
|
652 |
+
if not numeric_df.empty:
|
653 |
+
corr = numeric_df.corr()
|
654 |
+
fig = px.imshow(
|
655 |
+
corr, text_auto=True,
|
656 |
+
color_continuous_scale=config['color_palette']
|
657 |
)
|
658 |
+
else:
|
659 |
+
st.warning("No numerical columns found for correlation heatmap.")
|
660 |
|
661 |
+
elif config['plot_type'] == "Box Plot":
|
662 |
+
fig = px.box(
|
663 |
+
df, x=config['x_col'], y=config['y_col'],
|
664 |
+
color=config['color_col']
|
665 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
666 |
|
667 |
+
elif config['plot_type'] == "Violin Plot":
|
668 |
+
fig = px.violin(
|
669 |
+
df, x=config['x_col'], y=config['y_col'],
|
670 |
+
box=True, points="all",
|
671 |
+
color=config['color_col']
|
672 |
+
)
|
673 |
|
674 |
+
elif config['plot_type'] == "Time Series":
|
675 |
+
df = df.sort_values(by=config['time_col'])
|
676 |
+
fig = px.line(
|
677 |
+
df, x=config['time_col'], y=config['value_col'],
|
678 |
+
color=config['color_col']
|
679 |
+
)
|
680 |
|
681 |
+
elif config['plot_type'] == "Scatter Matrix":
|
682 |
+
fig = px.scatter_matrix(
|
683 |
+
df, dimensions=config['scatter_matrix_cols'],
|
684 |
+
color=config['color_col']
|
685 |
+
)
|
686 |
|
687 |
+
if fig:
|
688 |
+
st.plotly_chart(fig, use_container_width=True)
|
689 |
+
except Exception as e:
|
690 |
+
st.error(f"An error occurred while generating the plot: {e}")
|
691 |
+
|
692 |
+
# Statistical Analysis Section
|
693 |
+
with st.expander("π Statistical Analysis", expanded=True):
|
694 |
+
analysis_type = st.selectbox("Select Analysis Type", [
|
695 |
+
"Descriptive Statistics",
|
696 |
+
"Correlation Analysis",
|
697 |
+
"Hypothesis Testing",
|
698 |
+
"Distribution Fitting"
|
699 |
+
])
|
700 |
+
|
701 |
+
if analysis_type == "Descriptive Statistics":
|
702 |
+
st.write(df.describe(include='all'))
|
703 |
+
|
704 |
+
elif analysis_type == "Correlation Analysis":
|
705 |
+
numeric_cols = df.select_dtypes(include=np.number).columns
|
706 |
+
if len(numeric_cols) >= 2:
|
707 |
+
corr_method = st.selectbox("Correlation Method", [
|
708 |
+
"Pearson", "Kendall", "Spearman"
|
709 |
+
])
|
710 |
+
corr_matrix = df[numeric_cols].corr(method=corr_method.lower())
|
711 |
+
st.write(corr_matrix)
|
712 |
+
st.heatmap(corr_matrix, annot=True, cmap=config['color_palette'])
|
713 |
+
else:
|
714 |
+
st.warning("Need at least 2 numeric columns for correlation analysis")
|
715 |
|
716 |
+
elif analysis_type == "Hypothesis Testing":
|
717 |
+
test_type = st.selectbox("Select Test Type", [
|
718 |
+
"T-test", "Chi-Squared Test", "ANOVA", "Mann-Whitney U"
|
|
|
|
|
|
|
|
|
719 |
])
|
720 |
+
if test_type == "T-test":
|
721 |
+
col1 = st.selectbox("Column 1 (Numeric)", df.select_dtypes(include=np.number).columns)
|
722 |
+
col2 = st.selectbox("Column 2 (Categorical)", df.select_dtypes(include='object').columns)
|
723 |
+
if st.button("Run T-test"):
|
724 |
+
groups = df.groupby(col2)[col1].apply(list)
|
725 |
+
if len(groups) == 2:
|
726 |
+
t_stat, p_value = stats.ttest_ind(groups.iloc[0], groups.iloc[1])
|
727 |
+
st.write(f"T-statistic: {t_stat:.4f}")
|
728 |
+
st.write(f"P-value: {p_value:.4f}")
|
729 |
+
if p_value < 0.05:
|
730 |
+
st.write("Reject the null hypothesis.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
731 |
else:
|
732 |
+
st.write("Fail to reject the null hypothesis.")
|
733 |
+
else:
|
734 |
+
st.write("Select a categorical column with exactly two categories.")
|
735 |
+
|
736 |
+
elif analysis_type == "Distribution Fitting":
|
737 |
+
numeric_col = st.selectbox("Select Numeric Column", df.select_dtypes(include=np.number).columns)
|
738 |
+
dist_types = ["Normal", "Log-Normal", "Exponential", "Gamma"]
|
739 |
+
selected_dist = st.selectbox("Select Distribution Type", dist_types)
|
740 |
+
if st.button("Fit Distribution"):
|
741 |
+
from scipy.stats import norm, lognorm, expon, gamma
|
742 |
+
dist_functions = {
|
743 |
+
"Normal": norm,
|
744 |
+
"Log-Normal": lognorm,
|
745 |
+
"Exponential": expon,
|
746 |
+
"Gamma": gamma
|
747 |
+
}
|
748 |
+
params = dist_functions[selected_dist].fit(df[numeric_col].dropna())
|
749 |
+
st.write(f"Fitted Parameters: {params}")
|
750 |
+
|
751 |
+
# Data Profiling Section
|
752 |
+
with st.expander("π Generate Full Data Profile", expanded=False):
|
753 |
+
if st.button("π Generate Comprehensive Report"):
|
754 |
+
with st.spinner("Generating report..."):
|
755 |
+
pr = ProfileReport(df, explorative=True)
|
756 |
+
st_profile_report(pr)
|
757 |
|
758 |
# Model Training Section
|
759 |
elif app_mode == "Model Training":
|