CosmickVisions commited on
Commit
b065a25
Β·
verified Β·
1 Parent(s): caa2b7a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +212 -212
app.py CHANGED
@@ -519,241 +519,241 @@ elif app_mode == "Advanced EDA":
519
  Uncover hidden patterns and relationships in your data.
520
  """)
521
 
522
- if 'cleaned_data' not in st.session_state or st.session_state.cleaned_data is None:
523
- st.warning("Please clean your data in the Smart Cleaning section first.")
524
- st.stop()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
525
 
526
- df = st.session_state.cleaned_data.copy()
527
-
528
- # Initialize session state for EDA configuration
529
- if 'eda_config' not in st.session_state:
530
- st.session_state.eda_config = {
531
- 'plot_type': "Histogram",
532
- 'x_col': df.columns[0] if len(df.columns) > 0 else None,
533
- 'y_col': df.columns[1] if len(df.columns) > 1 else None,
534
- 'z_col': df.columns[2] if len(df.columns) > 2 else None,
535
- 'color_col': None,
536
- 'size_col': None,
537
- 'time_col': None,
538
- 'value_col': None,
539
- 'scatter_matrix_cols': df.select_dtypes(include=np.number).columns.tolist()[:5],
540
- 'color_palette': "Viridis",
541
- 'hover_data_cols': [],
542
- 'filter_col': None,
543
- 'filter_options': []
544
- }
545
 
546
- # Data Filtering Section
547
- with st.expander("πŸ”Ž Data Filtering", expanded=True):
548
- st.session_state.eda_config['filter_col'] = st.selectbox(
549
- "Filter Column",
550
- [None] + list(df.columns),
551
- help="Choose a column to filter the data."
 
552
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
553
 
554
- if st.session_state.eda_config['filter_col']:
555
- unique_values = df[st.session_state.eda_config['filter_col']].unique()
556
- st.session_state.eda_config['filter_options'] = st.multiselect(
557
- "Filter Values",
558
- unique_values,
559
- default=unique_values,
560
- help=f"Select values from '{st.session_state.eda_config['filter_col']}'"
561
- )
562
- df = df[df[st.session_state.eda_config['filter_col']].isin(
563
- st.session_state.eda_config['filter_options']
564
- )]
565
 
566
- # Visualization Type Selection
567
- st.sidebar.header("πŸ“Š Visualization Configuration")
568
- plot_types = [
569
- "Histogram", "Scatter Plot", "Box Plot", "Violin Plot",
570
- "Correlation Heatmap", "Parallel Coordinates", "Andrews Curves",
571
- "Pair Plot", "Density Contour", "3D Scatter", "Time Series",
572
- "Sunburst Chart", "Funnel Chart", "Clustering Analysis"
573
- ]
574
- st.session_state.eda_config['plot_type'] = st.sidebar.selectbox(
575
- "Choose Visualization",
576
- plot_types,
577
- index=0
578
  )
579
 
580
- # Dynamic Controls Based on Plot Type
581
- if st.session_state.eda_config['plot_type'] != "Correlation Heatmap":
582
- st.session_state.eda_config['x_col'] = st.sidebar.selectbox(
583
- "X Axis",
584
- df.columns,
585
- index=df.columns.get_loc(st.session_state.eda_config['x_col'])
586
- if st.session_state.eda_config['x_col'] in df.columns else 0
587
- )
 
 
 
588
 
589
- if st.session_state.eda_config['plot_type'] in ["Scatter Plot", "Box Plot", "Violin Plot", "Time Series", "3D Scatter", "Histogram"]:
590
- st.session_state.eda_config['y_col'] = st.sidebar.selectbox(
591
- "Y Axis",
592
- df.columns,
593
- index=df.columns.get_loc(st.session_state.eda_config['y_col'])
594
- if st.session_state.eda_config['y_col'] in df.columns else 0
595
- )
 
 
 
596
 
597
- if st.session_state.eda_config['plot_type'] == "3D Scatter":
598
- st.session_state.eda_config['z_col'] = st.sidebar.selectbox(
599
- "Z Axis",
600
- df.columns,
601
- index=df.columns.get_loc(st.session_state.eda_config['z_col'])
602
- if st.session_state.eda_config['z_col'] in df.columns else 0
603
- )
604
- st.session_state.eda_config['color_col'] = st.sidebar.selectbox(
605
- "Color by",
606
- [None] + list(df.columns)
 
 
 
607
  )
608
 
609
- # Advanced Plot Customization
610
- with st.expander("🎨 Advanced Customization", expanded=False):
611
- st.session_state.eda_config['color_palette'] = st.selectbox(
612
- "Color Palette",
613
- ["Viridis", "Plasma", "Magma", "Cividis", "RdBu", "Rainbow"]
 
614
  )
615
- st.session_state.eda_config['hover_data_cols'] = st.multiselect(
616
- "Hover Data",
617
- df.columns
618
- )
619
-
620
- # Plot Generation
621
- try:
622
- fig = None
623
- config = st.session_state.eda_config
624
 
625
- if config['plot_type'] == "Histogram":
626
- color_palette = config['color_palette']
627
- colors = getattr(pc.sequential, color_palette)
628
- fig = px.histogram(
629
- df, x=config['x_col'], y=config['y_col'],
630
- nbins=30, template="plotly_dark",
631
- color=config['x_col'],
632
- color_discrete_sequence = [colors[0]]
633
- )
634
-
635
- elif config['plot_type'] == "Scatter Plot":
636
- fig = px.scatter(
637
- df, x=config['x_col'], y=config['y_col'],
638
- color=config['color_col'],
639
- size=config['size_col'],
640
- hover_data=config['hover_data_cols']
641
- )
642
 
643
- elif config['plot_type'] == "3D Scatter":
644
- fig = px.scatter_3d(
645
- df, x=config['x_col'], y=config['y_col'], z=config['z_col'],
646
- color=config['color_col'],
647
- color_discrete_sequence=[config['color_palette']]
 
 
648
  )
 
 
649
 
650
- elif config['plot_type'] == "Correlation Heatmap":
651
- numeric_df = df.select_dtypes(include=np.number)
652
- if not numeric_df.empty:
653
- corr = numeric_df.corr()
654
- fig = px.imshow(
655
- corr, text_auto=True,
656
- color_continuous_scale=config['color_palette']
657
- )
658
- else:
659
- st.warning("No numerical columns found for correlation heatmap.")
660
-
661
- elif config['plot_type'] == "Box Plot":
662
- fig = px.box(
663
- df, x=config['x_col'], y=config['y_col'],
664
- color=config['color_col']
665
- )
666
 
667
- elif config['plot_type'] == "Violin Plot":
668
- fig = px.violin(
669
- df, x=config['x_col'], y=config['y_col'],
670
- box=True, points="all",
671
- color=config['color_col']
672
- )
673
 
674
- elif config['plot_type'] == "Time Series":
675
- df = df.sort_values(by=config['time_col'])
676
- fig = px.line(
677
- df, x=config['time_col'], y=config['value_col'],
678
- color=config['color_col']
679
- )
680
 
681
- elif config['plot_type'] == "Scatter Matrix":
682
- fig = px.scatter_matrix(
683
- df, dimensions=config['scatter_matrix_cols'],
684
- color=config['color_col']
685
- )
686
 
687
- if fig:
688
- st.plotly_chart(fig, use_container_width=True)
689
- except Exception as e:
690
- st.error(f"An error occurred while generating the plot: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
691
 
692
- # Statistical Analysis Section
693
- with st.expander("πŸ“Š Statistical Analysis", expanded=True):
694
- analysis_type = st.selectbox("Select Analysis Type", [
695
- "Descriptive Statistics",
696
- "Correlation Analysis",
697
- "Hypothesis Testing",
698
- "Distribution Fitting"
699
  ])
700
-
701
- if analysis_type == "Descriptive Statistics":
702
- st.write(df.describe(include='all'))
703
-
704
- elif analysis_type == "Correlation Analysis":
705
- numeric_cols = df.select_dtypes(include=np.number).columns
706
- if len(numeric_cols) >= 2:
707
- corr_method = st.selectbox("Correlation Method", [
708
- "Pearson", "Kendall", "Spearman"
709
- ])
710
- corr_matrix = df[numeric_cols].corr(method=corr_method.lower())
711
- st.write(corr_matrix)
712
- st.heatmap(corr_matrix, annot=True, cmap=config['color_palette'])
713
- else:
714
- st.warning("Need at least 2 numeric columns for correlation analysis")
715
-
716
- elif analysis_type == "Hypothesis Testing":
717
- test_type = st.selectbox("Select Test Type", [
718
- "T-test", "Chi-Squared Test", "ANOVA", "Mann-Whitney U"
719
- ])
720
- if test_type == "T-test":
721
- col1 = st.selectbox("Column 1 (Numeric)", df.select_dtypes(include=np.number).columns)
722
- col2 = st.selectbox("Column 2 (Categorical)", df.select_dtypes(include='object').columns)
723
- if st.button("Run T-test"):
724
- groups = df.groupby(col2)[col1].apply(list)
725
- if len(groups) == 2:
726
- t_stat, p_value = stats.ttest_ind(groups.iloc[0], groups.iloc[1])
727
- st.write(f"T-statistic: {t_stat:.4f}")
728
- st.write(f"P-value: {p_value:.4f}")
729
- if p_value < 0.05:
730
- st.write("Reject the null hypothesis.")
731
- else:
732
- st.write("Fail to reject the null hypothesis.")
733
  else:
734
- st.write("Select a categorical column with exactly two categories.")
735
-
736
- elif analysis_type == "Distribution Fitting":
737
- numeric_col = st.selectbox("Select Numeric Column", df.select_dtypes(include=np.number).columns)
738
- dist_types = ["Normal", "Log-Normal", "Exponential", "Gamma"]
739
- selected_dist = st.selectbox("Select Distribution Type", dist_types)
740
- if st.button("Fit Distribution"):
741
- from scipy.stats import norm, lognorm, expon, gamma
742
- dist_functions = {
743
- "Normal": norm,
744
- "Log-Normal": lognorm,
745
- "Exponential": expon,
746
- "Gamma": gamma
747
- }
748
- params = dist_functions[selected_dist].fit(df[numeric_col].dropna())
749
- st.write(f"Fitted Parameters: {params}")
750
-
751
- # Data Profiling Section
752
- with st.expander("πŸ“ Generate Full Data Profile", expanded=False):
753
- if st.button("πŸš€ Generate Comprehensive Report"):
754
- with st.spinner("Generating report..."):
755
- pr = ProfileReport(df, explorative=True)
756
- st_profile_report(pr)
 
 
757
 
758
  # Model Training Section
759
  elif app_mode == "Model Training":
 
519
  Uncover hidden patterns and relationships in your data.
520
  """)
521
 
522
+ if 'cleaned_data' not in st.session_state or st.session_state.cleaned_data is None:
523
+ st.warning("Please clean your data in the Smart Cleaning section first.")
524
+ st.stop()
525
+
526
+ df = st.session_state.cleaned_data.copy()
527
+
528
+ # Initialize session state for EDA configuration
529
+ if 'eda_config' not in st.session_state:
530
+ st.session_state.eda_config = {
531
+ 'plot_type': "Histogram",
532
+ 'x_col': df.columns[0] if len(df.columns) > 0 else None,
533
+ 'y_col': df.columns[1] if len(df.columns) > 1 else None,
534
+ 'z_col': df.columns[2] if len(df.columns) > 2 else None,
535
+ 'color_col': None,
536
+ 'size_col': None,
537
+ 'time_col': None,
538
+ 'value_col': None,
539
+ 'scatter_matrix_cols': df.select_dtypes(include=np.number).columns.tolist()[:5],
540
+ 'color_palette': "Viridis",
541
+ 'hover_data_cols': [],
542
+ 'filter_col': None,
543
+ 'filter_options': []
544
+ }
545
 
546
+ # Data Filtering Section
547
+ with st.expander("πŸ”Ž Data Filtering", expanded=True):
548
+ st.session_state.eda_config['filter_col'] = st.selectbox(
549
+ "Filter Column",
550
+ [None] + list(df.columns),
551
+ help="Choose a column to filter the data."
552
+ )
 
 
 
 
 
 
 
 
 
 
 
 
553
 
554
+ if st.session_state.eda_config['filter_col']:
555
+ unique_values = df[st.session_state.eda_config['filter_col']].unique()
556
+ st.session_state.eda_config['filter_options'] = st.multiselect(
557
+ "Filter Values",
558
+ unique_values,
559
+ default=unique_values,
560
+ help=f"Select values from '{st.session_state.eda_config['filter_col']}'"
561
  )
562
+ df = df[df[st.session_state.eda_config['filter_col']].isin(
563
+ st.session_state.eda_config['filter_options']
564
+ )]
565
+
566
+ # Visualization Type Selection
567
+ st.sidebar.header("πŸ“Š Visualization Configuration")
568
+ plot_types = [
569
+ "Histogram", "Scatter Plot", "Box Plot", "Violin Plot",
570
+ "Correlation Heatmap", "Parallel Coordinates", "Andrews Curves",
571
+ "Pair Plot", "Density Contour", "3D Scatter", "Time Series",
572
+ "Sunburst Chart", "Funnel Chart", "Clustering Analysis"
573
+ ]
574
+ st.session_state.eda_config['plot_type'] = st.sidebar.selectbox(
575
+ "Choose Visualization",
576
+ plot_types,
577
+ index=0
578
+ )
579
 
580
+ # Dynamic Controls Based on Plot Type
581
+ if st.session_state.eda_config['plot_type'] != "Correlation Heatmap":
582
+ st.session_state.eda_config['x_col'] = st.sidebar.selectbox(
583
+ "X Axis",
584
+ df.columns,
585
+ index=df.columns.get_loc(st.session_state.eda_config['x_col'])
586
+ if st.session_state.eda_config['x_col'] in df.columns else 0
587
+ )
 
 
 
588
 
589
+ if st.session_state.eda_config['plot_type'] in ["Scatter Plot", "Box Plot", "Violin Plot", "Time Series", "3D Scatter", "Histogram"]:
590
+ st.session_state.eda_config['y_col'] = st.sidebar.selectbox(
591
+ "Y Axis",
592
+ df.columns,
593
+ index=df.columns.get_loc(st.session_state.eda_config['y_col'])
594
+ if st.session_state.eda_config['y_col'] in df.columns else 0
 
 
 
 
 
 
595
  )
596
 
597
+ if st.session_state.eda_config['plot_type'] == "3D Scatter":
598
+ st.session_state.eda_config['z_col'] = st.sidebar.selectbox(
599
+ "Z Axis",
600
+ df.columns,
601
+ index=df.columns.get_loc(st.session_state.eda_config['z_col'])
602
+ if st.session_state.eda_config['z_col'] in df.columns else 0
603
+ )
604
+ st.session_state.eda_config['color_col'] = st.sidebar.selectbox(
605
+ "Color by",
606
+ [None] + list(df.columns)
607
+ )
608
 
609
+ # Advanced Plot Customization
610
+ with st.expander("🎨 Advanced Customization", expanded=False):
611
+ st.session_state.eda_config['color_palette'] = st.selectbox(
612
+ "Color Palette",
613
+ ["Viridis", "Plasma", "Magma", "Cividis", "RdBu", "Rainbow"]
614
+ )
615
+ st.session_state.eda_config['hover_data_cols'] = st.multiselect(
616
+ "Hover Data",
617
+ df.columns
618
+ )
619
 
620
+ # Plot Generation
621
+ try:
622
+ fig = None
623
+ config = st.session_state.eda_config
624
+
625
+ if config['plot_type'] == "Histogram":
626
+ color_palette = config['color_palette']
627
+ colors = getattr(pc.sequential, color_palette)
628
+ fig = px.histogram(
629
+ df, x=config['x_col'], y=config['y_col'],
630
+ nbins=30, template="plotly_dark",
631
+ color=config['x_col'],
632
+ color_discrete_sequence = [colors[0]]
633
  )
634
 
635
+ elif config['plot_type'] == "Scatter Plot":
636
+ fig = px.scatter(
637
+ df, x=config['x_col'], y=config['y_col'],
638
+ color=config['color_col'],
639
+ size=config['size_col'],
640
+ hover_data=config['hover_data_cols']
641
  )
 
 
 
 
 
 
 
 
 
642
 
643
+ elif config['plot_type'] == "3D Scatter":
644
+ fig = px.scatter_3d(
645
+ df, x=config['x_col'], y=config['y_col'], z=config['z_col'],
646
+ color=config['color_col'],
647
+ color_discrete_sequence=[config['color_palette']]
648
+ )
 
 
 
 
 
 
 
 
 
 
 
649
 
650
+ elif config['plot_type'] == "Correlation Heatmap":
651
+ numeric_df = df.select_dtypes(include=np.number)
652
+ if not numeric_df.empty:
653
+ corr = numeric_df.corr()
654
+ fig = px.imshow(
655
+ corr, text_auto=True,
656
+ color_continuous_scale=config['color_palette']
657
  )
658
+ else:
659
+ st.warning("No numerical columns found for correlation heatmap.")
660
 
661
+ elif config['plot_type'] == "Box Plot":
662
+ fig = px.box(
663
+ df, x=config['x_col'], y=config['y_col'],
664
+ color=config['color_col']
665
+ )
 
 
 
 
 
 
 
 
 
 
 
666
 
667
+ elif config['plot_type'] == "Violin Plot":
668
+ fig = px.violin(
669
+ df, x=config['x_col'], y=config['y_col'],
670
+ box=True, points="all",
671
+ color=config['color_col']
672
+ )
673
 
674
+ elif config['plot_type'] == "Time Series":
675
+ df = df.sort_values(by=config['time_col'])
676
+ fig = px.line(
677
+ df, x=config['time_col'], y=config['value_col'],
678
+ color=config['color_col']
679
+ )
680
 
681
+ elif config['plot_type'] == "Scatter Matrix":
682
+ fig = px.scatter_matrix(
683
+ df, dimensions=config['scatter_matrix_cols'],
684
+ color=config['color_col']
685
+ )
686
 
687
+ if fig:
688
+ st.plotly_chart(fig, use_container_width=True)
689
+ except Exception as e:
690
+ st.error(f"An error occurred while generating the plot: {e}")
691
+
692
+ # Statistical Analysis Section
693
+ with st.expander("πŸ“Š Statistical Analysis", expanded=True):
694
+ analysis_type = st.selectbox("Select Analysis Type", [
695
+ "Descriptive Statistics",
696
+ "Correlation Analysis",
697
+ "Hypothesis Testing",
698
+ "Distribution Fitting"
699
+ ])
700
+
701
+ if analysis_type == "Descriptive Statistics":
702
+ st.write(df.describe(include='all'))
703
+
704
+ elif analysis_type == "Correlation Analysis":
705
+ numeric_cols = df.select_dtypes(include=np.number).columns
706
+ if len(numeric_cols) >= 2:
707
+ corr_method = st.selectbox("Correlation Method", [
708
+ "Pearson", "Kendall", "Spearman"
709
+ ])
710
+ corr_matrix = df[numeric_cols].corr(method=corr_method.lower())
711
+ st.write(corr_matrix)
712
+ st.heatmap(corr_matrix, annot=True, cmap=config['color_palette'])
713
+ else:
714
+ st.warning("Need at least 2 numeric columns for correlation analysis")
715
 
716
+ elif analysis_type == "Hypothesis Testing":
717
+ test_type = st.selectbox("Select Test Type", [
718
+ "T-test", "Chi-Squared Test", "ANOVA", "Mann-Whitney U"
 
 
 
 
719
  ])
720
+ if test_type == "T-test":
721
+ col1 = st.selectbox("Column 1 (Numeric)", df.select_dtypes(include=np.number).columns)
722
+ col2 = st.selectbox("Column 2 (Categorical)", df.select_dtypes(include='object').columns)
723
+ if st.button("Run T-test"):
724
+ groups = df.groupby(col2)[col1].apply(list)
725
+ if len(groups) == 2:
726
+ t_stat, p_value = stats.ttest_ind(groups.iloc[0], groups.iloc[1])
727
+ st.write(f"T-statistic: {t_stat:.4f}")
728
+ st.write(f"P-value: {p_value:.4f}")
729
+ if p_value < 0.05:
730
+ st.write("Reject the null hypothesis.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
731
  else:
732
+ st.write("Fail to reject the null hypothesis.")
733
+ else:
734
+ st.write("Select a categorical column with exactly two categories.")
735
+
736
+ elif analysis_type == "Distribution Fitting":
737
+ numeric_col = st.selectbox("Select Numeric Column", df.select_dtypes(include=np.number).columns)
738
+ dist_types = ["Normal", "Log-Normal", "Exponential", "Gamma"]
739
+ selected_dist = st.selectbox("Select Distribution Type", dist_types)
740
+ if st.button("Fit Distribution"):
741
+ from scipy.stats import norm, lognorm, expon, gamma
742
+ dist_functions = {
743
+ "Normal": norm,
744
+ "Log-Normal": lognorm,
745
+ "Exponential": expon,
746
+ "Gamma": gamma
747
+ }
748
+ params = dist_functions[selected_dist].fit(df[numeric_col].dropna())
749
+ st.write(f"Fitted Parameters: {params}")
750
+
751
+ # Data Profiling Section
752
+ with st.expander("πŸ“ Generate Full Data Profile", expanded=False):
753
+ if st.button("πŸš€ Generate Comprehensive Report"):
754
+ with st.spinner("Generating report..."):
755
+ pr = ProfileReport(df, explorative=True)
756
+ st_profile_report(pr)
757
 
758
  # Model Training Section
759
  elif app_mode == "Model Training":