CosmickVisions commited on
Commit
79b3b0f
·
verified ·
1 Parent(s): b065a25

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +148 -213
app.py CHANGED
@@ -519,241 +519,176 @@ elif app_mode == "Advanced EDA":
519
  Uncover hidden patterns and relationships in your data.
520
  """)
521
 
522
- if 'cleaned_data' not in st.session_state or st.session_state.cleaned_data is None:
523
- st.warning("Please clean your data in the Smart Cleaning section first.")
524
- st.stop()
525
-
526
- df = st.session_state.cleaned_data.copy()
527
-
528
- # Initialize session state for EDA configuration
529
- if 'eda_config' not in st.session_state:
530
- st.session_state.eda_config = {
531
- 'plot_type': "Histogram",
532
- 'x_col': df.columns[0] if len(df.columns) > 0 else None,
533
- 'y_col': df.columns[1] if len(df.columns) > 1 else None,
534
- 'z_col': df.columns[2] if len(df.columns) > 2 else None,
535
- 'color_col': None,
536
- 'size_col': None,
537
- 'time_col': None,
538
- 'value_col': None,
539
- 'scatter_matrix_cols': df.select_dtypes(include=np.number).columns.tolist()[:5],
540
- 'color_palette': "Viridis",
541
- 'hover_data_cols': [],
542
- 'filter_col': None,
543
- 'filter_options': []
544
- }
545
 
546
- # Data Filtering Section
547
- with st.expander("🔎 Data Filtering", expanded=True):
548
- st.session_state.eda_config['filter_col'] = st.selectbox(
549
- "Filter Column",
550
- [None] + list(df.columns),
551
- help="Choose a column to filter the data."
552
- )
 
 
 
 
 
 
 
 
 
 
 
 
553
 
554
- if st.session_state.eda_config['filter_col']:
555
- unique_values = df[st.session_state.eda_config['filter_col']].unique()
556
- st.session_state.eda_config['filter_options'] = st.multiselect(
557
- "Filter Values",
558
- unique_values,
559
- default=unique_values,
560
- help=f"Select values from '{st.session_state.eda_config['filter_col']}'"
561
  )
562
- df = df[df[st.session_state.eda_config['filter_col']].isin(
563
- st.session_state.eda_config['filter_options']
564
- )]
565
-
566
- # Visualization Type Selection
567
- st.sidebar.header("📊 Visualization Configuration")
568
- plot_types = [
569
- "Histogram", "Scatter Plot", "Box Plot", "Violin Plot",
570
- "Correlation Heatmap", "Parallel Coordinates", "Andrews Curves",
571
- "Pair Plot", "Density Contour", "3D Scatter", "Time Series",
572
- "Sunburst Chart", "Funnel Chart", "Clustering Analysis"
573
- ]
574
- st.session_state.eda_config['plot_type'] = st.sidebar.selectbox(
575
- "Choose Visualization",
576
- plot_types,
577
- index=0
578
- )
579
 
580
- # Dynamic Controls Based on Plot Type
581
- if st.session_state.eda_config['plot_type'] != "Correlation Heatmap":
582
- st.session_state.eda_config['x_col'] = st.sidebar.selectbox(
583
- "X Axis",
584
- df.columns,
585
- index=df.columns.get_loc(st.session_state.eda_config['x_col'])
586
- if st.session_state.eda_config['x_col'] in df.columns else 0
587
- )
588
 
589
- if st.session_state.eda_config['plot_type'] in ["Scatter Plot", "Box Plot", "Violin Plot", "Time Series", "3D Scatter", "Histogram"]:
590
- st.session_state.eda_config['y_col'] = st.sidebar.selectbox(
591
- "Y Axis",
592
- df.columns,
593
- index=df.columns.get_loc(st.session_state.eda_config['y_col'])
594
- if st.session_state.eda_config['y_col'] in df.columns else 0
595
- )
 
 
596
 
597
- if st.session_state.eda_config['plot_type'] == "3D Scatter":
598
- st.session_state.eda_config['z_col'] = st.sidebar.selectbox(
599
- "Z Axis",
600
- df.columns,
601
- index=df.columns.get_loc(st.session_state.eda_config['z_col'])
602
- if st.session_state.eda_config['z_col'] in df.columns else 0
603
- )
604
- st.session_state.eda_config['color_col'] = st.sidebar.selectbox(
605
- "Color by",
606
- [None] + list(df.columns)
 
 
607
  )
608
 
609
- # Advanced Plot Customization
610
- with st.expander("🎨 Advanced Customization", expanded=False):
611
- st.session_state.eda_config['color_palette'] = st.selectbox(
612
- "Color Palette",
613
- ["Viridis", "Plasma", "Magma", "Cividis", "RdBu", "Rainbow"]
614
- )
615
- st.session_state.eda_config['hover_data_cols'] = st.multiselect(
616
- "Hover Data",
617
- df.columns
618
- )
619
 
620
- # Plot Generation
621
- try:
622
- fig = None
623
- config = st.session_state.eda_config
624
-
625
- if config['plot_type'] == "Histogram":
626
- color_palette = config['color_palette']
627
- colors = getattr(pc.sequential, color_palette)
628
- fig = px.histogram(
629
- df, x=config['x_col'], y=config['y_col'],
630
- nbins=30, template="plotly_dark",
631
- color=config['x_col'],
632
- color_discrete_sequence = [colors[0]]
633
  )
634
 
635
- elif config['plot_type'] == "Scatter Plot":
636
- fig = px.scatter(
637
- df, x=config['x_col'], y=config['y_col'],
638
- color=config['color_col'],
639
- size=config['size_col'],
640
- hover_data=config['hover_data_cols']
 
 
 
 
641
  )
642
 
643
- elif config['plot_type'] == "3D Scatter":
644
- fig = px.scatter_3d(
645
- df, x=config['x_col'], y=config['y_col'], z=config['z_col'],
646
- color=config['color_col'],
647
- color_discrete_sequence=[config['color_palette']]
648
  )
 
 
 
 
 
 
 
 
 
649
 
650
- elif config['plot_type'] == "Correlation Heatmap":
651
- numeric_df = df.select_dtypes(include=np.number)
652
- if not numeric_df.empty:
653
- corr = numeric_df.corr()
654
- fig = px.imshow(
655
- corr, text_auto=True,
656
- color_continuous_scale=config['color_palette']
 
657
  )
658
- else:
659
- st.warning("No numerical columns found for correlation heatmap.")
660
 
661
- elif config['plot_type'] == "Box Plot":
662
- fig = px.box(
663
- df, x=config['x_col'], y=config['y_col'],
664
- color=config['color_col']
665
- )
 
 
666
 
667
- elif config['plot_type'] == "Violin Plot":
668
- fig = px.violin(
669
- df, x=config['x_col'], y=config['y_col'],
670
- box=True, points="all",
671
- color=config['color_col']
672
- )
673
 
674
- elif config['plot_type'] == "Time Series":
675
- df = df.sort_values(by=config['time_col'])
676
- fig = px.line(
677
- df, x=config['time_col'], y=config['value_col'],
678
- color=config['color_col']
679
- )
 
 
 
 
680
 
681
- elif config['plot_type'] == "Scatter Matrix":
682
- fig = px.scatter_matrix(
683
- df, dimensions=config['scatter_matrix_cols'],
684
- color=config['color_col']
685
- )
686
 
687
- if fig:
688
- st.plotly_chart(fig, use_container_width=True)
689
- except Exception as e:
690
- st.error(f"An error occurred while generating the plot: {e}")
691
-
692
- # Statistical Analysis Section
693
- with st.expander("📊 Statistical Analysis", expanded=True):
694
- analysis_type = st.selectbox("Select Analysis Type", [
695
- "Descriptive Statistics",
696
- "Correlation Analysis",
697
- "Hypothesis Testing",
698
- "Distribution Fitting"
699
- ])
700
-
701
- if analysis_type == "Descriptive Statistics":
702
- st.write(df.describe(include='all'))
703
-
704
- elif analysis_type == "Correlation Analysis":
705
- numeric_cols = df.select_dtypes(include=np.number).columns
706
- if len(numeric_cols) >= 2:
707
- corr_method = st.selectbox("Correlation Method", [
708
- "Pearson", "Kendall", "Spearman"
709
- ])
710
- corr_matrix = df[numeric_cols].corr(method=corr_method.lower())
711
- st.write(corr_matrix)
712
- st.heatmap(corr_matrix, annot=True, cmap=config['color_palette'])
713
- else:
714
- st.warning("Need at least 2 numeric columns for correlation analysis")
715
 
716
- elif analysis_type == "Hypothesis Testing":
717
- test_type = st.selectbox("Select Test Type", [
718
- "T-test", "Chi-Squared Test", "ANOVA", "Mann-Whitney U"
719
- ])
720
- if test_type == "T-test":
721
- col1 = st.selectbox("Column 1 (Numeric)", df.select_dtypes(include=np.number).columns)
722
- col2 = st.selectbox("Column 2 (Categorical)", df.select_dtypes(include='object').columns)
723
- if st.button("Run T-test"):
724
- groups = df.groupby(col2)[col1].apply(list)
725
- if len(groups) == 2:
726
- t_stat, p_value = stats.ttest_ind(groups.iloc[0], groups.iloc[1])
727
- st.write(f"T-statistic: {t_stat:.4f}")
728
- st.write(f"P-value: {p_value:.4f}")
729
- if p_value < 0.05:
730
- st.write("Reject the null hypothesis.")
731
- else:
732
- st.write("Fail to reject the null hypothesis.")
733
- else:
734
- st.write("Select a categorical column with exactly two categories.")
735
-
736
- elif analysis_type == "Distribution Fitting":
737
- numeric_col = st.selectbox("Select Numeric Column", df.select_dtypes(include=np.number).columns)
738
- dist_types = ["Normal", "Log-Normal", "Exponential", "Gamma"]
739
- selected_dist = st.selectbox("Select Distribution Type", dist_types)
740
- if st.button("Fit Distribution"):
741
- from scipy.stats import norm, lognorm, expon, gamma
742
- dist_functions = {
743
- "Normal": norm,
744
- "Log-Normal": lognorm,
745
- "Exponential": expon,
746
- "Gamma": gamma
747
- }
748
- params = dist_functions[selected_dist].fit(df[numeric_col].dropna())
749
- st.write(f"Fitted Parameters: {params}")
750
-
751
- # Data Profiling Section
752
- with st.expander("📝 Generate Full Data Profile", expanded=False):
753
- if st.button("🚀 Generate Comprehensive Report"):
754
- with st.spinner("Generating report..."):
755
- pr = ProfileReport(df, explorative=True)
756
- st_profile_report(pr)
757
 
758
  # Model Training Section
759
  elif app_mode == "Model Training":
 
519
  Uncover hidden patterns and relationships in your data.
520
  """)
521
 
522
+ if 'cleaned_data' not in st.session_state or st.session_state.cleaned_data is None:
523
+ st.warning("Please clean your data in the Smart Cleaning section first.")
524
+ st.stop()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
525
 
526
+ df = st.session_state.cleaned_data.copy()
527
+
528
+ # Initialize session state for EDA configuration
529
+ if 'eda_config' not in st.session_state:
530
+ st.session_state.eda_config = {
531
+ 'plot_type': "Histogram",
532
+ 'x_col': df.columns[0] if len(df.columns) > 0 else None,
533
+ 'y_col': df.columns[1] if len(df.columns) > 1 else None,
534
+ 'z_col': df.columns[2] if len(df.columns) > 2 else None,
535
+ 'color_col': None,
536
+ 'size_col': None,
537
+ 'time_col': None,
538
+ 'value_col': None,
539
+ 'scatter_matrix_cols': df.select_dtypes(include=np.number).columns.tolist()[:5],
540
+ 'color_palette': "Viridis",
541
+ 'hover_data_cols': [],
542
+ 'filter_col': None,
543
+ 'filter_options': []
544
+ }
545
 
546
+ # Data Filtering Section
547
+ with st.expander("🔎 Data Filtering", expanded=True):
548
+ st.session_state.eda_config['filter_col'] = st.selectbox(
549
+ "Filter Column",
550
+ [None] + list(df.columns),
551
+ help="Choose a column to filter the data."
 
552
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
553
 
554
+ if st.session_state.eda_config['filter_col']:
555
+ unique_values = df[st.session_state.eda_config['filter_col']].unique()
 
 
 
 
 
 
556
 
557
+ st.session_state.eda_config['filter_options'] = st.multiselect(
558
+ "Filter Values",
559
+ unique_values,
560
+ default=unique_values,
561
+ help=f"Select values from '{st.session_state.eda_config['filter_col']}'"
562
+ )
563
+ df = df[df[st.session_state.eda_config['filter_col']].isin(
564
+ st.session_state.eda_config['filter_options']
565
+ )]
566
 
567
+ # Visualization Type Selection
568
+ st.sidebar.header("📊 Visualization Configuration")
569
+ plot_types = [
570
+ "Histogram", "Scatter Plot", "Box Plot", "Violin Plot",
571
+ "Correlation Heatmap", "Parallel Coordinates", "Andrews Curves",
572
+ "Pair Plot", "Density Contour", "3D Scatter", "Time Series",
573
+ "Sunburst Chart", "Funnel Chart", "Clustering Analysis"
574
+ ]
575
+ st.session_state.eda_config['plot_type'] = st.sidebar.selectbox(
576
+ "Choose Visualization",
577
+ plot_types,
578
+ index=0
579
  )
580
 
581
+ # Dynamic Controls Based on Plot Type
582
+ if st.session_state.eda_config['plot_type'] != "Correlation Heatmap":
583
+ st.session_state.eda_config['x_col'] = st.sidebar.selectbox(
584
+ "X Axis",
585
+ df.columns,
586
+ index=df.columns.get_loc(st.session_state.eda_config['x_col'])
587
+ if st.session_state.eda_config['x_col'] in df.columns else 0
588
+ )
 
 
589
 
590
+ if st.session_state.eda_config['plot_type'] in ["Scatter Plot", "Box Plot", "Violin Plot", "Time Series", "3D Scatter", "Histogram"]:
591
+ st.session_state.eda_config['y_col'] = st.sidebar.selectbox(
592
+ "Y Axis",
593
+ df.columns,
594
+ index=df.columns.get_loc(st.session_state.eda_config['y_col'])
595
+ if st.session_state.eda_config['y_col'] in df.columns else 0
 
 
 
 
 
 
 
596
  )
597
 
598
+ if st.session_state.eda_config['plot_type'] == "3D Scatter":
599
+ st.session_state.eda_config['z_col'] = st.sidebar.selectbox(
600
+ "Z Axis",
601
+ df.columns,
602
+ index=df.columns.get_loc(st.session_state.eda_config['z_col'])
603
+ if st.session_state.eda_config['z_col'] in df.columns else 0
604
+ )
605
+ st.session_state.eda_config['color_col'] = st.sidebar.selectbox(
606
+ "Color by",
607
+ [None] + list(df.columns)
608
  )
609
 
610
+ # Advanced Plot Customization
611
+ with st.expander("🎨 Advanced Customization", expanded=False):
612
+ st.session_state.eda_config['color_palette'] = st.selectbox(
613
+ "Color Palette",
614
+ ["Viridis", "Plasma", "Magma", "Cividis", "RdBu", "Rainbow"]
615
  )
616
+ st.session_state.eda_config['hover_data_cols'] = st.multiselect(
617
+ "Hover Data",
618
+ df.columns
619
+ )
620
+
621
+ # Plot Generation
622
+ try:
623
+ fig = None
624
+ config = st.session_state.eda_config
625
 
626
+ if config['plot_type'] == "Histogram":
627
+ color_palette = config['color_palette']
628
+ colors = getattr(pc.sequential, color_palette)
629
+ fig = px.histogram(
630
+ df, x=config['x_col'], y=config['y_col'],
631
+ nbins=30, template="plotly_dark",
632
+ color=config['x_col'],
633
+ color_discrete_sequence = [colors[0]]
634
  )
 
 
635
 
636
+ elif config['plot_type'] == "Scatter Plot":
637
+ fig = px.scatter(
638
+ df, x=config['x_col'], y=config['y_col'],
639
+ color=config['color_col'],
640
+ size=config['size_col'],
641
+ hover_data=config['hover_data_cols']
642
+ )
643
 
644
+ elif config['plot_type'] == "3D Scatter":
645
+ fig = px.scatter_3d(
646
+ df, x=config['x_col'], y=config['y_col'], z=config['z_col'],
647
+ color=config['color_col'],
648
+ color_discrete_sequence=[config['color_palette']]
649
+ )
650
 
651
+ elif config['plot_type'] == "Correlation Heatmap":
652
+ numeric_df = df.select_dtypes(include=np.number)
653
+ if not numeric_df.empty:
654
+ corr = numeric_df.corr()
655
+ fig = px.imshow(
656
+ corr, text_auto=True,
657
+ color_continuous_scale=config['color_palette']
658
+ )
659
+ else:
660
+ st.warning("No numerical columns found for correlation heatmap.")
661
 
662
+ elif config['plot_type'] == "Box Plot":
663
+ fig = px.box(
664
+ df, x=config['x_col'], y=config['y_col'],
665
+ color=config['color_col']
666
+ )
667
 
668
+ elif config['plot_type'] == "Violin Plot":
669
+ fig = px.violin(
670
+ df, x=config['x_col'], y=config['y_col'],
671
+ box=True, points="all",
672
+ color=config['color_col']
673
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
674
 
675
+ elif config['plot_type'] == "Time Series":
676
+ df = df.sort_values(by=config['time_col'])
677
+ fig = px.line(
678
+ df, x=config['time_col'], y=config['value_col'],
679
+ color=config['color_col']
680
+ )
681
+
682
+ elif config['plot_type'] == "Scatter Matrix":
683
+ fig = px.scatter_matrix(
684
+ df, dimensions=config['scatter_matrix_cols'],
685
+ color=config['color_col']
686
+ )
687
+
688
+ if fig:
689
+ st.plotly_chart(fig, use_container_width=True)
690
+ except Exception as e:
691
+ st.error(f"An error occurred while generating the plot: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
692
 
693
  # Model Training Section
694
  elif app_mode == "Model Training":