CosmickVisions commited on
Commit
98ce78a
·
verified ·
1 Parent(s): f1cf7f1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +237 -251
app.py CHANGED
@@ -27,103 +27,6 @@ st.set_page_config(
27
  )
28
 
29
 
30
- # HTML and CSS for the draggable button
31
- html_code = """
32
- <style>
33
- #floatingButton {
34
- position: fixed;
35
- bottom: 20px;
36
- right: 20px;
37
- width: 60px;
38
- height: 60px;
39
- background-color: #007bff;
40
- color: white;
41
- border: none;
42
- border-radius: 50%;
43
- cursor: pointer;
44
- font-size: 24px;
45
- z-index: 1000;
46
- }
47
-
48
- #floatingButton:active {
49
- background-color: #0056b3;
50
- }
51
-
52
- .draggable {
53
- position: absolute;
54
- cursor: move;
55
- }
56
- </style>
57
- <button id="floatingButton" class="draggable">+</button>
58
- <script>
59
- dragElement(document.getElementById("floatingButton"));
60
-
61
- function dragElement(elmnt) {
62
- var pos1 = 0, pos2 = 0, pos3 = 0, pos4 = 0;
63
- elmnt.onmousedown = dragMouseDown;
64
-
65
- function dragMouseDown(e) {
66
- e = e || window.event;
67
- e.preventDefault();
68
- pos3 = e.clientX;
69
- pos4 = e.clientY;
70
- document.onmouseup = closeDragElement;
71
- document.onmousemove = elementDrag;
72
- }
73
-
74
- function elementDrag(e) {
75
- e = e || window.event;
76
- e.preventDefault();
77
- pos1 = pos3 - e.clientX;
78
- pos2 = e.clientY;
79
- pos3 = e.clientX;
80
- pos4 = e.clientY;
81
- elmnt.style.top = (elmnt.offsetTop - pos2) + "px";
82
- elmnt.style.left = (elmnt.offsetLeft - pos1) + "px";
83
- }
84
-
85
- function closeDragElement() {
86
- document.onmouseup = null;
87
- document.onmousemove = null;
88
- }
89
- }
90
-
91
- document.getElementById("floatingButton").onclick = function() {
92
- var expander = document.getElementById("dataExpander");
93
- if (expander.style.display === "none") {
94
- expander.style.display = "block";
95
- } else {
96
- expander.style.display = "none";
97
- }
98
- fetch("/?show_data=true", {method: "POST"});
99
- }
100
- </script>
101
- """
102
-
103
- # JavaScript to handle the toggle functionality
104
- js_code = """
105
- <script>
106
- document.addEventListener('DOMContentLoaded', function() {
107
- var expander = document.createElement('div');
108
- expander.id = "dataExpander";
109
- expander.style.display = "none";
110
- document.body.appendChild(expander);
111
- });
112
- </script>
113
- """
114
-
115
- st.markdown(html_code, unsafe_allow_html=True)
116
- st.markdown(js_code, unsafe_allow_html=True)
117
-
118
- # Function to show data in an expander
119
- def show_data():
120
- st.session_state.show_data = not st.session_state.show_data # Toggle the state
121
- if st.session_state.show_data:
122
- with st.expander("✨ Data Viewport", expanded=True):
123
- st.dataframe(df, use_container_width=True)
124
-
125
- # --------------------------
126
-
127
  # --------------------------
128
  # Custom Styling
129
  # --------------------------
@@ -152,6 +55,16 @@ if 'model' not in st.session_state:
152
  # --------------------------
153
  # Helper Functions
154
  # --------------------------
 
 
 
 
 
 
 
 
 
 
155
  def generate_quality_report(df):
156
  """Generate comprehensive data quality report"""
157
  report = {
@@ -484,23 +397,29 @@ if app_mode == "Data Upload":
484
 
485
  except Exception as e:
486
  st.error(f"Error loading file: {str(e)}")
487
-
488
- elif app_mode == "Data Cleaning":
 
 
 
489
  st.title("🧹 Smart Data Cleaning")
490
-
491
  if st.session_state.raw_data is None:
492
  st.warning("Please upload data first")
493
  st.stop()
494
-
495
- # Initialize session state for undo functionality
496
  if 'data_versions' not in st.session_state:
497
  st.session_state.data_versions = [st.session_state.raw_data.copy()]
498
-
499
- df = st.session_state.data_versions[-1].copy()
500
-
 
 
501
  # --------------------------
502
  # Data Health Dashboard
503
  # --------------------------
 
504
  with st.expander("📊 Data Health Dashboard", expanded=True):
505
  col1, col2, col3 = st.columns(3)
506
  with col1:
@@ -509,200 +428,227 @@ elif app_mode == "Data Cleaning":
509
  st.metric("Total Rows", len(df))
510
  with col3:
511
  st.metric("Missing Values", df.isna().sum().sum())
512
-
513
  # Generate quick profile report
514
  if st.button("Generate Data Health Report"):
515
  with st.spinner("Analyzing data..."):
516
  profile = ProfileReport(df, minimal=True)
517
  st_profile_report(profile)
518
-
519
  # --------------------------
520
  # Undo Functionality
521
  # --------------------------
522
  if len(st.session_state.data_versions) > 1:
523
  if st.button("⏮️ Undo Last Action"):
524
- st.session_state.data_versions.pop()
525
- df = st.session_state.data_versions[-1].copy()
526
- st.session_state.cleaned_data = df
527
  st.success("Last action undone!")
528
-
 
529
  # --------------------------
530
  # Missing Value Handling
531
  # --------------------------
 
532
  with st.expander("🔍 Missing Values Treatment", expanded=True):
533
  missing_cols = df.columns[df.isna().any()].tolist()
534
  if missing_cols:
535
  cols = st.multiselect("Select columns to handle", missing_cols)
536
  method = st.selectbox("Imputation Method", [
537
- "Drop Missing",
538
- "Mean/Median",
539
  "Custom Value",
540
  "Forward Fill",
541
  "Backward Fill"
542
  ])
543
-
544
  if method == "Custom Value":
545
  custom_val = st.text_input("Enter custom value")
546
-
547
- if st.button("Apply Treatment"):
548
- st.session_state.data_versions.append(df.copy())
549
  try:
 
550
  if method == "Drop Missing":
551
- df = df.dropna(subset=cols)
552
  elif method == "Mean/Median":
553
  for col in cols:
554
- if pd.api.types.is_numeric_dtype(df[col]):
555
- df[col] = df[col].fillna(df[col].median())
556
  else:
557
- df[col] = df[col].fillna(df[col].mode()[0])
558
  elif method == "Custom Value" and custom_val:
559
  for col in cols:
560
- df[col] = df[col].fillna(custom_val)
561
  elif method == "Forward Fill":
562
- df[cols] = df[cols].ffill()
563
  elif method == "Backward Fill":
564
- df[cols] = df[cols].bfill()
565
-
566
- st.session_state.cleaned_data = df
567
- st.success("Missing values handled successfully!")
 
568
  except Exception as e:
569
  st.error(f"Error: {str(e)}")
570
  else:
571
  st.success("✨ No missing values found!")
572
-
573
  # --------------------------
574
  # Data Type Conversion
575
  # --------------------------
 
576
  with st.expander("🔄 Data Type Conversion"):
577
  col_to_convert = st.selectbox("Select column", df.columns)
578
  new_type = st.selectbox("New data type", [
579
- "String", "Integer", "Float",
580
  "Boolean", "Datetime"
581
  ])
582
-
583
  if new_type == "Datetime":
584
  date_format = st.text_input("Date format (e.g. %Y-%m-%d)", "%Y-%m-%d")
585
-
586
- if st.button("Convert"):
587
- st.session_state.data_versions.append(df.copy())
588
  try:
 
589
  if new_type == "String":
590
- df[col_to_convert] = df[col_to_convert].astype(str)
591
  elif new_type == "Integer":
592
- if df[col_to_convert].dtype == 'object':
593
  st.error("Cannot convert text column to integer!")
594
  else:
595
- df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce').astype('Int64')
596
  elif new_type == "Float":
597
- if df[col_to_convert].dtype == 'object':
598
  st.error("Cannot convert text column to float!")
599
  else:
600
- df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce')
601
  elif new_type == "Boolean":
602
- df[col_to_convert] = df[col_to_convert].astype(bool)
603
  elif new_type == "Datetime":
604
- df[col_to_convert] = pd.to_datetime(df[col_to_convert], format=date_format, errors='coerce')
605
-
606
- st.session_state.cleaned_data = df
607
- st.success("Conversion successful!")
608
  except Exception as e:
609
  st.error(f"Error: {str(e)}")
610
-
611
  # --------------------------
612
  # Drop Columns
613
  # --------------------------
 
614
  with st.expander("🗑️ Drop Columns"):
615
  columns_to_drop = st.multiselect("Select columns to drop", df.columns)
616
  if columns_to_drop:
617
  st.warning(f"Will drop: {', '.join(columns_to_drop)}")
618
- if st.button("Confirm Drop"):
619
- st.session_state.data_versions.append(df.copy())
620
- df = df.drop(columns=columns_to_drop)
621
- st.session_state.cleaned_data = df
622
- st.success("Selected columns dropped successfully!")
623
-
624
  # --------------------------
625
  # Label Encoding
626
  # --------------------------
 
627
  with st.expander("🔢 Label Encoding"):
628
  data_to_encode = st.multiselect("Select categorical columns to encode", df.select_dtypes(include='object').columns)
629
  if data_to_encode:
630
- if st.button("Apply Label Encoding"):
631
- st.session_state.data_versions.append(df.copy())
632
  label_encoders = {}
633
  for col in data_to_encode:
634
  le = LabelEncoder()
635
- df[col] = le.fit_transform(df[col].astype(str))
636
  label_encoders[col] = le
637
- st.session_state.cleaned_data = df
638
- st.success("Label encoding applied successfully!")
639
-
640
  # --------------------------
641
  # StandardScaler
642
  # --------------------------
 
643
  with st.expander("📏 StandardScaler"):
644
  scale_cols = st.multiselect("Select numeric columns to scale", df.select_dtypes(include=np.number).columns)
645
  if scale_cols:
646
- if st.button("Apply StandardScaler"):
647
- st.session_state.data_versions.append(df.copy())
648
  try:
 
649
  scaler = StandardScaler()
650
- df[scale_cols] = scaler.fit_transform(df[scale_cols])
651
- st.session_state.cleaned_data = df
652
- st.success("Standard scaling applied successfully!")
653
  except Exception as e:
654
  st.error(f"Error: {str(e)}")
655
-
656
  # --------------------------
657
  # Pattern-Based Cleaning
658
  # --------------------------
 
659
  with st.expander("🕵️ Pattern-Based Cleaning"):
660
  selected_col = st.selectbox("Select text column", df.select_dtypes(include='object').columns)
661
  pattern = st.text_input("Regex pattern (e.g. \d+ for numbers)")
662
  replacement = st.text_input("Replacement value")
663
-
664
- if st.button("Apply Pattern Replacement"):
665
- st.session_state.data_versions.append(df.copy())
666
  try:
667
- df[selected_col] = df[selected_col].str.replace(pattern, replacement, regex=True)
668
- st.session_state.cleaned_data = df
669
- st.success("Pattern replacement applied successfully!")
 
670
  except Exception as e:
671
  st.error(f"Error: {str(e)}")
672
-
673
  # --------------------------
674
  # Bulk Operations
675
  # --------------------------
 
676
  with st.expander("🚀 Bulk Actions"):
677
- if st.button("Auto-Clean Common Issues"):
678
- st.session_state.data_versions.append(df.copy())
679
- df = df.dropna(axis=1, how='all') # Remove empty cols
680
- df = df.convert_dtypes() # Better type inference
681
- text_cols = df.select_dtypes(include='object').columns
682
- df[text_cols] = df[text_cols].apply(lambda x: x.str.strip())
683
- st.session_state.cleaned_data = df
684
- st.success("Bulk cleaning completed!")
685
-
686
  # --------------------------
687
  # Cleaned Data Preview
688
  # --------------------------
689
- if st.session_state.cleaned_data is not None:
 
690
  with st.expander("✨ Cleaned Data Preview", expanded=True):
691
- st.dataframe(st.session_state.cleaned_data.head(), use_container_width=True)
692
 
693
 
694
- elif app_mode == "EDA":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
695
  st.title("🔍 Interactive Data Explorer")
696
-
697
  if st.session_state.cleaned_data is None:
698
  st.warning("Please clean your data first")
699
  st.stop()
700
-
701
  df = st.session_state.cleaned_data
702
-
703
  # --------------------------
704
  # Enhanced Data Overview
705
  # --------------------------
 
706
  with st.expander("📁 Dataset Overview", expanded=True):
707
  col1, col2, col3, col4 = st.columns(4)
708
  with col1:
@@ -715,7 +661,7 @@ elif app_mode == "EDA":
715
  with col4:
716
  dupes = df.duplicated().sum()
717
  st.metric("Duplicates", dupes, help="Fully duplicated rows")
718
-
719
  # Data Preview Tabs
720
  tab1, tab2, tab3 = st.tabs(["Quick Preview", "Column Types", "Missing Matrix"])
721
  with tab1:
@@ -727,16 +673,17 @@ elif app_mode == "EDA":
727
  with tab3:
728
  fig = px.imshow(df.isna(), color_continuous_scale='gray')
729
  st.plotly_chart(fig, use_container_width=True)
730
-
731
  # --------------------------
732
  # Smart Visualization Builder
733
  # --------------------------
 
734
  st.subheader("📊 Visualization Builder")
735
-
736
  # Automatic plot type suggestions
737
  numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
738
  categorical_cols = df.select_dtypes(exclude=np.number).columns.tolist()
739
-
740
  col1, col2 = st.columns([1, 3])
741
  with col1:
742
  # Dynamic plot type filtering
@@ -752,81 +699,110 @@ elif app_mode == "EDA":
752
  index=0,
753
  help="Automatically filtered based on data types"
754
  )
755
-
756
- # Dynamic axis selection
757
- x_axis = st.selectbox("X-axis", df.columns,
758
- help="Primary dimension for analysis")
759
- y_axis = st.selectbox("Y-axis", [None] + df.columns.tolist(),
760
- disabled=plot_type in ["Histogram", "Bar Chart"],
761
- help="Secondary dimension for analysis")
762
-
763
- # Smart color encoding
764
- color_options = ["None"] + df.columns.tolist()
765
- color_by = st.selectbox("Color encoding", color_options,
766
- format_func=lambda x: "No color" if x == "None" else x)
767
-
768
- # Context-aware controls
769
- if plot_type in ["3D Scatter", "Parallel Categories"]:
770
- z_axis = st.selectbox("Z-axis", [None] + df.columns.tolist())
 
 
 
 
 
 
 
 
771
  if plot_type == "Parallel Categories":
772
  dimensions = st.multiselect("Dimensions", df.columns.tolist(), default=df.columns[:3])
773
-
774
  with col2:
775
  try:
776
- # Generate appropriate visualization
 
 
777
  if plot_type == "Scatter Plot":
778
- fig = px.scatter(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None,
779
- hover_data=df.columns, trendline="lowess")
 
780
  elif plot_type == "Histogram":
781
- fig = px.histogram(df, x=x_axis, color=color_by if color_by != "None" else None,
782
- nbins=30, marginal="box")
 
783
  elif plot_type == "Box Plot":
784
- fig = px.box(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None)
 
785
  elif plot_type == "Violin Plot":
786
- fig = px.violin(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None,
787
- box=True)
 
788
  elif plot_type == "Line Chart":
789
- fig = px.line(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None)
 
790
  elif plot_type == "Bar Chart":
791
- fig = px.bar(df, x=x_axis, color=color_by if color_by != "None" else None)
 
792
  elif plot_type == "Correlation Matrix":
793
- corr = df.select_dtypes(include=np.number).corr()
794
- fig = px.imshow(corr, text_auto=True, color_continuous_scale='RdBu_r',
795
- zmin=-1, zmax=1)
 
 
796
  elif plot_type == "Pair Plot":
797
- fig = px.scatter_matrix(df, dimensions=numeric_cols[:4],
798
- color=color_by if color_by != "None" else None)
 
 
 
 
799
  elif plot_type == "Heatmap":
800
- fig = px.density_heatmap(df, x=x_axis, y=y_axis, facet_col=color_by if color_by != "None" else None)
 
801
  elif plot_type == "3D Scatter":
802
- fig = px.scatter_3d(df, x=x_axis, y=y_axis, z=z_axis,
803
- color=color_by if color_by != "None" else None)
 
804
  elif plot_type == "Parallel Categories":
805
- fig = px.parallel_categories(df, dimensions=dimensions,
806
- color=color_by if color_by != "None" else None)
807
-
 
 
 
808
  # Interactive plot customization
809
- with st.expander("⚙️ Chart Settings", expanded=False):
810
- col1, col2 = st.columns(2)
811
- with col1:
812
- chart_title = st.text_input("Chart title", f"{plot_type} of {x_axis} vs {y_axis}")
813
- fig.update_layout(title=chart_title)
814
- with col2:
815
- theme = st.selectbox("Color theme", px.colors.named_colorscales())
816
- fig.update_layout(colorway=px.colors.qualitative.Plotly)
817
-
818
- st.plotly_chart(fig, use_container_width=True)
819
-
 
820
  except Exception as e:
821
  st.error(f"Couldn't create visualization: {str(e)}")
822
  st.info("Try selecting different columns or changing the visualization type")
823
-
824
  # --------------------------
825
  # Advanced Analysis
826
  # --------------------------
 
827
  with st.expander("🔬 Deep Analysis Tools", expanded=False):
828
  tab1, tab2, tab3 = st.tabs(["Statistical Tests", "Pattern Explorer", "Data Transformation"])
829
-
830
  with tab1:
831
  st.subheader("Hypothesis Testing")
832
  col1, col2 = st.columns(2)
@@ -834,12 +810,15 @@ elif app_mode == "EDA":
834
  test_var = st.selectbox("Test variable", numeric_cols)
835
  with col2:
836
  group_var = st.selectbox("Grouping variable", [None] + categorical_cols)
837
-
838
  if group_var and st.button("Run ANOVA"):
839
- groups = df.groupby(group_var)[test_var].apply(list)
840
- f_val, p_val = stats.f_oneway(*groups)
841
- st.write(f"F-value: {f_val:.2f}, p-value: {p_val:.4f}")
842
-
 
 
 
843
  with tab2:
844
  st.subheader("Pattern Discovery")
845
  explore_col = st.selectbox("Column to analyze", df.columns)
@@ -848,7 +827,7 @@ elif app_mode == "EDA":
848
  if pattern:
849
  matches = df[explore_col].str.contains(pattern).sum()
850
  st.write(f"Found {matches} matches")
851
-
852
  with tab3:
853
  st.subheader("Data Transformation")
854
  transform_col = st.selectbox("Column to transform", numeric_cols)
@@ -859,21 +838,28 @@ elif app_mode == "EDA":
859
  df[transform_col] = np.sqrt(df[transform_col])
860
  elif transform_type == "Z-score":
861
  df[transform_col] = (df[transform_col] - df[transform_col].mean())/df[transform_col].std()
862
-
863
  # --------------------------
864
  # Export & Save
865
  # --------------------------
 
866
  st.subheader("💾 Export Options")
867
  col1, col2 = st.columns(2)
868
  with col1:
869
  if st.button("📥 Download Current Visualization"):
870
- fig.write_image("visualization.png")
871
- st.success("Image saved!")
 
 
 
872
  with col2:
873
  if st.button("📊 Export Analysis Report"):
874
- profile = ProfileReport(df, minimal=True)
875
- profile.to_file("analysis_report.html")
876
- st.success("Report generated!")
 
 
 
877
 
878
  # Streamlit App
879
  elif app_mode == "Model Training":
 
27
  )
28
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  # --------------------------
31
  # Custom Styling
32
  # --------------------------
 
55
  # --------------------------
56
  # Helper Functions
57
  # --------------------------
58
+ def enhance_section_title(title, icon="✨"):
59
+ """Helper function to create a styled section title with an icon."""
60
+ st.markdown(f"<h2 style='border-bottom: 2px solid #ccc; padding-bottom: 5px;'>{icon} {title}</h2>", unsafe_allow_html=True)
61
+
62
+ def update_cleaned_data(df):
63
+ """Updates the cleaned data in session state."""
64
+ st.session_state.cleaned_data = df
65
+ st.session_state.data_versions.append(df.copy()) # Append to history
66
+ st.success("Action completed successfully!")
67
+
68
  def generate_quality_report(df):
69
  """Generate comprehensive data quality report"""
70
  report = {
 
397
 
398
  except Exception as e:
399
  st.error(f"Error loading file: {str(e)}")
400
+
401
+ # --------------------------
402
+ # Page Content
403
+ # --------------------------
404
+ if st.session_state.get("app_mode") == "Data Cleaning":
405
  st.title("🧹 Smart Data Cleaning")
406
+
407
  if st.session_state.raw_data is None:
408
  st.warning("Please upload data first")
409
  st.stop()
410
+
411
+ # Initialize session state (only if it's not already there)
412
  if 'data_versions' not in st.session_state:
413
  st.session_state.data_versions = [st.session_state.raw_data.copy()]
414
+ if 'cleaned_data' not in st.session_state: #Added a conditional value
415
+ st.session_state.cleaned_data = st.session_state.raw_data.copy()
416
+
417
+ df = st.session_state.cleaned_data.copy()
418
+
419
  # --------------------------
420
  # Data Health Dashboard
421
  # --------------------------
422
+ enhance_section_title("Data Health Dashboard", "📊")
423
  with st.expander("📊 Data Health Dashboard", expanded=True):
424
  col1, col2, col3 = st.columns(3)
425
  with col1:
 
428
  st.metric("Total Rows", len(df))
429
  with col3:
430
  st.metric("Missing Values", df.isna().sum().sum())
431
+
432
  # Generate quick profile report
433
  if st.button("Generate Data Health Report"):
434
  with st.spinner("Analyzing data..."):
435
  profile = ProfileReport(df, minimal=True)
436
  st_profile_report(profile)
437
+
438
  # --------------------------
439
  # Undo Functionality
440
  # --------------------------
441
  if len(st.session_state.data_versions) > 1:
442
  if st.button("⏮️ Undo Last Action"):
443
+ st.session_state.data_versions.pop() # Remove current version
444
+ st.session_state.cleaned_data = st.session_state.data_versions[-1].copy() # Set data
 
445
  st.success("Last action undone!")
446
+ st.experimental_rerun() #Force re-run after undo
447
+
448
  # --------------------------
449
  # Missing Value Handling
450
  # --------------------------
451
+ enhance_section_title("Missing Values Treatment", "🔍")
452
  with st.expander("🔍 Missing Values Treatment", expanded=True):
453
  missing_cols = df.columns[df.isna().any()].tolist()
454
  if missing_cols:
455
  cols = st.multiselect("Select columns to handle", missing_cols)
456
  method = st.selectbox("Imputation Method", [
457
+ "Drop Missing",
458
+ "Mean/Median",
459
  "Custom Value",
460
  "Forward Fill",
461
  "Backward Fill"
462
  ])
463
+
464
  if method == "Custom Value":
465
  custom_val = st.text_input("Enter custom value")
466
+
467
+ if st.button("Apply Treatment (Missing)"):
 
468
  try:
469
+ new_df = df.copy() # Create a copy to modify
470
  if method == "Drop Missing":
471
+ new_df = new_df.dropna(subset=cols)
472
  elif method == "Mean/Median":
473
  for col in cols:
474
+ if pd.api.types.is_numeric_dtype(new_df[col]):
475
+ new_df[col] = new_df[col].fillna(new_df[col].median())
476
  else:
477
+ new_df[col] = new_df[col].fillna(new_df[col].mode()[0])
478
  elif method == "Custom Value" and custom_val:
479
  for col in cols:
480
+ new_df[col] = new_df[col].fillna(custom_val)
481
  elif method == "Forward Fill":
482
+ new_df[cols] = new_df[cols].ffill()
483
  elif method == "Backward Fill":
484
+ new_df[cols] = new_df[cols].bfill()
485
+
486
+ update_cleaned_data(new_df)
487
+ st.experimental_rerun() #Force re-run after apply
488
+
489
  except Exception as e:
490
  st.error(f"Error: {str(e)}")
491
  else:
492
  st.success("✨ No missing values found!")
493
+
494
  # --------------------------
495
  # Data Type Conversion
496
  # --------------------------
497
+ enhance_section_title("Data Type Conversion", "🔄")
498
  with st.expander("🔄 Data Type Conversion"):
499
  col_to_convert = st.selectbox("Select column", df.columns)
500
  new_type = st.selectbox("New data type", [
501
+ "String", "Integer", "Float",
502
  "Boolean", "Datetime"
503
  ])
504
+
505
  if new_type == "Datetime":
506
  date_format = st.text_input("Date format (e.g. %Y-%m-%d)", "%Y-%m-%d")
507
+
508
+ if st.button("Convert (Data Type)"):
 
509
  try:
510
+ new_df = df.copy()
511
  if new_type == "String":
512
+ new_df[col_to_convert] = new_df[col_to_convert].astype(str)
513
  elif new_type == "Integer":
514
+ if new_df[col_to_convert].dtype == 'object':
515
  st.error("Cannot convert text column to integer!")
516
  else:
517
+ new_df[col_to_convert] = pd.to_numeric(new_df[col_to_convert], errors='coerce').astype('Int64')
518
  elif new_type == "Float":
519
+ if new_df[col_to_convert].dtype == 'object':
520
  st.error("Cannot convert text column to float!")
521
  else:
522
+ new_df[col_to_convert] = pd.to_numeric(new_df[col_to_convert], errors='coerce')
523
  elif new_type == "Boolean":
524
+ new_df[col_to_convert] = new_df[col_to_convert].astype(bool)
525
  elif new_type == "Datetime":
526
+ new_df[col_to_convert] = pd.to_datetime(new_df[col_to_convert], format=date_format, errors='coerce')
527
+
528
+ update_cleaned_data(new_df)
529
+ st.experimental_rerun() #Force re-run after apply
530
  except Exception as e:
531
  st.error(f"Error: {str(e)}")
532
+
533
  # --------------------------
534
  # Drop Columns
535
  # --------------------------
536
+ enhance_section_title("Drop Columns", "🗑️")
537
  with st.expander("🗑️ Drop Columns"):
538
  columns_to_drop = st.multiselect("Select columns to drop", df.columns)
539
  if columns_to_drop:
540
  st.warning(f"Will drop: {', '.join(columns_to_drop)}")
541
+ if st.button("Confirm Drop (Columns)"):
542
+ new_df = df.copy()
543
+ new_df = new_df.drop(columns=columns_to_drop)
544
+ update_cleaned_data(new_df)
545
+ st.experimental_rerun() #Force re-run after apply
546
+
547
  # --------------------------
548
  # Label Encoding
549
  # --------------------------
550
+ enhance_section_title("Label Encoding", "🔢")
551
  with st.expander("🔢 Label Encoding"):
552
  data_to_encode = st.multiselect("Select categorical columns to encode", df.select_dtypes(include='object').columns)
553
  if data_to_encode:
554
+ if st.button("Apply Label Encoding (Encoding)"):
555
+ new_df = df.copy()
556
  label_encoders = {}
557
  for col in data_to_encode:
558
  le = LabelEncoder()
559
+ new_df[col] = le.fit_transform(new_df[col].astype(str))
560
  label_encoders[col] = le
561
+ update_cleaned_data(new_df)
562
+ st.experimental_rerun() #Force re-run after apply
563
+
564
  # --------------------------
565
  # StandardScaler
566
  # --------------------------
567
+ enhance_section_title("StandardScaler", "📏")
568
  with st.expander("📏 StandardScaler"):
569
  scale_cols = st.multiselect("Select numeric columns to scale", df.select_dtypes(include=np.number).columns)
570
  if scale_cols:
571
+ if st.button("Apply StandardScaler (Scaling)"):
 
572
  try:
573
+ new_df = df.copy()
574
  scaler = StandardScaler()
575
+ new_df[scale_cols] = scaler.fit_transform(new_df[scale_cols])
576
+ update_cleaned_data(new_df)
577
+ st.experimental_rerun() #Force re-run after apply
578
  except Exception as e:
579
  st.error(f"Error: {str(e)}")
580
+
581
  # --------------------------
582
  # Pattern-Based Cleaning
583
  # --------------------------
584
+ enhance_section_title("Pattern-Based Cleaning", "🕵️")
585
  with st.expander("🕵️ Pattern-Based Cleaning"):
586
  selected_col = st.selectbox("Select text column", df.select_dtypes(include='object').columns)
587
  pattern = st.text_input("Regex pattern (e.g. \d+ for numbers)")
588
  replacement = st.text_input("Replacement value")
589
+
590
+ if st.button("Apply Pattern Replacement (Replace)"):
 
591
  try:
592
+ new_df = df.copy()
593
+ new_df[selected_col] = new_df[selected_col].str.replace(pattern, replacement, regex=True)
594
+ update_cleaned_data(new_df)
595
+ st.experimental_rerun() #Force re-run after apply
596
  except Exception as e:
597
  st.error(f"Error: {str(e)}")
598
+
599
  # --------------------------
600
  # Bulk Operations
601
  # --------------------------
602
+ enhance_section_title("Bulk Actions", "🚀")
603
  with st.expander("🚀 Bulk Actions"):
604
+ if st.button("Auto-Clean Common Issues (Cleaning)"):
605
+ new_df = df.copy()
606
+ new_df = new_df.dropna(axis=1, how='all') # Remove empty cols
607
+ new_df = new_df.convert_dtypes() # Better type inference
608
+ text_cols = new_df.select_dtypes(include='object').columns
609
+ new_df[text_cols] = new_df[text_cols].apply(lambda x: x.str.strip())
610
+ update_cleaned_data(new_df)
611
+ st.experimental_rerun() #Force re-run after apply
612
+
613
  # --------------------------
614
  # Cleaned Data Preview
615
  # --------------------------
616
+ if st.session_state.get("cleaned_data") is not None:
617
+ enhance_section_title("Cleaned Data Preview", "✨")
618
  with st.expander("✨ Cleaned Data Preview", expanded=True):
619
+ st.dataframe(st.session_state.cleaned_data.head(), use_container_width=True)
620
 
621
 
622
+ import streamlit as st
623
+ import pandas as pd
624
+ import numpy as np
625
+ import plotly.express as px
626
+ from scipy import stats # For statistical tests
627
+ from pandas_profiling import ProfileReport # Automated EDA (if you have it installed)
628
+
629
+ # --------------------------
630
+ # Helper Functions
631
+ # --------------------------
632
+ def enhance_section_title(title, icon="✨"):
633
+ """Helper function to create a styled section title with an icon."""
634
+ st.markdown(f"<h2 style='border-bottom: 2px solid #ccc; padding-bottom: 5px;'>{icon} {title}</h2>", unsafe_allow_html=True)
635
+
636
+ # --------------------------
637
+ # Page Content
638
+ # --------------------------
639
+ if st.session_state.get("app_mode") == "EDA":
640
  st.title("🔍 Interactive Data Explorer")
641
+
642
  if st.session_state.cleaned_data is None:
643
  st.warning("Please clean your data first")
644
  st.stop()
645
+
646
  df = st.session_state.cleaned_data
647
+
648
  # --------------------------
649
  # Enhanced Data Overview
650
  # --------------------------
651
+ enhance_section_title("Dataset Overview", "📁")
652
  with st.expander("📁 Dataset Overview", expanded=True):
653
  col1, col2, col3, col4 = st.columns(4)
654
  with col1:
 
661
  with col4:
662
  dupes = df.duplicated().sum()
663
  st.metric("Duplicates", dupes, help="Fully duplicated rows")
664
+
665
  # Data Preview Tabs
666
  tab1, tab2, tab3 = st.tabs(["Quick Preview", "Column Types", "Missing Matrix"])
667
  with tab1:
 
673
  with tab3:
674
  fig = px.imshow(df.isna(), color_continuous_scale='gray')
675
  st.plotly_chart(fig, use_container_width=True)
676
+
677
  # --------------------------
678
  # Smart Visualization Builder
679
  # --------------------------
680
+ enhance_section_title("Visualization Builder", "📊")
681
  st.subheader("📊 Visualization Builder")
682
+
683
  # Automatic plot type suggestions
684
  numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
685
  categorical_cols = df.select_dtypes(exclude=np.number).columns.tolist()
686
+
687
  col1, col2 = st.columns([1, 3])
688
  with col1:
689
  # Dynamic plot type filtering
 
699
  index=0,
700
  help="Automatically filtered based on data types"
701
  )
702
+
703
+ # Axis selection - conditionally displayed
704
+ x_axis = None
705
+ y_axis = None
706
+ z_axis = None
707
+ color_by = "None" # Default color to None
708
+
709
+ if plot_type not in ["Correlation Matrix", "Pair Plot"]:
710
+ x_axis = st.selectbox("X-axis", df.columns, help="Primary dimension for analysis")
711
+
712
+ if plot_type in ["Scatter Plot", "Box Plot", "Violin Plot", "Line Chart", "Heatmap"]:
713
+ y_axis = st.selectbox("Y-axis", df.columns, help="Secondary dimension for analysis")
714
+
715
+ if plot_type == "3D Scatter":
716
+ z_axis = st.selectbox("Z-axis", df.columns, help="Third dimension for analysis")
717
+
718
+ # Color encoding
719
+ if plot_type not in ["Correlation Matrix", "Pair Plot"]:
720
+ color_options = ["None"] + df.columns.tolist()
721
+ color_by = st.selectbox("Color encoding", color_options,
722
+ format_func=lambda x: "No color" if x == "None" else x)
723
+
724
+ # Context-aware controls for Parallel Categories
725
+ dimensions = None
726
  if plot_type == "Parallel Categories":
727
  dimensions = st.multiselect("Dimensions", df.columns.tolist(), default=df.columns[:3])
728
+
729
  with col2:
730
  try:
731
+ fig = None # Initialize fig to None
732
+
733
+ # Generate appropriate visualization with input validation
734
  if plot_type == "Scatter Plot":
735
+ if x_axis and y_axis:
736
+ fig = px.scatter(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None,
737
+ hover_data=df.columns, trendline="lowess")
738
  elif plot_type == "Histogram":
739
+ if x_axis:
740
+ fig = px.histogram(df, x=x_axis, color=color_by if color_by != "None" else None,
741
+ nbins=30, marginal="box")
742
  elif plot_type == "Box Plot":
743
+ if x_axis and y_axis:
744
+ fig = px.box(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None)
745
  elif plot_type == "Violin Plot":
746
+ if x_axis and y_axis:
747
+ fig = px.violin(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None,
748
+ box=True)
749
  elif plot_type == "Line Chart":
750
+ if x_axis and y_axis:
751
+ fig = px.line(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None)
752
  elif plot_type == "Bar Chart":
753
+ if x_axis:
754
+ fig = px.bar(df, x=x_axis, color=color_by if color_by != "None" else None)
755
  elif plot_type == "Correlation Matrix":
756
+ numeric_df = df.select_dtypes(include=np.number)
757
+ if len(numeric_df.columns) > 1:
758
+ corr = numeric_df.corr()
759
+ fig = px.imshow(corr, text_auto=True, color_continuous_scale='RdBu_r',
760
+ zmin=-1, zmax=1)
761
  elif plot_type == "Pair Plot":
762
+ numeric_df = df.select_dtypes(include=np.number)
763
+ num_cols = len(numeric_df.columns)
764
+ if num_cols > 1:
765
+ dimensions = numeric_df.columns[:min(4, num_cols)].tolist() # Limit to the first 4 for performance
766
+ fig = px.scatter_matrix(df, dimensions=dimensions,
767
+ color=color_by if color_by != "None" else None)
768
  elif plot_type == "Heatmap":
769
+ if x_axis and y_axis:
770
+ fig = px.density_heatmap(df, x=x_axis, y=y_axis, facet_col=color_by if color_by != "None" else None)
771
  elif plot_type == "3D Scatter":
772
+ if x_axis and y_axis and z_axis:
773
+ fig = px.scatter_3d(df, x=x_axis, y=y_axis, z=z_axis,
774
+ color=color_by if color_by != "None" else None)
775
  elif plot_type == "Parallel Categories":
776
+ if dimensions:
777
+ fig = px.parallel_categories(df, dimensions=dimensions,
778
+ color=color_by if color_by != "None" else None)
779
+ else:
780
+ st.error("Please choose the specific plot")
781
+
782
  # Interactive plot customization
783
+ if fig: #Only display customization options when we have a plot
784
+ with st.expander("⚙️ Chart Settings", expanded=False):
785
+ col1, col2 = st.columns(2)
786
+ with col1:
787
+ chart_title = st.text_input("Chart title", f"{plot_type} of {x_axis} vs {y_axis}" if (x_axis and y_axis) else f"{plot_type} of {x_axis}" if x_axis else plot_type)
788
+ fig.update_layout(title=chart_title)
789
+ with col2:
790
+ theme = st.selectbox("Color theme", px.colors.named_colorscales())
791
+ fig.update_layout(colorway=px.colors.qualitative.Plotly)
792
+
793
+ st.plotly_chart(fig, use_container_width=True)
794
+
795
  except Exception as e:
796
  st.error(f"Couldn't create visualization: {str(e)}")
797
  st.info("Try selecting different columns or changing the visualization type")
798
+
799
  # --------------------------
800
  # Advanced Analysis
801
  # --------------------------
802
+ enhance_section_title("Deep Analysis Tools", "🔬")
803
  with st.expander("🔬 Deep Analysis Tools", expanded=False):
804
  tab1, tab2, tab3 = st.tabs(["Statistical Tests", "Pattern Explorer", "Data Transformation"])
805
+
806
  with tab1:
807
  st.subheader("Hypothesis Testing")
808
  col1, col2 = st.columns(2)
 
810
  test_var = st.selectbox("Test variable", numeric_cols)
811
  with col2:
812
  group_var = st.selectbox("Grouping variable", [None] + categorical_cols)
813
+
814
  if group_var and st.button("Run ANOVA"):
815
+ if test_var and group_var:
816
+ groups = df.groupby(group_var)[test_var].apply(list)
817
+ f_val, p_val = stats.f_oneway(*groups)
818
+ st.write(f"F-value: {f_val:.2f}, p-value: {p_val:.4f}")
819
+ else:
820
+ st.warning("Please select both a Test variable and a Grouping variable for ANOVA.")
821
+
822
  with tab2:
823
  st.subheader("Pattern Discovery")
824
  explore_col = st.selectbox("Column to analyze", df.columns)
 
827
  if pattern:
828
  matches = df[explore_col].str.contains(pattern).sum()
829
  st.write(f"Found {matches} matches")
830
+
831
  with tab3:
832
  st.subheader("Data Transformation")
833
  transform_col = st.selectbox("Column to transform", numeric_cols)
 
838
  df[transform_col] = np.sqrt(df[transform_col])
839
  elif transform_type == "Z-score":
840
  df[transform_col] = (df[transform_col] - df[transform_col].mean())/df[transform_col].std()
841
+
842
  # --------------------------
843
  # Export & Save
844
  # --------------------------
845
+ enhance_section_title("Export Options", "💾")
846
  st.subheader("💾 Export Options")
847
  col1, col2 = st.columns(2)
848
  with col1:
849
  if st.button("📥 Download Current Visualization"):
850
+ try:
851
+ fig.write_image("visualization.png")
852
+ st.success("Image saved!")
853
+ except NameError:
854
+ st.error("No visualization to download. Please create a chart first.")
855
  with col2:
856
  if st.button("📊 Export Analysis Report"):
857
+ try:
858
+ profile = ProfileReport(df, minimal=True)
859
+ profile.to_file("analysis_report.html")
860
+ st.success("Report generated!")
861
+ except Exception as e:
862
+ st.error(f"Could not generate analysis report. Ensure pandas-profiling is installed correctly.")
863
 
864
  # Streamlit App
865
  elif app_mode == "Model Training":