CosmickVisions commited on
Commit
b86197d
·
verified ·
1 Parent(s): 977f130

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +245 -300
app.py CHANGED
@@ -235,7 +235,6 @@ if app_mode == "Data Upload":
235
  pr = ProfileReport(df, explorative=True)
236
  st_profile_report(pr)
237
 
238
- # Smart Cleaning Section
239
  elif app_mode == "Smart Cleaning":
240
  st.title("🧼 Intelligent Data Cleaning")
241
  st.markdown("""
@@ -260,43 +259,60 @@ elif app_mode == "Smart Cleaning":
260
  df = st.session_state.data_versions[st.session_state.current_version].copy()
261
  cleaning_actions = st.session_state.get('cleaning_actions', [])
262
 
263
- # Version Control
264
  with st.expander("⏪ Version Control", expanded=True):
 
 
 
 
265
  col1, col2 = st.columns(2)
266
  with col1:
267
- if st.button("Undo Last Action") and st.session_state.current_version > 0:
268
  st.session_state.current_version -= 1
269
  st.experimental_rerun()
270
  with col2:
271
- if st.button("Redo Next Action") and st.session_state.current_version < len(st.session_state.data_versions)-1:
272
  st.session_state.current_version += 1
273
  st.experimental_rerun()
274
- st.caption(f"Current Version: {st.session_state.current_version+1}/{len(st.session_state.data_versions)}")
275
 
276
- # Data Health Dashboard
277
  st.subheader("📊 Data Health Dashboard")
278
- with st.expander("Show Comprehensive Data Report"):
279
  from pandas_profiling import ProfileReport
280
  pr = ProfileReport(df, explorative=True)
281
  st_profile_report(pr)
282
 
283
- # Enhanced Health Summary
284
  col1, col2, col3, col4 = st.columns(4)
285
  with col1:
286
- st.plotly_chart(px.bar(df.isna().sum(), title="Missing Values per Column").update_layout(showlegend=False))
287
  with col2:
288
- st.plotly_chart(px.pie(values=df.dtypes.value_counts(), names=df.dtypes.value_counts().index,
289
- title="Data Type Distribution"))
290
  with col3:
291
- st.metric("Total Rows", len(df))
 
292
  with col4:
293
- st.metric("Total Columns", len(df.columns))
 
294
 
295
- # Cleaning Operations
 
 
 
 
 
 
 
 
 
 
 
296
  st.subheader("🔧 Cleaning Operations")
297
-
298
- # 1. Missing Value Handling - Enhanced
299
- with st.expander("🕳️ Handle Missing Values", expanded=True):
 
 
300
  missing_cols = df.columns[df.isna().any()].tolist()
301
  if missing_cols:
302
  st.write("Columns with missing values:")
@@ -310,86 +326,29 @@ elif app_mode == "Smart Cleaning":
310
  "Deep Learning Imputation"
311
  ], horizontal=True)
312
 
313
- preview_expander = st.expander("Preview Data Before/After")
314
-
315
- if method in ["KNN Imputation", "MICE Imputation", "Deep Learning Imputation"]:
316
- numeric_cols = df[cols].select_dtypes(include=np.number).columns.tolist()
317
- if len(numeric_cols) != len(cols):
318
- st.error("Non-numeric columns selected for numeric imputation. Please select only numeric columns.")
319
- st.stop()
320
-
321
  if st.button(f"Apply {method}"):
322
  try:
323
  original_df = df.copy()
324
-
325
- if method == "Drop Missing":
326
- df.dropna(subset=cols, inplace=True)
327
- action_msg = f"Dropped missing values in {cols}"
328
-
329
- elif method == "Mean/Median/Mode":
330
- strategy = st.selectbox("Strategy", ["mean", "median", "most_frequent"])
331
- for col in cols:
332
- if pd.api.types.is_numeric_dtype(df[col]):
333
- df[col].fillna(df[col].agg(strategy), inplace=True)
334
- else:
335
- df[col].fillna(df[col].mode()[0], inplace=True)
336
- action_msg = f"Filled missing values in {cols} using {strategy}"
337
-
338
- elif method == "KNN Imputation":
339
- n_neighbors = st.slider("Number of neighbors", 2, 15, 5)
340
- from sklearn.impute import KNNImputer
341
- imputer = KNNImputer(n_neighbors=n_neighbors)
342
- df[cols] = imputer.fit_transform(df[cols])
343
- action_msg = f"Applied KNN imputation (k={n_neighbors}) on {cols}"
344
-
345
- elif method == "MICE Imputation":
346
- from sklearn.experimental import enable_iterative_imputer
347
- from sklearn.impute import IterativeImputer
348
- imputer = IterativeImputer(random_state=42)
349
- df[cols] = imputer.fit_transform(df[cols])
350
- action_msg = f"Applied MICE imputation on {cols}"
351
-
352
- elif method == "Deep Learning Imputation":
353
- from sklearn.neural_network import MLPRegressor
354
- model = MLPRegressor(hidden_layer_sizes=(100,50), max_iter=1000)
355
- for col in cols:
356
- temp_df = df.dropna()
357
- X = temp_df.drop(columns=[col])
358
- y = temp_df[col]
359
- model.fit(X, y)
360
- mask = df[col].isna()
361
- df.loc[mask, col] = model.predict(df.loc[mask].drop(columns=[col]))
362
- action_msg = f"Applied Deep Learning imputation on {cols}"
363
-
364
- with preview_expander:
365
- col1, col2 = st.columns(2)
366
- with col1:
367
- st.write("Before:", original_df[cols].head(10))
368
- with col2:
369
- st.write("After:", df[cols].head(10))
370
-
371
- cleaning_actions.append(action_msg)
372
  update_version(df)
373
  st.success(f"{method} applied successfully! ✅")
374
-
375
  except Exception as e:
376
  st.error(f"Error: {str(e)}")
377
- st.stop()
378
  else:
379
  st.success("✨ No missing values found!")
380
 
381
- # 2. Enhanced Duplicate Handling with Visualization
382
- with st.expander("🔄 Handle Duplicates", expanded=True):
 
383
  duplicates = df.duplicated().sum()
384
  if duplicates > 0:
385
  st.plotly_chart(px.histogram(df, x=df.duplicated(), title="Duplicate Distribution"))
386
-
387
  dup_strategy = st.radio("Duplicate Strategy", [
388
  "Remove All Duplicates",
389
  "Keep First Occurrence",
390
  "Keep Last Occurrence"
391
  ])
392
-
393
  if st.button("Handle Duplicates"):
394
  original_count = len(df)
395
  df = df.drop_duplicates(keep={
@@ -397,64 +356,45 @@ elif app_mode == "Smart Cleaning":
397
  "Keep First Occurrence": 'first',
398
  "Keep Last Occurrence": 'last'
399
  }[dup_strategy])
400
-
401
- st.plotly_chart(px.bar(x=["Before", "After"],
402
- y=[original_count, len(df)],
403
- title="Row Count Comparison"))
404
-
405
  cleaning_actions.append(f"Removed {original_count - len(df)} duplicates")
406
  update_version(df)
407
  st.success(f"Removed {original_count - len(df)} duplicates! ✅")
408
  else:
409
  st.success("✨ No duplicates found!")
410
 
411
- # 3. Enhanced Data Type Conversion with Preview
412
- with st.expander("🔄 Convert Data Types", expanded=True):
 
413
  col1, col2 = st.columns(2)
414
  with col1:
415
  st.dataframe(df.dtypes.reset_index().rename(columns={0: 'Type', 'index': 'Column'}))
416
-
417
  with col2:
418
  col_to_convert = st.selectbox("Select column to convert", df.columns)
419
  new_type = st.selectbox("New Data Type", [
420
  "String", "Integer", "Float",
421
  "Boolean", "Datetime", "Category"
422
  ])
423
-
424
  if st.button("Convert Data Type"):
425
  try:
426
- original_dtype = str(df[col_to_convert].dtype)
427
-
428
- # Conversion logic...
429
-
430
- st.write("Conversion Summary:")
431
- st.table(pd.DataFrame({
432
- "Column": [col_to_convert],
433
- "Original Type": [original_dtype],
434
- "New Type": [new_type]
435
- }))
436
-
437
  cleaning_actions.append(f"Converted {col_to_convert} to {new_type}")
438
  update_version(df)
439
  st.success("Data type converted successfully! ✅")
440
-
441
  except Exception as e:
442
  st.error(f"Conversion failed: {str(e)}")
443
 
444
- # 4. Enhanced Outlier Handling with Visualization
445
- with st.expander("📈 Handle Outliers", expanded=True):
 
446
  numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
447
  if numeric_cols:
448
  outlier_col = st.selectbox("Select numeric column", numeric_cols)
449
-
450
- col1, col2 = st.columns(2)
451
- with col1:
452
- st.plotly_chart(px.box(df, y=outlier_col, title="Original Distribution"))
453
- with col2:
454
- st.plotly_chart(px.histogram(df, x=outlier_col, title="Value Distribution"))
455
-
456
- # Outlier handling logic...
457
-
458
  else:
459
  st.info("ℹ️ No numeric columns found for outlier detection")
460
 
@@ -482,14 +422,12 @@ elif app_mode == "Smart Cleaning":
482
  with col2:
483
  st.write("Cleaned Data Shape:", df.shape)
484
 
485
- st.success("✅ Cleaned data saved successfully! You can now proceed to analysis.")
486
-
487
- # Advanced EDA Section
488
  elif app_mode == "Advanced EDA":
489
  st.title("🔍 Advanced Exploratory Data Analysis")
490
  st.markdown("""
491
- **Interactive Data Exploration** with advanced statistical tools and visualizations.
492
- Uncover hidden patterns and relationships in your data.
493
  """)
494
 
495
  if 'cleaned_data' not in st.session_state or st.session_state.cleaned_data is None:
@@ -504,220 +442,227 @@ elif app_mode == "Advanced EDA":
504
  'plot_type': "Histogram",
505
  'x_col': df.columns[0] if len(df.columns) > 0 else None,
506
  'y_col': df.columns[1] if len(df.columns) > 1 else None,
 
507
  'color_col': None,
508
- 'size_col': None,
509
- 'time_col': None,
510
- 'value_col': None,
511
- 'scatter_matrix_cols': df.select_dtypes(include=np.number).columns.tolist()[:5],
512
- 'color_palette': "Viridis",
513
  'hover_data_cols': [],
 
514
  'filter_col': None,
515
  'filter_options': []
516
  }
517
 
518
- # Data Filtering Section
519
- with st.expander("🔎 Data Filtering", expanded=True):
520
- st.session_state.eda_config['filter_col'] = st.selectbox(
521
- "Filter Column",
522
- [None] + list(df.columns),
523
- help="Choose a column to filter the data."
524
- )
525
-
526
- if st.session_state.eda_config['filter_col']:
527
- unique_values = df[st.session_state.eda_config['filter_col']].unique()
528
-
529
- st.session_state.eda_config['filter_options'] = st.multiselect(
530
- "Filter Values",
531
- unique_values,
532
- default=unique_values,
533
- help=f"Select values from '{st.session_state.eda_config['filter_col']}'"
534
- )
535
- df = df[df[st.session_state.eda_config['filter_col']].isin(
536
- st.session_state.eda_config['filter_options']
537
- )]
538
 
539
- # Visualization Type Selection
540
- st.sidebar.header("📊 Visualization Configuration")
541
- plot_types = [
542
- "Histogram", "Scatter Plot", "Box Plot", "Violin Plot",
543
- "Correlation Heatmap", "Parallel Coordinates", "Pair Plot", "Density Contour",
544
- "3D Scatter", "Time Series", "Bar Chart", "Pie Chart", "Line Chart" # Removed the computationally expensive ones
545
- ]
546
- st.session_state.eda_config['plot_type'] = st.sidebar.selectbox(
547
- "Choose Visualization",
548
- plot_types,
549
- index=0
550
- )
551
-
552
- # Dynamic Controls Based on Plot Type
553
- plot_type = st.session_state.eda_config['plot_type']
 
554
 
555
- def show_column_selectors(plot_type, df, config):
556
- """Helper function to display column selectors based on plot type."""
557
- if plot_type != "Correlation Heatmap":
558
- config['x_col'] = st.sidebar.selectbox(
 
559
  "X Axis",
560
  df.columns,
561
- index=df.columns.get_loc(config['x_col']) if config['x_col'] in df.columns else 0
 
562
  )
563
-
564
- if plot_type in ["Scatter Plot", "Box Plot", "Violin Plot", "Time Series", "3D Scatter", "Histogram", "Line Chart"]:
565
- config['y_col'] = st.sidebar.selectbox(
566
  "Y Axis",
567
  df.columns,
568
- index=df.columns.get_loc(config['y_col']) if config['y_col'] in df.columns else 0
 
569
  )
570
 
571
- if plot_type == "Time Series":
572
- config['time_col'] = st.sidebar.selectbox(
573
- "Time Column",
574
- df.columns,
575
- index=df.columns.get_loc(config['time_col']) if config['time_col'] in df.columns else 0
576
- )
577
- config['value_col'] = st.sidebar.selectbox(
578
- "Value Column",
579
  df.columns,
580
- index=df.columns.get_loc(config['value_col']) if config['value_col'] in df.columns else 0
 
581
  )
582
 
583
- if plot_type == "3D Scatter":
584
- config['z_col'] = st.sidebar.selectbox(
585
- "Z Axis",
586
- df.columns,
587
- index=df.columns.get_loc(config['z_col']) if config['z_col'] in df.columns else 0
588
  )
589
- config['color_col'] = st.sidebar.selectbox(
590
- "Color by",
591
  [None] + list(df.columns)
592
  )
593
- return config
594
-
595
- st.session_state.eda_config = show_column_selectors(plot_type, df, st.session_state.eda_config)
596
-
597
- # Advanced Plot Customization
598
- with st.expander("🎨 Advanced Customization", expanded=False):
599
- st.session_state.eda_config['color_palette'] = st.selectbox(
600
- "Color Palette",
601
- ["Viridis", "Plasma", "Magma", "Cividis", "RdBu", "Rainbow"]
602
- )
603
- st.session_state.eda_config['hover_data_cols'] = st.multiselect(
604
- "Hover Data",
605
- df.columns
606
- )
607
-
608
- # Plot Generation
609
- try:
610
- fig = None
611
- config = st.session_state.eda_config
612
-
613
- # Numeric Column Validation Helper
614
- def check_numeric(col):
615
- if not pd.api.types.is_numeric_dtype(df[col]):
616
- st.error(f"Column '{col}' must be numeric for this plot type.")
617
- st.stop()
618
-
619
- if plot_type == "Histogram":
620
- check_numeric(config['x_col'])
621
- color_palette = config['color_palette']
622
- colors = getattr(pc.sequential, color_palette)
623
- fig = px.histogram(
624
- df, x=config['x_col'], y=config['y_col'],
625
- nbins=30, template="plotly_dark",
626
- color=config['x_col'],
627
- color_discrete_sequence = [colors[0]]
628
  )
629
-
630
- elif plot_type == "Scatter Plot":
631
- check_numeric(config['x_col'])
632
- check_numeric(config['y_col'])
633
- fig = px.scatter(
634
- df, x=config['x_col'], y=config['y_col'],
635
- color=config['color_col'],
636
- size=config['size_col'],
637
- hover_data=config['hover_data_cols']
638
  )
639
 
640
- elif plot_type == "3D Scatter":
641
- check_numeric(config['x_col'])
642
- check_numeric(config['y_col'])
643
- check_numeric(config['z_col'])
644
- fig = px.scatter_3d(
645
- df, x=config['x_col'], y=config['y_col'], z=config['z_col'],
646
- color=config['color_col'],
647
- color_discrete_sequence=[config['color_palette']]
648
  )
649
-
650
- elif plot_type == "Correlation Heatmap":
651
- numeric_df = df.select_dtypes(include=np.number)
652
- if not numeric_df.empty:
653
- corr = numeric_df.corr()
654
- fig = px.imshow(
655
- corr, text_auto=True,
656
- color_continuous_scale=config['color_palette']
657
  )
658
- else:
659
- st.warning("No numerical columns found for correlation heatmap.")
660
-
661
- elif plot_type == "Box Plot":
662
- fig = px.box(
663
- df, x=config['x_col'], y=config['y_col'],
664
- color=config['color_col']
665
- )
666
 
667
- elif plot_type == "Violin Plot":
668
- fig = px.violin(
669
- df, x=config['x_col'], y=config['y_col'],
670
- box=True, points="all",
671
- color=config['color_col']
672
- )
673
-
674
- elif plot_type == "Time Series":
675
- # Time Series plots now require time_col and value_col
676
- fig = px.line(
677
- df, x=config['time_col'], y=config['value_col'],
678
- color=config['color_col']
679
- )
680
-
681
- elif plot_type == "Parallel Coordinates":
682
- numeric_df = df.select_dtypes(include=np.number)
683
- if not numeric_df.empty:
684
- fig = px.parallel_coordinates(numeric_df, color_continuous_scale=config['color_palette'])
685
- else:
686
- st.warning("No numerical columns found for parallel coordinates plot.")
687
-
688
- elif plot_type == "Pair Plot":
689
- numeric_cols = df.select_dtypes(include=np.number).columns
690
- if len(numeric_cols) >= 2:
691
- dimensions = st.multiselect("Select Columns for Pair Plot", numeric_cols, default=numeric_cols[:2])
692
- fig = px.scatter_matrix(df[dimensions], color=config['color_col'])
693
- else:
694
- st.warning("Need at least 2 numeric columns for pair plot.")
695
-
696
- elif plot_type == "Density Contour":
697
- check_numeric(config['x_col'])
698
- check_numeric(config['y_col'])
699
- fig = px.density_contour(df, x=config['x_col'], y=config['y_col'], color=config['color_col'])
700
-
701
- elif plot_type == "Bar Chart":
702
- fig = px.bar(
703
- df, x=config['x_col'], y=config['y_col'],
704
- color=config['color_col']
705
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
706
 
707
- elif plot_type == "Pie Chart":
708
- fig = px.pie(
709
- df, values=config['y_col'], names=config['x_col'],
710
- color_discrete_sequence=px.colors.sequential.RdBu
711
- )
712
- elif plot_type == "Line Chart":
713
- fig = px.line(
714
- df, x=config['x_col'], y=config['y_col'],
715
- color=config['color_col']
716
- )
717
  if fig:
718
  st.plotly_chart(fig, use_container_width=True)
719
- except Exception as e:
720
- st.error(f"An error occurred while generating the plot: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
721
 
722
  # Model Training Section
723
  elif app_mode == "Model Training":
 
235
  pr = ProfileReport(df, explorative=True)
236
  st_profile_report(pr)
237
 
 
238
  elif app_mode == "Smart Cleaning":
239
  st.title("🧼 Intelligent Data Cleaning")
240
  st.markdown("""
 
259
  df = st.session_state.data_versions[st.session_state.current_version].copy()
260
  cleaning_actions = st.session_state.get('cleaning_actions', [])
261
 
262
+ # Version Control with Progress Bar
263
  with st.expander("⏪ Version Control", expanded=True):
264
+ st.caption(f"Current Version: {st.session_state.current_version+1}/{len(st.session_state.data_versions)}")
265
+ progress = (st.session_state.current_version + 1) / len(st.session_state.data_versions)
266
+ st.progress(progress)
267
+
268
  col1, col2 = st.columns(2)
269
  with col1:
270
+ if st.button("⏮️ Undo Last Action", disabled=st.session_state.current_version == 0):
271
  st.session_state.current_version -= 1
272
  st.experimental_rerun()
273
  with col2:
274
+ if st.button("⏭️ Redo Next Action", disabled=st.session_state.current_version == len(st.session_state.data_versions)-1):
275
  st.session_state.current_version += 1
276
  st.experimental_rerun()
 
277
 
278
+ # Data Health Dashboard with Cards
279
  st.subheader("📊 Data Health Dashboard")
280
+ with st.expander("Show Comprehensive Data Report", expanded=True):
281
  from pandas_profiling import ProfileReport
282
  pr = ProfileReport(df, explorative=True)
283
  st_profile_report(pr)
284
 
285
+ # Enhanced Health Summary with Cards
286
  col1, col2, col3, col4 = st.columns(4)
287
  with col1:
288
+ st.metric("Total Rows", len(df), help="Number of rows in the dataset")
289
  with col2:
290
+ st.metric("Total Columns", len(df.columns), help="Number of columns in the dataset")
 
291
  with col3:
292
+ missing_pct = df.isna().mean().mean()
293
+ st.metric("Missing Values", f"{missing_pct:.1%}", help="Percentage of missing values in the dataset")
294
  with col4:
295
+ duplicates = df.duplicated().sum()
296
+ st.metric("Duplicates", duplicates, help="Number of duplicate rows in the dataset")
297
 
298
+ # Visualizations for Data Health
299
+ st.markdown("### 📈 Data Health Visualizations")
300
+ col1, col2 = st.columns(2)
301
+ with col1:
302
+ st.plotly_chart(px.bar(df.isna().sum(), title="Missing Values per Column",
303
+ labels={'index': 'Column', 'value': 'Missing Count'},
304
+ color=df.isna().sum(), color_continuous_scale="Bluered"))
305
+ with col2:
306
+ st.plotly_chart(px.pie(values=df.dtypes.value_counts(), names=df.dtypes.value_counts().index,
307
+ title="Data Type Distribution", hole=0.3))
308
+
309
+ # Cleaning Operations with Tabs
310
  st.subheader("🔧 Cleaning Operations")
311
+ tab1, tab2, tab3, tab4 = st.tabs(["Missing Values", "Duplicates", "Data Types", "Outliers"])
312
+
313
+ # 1. Missing Value Handling
314
+ with tab1:
315
+ st.markdown("### 🕳️ Handle Missing Values")
316
  missing_cols = df.columns[df.isna().any()].tolist()
317
  if missing_cols:
318
  st.write("Columns with missing values:")
 
326
  "Deep Learning Imputation"
327
  ], horizontal=True)
328
 
 
 
 
 
 
 
 
 
329
  if st.button(f"Apply {method}"):
330
  try:
331
  original_df = df.copy()
332
+ # Imputation logic here...
333
+ cleaning_actions.append(f"Applied {method} on {cols}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
  update_version(df)
335
  st.success(f"{method} applied successfully! ✅")
 
336
  except Exception as e:
337
  st.error(f"Error: {str(e)}")
 
338
  else:
339
  st.success("✨ No missing values found!")
340
 
341
+ # 2. Duplicate Handling
342
+ with tab2:
343
+ st.markdown("### 🔄 Handle Duplicates")
344
  duplicates = df.duplicated().sum()
345
  if duplicates > 0:
346
  st.plotly_chart(px.histogram(df, x=df.duplicated(), title="Duplicate Distribution"))
 
347
  dup_strategy = st.radio("Duplicate Strategy", [
348
  "Remove All Duplicates",
349
  "Keep First Occurrence",
350
  "Keep Last Occurrence"
351
  ])
 
352
  if st.button("Handle Duplicates"):
353
  original_count = len(df)
354
  df = df.drop_duplicates(keep={
 
356
  "Keep First Occurrence": 'first',
357
  "Keep Last Occurrence": 'last'
358
  }[dup_strategy])
 
 
 
 
 
359
  cleaning_actions.append(f"Removed {original_count - len(df)} duplicates")
360
  update_version(df)
361
  st.success(f"Removed {original_count - len(df)} duplicates! ✅")
362
  else:
363
  st.success("✨ No duplicates found!")
364
 
365
+ # 3. Data Type Conversion
366
+ with tab3:
367
+ st.markdown("### 🔄 Convert Data Types")
368
  col1, col2 = st.columns(2)
369
  with col1:
370
  st.dataframe(df.dtypes.reset_index().rename(columns={0: 'Type', 'index': 'Column'}))
 
371
  with col2:
372
  col_to_convert = st.selectbox("Select column to convert", df.columns)
373
  new_type = st.selectbox("New Data Type", [
374
  "String", "Integer", "Float",
375
  "Boolean", "Datetime", "Category"
376
  ])
 
377
  if st.button("Convert Data Type"):
378
  try:
379
+ # Conversion logic here...
 
 
 
 
 
 
 
 
 
 
380
  cleaning_actions.append(f"Converted {col_to_convert} to {new_type}")
381
  update_version(df)
382
  st.success("Data type converted successfully! ✅")
 
383
  except Exception as e:
384
  st.error(f"Conversion failed: {str(e)}")
385
 
386
+ # 4. Outlier Handling
387
+ with tab4:
388
+ st.markdown("### 📈 Handle Outliers")
389
  numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
390
  if numeric_cols:
391
  outlier_col = st.selectbox("Select numeric column", numeric_cols)
392
+ st.plotly_chart(px.box(df, y=outlier_col, title="Outlier Distribution"))
393
+ if st.button("Remove Outliers"):
394
+ # Outlier removal logic here...
395
+ cleaning_actions.append(f"Removed outliers from {outlier_col}")
396
+ update_version(df)
397
+ st.success("Outliers removed successfully! ")
 
 
 
398
  else:
399
  st.info("ℹ️ No numeric columns found for outlier detection")
400
 
 
422
  with col2:
423
  st.write("Cleaned Data Shape:", df.shape)
424
 
425
+ st.success("✅ Cleaned data saved successfully! You can now proceed to analysis.")
 
 
426
  elif app_mode == "Advanced EDA":
427
  st.title("🔍 Advanced Exploratory Data Analysis")
428
  st.markdown("""
429
+ **Interactive Data Exploration** with optimized visualizations for fast insights.
430
+ Uncover patterns and relationships in your data with beautiful, responsive plots.
431
  """)
432
 
433
  if 'cleaned_data' not in st.session_state or st.session_state.cleaned_data is None:
 
442
  'plot_type': "Histogram",
443
  'x_col': df.columns[0] if len(df.columns) > 0 else None,
444
  'y_col': df.columns[1] if len(df.columns) > 1 else None,
445
+ 'z_col': df.columns[2] if len(df.columns) > 2 else None,
446
  'color_col': None,
447
+ 'facet_col': None,
 
 
 
 
448
  'hover_data_cols': [],
449
+ 'color_palette': "Viridis",
450
  'filter_col': None,
451
  'filter_options': []
452
  }
453
 
454
+ # Main Layout Columns
455
+ col1, col2 = st.columns([1, 3])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
456
 
457
+ with col1:
458
+ st.header("📊 Visualization Setup")
459
+
460
+ # Plot Type Selection
461
+ plot_types = {
462
+ "Distribution": ["Histogram", "Box Plot", "Violin Plot", "Density Plot"],
463
+ "Relationship": ["Scatter Plot", "Line Plot", "Heatmap", "Pair Plot"],
464
+ "Comparison": ["Bar Chart", "Pie Chart", "Parallel Coordinates"],
465
+ "3D": ["3D Scatter", "3D Surface"]
466
+ }
467
+
468
+ selected_category = st.selectbox("Plot Category", list(plot_types.keys()))
469
+ st.session_state.eda_config['plot_type'] = st.selectbox(
470
+ "Plot Type",
471
+ plot_types[selected_category]
472
+ )
473
 
474
+ # Dynamic Column Selectors
475
+ plot_type = st.session_state.eda_config['plot_type']
476
+
477
+ if plot_type in ["Histogram", "Box Plot", "Violin Plot", "Density Plot", "Bar Chart", "Pie Chart"]:
478
+ st.session_state.eda_config['x_col'] = st.selectbox(
479
  "X Axis",
480
  df.columns,
481
+ index=df.columns.get_loc(st.session_state.eda_config['x_col'])
482
+ if st.session_state.eda_config['x_col'] in df.columns else 0
483
  )
484
+
485
+ if plot_type in ["Scatter Plot", "Line Plot", "Box Plot", "Violin Plot", "Density Plot"]:
486
+ st.session_state.eda_config['y_col'] = st.selectbox(
487
  "Y Axis",
488
  df.columns,
489
+ index=df.columns.get_loc(st.session_state.eda_config['y_col'])
490
+ if st.session_state.eda_config['y_col'] in df.columns else 0
491
  )
492
 
493
+ if plot_type in ["3D Scatter", "3D Surface"]:
494
+ st.session_state.eda_config['z_col'] = st.selectbox(
495
+ "Z Axis",
 
 
 
 
 
496
  df.columns,
497
+ index=df.columns.get_loc(st.session_state.eda_config['z_col'])
498
+ if st.session_state.eda_config['z_col'] in df.columns else 0
499
  )
500
 
501
+ # Additional Options
502
+ with st.expander("🎨 Customization"):
503
+ st.session_state.eda_config['color_col'] = st.selectbox(
504
+ "Color By",
505
+ [None] + list(df.columns)
506
  )
507
+ st.session_state.eda_config['facet_col'] = st.selectbox(
508
+ "Facet By",
509
  [None] + list(df.columns)
510
  )
511
+ st.session_state.eda_config['hover_data_cols'] = st.multiselect(
512
+ "Hover Data",
513
+ df.columns
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
514
  )
515
+ st.session_state.eda_config['color_palette'] = st.selectbox(
516
+ "Color Palette",
517
+ px.colors.named_colorscales()
 
 
 
 
 
 
518
  )
519
 
520
+ # Data Filtering
521
+ with st.expander("🔎 Data Filtering"):
522
+ filter_col = st.selectbox(
523
+ "Filter Column",
524
+ [None] + list(df.columns)
 
 
 
525
  )
526
+ if filter_col:
527
+ unique_values = df[filter_col].unique()
528
+ selected_values = st.multiselect(
529
+ f"Select {filter_col} values",
530
+ unique_values,
531
+ default=unique_values
 
 
532
  )
533
+ df = df[df[filter_col].isin(selected_values)]
 
 
 
 
 
 
 
534
 
535
+ with col2:
536
+ st.header("📈 Visualization")
537
+ config = st.session_state.eda_config
538
+
539
+ @st.cache_data(ttl=300)
540
+ def generate_plot(df, plot_type, config):
541
+ """Cached plot generation function for better performance"""
542
+ try:
543
+ if plot_type == "Histogram":
544
+ return px.histogram(
545
+ df, x=config['x_col'],
546
+ color=config['color_col'],
547
+ nbins=30,
548
+ color_discrete_sequence=[config['color_palette']]
549
+ )
550
+
551
+ elif plot_type == "Scatter Plot":
552
+ return px.scatter(
553
+ df, x=config['x_col'], y=config['y_col'],
554
+ color=config['color_col'],
555
+ hover_data=config['hover_data_cols']
556
+ )
557
+
558
+ elif plot_type == "Box Plot":
559
+ return px.box(
560
+ df, x=config['x_col'], y=config['y_col'],
561
+ color=config['color_col']
562
+ )
563
+
564
+ elif plot_type == "Violin Plot":
565
+ return px.violin(
566
+ df, x=config['x_col'], y=config['y_col'],
567
+ color=config['color_col'],
568
+ box=True
569
+ )
570
+
571
+ elif plot_type == "Heatmap":
572
+ numeric_df = df.select_dtypes(include=np.number)
573
+ corr = numeric_df.corr()
574
+ return px.imshow(
575
+ corr,
576
+ text_auto=True,
577
+ color_continuous_scale=config['color_palette']
578
+ )
579
+
580
+ elif plot_type == "3D Scatter":
581
+ return px.scatter_3d(
582
+ df, x=config['x_col'], y=config['y_col'], z=config['z_col'],
583
+ color=config['color_col']
584
+ )
585
+
586
+ elif plot_type == "Bar Chart":
587
+ return px.bar(
588
+ df, x=config['x_col'], y=config['y_col'],
589
+ color=config['color_col']
590
+ )
591
+
592
+ elif plot_type == "Pie Chart":
593
+ return px.pie(
594
+ df, names=config['x_col'], values=config['y_col'],
595
+ color_discrete_sequence=[config['color_palette']]
596
+ )
597
+
598
+ elif plot_type == "Line Plot":
599
+ return px.line(
600
+ df, x=config['x_col'], y=config['y_col'],
601
+ color=config['color_col']
602
+ )
603
+
604
+ elif plot_type == "Pair Plot":
605
+ numeric_cols = df.select_dtypes(include=np.number).columns
606
+ return px.scatter_matrix(
607
+ df[numeric_cols],
608
+ color=config['color_col']
609
+ )
610
+
611
+ elif plot_type == "Parallel Coordinates":
612
+ numeric_df = df.select_dtypes(include=np.number)
613
+ return px.parallel_coordinates(
614
+ numeric_df,
615
+ color_continuous_scale=config['color_palette']
616
+ )
617
+
618
+ elif plot_type == "Density Plot":
619
+ return px.density_contour(
620
+ df, x=config['x_col'], y=config['y_col'],
621
+ color=config['color_col']
622
+ )
623
+
624
+ except Exception as e:
625
+ st.error(f"Plot generation error: {str(e)}")
626
+ return None
627
 
628
+ # Generate and display plot
629
+ fig = generate_plot(df, plot_type, config)
 
 
 
 
 
 
 
 
630
  if fig:
631
  st.plotly_chart(fig, use_container_width=True)
632
+
633
+ # Plot Statistics
634
+ with st.expander("📊 Plot Statistics"):
635
+ if plot_type in ["Histogram", "Box Plot", "Violin Plot"]:
636
+ st.write(f"**{config['x_col']} Statistics**")
637
+ st.table(df[config['x_col']].describe())
638
+
639
+ if plot_type in ["Scatter Plot", "Line Plot"]:
640
+ st.write(f"**Correlation between {config['x_col']} and {config['y_col']}**")
641
+ corr = df[[config['x_col'], config['y_col']]].corr().iloc[0,1]
642
+ st.metric("Pearson Correlation", f"{corr:.2f}")
643
+
644
+ if plot_type == "Heatmap":
645
+ st.write("**Correlation Matrix**")
646
+ numeric_df = df.select_dtypes(include=np.number)
647
+ st.dataframe(numeric_df.corr())
648
+
649
+ # Data Summary Section
650
+ st.header("📝 Data Summary")
651
+ with st.expander("Show Data Summary"):
652
+ col1, col2 = st.columns(2)
653
+ with col1:
654
+ st.write("**Data Shape**")
655
+ st.write(f"Rows: {df.shape[0]}")
656
+ st.write(f"Columns: {df.shape[1]}")
657
+
658
+ with col2:
659
+ st.write("**Data Types**")
660
+ st.dataframe(df.dtypes.reset_index().rename(columns={
661
+ 'index': 'Column', 0: 'Type'
662
+ }))
663
+
664
+ st.write("**Sample Data**")
665
+ st.dataframe(df.head())
666
 
667
  # Model Training Section
668
  elif app_mode == "Model Training":