CosmickVisions commited on
Commit
f859c5c
·
verified ·
1 Parent(s): 3fddd77

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +261 -559
app.py CHANGED
@@ -1,6 +1,24 @@
1
- import streamlit as st
 
2
  import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import numpy as np
 
4
  import plotly.express as px
5
  from scipy import stats
6
  import plotly.colors as pc
@@ -27,16 +45,14 @@ from sklearn.svm import SVR, SVC
27
  from sklearn.feature_selection import SelectKBest
28
  from sklearn.experimental import enable_iterative_imputer
29
  from sklearn.impute import IterativeImputer
30
- from sklearn.neural_network import MLPRegressor
31
- from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score
32
  from sklearn.impute import KNNImputer, SimpleImputer
33
  from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder, LabelEncoder
34
  from sklearn.compose import ColumnTransformer
35
  from sklearn.pipeline import Pipeline
36
- from ydata_profiling import ProfileReport
37
- from streamlit_pandas_profiling import st_profile_report
38
-
39
 
 
40
 
41
 
42
  # Enhanced configuration
@@ -51,7 +67,12 @@ if 'raw_data' not in st.session_state:
51
  st.session_state.raw_data = None
52
  if 'cleaned_data' not in st.session_state:
53
  st.session_state.cleaned_data = None
54
-
 
 
 
 
 
55
  # Security: Set allowed file types
56
  ALLOWED_EXTENSIONS = {'csv', 'xlsx', 'parquet', 'feather'}
57
  MAX_FILE_SIZE_MB = 250 # 250MB limit
@@ -60,15 +81,15 @@ def validate_file(file):
60
  """Comprehensive file validation"""
61
  if not file:
62
  return False, "No file uploaded"
63
-
64
  extension = file.name.split('.')[-1].lower()
65
  if extension not in ALLOWED_EXTENSIONS:
66
  return False, f"Unsupported file type: {extension}"
67
-
68
  file_size_mb = file.size / (1024 * 1024)
69
  if file_size_mb > MAX_FILE_SIZE_MB:
70
  return False, f"File size exceeds {MAX_FILE_SIZE_MB}MB limit"
71
-
72
  return True, ""
73
 
74
  @st.cache_data(ttl=3600, show_spinner="Analyzing data quality...")
@@ -164,10 +185,10 @@ if app_mode == "Data Upload":
164
  df = pd.read_parquet(uploaded_file)
165
  elif uploaded_file.name.endswith('.feather'):
166
  df = pd.read_feather(uploaded_file)
167
-
168
  st.session_state.raw_data = df
169
  st.success("Dataset loaded successfully!")
170
-
171
  except Exception as e:
172
  st.error(f"Error loading file: {str(e)}")
173
  st.stop()
@@ -181,7 +202,7 @@ if app_mode == "Data Upload":
181
  # Data Health Dashboard
182
  st.subheader("📊 Data Health Dashboard")
183
  report = enhanced_quality_report(df)
184
-
185
  col1, col2, col3, col4 = st.columns(4)
186
  col1.metric("Total Rows", report['basic_stats']['rows'])
187
  col2.metric("Total Columns", report['basic_stats']['columns'])
@@ -192,11 +213,11 @@ if app_mode == "Data Upload":
192
  with st.expander("🔍 Deep Column Analysis", expanded=True):
193
  selected_col = st.selectbox("Select column to inspect", df.columns)
194
  col_info = report['column_analysis'][selected_col]
195
-
196
  st.write(f"**Type:** {col_info['type']}")
197
  st.write(f"**Unique Values:** {col_info['unique']}")
198
  st.write(f"**Missing Values:** {col_info['missing']} ({col_info['missing']/len(df):.1%})")
199
-
200
  if pd.api.types.is_numeric_dtype(df[selected_col]):
201
  st.write("**Distribution:**")
202
  st.line_chart(df[selected_col])
@@ -218,7 +239,7 @@ if app_mode == "Data Upload":
218
  recommendations.append(f"⚠️ Consider dropping {col} (>{50}% missing)")
219
  if data['unique'] == len(df):
220
  recommendations.append(f"🔍 Investigate {col} - potential unique identifier")
221
-
222
  if recommendations:
223
  st.write("### Recommended Actions")
224
  for rec in recommendations[:5]: # Show top 5
@@ -234,7 +255,7 @@ if app_mode == "Data Upload":
234
  # Advanced Profiling
235
  if st.button("🚀 Generate Full Data Profile"):
236
  with st.spinner("Generating comprehensive report..."):
237
- pr = ProfileReport(df, explorative=True)
238
  st_profile_report(pr)
239
 
240
  elif app_mode == "Smart Cleaning":
@@ -266,7 +287,7 @@ elif app_mode == "Smart Cleaning":
266
  st.caption(f"Current Version: {st.session_state.current_version+1}/{len(st.session_state.data_versions)}")
267
  progress = (st.session_state.current_version + 1) / len(st.session_state.data_versions)
268
  st.progress(progress)
269
-
270
  col1, col2 = st.columns(2)
271
  with col1:
272
  if st.button("⏮️ Undo Last Action", disabled=st.session_state.current_version == 0):
@@ -281,7 +302,7 @@ elif app_mode == "Smart Cleaning":
281
  st.subheader("📊 Data Health Dashboard")
282
  with st.expander("Show Comprehensive Data Report", expanded=True):
283
  from pandas_profiling import ProfileReport
284
- pr = ProfileReport(df, explorative=True)
285
  st_profile_report(pr)
286
 
287
  # Enhanced Health Summary with Cards
@@ -301,11 +322,11 @@ elif app_mode == "Smart Cleaning":
301
  st.markdown("### 📈 Data Health Visualizations")
302
  col1, col2 = st.columns(2)
303
  with col1:
304
- st.plotly_chart(px.bar(df.isna().sum(), title="Missing Values per Column",
305
- labels={'index': 'Column', 'value': 'Missing Count'},
306
  color=df.isna().sum(), color_continuous_scale="Bluered"))
307
  with col2:
308
- st.plotly_chart(px.pie(values=df.dtypes.value_counts(), names=df.dtypes.value_counts().index,
309
  title="Data Type Distribution", hole=0.3))
310
 
311
  # Cleaning Operations with Tabs
@@ -319,15 +340,15 @@ elif app_mode == "Smart Cleaning":
319
  if missing_cols:
320
  st.write("Columns with missing values:")
321
  cols = st.multiselect("Select columns to clean", missing_cols, default=missing_cols)
322
-
323
  method = st.radio("Imputation Method", [
324
- "Drop Missing",
325
- "Mean/Median/Mode",
326
- "KNN Imputation",
327
- "MICE Imputation",
328
  "Deep Learning Imputation"
329
  ], horizontal=True)
330
-
331
  if st.button(f"Apply {method}"):
332
  try:
333
  original_df = df.copy()
@@ -348,7 +369,7 @@ elif app_mode == "Smart Cleaning":
348
  st.plotly_chart(px.histogram(df, x=df.duplicated(), title="Duplicate Distribution"))
349
  dup_strategy = st.radio("Duplicate Strategy", [
350
  "Remove All Duplicates",
351
- "Keep First Occurrence",
352
  "Keep Last Occurrence"
353
  ])
354
  if st.button("Handle Duplicates"):
@@ -373,7 +394,7 @@ elif app_mode == "Smart Cleaning":
373
  with col2:
374
  col_to_convert = st.selectbox("Select column to convert", df.columns)
375
  new_type = st.selectbox("New Data Type", [
376
- "String", "Integer", "Float",
377
  "Boolean", "Datetime", "Category"
378
  ])
379
  if st.button("Convert Data Type"):
@@ -404,27 +425,27 @@ elif app_mode == "Smart Cleaning":
404
  if st.button("💾 Save Cleaned Data"):
405
  st.session_state.cleaned_data = df
406
  st.balloons()
407
-
408
  # Generate comprehensive report
409
  from pandas_profiling import ProfileReport
410
  pr = ProfileReport(df, title="Cleaned Data Report")
411
  st_profile_report(pr)
412
-
413
  # Show cleaning log with diffs
414
  st.subheader("📝 Cleaning Log")
415
  st.table(pd.DataFrame({
416
  "Step": range(1, len(cleaning_actions)+1),
417
  "Action": cleaning_actions
418
  }))
419
-
420
  # Show dataset comparison
421
  col1, col2 = st.columns(2)
422
  with col1:
423
  st.write("Original Data Shape:", st.session_state.raw_data.shape)
424
  with col2:
425
  st.write("Cleaned Data Shape:", df.shape)
426
-
427
- st.success("✅ Cleaned data saved successfully! You can now proceed to analysis.")
428
  elif app_mode == "Advanced EDA":
429
  st.title("🔍 Advanced Exploratory Data Analysis")
430
  st.markdown("""
@@ -458,7 +479,7 @@ elif app_mode == "Advanced EDA":
458
 
459
  with col1:
460
  st.header("📊 Visualization Setup")
461
-
462
  # Plot Type Selection
463
  plot_types = {
464
  "Distribution": ["Histogram", "Box Plot", "Violin Plot", "Density Plot"],
@@ -466,7 +487,7 @@ elif app_mode == "Advanced EDA":
466
  "Comparison": ["Bar Chart", "Pie Chart", "Parallel Coordinates"],
467
  "3D": ["3D Scatter", "3D Surface"]
468
  }
469
-
470
  selected_category = st.selectbox("Plot Category", list(plot_types.keys()))
471
  st.session_state.eda_config['plot_type'] = st.selectbox(
472
  "Plot Type",
@@ -475,28 +496,28 @@ elif app_mode == "Advanced EDA":
475
 
476
  # Dynamic Column Selectors
477
  plot_type = st.session_state.eda_config['plot_type']
478
-
479
  if plot_type in ["Histogram", "Box Plot", "Violin Plot", "Density Plot", "Bar Chart", "Pie Chart"]:
480
  st.session_state.eda_config['x_col'] = st.selectbox(
481
  "X Axis",
482
  df.columns,
483
- index=df.columns.get_loc(st.session_state.eda_config['x_col'])
484
  if st.session_state.eda_config['x_col'] in df.columns else 0
485
  )
486
-
487
  if plot_type in ["Scatter Plot", "Line Plot", "Box Plot", "Violin Plot", "Density Plot"]:
488
  st.session_state.eda_config['y_col'] = st.selectbox(
489
  "Y Axis",
490
  df.columns,
491
- index=df.columns.get_loc(st.session_state.eda_config['y_col'])
492
  if st.session_state.eda_config['y_col'] in df.columns else 0
493
  )
494
-
495
  if plot_type in ["3D Scatter", "3D Surface"]:
496
  st.session_state.eda_config['z_col'] = st.selectbox(
497
  "Z Axis",
498
  df.columns,
499
- index=df.columns.get_loc(st.session_state.eda_config['z_col'])
500
  if st.session_state.eda_config['z_col'] in df.columns else 0
501
  )
502
 
@@ -537,92 +558,92 @@ elif app_mode == "Advanced EDA":
537
  with col2:
538
  st.header("📈 Visualization")
539
  config = st.session_state.eda_config
540
-
541
  @st.cache_data(ttl=300)
542
  def generate_plot(df, plot_type, config):
543
  """Cached plot generation function for better performance"""
544
  try:
545
  if plot_type == "Histogram":
546
  return px.histogram(
547
- df, x=config['x_col'],
548
  color=config['color_col'],
549
  nbins=30,
550
  color_discrete_sequence=[config['color_palette']]
551
  )
552
-
553
  elif plot_type == "Scatter Plot":
554
  return px.scatter(
555
  df, x=config['x_col'], y=config['y_col'],
556
  color=config['color_col'],
557
  hover_data=config['hover_data_cols']
558
  )
559
-
560
  elif plot_type == "Box Plot":
561
  return px.box(
562
  df, x=config['x_col'], y=config['y_col'],
563
  color=config['color_col']
564
  )
565
-
566
  elif plot_type == "Violin Plot":
567
  return px.violin(
568
  df, x=config['x_col'], y=config['y_col'],
569
  color=config['color_col'],
570
  box=True
571
  )
572
-
573
  elif plot_type == "Heatmap":
574
  numeric_df = df.select_dtypes(include=np.number)
575
  corr = numeric_df.corr()
576
  return px.imshow(
577
- corr,
578
  text_auto=True,
579
  color_continuous_scale=config['color_palette']
580
  )
581
-
582
  elif plot_type == "3D Scatter":
583
  return px.scatter_3d(
584
  df, x=config['x_col'], y=config['y_col'], z=config['z_col'],
585
  color=config['color_col']
586
  )
587
-
588
  elif plot_type == "Bar Chart":
589
  return px.bar(
590
  df, x=config['x_col'], y=config['y_col'],
591
  color=config['color_col']
592
  )
593
-
594
  elif plot_type == "Pie Chart":
595
  return px.pie(
596
  df, names=config['x_col'], values=config['y_col'],
597
  color_discrete_sequence=[config['color_palette']]
598
  )
599
-
600
  elif plot_type == "Line Plot":
601
  return px.line(
602
  df, x=config['x_col'], y=config['y_col'],
603
  color=config['color_col']
604
  )
605
-
606
  elif plot_type == "Pair Plot":
607
  numeric_cols = df.select_dtypes(include=np.number).columns
608
  return px.scatter_matrix(
609
  df[numeric_cols],
610
  color=config['color_col']
611
  )
612
-
613
  elif plot_type == "Parallel Coordinates":
614
  numeric_df = df.select_dtypes(include=np.number)
615
  return px.parallel_coordinates(
616
  numeric_df,
617
  color_continuous_scale=config['color_palette']
618
  )
619
-
620
  elif plot_type == "Density Plot":
621
  return px.density_contour(
622
  df, x=config['x_col'], y=config['y_col'],
623
  color=config['color_col']
624
  )
625
-
626
  except Exception as e:
627
  st.error(f"Plot generation error: {str(e)}")
628
  return None
@@ -631,18 +652,18 @@ elif app_mode == "Advanced EDA":
631
  fig = generate_plot(df, plot_type, config)
632
  if fig:
633
  st.plotly_chart(fig, use_container_width=True)
634
-
635
  # Plot Statistics
636
  with st.expander("📊 Plot Statistics"):
637
  if plot_type in ["Histogram", "Box Plot", "Violin Plot"]:
638
  st.write(f"**{config['x_col']} Statistics**")
639
  st.table(df[config['x_col']].describe())
640
-
641
  if plot_type in ["Scatter Plot", "Line Plot"]:
642
  st.write(f"**Correlation between {config['x_col']} and {config['y_col']}**")
643
  corr = df[[config['x_col'], config['y_col']]].corr().iloc[0,1]
644
  st.metric("Pearson Correlation", f"{corr:.2f}")
645
-
646
  if plot_type == "Heatmap":
647
  st.write("**Correlation Matrix**")
648
  numeric_df = df.select_dtypes(include=np.number)
@@ -656,16 +677,16 @@ elif app_mode == "Advanced EDA":
656
  st.write("**Data Shape**")
657
  st.write(f"Rows: {df.shape[0]}")
658
  st.write(f"Columns: {df.shape[1]}")
659
-
660
  with col2:
661
  st.write("**Data Types**")
662
  st.dataframe(df.dtypes.reset_index().rename(columns={
663
  'index': 'Column', 0: 'Type'
664
  }))
665
-
666
  st.write("**Sample Data**")
667
  st.dataframe(df.head())
668
-
669
  # Model Training Section
670
  elif app_mode == "Model Training":
671
  st.title("🚂 Model Training Studio")
@@ -714,42 +735,45 @@ elif app_mode == "Model Training":
714
  model_options = ["Logistic Regression", "Decision Tree", "Random Forest", "Gradient Boosting", "SVM", "Neural Network", "KNN", "Naive Bayes"]
715
  model_name = st.selectbox("Select Model", model_options, help="Choose a model.")
716
 
717
- # Hyperparameter Tuning
718
- st.subheader("🎛️ Hyperparameter Tuning")
719
- with st.expander("Configure Hyperparameters", expanded=True):
720
- if model_name == "Random Forest":
721
- n_estimators = st.slider("Number of Estimators", 10, 200, 100)
722
- max_depth = st.slider("Max Depth", 3, 20, 10)
723
- min_samples_split = st.slider("Min Samples Split", 2, 10, 2)
724
- min_samples_leaf = st.slider("Min Samples Leaf", 1, 10, 1)
725
- hyperparams = {
726
- 'n_estimators': n_estimators,
727
- 'max_depth': max_depth,
728
- 'min_samples_split': min_samples_split,
729
- 'min_samples_leaf': min_samples_leaf
730
- }
731
- elif model_name == "Gradient Boosting":
732
- learning_rate = st.slider("Learning Rate", 0.01, 1.0, 0.1)
733
- n_estimators = st.slider("Number of Estimators", 10, 200, 100)
734
- max_depth = st.slider("Max Depth", 3, 20, 10)
735
- hyperparams = {
736
- 'learning_rate': learning_rate,
737
- 'n_estimators': n_estimators,
738
- 'max_depth': max_depth
739
- }
740
- elif model_name == "Neural Network":
741
- hidden_layers = st.slider("Number of Hidden Layers", 1, 5, 2)
742
- neurons_per_layer = st.slider("Neurons per Layer", 10, 200, 50)
743
- epochs = st.slider("Epochs", 10, 200, 50)
744
- batch_size = st.slider("Batch Size", 16, 128, 32)
745
- hyperparams = {
746
- 'hidden_layers': hidden_layers,
747
- 'neurons_per_layer': neurons_per_layer,
748
- 'epochs': epochs,
749
- 'batch_size': batch_size
750
- }
751
- else:
752
- hyperparams = {}
 
 
 
753
 
754
  # Train-Test Split
755
  st.subheader("✂️ Train-Test Split")
@@ -819,39 +843,69 @@ elif app_mode == "Model Training":
819
  else:
820
  model = SVC()
821
  elif model_name == "Neural Network":
822
- if problem_type == "Regression":
823
- model = MLPRegressor(
824
- hidden_layer_sizes=[hyperparams['neurons_per_layer']] * hyperparams['hidden_layers'],
825
- max_iter=hyperparams['epochs'],
826
- batch_size=hyperparams['batch_size']
827
- )
828
- else:
829
- model = MLPClassifier(
830
- hidden_layer_sizes=[hyperparams['neurons_per_layer']] * hyperparams['hidden_layers'],
831
- max_iter=hyperparams['epochs'],
832
- batch_size=hyperparams['batch_size']
833
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
834
  elif model_name == "KNN":
 
835
  model = KNeighborsClassifier()
836
  elif model_name == "Naive Bayes":
 
837
  model = GaussianNB()
838
 
839
  # Train the model
840
- model.fit(X_train_processed, y_train)
 
 
 
 
 
841
 
 
 
842
  # Store model and preprocessor
843
  st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
844
  st.session_state.preprocessor = preprocessor
845
 
846
- # Store the test data
847
  st.session_state.X_train_selected = X_train_processed
848
  st.session_state.X_test_selected = X_test_processed
849
  st.session_state.y_train = y_train
850
  st.session_state.y_test = y_test
851
 
852
  # Model Evaluation
853
- y_pred = model.predict(X_test_processed)
854
  if problem_type == "Regression":
 
855
  mse = mean_squared_error(y_test, y_pred)
856
  rmse = np.sqrt(mse)
857
  mae = mean_absolute_error(y_test, y_pred)
@@ -860,7 +914,11 @@ elif app_mode == "Model Training":
860
  st.write(f"Root Mean Squared Error: {rmse:.4f}")
861
  st.write(f"Mean Absolute Error: {mae:.4f}")
862
  st.write(f"R-squared: {r2:.4f}")
863
- else:
 
 
 
 
864
  accuracy = accuracy_score(y_test, y_pred)
865
  precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
866
  recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
@@ -871,6 +929,10 @@ elif app_mode == "Model Training":
871
  st.write(f"F1 Score: {f1:.4f}")
872
  st.write("Classification Report:")
873
  st.text(classification_report(y_test, y_pred))
 
 
 
 
874
 
875
  # Visualization
876
  st.subheader("📊 Model Performance Visualization")
@@ -882,7 +944,33 @@ elif app_mode == "Model Training":
882
  ax.set_ylabel('Predicted')
883
  ax.set_title('Actual vs Predicted')
884
  st.pyplot(fig)
885
- else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
886
  conf_matrix = confusion_matrix(y_test, y_pred)
887
  fig, ax = plt.subplots()
888
  sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax)
@@ -890,7 +978,6 @@ elif app_mode == "Model Training":
890
  ax.set_ylabel('True Labels')
891
  ax.set_title('Confusion Matrix')
892
  st.pyplot(fig)
893
-
894
  st.success("Model trained successfully!")
895
  except Exception as e:
896
  st.error(f"An error occurred during training: {e}")
@@ -908,180 +995,6 @@ elif app_mode == "Model Training":
908
  st.warning("No trained model available. Train a model first to enable saving.")
909
 
910
 
911
- # Visualization Lab Section
912
- elif app_mode == "Visualization Lab":
913
- st.title("🔬 Visualization Lab")
914
- st.markdown("""
915
- **Explore and Visualize Your Data** with advanced plotting tools and interactive visualizations.
916
- Uncover hidden patterns and relationships in your data.
917
- """)
918
-
919
- if 'cleaned_data' not in st.session_state or st.session_state.cleaned_data is None:
920
- st.warning("Please clean your data in the Smart Cleaning section first.")
921
- st.stop()
922
-
923
- df = st.session_state.cleaned_data.copy()
924
-
925
- # Visualization Type Selection
926
- st.subheader("📊 Choose Visualization Type")
927
- plot_types = [
928
- "Histogram", "Scatter Plot", "Box Plot", "Violin Plot",
929
- "Correlation Heatmap", "Parallel Coordinates", "Andrews Curves",
930
- "Pair Plot", "Density Contour", "3D Scatter", "Time Series",
931
- "Sunburst Chart", "Funnel Chart", "Clustering Analysis"
932
- ]
933
- plot_type = st.selectbox("Select Visualization Type", plot_types)
934
-
935
- # Dynamic Controls Based on Plot Type
936
- if plot_type != "Correlation Heatmap":
937
- x_col = st.selectbox("X Axis", df.columns)
938
-
939
- if plot_type in ["Scatter Plot", "Box Plot", "Violin Plot", "Time Series", "3D Scatter", "Histogram"]:
940
- y_col = st.selectbox("Y Axis", df.columns)
941
-
942
- if plot_type == "3D Scatter":
943
- z_col = st.selectbox("Z Axis", df.columns)
944
- color_col = st.selectbox("Color by", [None] + list(df.columns))
945
-
946
- # Advanced Plot Customization
947
- with st.expander("🎨 Advanced Customization", expanded=False):
948
- color_palette = st.selectbox("Color Palette", ["Viridis", "Plasma", "Magma", "Cividis", "RdBu", "Rainbow"])
949
- hover_data_cols = st.multiselect("Hover Data", df.columns)
950
-
951
- # Plot Generation
952
- try:
953
- fig = None
954
-
955
- if plot_type == "Histogram":
956
- fig = px.histogram(
957
- df, x=x_col, y=y_col,
958
- nbins=30, template="plotly_dark",
959
- color_discrete_sequence=[color_palette]
960
- )
961
-
962
- elif plot_type == "Scatter Plot":
963
- fig = px.scatter(
964
- df, x=x_col, y=y_col,
965
- color=color_col,
966
- size=hover_data_cols,
967
- hover_data=hover_data_cols
968
- )
969
-
970
- elif plot_type == "3D Scatter":
971
- fig = px.scatter_3d(
972
- df, x=x_col, y=y_col, z=z_col,
973
- color=color_col,
974
- color_discrete_sequence=[color_palette]
975
- )
976
-
977
- elif plot_type == "Correlation Heatmap":
978
- numeric_df = df.select_dtypes(include=np.number)
979
- if not numeric_df.empty:
980
- corr = numeric_df.corr()
981
- fig = px.imshow(
982
- corr, text_auto=True,
983
- color_continuous_scale=color_palette
984
- )
985
- else:
986
- st.warning("No numerical columns found for correlation heatmap.")
987
-
988
- elif plot_type == "Box Plot":
989
- fig = px.box(
990
- df, x=x_col, y=y_col,
991
- color=color_col
992
- )
993
-
994
- elif plot_type == "Violin Plot":
995
- fig = px.violin(
996
- df, x=x_col, y=y_col,
997
- box=True, points="all",
998
- color=color_col
999
- )
1000
-
1001
- elif plot_type == "Time Series":
1002
- df = df.sort_values(by=x_col)
1003
- fig = px.line(
1004
- df, x=x_col, y=y_col,
1005
- color=color_col
1006
- )
1007
-
1008
- elif plot_type == "Scatter Matrix":
1009
- fig = px.scatter_matrix(
1010
- df, dimensions=[x_col, y_col],
1011
- color=color_col
1012
- )
1013
-
1014
- if fig:
1015
- st.plotly_chart(fig, use_container_width=True)
1016
- except Exception as e:
1017
- st.error(f"An error occurred while generating the plot: {e}")
1018
-
1019
- # Statistical Analysis Section
1020
- with st.expander("📊 Statistical Analysis", expanded=True):
1021
- analysis_type = st.selectbox("Select Analysis Type", [
1022
- "Descriptive Statistics",
1023
- "Correlation Analysis",
1024
- "Hypothesis Testing",
1025
- "Distribution Fitting"
1026
- ])
1027
-
1028
- if analysis_type == "Descriptive Statistics":
1029
- st.write(df.describe(include='all'))
1030
-
1031
- elif analysis_type == "Correlation Analysis":
1032
- numeric_cols = df.select_dtypes(include=np.number).columns
1033
- if len(numeric_cols) >= 2:
1034
- corr_method = st.selectbox("Correlation Method", [
1035
- "Pearson", "Kendall", "Spearman"
1036
- ])
1037
- corr_matrix = df[numeric_cols].corr(method=corr_method.lower())
1038
- st.write(corr_matrix)
1039
- st.heatmap(corr_matrix, annot=True, cmap=color_palette)
1040
- else:
1041
- st.warning("Need at least 2 numeric columns for correlation analysis")
1042
-
1043
- elif analysis_type == "Hypothesis Testing":
1044
- test_type = st.selectbox("Select Test Type", [
1045
- "T-test", "Chi-Squared Test", "ANOVA", "Mann-Whitney U"
1046
- ])
1047
- if test_type == "T-test":
1048
- col1 = st.selectbox("Column 1 (Numeric)", df.select_dtypes(include=np.number).columns)
1049
- col2 = st.selectbox("Column 2 (Categorical)", df.select_dtypes(include='object').columns)
1050
- if st.button("Run T-test"):
1051
- groups = df.groupby(col2)[col1].apply(list)
1052
- if len(groups) == 2:
1053
- t_stat, p_value = stats.ttest_ind(groups.iloc[0], groups.iloc[1])
1054
- st.write(f"T-statistic: {t_stat:.4f}")
1055
- st.write(f"P-value: {p_value:.4f}")
1056
- if p_value < 0.05:
1057
- st.write("Reject the null hypothesis.")
1058
- else:
1059
- st.write("Fail to reject the null hypothesis.")
1060
- else:
1061
- st.write("Select a categorical column with exactly two categories.")
1062
-
1063
- elif analysis_type == "Distribution Fitting":
1064
- numeric_col = st.selectbox("Select Numeric Column", df.select_dtypes(include=np.number).columns)
1065
- dist_types = ["Normal", "Log-Normal", "Exponential", "Gamma"]
1066
- selected_dist = st.selectbox("Select Distribution Type", dist_types)
1067
- if st.button("Fit Distribution"):
1068
- from scipy.stats import norm, lognorm, expon, gamma
1069
- dist_functions = {
1070
- "Normal": norm,
1071
- "Log-Normal": lognorm,
1072
- "Exponential": expon,
1073
- "Gamma": gamma
1074
- }
1075
- params = dist_functions[selected_dist].fit(df[numeric_col].dropna())
1076
- st.write(f"Fitted Parameters: {params}")
1077
-
1078
- # Data Profiling Section
1079
- with st.expander("📝 Generate Full Data Profile", expanded=False):
1080
- if st.button("🚀 Generate Comprehensive Report"):
1081
- with st.spinner("Generating report..."):
1082
- pr = ProfileReport(df, explorative=True)
1083
- st_profile_report(pr)
1084
-
1085
  # Insights Section
1086
  elif app_mode == "Insights":
1087
  st.title("📊 Model Insights & Explainability")
@@ -1112,7 +1025,7 @@ elif app_mode == "Insights":
1112
  'Feature': feature_names,
1113
  'Importance': importances
1114
  }).sort_values('Importance', ascending=False)
1115
-
1116
  fig, ax = plt.subplots()
1117
  sns.barplot(x='Importance', y='Feature', data=importance_df.head(10), ax=ax)
1118
  ax.set_title('Top 10 Feature Importances')
@@ -1125,22 +1038,44 @@ elif app_mode == "Insights":
1125
  if st.checkbox("Calculate SHAP Values (Warning: May be slow for large datasets)"):
1126
  try:
1127
  import shap
1128
- explainer = shap.TreeExplainer(model)
1129
- shap_values = explainer.shap_values(st.session_state.X_test_selected)
1130
-
1131
- # Summary Plot
1132
- st.write("### Summary Plot")
1133
- fig, ax = plt.subplots()
1134
- shap.summary_plot(shap_values, st.session_state.X_test_selected, feature_names=preprocessor.get_feature_names_out())
1135
- st.pyplot(fig)
1136
 
1137
- # Force Plot for Individual Predictions
1138
- st.write("### Individual Prediction Explanation")
1139
- sample_idx = st.slider("Select Sample Index", 0, len(st.session_state.X_test_selected)-1, 0)
1140
- fig, ax = plt.subplots()
1141
- shap.force_plot(explainer.expected_value, shap_values[sample_idx], st.session_state.X_test_selected[sample_idx],
1142
- feature_names=preprocessor.get_feature_names_out(), matplotlib=True, show=False)
1143
- st.pyplot(fig)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1144
  except Exception as e:
1145
  st.error(f"SHAP calculation failed: {e}")
1146
 
@@ -1152,8 +1087,8 @@ elif app_mode == "Insights":
1152
  from sklearn.inspection import PartialDependenceDisplay
1153
  fig, ax = plt.subplots()
1154
  PartialDependenceDisplay.from_estimator(
1155
- model, st.session_state.X_test_selected,
1156
- features=[feature_to_plot],
1157
  feature_names=preprocessor.get_feature_names_out(),
1158
  ax=ax
1159
  )
@@ -1167,14 +1102,14 @@ elif app_mode == "Insights":
1167
  'metric': [],
1168
  'value': []
1169
  }
1170
-
1171
  if hasattr(model, 'predict'):
1172
  y_pred = model.predict(st.session_state.X_test_selected)
1173
  mse = mean_squared_error(st.session_state.y_test, y_pred)
1174
  performance_history['timestamp'].append(datetime.now())
1175
  performance_history['metric'].append('MSE')
1176
  performance_history['value'].append(mse)
1177
-
1178
  performance_df = pd.DataFrame(performance_history)
1179
  st.line_chart(performance_df.set_index('timestamp'))
1180
 
@@ -1203,7 +1138,6 @@ elif app_mode == "Insights":
1203
  st.success("Insights exported successfully!")
1204
  except Exception as e:
1205
  st.error(f"Export failed: {e}")
1206
-
1207
 
1208
  # Predictions Section
1209
  elif app_mode == "Predictions":
@@ -1236,9 +1170,9 @@ elif app_mode == "Predictions":
1236
  input_df = pd.DataFrame([input_data])
1237
  input_processed = preprocessor.transform(input_df)
1238
  prediction = model.predict(input_processed)[0]
1239
-
1240
  st.write(f"**Prediction:** {prediction}")
1241
-
1242
  if hasattr(model, 'predict_proba'):
1243
  probabilities = model.predict_proba(input_processed)[0]
1244
  st.write("**Prediction Probabilities:**")
@@ -1248,14 +1182,20 @@ elif app_mode == "Predictions":
1248
  if st.checkbox("Show SHAP Explanation"):
1249
  try:
1250
  import shap
1251
- explainer = shap.TreeExplainer(model)
1252
- shap_values = explainer.shap_values(input_processed)
1253
-
 
 
 
 
 
1254
  st.write("### SHAP Values")
1255
  fig, ax = plt.subplots()
1256
- shap.force_plot(explainer.expected_value, shap_values, input_processed,
1257
  feature_names=feature_names, matplotlib=True, show=False)
1258
  st.pyplot(fig)
 
1259
  except Exception as e:
1260
  st.error(f"SHAP calculation failed: {e}")
1261
 
@@ -1328,243 +1268,5 @@ elif app_mode == "Predictions":
1328
  pdf.cell(200, 10, txt=f"Problem Type: {'Regression' if hasattr(model, 'predict') else 'Classification'}", ln=True)
1329
  pdf.output("predictions_report.pdf")
1330
  st.success("Predictions exported successfully!")
1331
- except Exception as e:
1332
- st.error(f"Export failed: {e}")
1333
-
1334
- # Neural Network Studio Section
1335
- elif app_mode == "Neural Network Studio":
1336
- st.title("🧠 Neural Network Studio")
1337
- st.markdown("""
1338
- **Build and Train Neural Networks** with advanced configurations and visualizations.
1339
- Explore deep learning models with ease.
1340
- """)
1341
-
1342
- if 'cleaned_data' not in st.session_state or st.session_state.cleaned_data is None:
1343
- st.warning("Please clean your data in the Smart Cleaning section first.")
1344
- st.stop()
1345
-
1346
- df = st.session_state.cleaned_data.copy()
1347
-
1348
- # Target Variable Selection
1349
- st.subheader("🎯 Target Variable")
1350
- target_column = st.selectbox("Select Target Variable", df.columns, help="Choose the column to predict.")
1351
-
1352
- # Problem Type Selection
1353
- st.subheader("📝 Problem Type")
1354
- problem_type = st.radio("Select Problem Type", ["Regression", "Classification"], help="Choose the type of machine learning problem.")
1355
-
1356
- # Feature Selection
1357
- st.subheader("🔧 Feature Selection")
1358
- use_all_features = st.checkbox("Use All Features", value=True, help="Select to use all features for training. Deselect to manually choose features.")
1359
- if use_all_features:
1360
- feature_columns = df.drop(columns=[target_column]).columns.tolist()
1361
- else:
1362
- feature_columns = st.multiselect("Select Feature Columns", df.drop(columns=[target_column]).columns, help="Choose the features you want to use for prediction.")
1363
-
1364
- # Neural Network Configuration
1365
- st.subheader("⚙️ Neural Network Configuration")
1366
- with st.expander("Configure Neural Network", expanded=True):
1367
- col1, col2 = st.columns(2)
1368
- with col1:
1369
- hidden_layers = st.slider("Number of Hidden Layers", 1, 5, 2)
1370
- neurons_per_layer = st.slider("Neurons per Layer", 10, 200, 50)
1371
- activation = st.selectbox("Activation Function",
1372
- ["relu", "tanh", "sigmoid", "selu", "swish"])
1373
- dropout_rate = st.slider("Dropout Rate", 0.0, 0.5, 0.2)
1374
- initializer = st.selectbox("Weight Initializer",
1375
- ["glorot_uniform", "he_normal", "lecun_uniform"])
1376
-
1377
- with col2:
1378
- learning_rate = st.slider("Learning Rate", 0.0001, 0.1, 0.001, format="%.4f")
1379
- optimizer_choice = st.selectbox("Optimizer",
1380
- ["Adam", "Nadam", "RMSprop", "SGD"])
1381
- batch_norm = st.checkbox("Batch Normalization", value=True)
1382
- regularization = st.checkbox("L2 Regularization")
1383
- epochs = st.slider("Epochs", 10, 200, 50)
1384
- batch_size = st.slider("Batch Size", 16, 128, 32)
1385
-
1386
- # Train-Test Split
1387
- st.subheader("✂️ Train-Test Split")
1388
- test_size = st.slider("Test Size", 0.1, 0.5, 0.2, help="Proportion of the dataset to include in the test split.")
1389
-
1390
- # Model Training
1391
- if st.button("🚀 Train Neural Network"):
1392
- with st.spinner("Training neural network..."):
1393
- try:
1394
- X = df[feature_columns]
1395
- y = df[target_column]
1396
-
1397
- # Train-Test Split
1398
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
1399
-
1400
- # Preprocessing Pipeline
1401
- numeric_features = X.select_dtypes(include=np.number).columns
1402
- categorical_features = X.select_dtypes(exclude=np.number).columns
1403
-
1404
- numeric_transformer = Pipeline(steps=[
1405
- ('imputer', SimpleImputer(strategy='median')),
1406
- ('scaler', StandardScaler())
1407
- ])
1408
-
1409
- categorical_transformer = Pipeline(steps=[
1410
- ('imputer', SimpleImputer(strategy='most_frequent')),
1411
- ('onehot', OneHotEncoder(handle_unknown='ignore'))
1412
- ])
1413
-
1414
- preprocessor = ColumnTransformer(
1415
- transformers=[
1416
- ('num', numeric_transformer, numeric_features),
1417
- ('cat', categorical_transformer, categorical_features)
1418
- ])
1419
-
1420
- X_train_processed = preprocessor.fit_transform(X_train)
1421
- X_test_processed = preprocessor.transform(X_test)
1422
-
1423
- # Build neural network with advanced features
1424
- model = keras.Sequential()
1425
- model.add(layers.Input(shape=(X_train_processed.shape[1],)))
1426
-
1427
- for _ in range(hidden_layers):
1428
- # Create configurable layers
1429
- layer_config = {
1430
- 'units': neurons_per_layer,
1431
- 'activation': activation,
1432
- 'kernel_initializer': initializer
1433
- }
1434
-
1435
- if regularization:
1436
- layer_config['kernel_regularizer'] = keras.regularizers.l2(0.01)
1437
-
1438
- model.add(layers.Dense(**layer_config))
1439
-
1440
- if batch_norm:
1441
- model.add(layers.BatchNormalization())
1442
-
1443
- if dropout_rate > 0:
1444
- model.add(layers.Dropout(dropout_rate))
1445
-
1446
- # Output layer
1447
- output_activation = 'linear' if problem_type == "Regression" else 'softmax'
1448
- output_units = 1 if problem_type == "Regression" else len(np.unique(y_train))
1449
- model.add(layers.Dense(output_units, activation=output_activation))
1450
-
1451
- # Configure optimizer
1452
- optimizers = {
1453
- "Adam": keras.optimizers.Adam(learning_rate=learning_rate),
1454
- "Nadam": keras.optimizers.Nadam(learning_rate=learning_rate),
1455
- "RMSprop": keras.optimizers.RMSprop(learning_rate=learning_rate),
1456
- "SGD": keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9)
1457
- }
1458
-
1459
- optimizer = optimizers[optimizer_choice]
1460
-
1461
- # Compile the model
1462
- model.compile(optimizer=optimizer,
1463
- loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
1464
- metrics=['mae'] if problem_type == "Regression" else ['accuracy'])
1465
-
1466
- # Add callbacks section
1467
- with st.expander("Advanced Training Options"):
1468
- early_stopping = st.checkbox("Early Stopping", value=True)
1469
- reduce_lr = st.checkbox("Reduce Learning Rate on Plateau")
1470
- patience = st.slider("Patience Epochs", 5, 20, 10) if early_stopping else 0
1471
-
1472
- callbacks_list = []
1473
- if early_stopping:
1474
- callbacks_list.append(
1475
- callbacks.EarlyStopping(patience=patience, restore_best_weights=True))
1476
- if reduce_lr:
1477
- callbacks_list.append(
1478
- callbacks.ReduceLROnPlateau(factor=0.2, patience=patience//2))
1479
-
1480
- # Train the model with callbacks
1481
- history = model.fit(
1482
- X_train_processed, y_train,
1483
- epochs=epochs,
1484
- batch_size=batch_size,
1485
- validation_split=0.2,
1486
- callbacks=callbacks_list,
1487
- verbose=0
1488
- )
1489
-
1490
- # Store model and preprocessor
1491
- st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
1492
- st.session_state.preprocessor = preprocessor
1493
-
1494
- # Store the test data
1495
- st.session_state.X_train_selected = X_train_processed
1496
- st.session_state.X_test_selected = X_test_processed
1497
- st.session_state.y_train = y_train
1498
- st.session_state.y_test = y_test
1499
-
1500
- # Model Evaluation
1501
- y_pred = model.predict(X_test_processed)
1502
-
1503
- # Post-processing for classification
1504
- if problem_type == "Classification":
1505
- y_pred = np.argmax(y_pred, axis=1) # Convert probabilities to class labels
1506
-
1507
- if problem_type == "Regression":
1508
- mse = mean_squared_error(y_test, y_pred)
1509
- rmse = np.sqrt(mse)
1510
- mae = mean_absolute_error(y_test, y_pred)
1511
- r2 = r2_score(y_test, y_pred)
1512
- st.write(f"Mean Squared Error: {mse:.4f}")
1513
- st.write(f"Root Mean Squared Error: {rmse:.4f}")
1514
- st.write(f"Mean Absolute Error: {mae:.4f}")
1515
- st.write(f"R-squared: {r2:.4f}")
1516
- else:
1517
- accuracy = accuracy_score(y_test, y_pred)
1518
- precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
1519
- recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
1520
- f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
1521
- st.write(f"Accuracy: {accuracy:.4f}")
1522
- st.write(f"Precision: {precision:.4f}")
1523
- st.write(f"Recall: {recall:.4f}")
1524
- st.write(f"F1 Score: {f1:.4f}")
1525
- st.write("Classification Report:")
1526
- st.text(classification_report(y_test, y_pred))
1527
-
1528
- # Visualization with multiple metrics
1529
- st.subheader("📊 Training History")
1530
- fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
1531
-
1532
- # Plot loss
1533
- ax1.plot(history.history['loss'], label='Train Loss')
1534
- ax1.plot(history.history['val_loss'], label='Validation Loss')
1535
- ax1.set_title('Loss Evolution')
1536
- ax1.set_xlabel('Epoch')
1537
- ax1.set_ylabel('Loss')
1538
- ax1.legend()
1539
-
1540
- # Plot accuracy/metric
1541
- if problem_type == "Classification":
1542
- ax2.plot(history.history['accuracy'], label='Train Accuracy')
1543
- ax2.plot(history.history['val_accuracy'], label='Validation Accuracy')
1544
- ax2.set_title('Accuracy Evolution')
1545
- ax2.set_ylabel('Accuracy')
1546
- else:
1547
- ax2.plot(history.history['mae'], label='Train MAE')
1548
- ax2.plot(history.history['val_mae'], label='Validation MAE')
1549
- ax2.set_title('MAE Evolution')
1550
- ax2.set_ylabel('MAE')
1551
-
1552
- ax2.set_xlabel('Epoch')
1553
- ax2.legend()
1554
- st.pyplot(fig)
1555
-
1556
- st.success("Neural network trained successfully!")
1557
- except Exception as e:
1558
- st.error(f"An error occurred during training: {e}")
1559
-
1560
- # Model Saving
1561
- if st.session_state.model is not None:
1562
- st.subheader("💾 Save Model")
1563
- model_filename = st.text_input("Enter Model Filename (without extension)", "neural_network")
1564
- if st.button("Save Model"):
1565
- try:
1566
- # Save the entire Keras model including architecture and weights
1567
- st.session_state.model.named_steps['model'].save(f"{model_filename}.h5") # Saves as a HDF5 file
1568
- st.success(f"Model saved as {model_filename}.h5")
1569
- except Exception as e:
1570
- st.error(f"Error saving model: {e}")
 
1
+ import gradio as gr
2
+ import numpy as np
3
  import pandas as pd
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ import io
7
+ import os
8
+ from tensorflow.keras.models import Sequential
9
+ from tensorflow.keras.layers import Dense, Dropout
10
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
11
+ from sklearn.model_selection import train_test_split
12
+ import re
13
+ # Pydantic is now in pydantic-settings, fixed
14
+ from pydantic_settings import BaseSettings # Fix: import from pydantic_settings
15
+ # pandas_profiling import and fix
16
+ from ydata_profiling import ProfileReport
17
+ from streamlit_pandas_profiling import st_profile_report
18
+
19
+ import streamlit as st
20
  import numpy as np
21
+ import pandas as pd
22
  import plotly.express as px
23
  from scipy import stats
24
  import plotly.colors as pc
 
45
  from sklearn.feature_selection import SelectKBest
46
  from sklearn.experimental import enable_iterative_imputer
47
  from sklearn.impute import IterativeImputer
48
+ from sklearn.neural_network import MLPRegressor, MLPClassifier
49
+ from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
50
  from sklearn.impute import KNNImputer, SimpleImputer
51
  from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder, LabelEncoder
52
  from sklearn.compose import ColumnTransformer
53
  from sklearn.pipeline import Pipeline
 
 
 
54
 
55
+ from datetime import datetime # Import datetime
56
 
57
 
58
  # Enhanced configuration
 
67
  st.session_state.raw_data = None
68
  if 'cleaned_data' not in st.session_state:
69
  st.session_state.cleaned_data = None
70
+
71
+ if 'model' not in st.session_state:
72
+ st.session_state.model = None
73
+ if 'preprocessor' not in st.session_state:
74
+ st.session_state.preprocessor = None
75
+
76
  # Security: Set allowed file types
77
  ALLOWED_EXTENSIONS = {'csv', 'xlsx', 'parquet', 'feather'}
78
  MAX_FILE_SIZE_MB = 250 # 250MB limit
 
81
  """Comprehensive file validation"""
82
  if not file:
83
  return False, "No file uploaded"
84
+
85
  extension = file.name.split('.')[-1].lower()
86
  if extension not in ALLOWED_EXTENSIONS:
87
  return False, f"Unsupported file type: {extension}"
88
+
89
  file_size_mb = file.size / (1024 * 1024)
90
  if file_size_mb > MAX_FILE_SIZE_MB:
91
  return False, f"File size exceeds {MAX_FILE_SIZE_MB}MB limit"
92
+
93
  return True, ""
94
 
95
  @st.cache_data(ttl=3600, show_spinner="Analyzing data quality...")
 
185
  df = pd.read_parquet(uploaded_file)
186
  elif uploaded_file.name.endswith('.feather'):
187
  df = pd.read_feather(uploaded_file)
188
+
189
  st.session_state.raw_data = df
190
  st.success("Dataset loaded successfully!")
191
+
192
  except Exception as e:
193
  st.error(f"Error loading file: {str(e)}")
194
  st.stop()
 
202
  # Data Health Dashboard
203
  st.subheader("📊 Data Health Dashboard")
204
  report = enhanced_quality_report(df)
205
+
206
  col1, col2, col3, col4 = st.columns(4)
207
  col1.metric("Total Rows", report['basic_stats']['rows'])
208
  col2.metric("Total Columns", report['basic_stats']['columns'])
 
213
  with st.expander("🔍 Deep Column Analysis", expanded=True):
214
  selected_col = st.selectbox("Select column to inspect", df.columns)
215
  col_info = report['column_analysis'][selected_col]
216
+
217
  st.write(f"**Type:** {col_info['type']}")
218
  st.write(f"**Unique Values:** {col_info['unique']}")
219
  st.write(f"**Missing Values:** {col_info['missing']} ({col_info['missing']/len(df):.1%})")
220
+
221
  if pd.api.types.is_numeric_dtype(df[selected_col]):
222
  st.write("**Distribution:**")
223
  st.line_chart(df[selected_col])
 
239
  recommendations.append(f"⚠️ Consider dropping {col} (>{50}% missing)")
240
  if data['unique'] == len(df):
241
  recommendations.append(f"🔍 Investigate {col} - potential unique identifier")
242
+
243
  if recommendations:
244
  st.write("### Recommended Actions")
245
  for rec in recommendations[:5]: # Show top 5
 
255
  # Advanced Profiling
256
  if st.button("🚀 Generate Full Data Profile"):
257
  with st.spinner("Generating comprehensive report..."):
258
+ pr = ProfileReport(df, explorative=True,title="Data Upload Report") # Added title to pandas profiling
259
  st_profile_report(pr)
260
 
261
  elif app_mode == "Smart Cleaning":
 
287
  st.caption(f"Current Version: {st.session_state.current_version+1}/{len(st.session_state.data_versions)}")
288
  progress = (st.session_state.current_version + 1) / len(st.session_state.data_versions)
289
  st.progress(progress)
290
+
291
  col1, col2 = st.columns(2)
292
  with col1:
293
  if st.button("⏮️ Undo Last Action", disabled=st.session_state.current_version == 0):
 
302
  st.subheader("📊 Data Health Dashboard")
303
  with st.expander("Show Comprehensive Data Report", expanded=True):
304
  from pandas_profiling import ProfileReport
305
+ pr = ProfileReport(df, title="Smart Cleaning Data Report") # Add title to pandas profiling report
306
  st_profile_report(pr)
307
 
308
  # Enhanced Health Summary with Cards
 
322
  st.markdown("### 📈 Data Health Visualizations")
323
  col1, col2 = st.columns(2)
324
  with col1:
325
+ st.plotly_chart(px.bar(df.isna().sum(), title="Missing Values per Column",
326
+ labels={'index': 'Column', 'value': 'Missing Count'},
327
  color=df.isna().sum(), color_continuous_scale="Bluered"))
328
  with col2:
329
+ st.plotly_chart(px.pie(values=df.dtypes.value_counts(), names=df.dtypes.value_counts().index,
330
  title="Data Type Distribution", hole=0.3))
331
 
332
  # Cleaning Operations with Tabs
 
340
  if missing_cols:
341
  st.write("Columns with missing values:")
342
  cols = st.multiselect("Select columns to clean", missing_cols, default=missing_cols)
343
+
344
  method = st.radio("Imputation Method", [
345
+ "Drop Missing",
346
+ "Mean/Median/Mode",
347
+ "KNN Imputation",
348
+ "MICE Imputation",
349
  "Deep Learning Imputation"
350
  ], horizontal=True)
351
+
352
  if st.button(f"Apply {method}"):
353
  try:
354
  original_df = df.copy()
 
369
  st.plotly_chart(px.histogram(df, x=df.duplicated(), title="Duplicate Distribution"))
370
  dup_strategy = st.radio("Duplicate Strategy", [
371
  "Remove All Duplicates",
372
+ "Keep First Occurrence",
373
  "Keep Last Occurrence"
374
  ])
375
  if st.button("Handle Duplicates"):
 
394
  with col2:
395
  col_to_convert = st.selectbox("Select column to convert", df.columns)
396
  new_type = st.selectbox("New Data Type", [
397
+ "String", "Integer", "Float",
398
  "Boolean", "Datetime", "Category"
399
  ])
400
  if st.button("Convert Data Type"):
 
425
  if st.button("💾 Save Cleaned Data"):
426
  st.session_state.cleaned_data = df
427
  st.balloons()
428
+
429
  # Generate comprehensive report
430
  from pandas_profiling import ProfileReport
431
  pr = ProfileReport(df, title="Cleaned Data Report")
432
  st_profile_report(pr)
433
+
434
  # Show cleaning log with diffs
435
  st.subheader("📝 Cleaning Log")
436
  st.table(pd.DataFrame({
437
  "Step": range(1, len(cleaning_actions)+1),
438
  "Action": cleaning_actions
439
  }))
440
+
441
  # Show dataset comparison
442
  col1, col2 = st.columns(2)
443
  with col1:
444
  st.write("Original Data Shape:", st.session_state.raw_data.shape)
445
  with col2:
446
  st.write("Cleaned Data Shape:", df.shape)
447
+
448
+ st.success("✅ Cleaned data saved successfully! You can now proceed to analysis.")
449
  elif app_mode == "Advanced EDA":
450
  st.title("🔍 Advanced Exploratory Data Analysis")
451
  st.markdown("""
 
479
 
480
  with col1:
481
  st.header("📊 Visualization Setup")
482
+
483
  # Plot Type Selection
484
  plot_types = {
485
  "Distribution": ["Histogram", "Box Plot", "Violin Plot", "Density Plot"],
 
487
  "Comparison": ["Bar Chart", "Pie Chart", "Parallel Coordinates"],
488
  "3D": ["3D Scatter", "3D Surface"]
489
  }
490
+
491
  selected_category = st.selectbox("Plot Category", list(plot_types.keys()))
492
  st.session_state.eda_config['plot_type'] = st.selectbox(
493
  "Plot Type",
 
496
 
497
  # Dynamic Column Selectors
498
  plot_type = st.session_state.eda_config['plot_type']
499
+
500
  if plot_type in ["Histogram", "Box Plot", "Violin Plot", "Density Plot", "Bar Chart", "Pie Chart"]:
501
  st.session_state.eda_config['x_col'] = st.selectbox(
502
  "X Axis",
503
  df.columns,
504
+ index=df.columns.get_loc(st.session_state.eda_config['x_col'])
505
  if st.session_state.eda_config['x_col'] in df.columns else 0
506
  )
507
+
508
  if plot_type in ["Scatter Plot", "Line Plot", "Box Plot", "Violin Plot", "Density Plot"]:
509
  st.session_state.eda_config['y_col'] = st.selectbox(
510
  "Y Axis",
511
  df.columns,
512
+ index=df.columns.get_loc(st.session_state.eda_config['y_col'])
513
  if st.session_state.eda_config['y_col'] in df.columns else 0
514
  )
515
+
516
  if plot_type in ["3D Scatter", "3D Surface"]:
517
  st.session_state.eda_config['z_col'] = st.selectbox(
518
  "Z Axis",
519
  df.columns,
520
+ index=df.columns.get_loc(st.session_state.eda_config['z_col'])
521
  if st.session_state.eda_config['z_col'] in df.columns else 0
522
  )
523
 
 
558
  with col2:
559
  st.header("📈 Visualization")
560
  config = st.session_state.eda_config
561
+
562
  @st.cache_data(ttl=300)
563
  def generate_plot(df, plot_type, config):
564
  """Cached plot generation function for better performance"""
565
  try:
566
  if plot_type == "Histogram":
567
  return px.histogram(
568
+ df, x=config['x_col'],
569
  color=config['color_col'],
570
  nbins=30,
571
  color_discrete_sequence=[config['color_palette']]
572
  )
573
+
574
  elif plot_type == "Scatter Plot":
575
  return px.scatter(
576
  df, x=config['x_col'], y=config['y_col'],
577
  color=config['color_col'],
578
  hover_data=config['hover_data_cols']
579
  )
580
+
581
  elif plot_type == "Box Plot":
582
  return px.box(
583
  df, x=config['x_col'], y=config['y_col'],
584
  color=config['color_col']
585
  )
586
+
587
  elif plot_type == "Violin Plot":
588
  return px.violin(
589
  df, x=config['x_col'], y=config['y_col'],
590
  color=config['color_col'],
591
  box=True
592
  )
593
+
594
  elif plot_type == "Heatmap":
595
  numeric_df = df.select_dtypes(include=np.number)
596
  corr = numeric_df.corr()
597
  return px.imshow(
598
+ corr,
599
  text_auto=True,
600
  color_continuous_scale=config['color_palette']
601
  )
602
+
603
  elif plot_type == "3D Scatter":
604
  return px.scatter_3d(
605
  df, x=config['x_col'], y=config['y_col'], z=config['z_col'],
606
  color=config['color_col']
607
  )
608
+
609
  elif plot_type == "Bar Chart":
610
  return px.bar(
611
  df, x=config['x_col'], y=config['y_col'],
612
  color=config['color_col']
613
  )
614
+
615
  elif plot_type == "Pie Chart":
616
  return px.pie(
617
  df, names=config['x_col'], values=config['y_col'],
618
  color_discrete_sequence=[config['color_palette']]
619
  )
620
+
621
  elif plot_type == "Line Plot":
622
  return px.line(
623
  df, x=config['x_col'], y=config['y_col'],
624
  color=config['color_col']
625
  )
626
+
627
  elif plot_type == "Pair Plot":
628
  numeric_cols = df.select_dtypes(include=np.number).columns
629
  return px.scatter_matrix(
630
  df[numeric_cols],
631
  color=config['color_col']
632
  )
633
+
634
  elif plot_type == "Parallel Coordinates":
635
  numeric_df = df.select_dtypes(include=np.number)
636
  return px.parallel_coordinates(
637
  numeric_df,
638
  color_continuous_scale=config['color_palette']
639
  )
640
+
641
  elif plot_type == "Density Plot":
642
  return px.density_contour(
643
  df, x=config['x_col'], y=config['y_col'],
644
  color=config['color_col']
645
  )
646
+
647
  except Exception as e:
648
  st.error(f"Plot generation error: {str(e)}")
649
  return None
 
652
  fig = generate_plot(df, plot_type, config)
653
  if fig:
654
  st.plotly_chart(fig, use_container_width=True)
655
+
656
  # Plot Statistics
657
  with st.expander("📊 Plot Statistics"):
658
  if plot_type in ["Histogram", "Box Plot", "Violin Plot"]:
659
  st.write(f"**{config['x_col']} Statistics**")
660
  st.table(df[config['x_col']].describe())
661
+
662
  if plot_type in ["Scatter Plot", "Line Plot"]:
663
  st.write(f"**Correlation between {config['x_col']} and {config['y_col']}**")
664
  corr = df[[config['x_col'], config['y_col']]].corr().iloc[0,1]
665
  st.metric("Pearson Correlation", f"{corr:.2f}")
666
+
667
  if plot_type == "Heatmap":
668
  st.write("**Correlation Matrix**")
669
  numeric_df = df.select_dtypes(include=np.number)
 
677
  st.write("**Data Shape**")
678
  st.write(f"Rows: {df.shape[0]}")
679
  st.write(f"Columns: {df.shape[1]}")
680
+
681
  with col2:
682
  st.write("**Data Types**")
683
  st.dataframe(df.dtypes.reset_index().rename(columns={
684
  'index': 'Column', 0: 'Type'
685
  }))
686
+
687
  st.write("**Sample Data**")
688
  st.dataframe(df.head())
689
+
690
  # Model Training Section
691
  elif app_mode == "Model Training":
692
  st.title("🚂 Model Training Studio")
 
735
  model_options = ["Logistic Regression", "Decision Tree", "Random Forest", "Gradient Boosting", "SVM", "Neural Network", "KNN", "Naive Bayes"]
736
  model_name = st.selectbox("Select Model", model_options, help="Choose a model.")
737
 
738
+ elif model_name == "Gradient Boosting":
739
+ learning_rate = st.slider("Learning Rate", 0.01, 1.0, 0.1)
740
+ n_estimators = st.slider("Number of Estimators", 10, 200, 100)
741
+ max_depth = st.slider("Max Depth", 3, 20, 10)
742
+ hyperparams = {
743
+ 'learning_rate': learning_rate,
744
+ 'n_estimators': n_estimators,
745
+ 'max_depth': max_depth
746
+ }
747
+ elif model_name == "Neural Network":
748
+ hidden_layers = st.slider("Number of Hidden Layers", 1, 5, 2)
749
+ neurons_per_layer = st.slider("Neurons per Layer", 10, 200, 50)
750
+ activation = st.selectbox("Activation Function",
751
+ ["relu", "tanh", "sigmoid", "selu", "swish"])
752
+ dropout_rate = st.slider("Dropout Rate", 0.0, 0.5, 0.2)
753
+ initializer = st.selectbox("Weight Initializer",
754
+ ["glorot_uniform", "he_normal", "lecun_uniform"])
755
+ learning_rate = st.slider("Learning Rate", 0.0001, 0.1, 0.001, format="%.4f")
756
+ optimizer_choice = st.selectbox("Optimizer",
757
+ ["Adam", "Nadam", "RMSprop", "SGD"])
758
+ batch_norm = st.checkbox("Batch Normalization", value=True)
759
+ regularization = st.checkbox("L2 Regularization")
760
+ epochs = st.slider("Epochs", 10, 200, 50)
761
+ batch_size = st.slider("Batch Size", 16, 128, 32)
762
+ hyperparams = {
763
+ 'hidden_layers': hidden_layers,
764
+ 'neurons_per_layer': neurons_per_layer,
765
+ 'activation': activation,
766
+ 'dropout_rate': dropout_rate,
767
+ 'initializer': initializer,
768
+ 'learning_rate': learning_rate,
769
+ 'optimizer_choice': optimizer_choice,
770
+ 'batch_norm': batch_norm,
771
+ 'regularization': regularization,
772
+ 'epochs': epochs,
773
+ 'batch_size': batch_size,
774
+ }
775
+ else:
776
+ hyperparams = {}
777
 
778
  # Train-Test Split
779
  st.subheader("✂️ Train-Test Split")
 
843
  else:
844
  model = SVC()
845
  elif model_name == "Neural Network":
846
+ from tensorflow.keras.models import Sequential
847
+ from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
848
+ from tensorflow.keras.optimizers import Adam, Nadam, RMSprop, SGD
849
+
850
+ # Build a new model with the parameters
851
+ model = Sequential()
852
+ model.add(layers.Input(shape=(X_train_processed.shape[1],)))
853
+
854
+ for i in range(hyperparams['hidden_layers']):
855
+ model.add(Dense(hyperparams['neurons_per_layer'],
856
+ activation=hyperparams['activation'],
857
+ kernel_initializer=hyperparams['initializer']))
858
+ if hyperparams['batch_norm']:
859
+ model.add(BatchNormalization())
860
+ model.add(Dropout(hyperparams['dropout_rate']))
861
+
862
+ # Output layer
863
+ output_activation = 'linear' if problem_type == "Regression" else 'softmax'
864
+ output_units = 1 if problem_type == "Regression" else len(np.unique(y_train))
865
+ model.add(Dense(output_units, activation=output_activation))
866
+
867
+ # Configure optimizer
868
+ optimizers = {
869
+ "Adam": Adam(learning_rate=hyperparams['learning_rate']),
870
+ "Nadam": Nadam(learning_rate=hyperparams['learning_rate']),
871
+ "RMSprop": RMSprop(learning_rate=hyperparams['learning_rate']),
872
+ "SGD": SGD(learning_rate=hyperparams['learning_rate'], momentum=0.9)
873
+ }
874
+ optimizer = optimizers[hyperparams['optimizer_choice']]
875
+
876
+ model.compile(optimizer=optimizer,
877
+ loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
878
+ metrics=['mae'] if problem_type == "Regression" else ['accuracy'])
879
  elif model_name == "KNN":
880
+ from sklearn.neighbors import KNeighborsClassifier
881
  model = KNeighborsClassifier()
882
  elif model_name == "Naive Bayes":
883
+ from sklearn.naive_bayes import GaussianNB
884
  model = GaussianNB()
885
 
886
  # Train the model
887
+ if model_name == "Neural Network": # Only for the neural network
888
+ history = model.fit(X_train_processed, y_train,
889
+ epochs=hyperparams['epochs'],
890
+ batch_size=hyperparams['batch_size'],
891
+ validation_data=(X_test_processed, y_test),
892
+ verbose=0)
893
 
894
+ else:
895
+ model.fit(X_train_processed, y_train)
896
  # Store model and preprocessor
897
  st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
898
  st.session_state.preprocessor = preprocessor
899
 
900
+ # Store the test data for insights and predictions
901
  st.session_state.X_train_selected = X_train_processed
902
  st.session_state.X_test_selected = X_test_processed
903
  st.session_state.y_train = y_train
904
  st.session_state.y_test = y_test
905
 
906
  # Model Evaluation
 
907
  if problem_type == "Regression":
908
+ y_pred = model.predict(X_test_processed)
909
  mse = mean_squared_error(y_test, y_pred)
910
  rmse = np.sqrt(mse)
911
  mae = mean_absolute_error(y_test, y_pred)
 
914
  st.write(f"Root Mean Squared Error: {rmse:.4f}")
915
  st.write(f"Mean Absolute Error: {mae:.4f}")
916
  st.write(f"R-squared: {r2:.4f}")
917
+ else: # Classification
918
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
919
+ y_pred = model.predict(X_test_processed)
920
+ if model_name == "Neural Network": # Neural network output probabilities
921
+ y_pred = np.argmax(model.predict(X_test_processed), axis=1)
922
  accuracy = accuracy_score(y_test, y_pred)
923
  precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
924
  recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
 
929
  st.write(f"F1 Score: {f1:.4f}")
930
  st.write("Classification Report:")
931
  st.text(classification_report(y_test, y_pred))
932
+ # confusion matrix
933
+ st.write("Confusion Matrix:")
934
+ conf_matrix = confusion_matrix(y_test, y_pred)
935
+ st.write(conf_matrix)
936
 
937
  # Visualization
938
  st.subheader("📊 Model Performance Visualization")
 
944
  ax.set_ylabel('Predicted')
945
  ax.set_title('Actual vs Predicted')
946
  st.pyplot(fig)
947
+ elif model_name == "Neural Network":
948
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
949
+ ax1.plot(history.history['loss'], label='Train Loss')
950
+ ax1.plot(history.history['val_loss'], label='Validation Loss')
951
+ ax1.set_title('Loss Evolution')
952
+ ax1.set_xlabel('Epoch')
953
+ ax1.set_ylabel('Loss')
954
+ ax1.legend()
955
+
956
+ # Plot accuracy/metric
957
+ if problem_type == "Classification":
958
+ ax2.plot(history.history['accuracy'], label='Train Accuracy')
959
+ ax2.plot(history.history['val_accuracy'], label='Validation Accuracy')
960
+ ax2.set_title('Accuracy Evolution')
961
+ ax2.set_ylabel('Accuracy')
962
+ else:
963
+ ax2.plot(history.history['mae'], label='Train MAE')
964
+ ax2.plot(history.history['val_mae'], label='Validation MAE')
965
+ ax2.set_title('MAE Evolution')
966
+ ax2.set_ylabel('MAE')
967
+
968
+ ax2.set_xlabel('Epoch')
969
+ ax2.legend()
970
+ st.pyplot(fig)
971
+
972
+ else: # Classification confusion matrix
973
+ from sklearn.metrics import confusion_matrix
974
  conf_matrix = confusion_matrix(y_test, y_pred)
975
  fig, ax = plt.subplots()
976
  sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax)
 
978
  ax.set_ylabel('True Labels')
979
  ax.set_title('Confusion Matrix')
980
  st.pyplot(fig)
 
981
  st.success("Model trained successfully!")
982
  except Exception as e:
983
  st.error(f"An error occurred during training: {e}")
 
995
  st.warning("No trained model available. Train a model first to enable saving.")
996
 
997
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
998
  # Insights Section
999
  elif app_mode == "Insights":
1000
  st.title("📊 Model Insights & Explainability")
 
1025
  'Feature': feature_names,
1026
  'Importance': importances
1027
  }).sort_values('Importance', ascending=False)
1028
+
1029
  fig, ax = plt.subplots()
1030
  sns.barplot(x='Importance', y='Feature', data=importance_df.head(10), ax=ax)
1031
  ax.set_title('Top 10 Feature Importances')
 
1038
  if st.checkbox("Calculate SHAP Values (Warning: May be slow for large datasets)"):
1039
  try:
1040
  import shap
 
 
 
 
 
 
 
 
1041
 
1042
+ # Use KernelExplainer for models that don't have a built-in explainer
1043
+ if not hasattr(model, 'predict'):
1044
+ explainer = shap.KernelExplainer(model.predict, st.session_state.X_train_selected[:100, :]) # Use a sample of training data
1045
+
1046
+ shap_values = explainer.shap_values(st.session_state.X_test_selected)
1047
+ feature_names = preprocessor.get_feature_names_out()
1048
+ # Summary Plot
1049
+ st.write("### Summary Plot")
1050
+ fig, ax = plt.subplots()
1051
+ shap.summary_plot(shap_values, features=st.session_state.X_test_selected, feature_names=feature_names, show=False, plot_type="bar") # Change to bar for a cleaner visualization
1052
+ st.pyplot(fig)
1053
+
1054
+ # Force Plot for Individual Predictions
1055
+ st.write("### Individual Prediction Explanation")
1056
+ sample_idx = st.slider("Select Sample Index", 0, len(st.session_state.X_test_selected) - 1, 0)
1057
+ fig, ax = plt.subplots()
1058
+ shap.force_plot(explainer.expected_value, shap_values[sample_idx], st.session_state.X_test_selected[sample_idx],
1059
+ feature_names=feature_names, matplotlib=True, show=False)
1060
+ st.pyplot(fig)
1061
+ else:
1062
+ explainer = shap.TreeExplainer(model)
1063
+ shap_values = explainer.shap_values(st.session_state.X_test_selected)
1064
+ feature_names = preprocessor.get_feature_names_out()
1065
+ # Summary Plot
1066
+ st.write("### Summary Plot")
1067
+ fig, ax = plt.subplots()
1068
+ shap.summary_plot(shap_values, features=st.session_state.X_test_selected, feature_names=feature_names, show=False, plot_type="bar") # Change to bar for a cleaner visualization
1069
+ st.pyplot(fig)
1070
+
1071
+ # Force Plot for Individual Predictions
1072
+ st.write("### Individual Prediction Explanation")
1073
+ sample_idx = st.slider("Select Sample Index", 0, len(st.session_state.X_test_selected) - 1, 0)
1074
+ fig, ax = plt.subplots()
1075
+ shap.force_plot(explainer.expected_value, shap_values[sample_idx], st.session_state.X_test_selected[sample_idx],
1076
+ feature_names=feature_names, matplotlib=True, show=False)
1077
+ st.pyplot(fig)
1078
+
1079
  except Exception as e:
1080
  st.error(f"SHAP calculation failed: {e}")
1081
 
 
1087
  from sklearn.inspection import PartialDependenceDisplay
1088
  fig, ax = plt.subplots()
1089
  PartialDependenceDisplay.from_estimator(
1090
+ model, st.session_state.X_test_selected,
1091
+ features=[feature_to_plot],
1092
  feature_names=preprocessor.get_feature_names_out(),
1093
  ax=ax
1094
  )
 
1102
  'metric': [],
1103
  'value': []
1104
  }
1105
+
1106
  if hasattr(model, 'predict'):
1107
  y_pred = model.predict(st.session_state.X_test_selected)
1108
  mse = mean_squared_error(st.session_state.y_test, y_pred)
1109
  performance_history['timestamp'].append(datetime.now())
1110
  performance_history['metric'].append('MSE')
1111
  performance_history['value'].append(mse)
1112
+
1113
  performance_df = pd.DataFrame(performance_history)
1114
  st.line_chart(performance_df.set_index('timestamp'))
1115
 
 
1138
  st.success("Insights exported successfully!")
1139
  except Exception as e:
1140
  st.error(f"Export failed: {e}")
 
1141
 
1142
  # Predictions Section
1143
  elif app_mode == "Predictions":
 
1170
  input_df = pd.DataFrame([input_data])
1171
  input_processed = preprocessor.transform(input_df)
1172
  prediction = model.predict(input_processed)[0]
1173
+
1174
  st.write(f"**Prediction:** {prediction}")
1175
+
1176
  if hasattr(model, 'predict_proba'):
1177
  probabilities = model.predict_proba(input_processed)[0]
1178
  st.write("**Prediction Probabilities:**")
 
1182
  if st.checkbox("Show SHAP Explanation"):
1183
  try:
1184
  import shap
1185
+ # Use KernelExplainer or TreeExplainer, checking if the model has the property first
1186
+ if hasattr(model, 'predict'):
1187
+ explainer = shap.TreeExplainer(model)
1188
+ shap_values = explainer.shap_values(input_processed)
1189
+ else:
1190
+ explainer = shap.KernelExplainer(model.predict, st.session_state.X_train_selected[:100, :])
1191
+ shap_values = explainer.shap_values(input_processed)
1192
+
1193
  st.write("### SHAP Values")
1194
  fig, ax = plt.subplots()
1195
+ shap.force_plot(explainer.expected_value, shap_values, input_processed,
1196
  feature_names=feature_names, matplotlib=True, show=False)
1197
  st.pyplot(fig)
1198
+
1199
  except Exception as e:
1200
  st.error(f"SHAP calculation failed: {e}")
1201
 
 
1268
  pdf.cell(200, 10, txt=f"Problem Type: {'Regression' if hasattr(model, 'predict') else 'Classification'}", ln=True)
1269
  pdf.output("predictions_report.pdf")
1270
  st.success("Predictions exported successfully!")
1271
+ except Exception as e:
1272
+ st.error(f"An unexpected error occurred: {e}")