CosmickVisions commited on
Commit
213f577
·
verified ·
1 Parent(s): b9d21cf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +255 -1293
app.py CHANGED
@@ -1,1360 +1,322 @@
1
- import gradio as gr
2
- import numpy as np
3
  import pandas as pd
4
- import matplotlib.pyplot as plt
5
- import seaborn as sns
6
- import io
7
- import os
8
- from tensorflow.keras.models import Sequential
9
- from tensorflow.keras.layers import Dense, Dropout
10
  from sklearn.preprocessing import StandardScaler, LabelEncoder
11
  from sklearn.model_selection import train_test_split
12
- import re
13
- # Pydantic is now in pydantic-settings, fixed
14
- from pydantic_settings import BaseSettings # Fix: import from pydantic_settings
15
- # pandas_profiling import and fix
16
  from ydata_profiling import ProfileReport
17
  from streamlit_pandas_profiling import st_profile_report
18
-
19
-
20
- import streamlit as st
21
- import numpy as np
22
- import pandas as pd
23
- import plotly.express as px
24
- from scipy import stats
25
- import plotly.colors as pc
26
  import joblib
27
- from io import StringIO
28
- import requests
29
- import asyncio
30
- from io import BytesIO
31
- import base64
32
- import tensorflow as tf
33
- from tensorflow import keras
34
- from tensorflow.keras import layers, callbacks
35
- from tensorflow.keras.utils import to_categorical
36
- from keras.models import Sequential
37
- from keras.layers import Dense
38
- import mimetypes
39
- import tensorflow
40
- import matplotlib.pyplot as plt
41
- from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, learning_curve
42
- from sklearn.linear_model import LinearRegression, LogisticRegression
43
- from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
44
- from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
45
- from sklearn.svm import SVR, SVC
46
- from sklearn.feature_selection import SelectKBest
47
- from sklearn.experimental import enable_iterative_imputer
48
- from sklearn.impute import IterativeImputer
49
- from sklearn.neural_network import MLPRegressor, MLPClassifier
50
- from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
51
- from sklearn.impute import KNNImputer, SimpleImputer
52
- from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder, LabelEncoder
53
- from sklearn.compose import ColumnTransformer
54
- from sklearn.pipeline import Pipeline
55
-
56
- from datetime import datetime # Import datetime
57
-
58
 
59
- # Enhanced configuration
 
 
60
  st.set_page_config(
61
- page_title="Executive Insights Pro",
 
62
  layout="wide",
63
- page_icon="📈",
64
  initial_sidebar_state="expanded"
65
  )
66
- # Initial session state setup (at the top of your script)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  if 'raw_data' not in st.session_state:
68
  st.session_state.raw_data = None
69
  if 'cleaned_data' not in st.session_state:
70
  st.session_state.cleaned_data = None
71
-
72
  if 'model' not in st.session_state:
73
  st.session_state.model = None
74
- if 'preprocessor' not in st.session_state:
75
- st.session_state.preprocessor = None
76
-
77
- # Security: Set allowed file types
78
- ALLOWED_EXTENSIONS = {'csv', 'xlsx', 'parquet', 'feather'}
79
- MAX_FILE_SIZE_MB = 250 # 250MB limit
80
-
81
- def validate_file(file):
82
- """Comprehensive file validation"""
83
- if not file:
84
- return False, "No file uploaded"
85
 
86
- extension = file.name.split('.')[-1].lower()
87
- if extension not in ALLOWED_EXTENSIONS:
88
- return False, f"Unsupported file type: {extension}"
89
-
90
- file_size_mb = file.size / (1024 * 1024)
91
- if file_size_mb > MAX_FILE_SIZE_MB:
92
- return False, f"File size exceeds {MAX_FILE_SIZE_MB}MB limit"
93
-
94
- return True, ""
95
-
96
- @st.cache_data(ttl=3600, show_spinner="Analyzing data quality...")
97
- def enhanced_quality_report(df):
98
  """Generate comprehensive data quality report"""
99
  report = {
100
- 'basic_stats': {
101
  'rows': df.shape[0],
102
  'columns': df.shape[1],
103
- 'missing_values': df.isna().sum().sum(),
104
  'duplicates': df.duplicated().sum()
105
  },
106
- 'column_analysis': {},
107
- 'data_health_score': 100 # Starting score
108
  }
109
-
110
  for col in df.columns:
111
  col_report = {
112
  'type': str(df[col].dtype),
113
  'unique': df[col].nunique(),
114
  'missing': df[col].isna().sum(),
115
- 'samples': df[col].dropna().sample(3).tolist() if df[col].dtype == 'object' else []
116
  }
117
-
118
- # Numeric specific checks
119
  if pd.api.types.is_numeric_dtype(df[col]):
120
  col_report.update({
121
  'mean': df[col].mean(),
122
  'std': df[col].std(),
123
- 'zeros': (df[col] == 0).sum(),
124
- 'negatives': (df[col] < 0).sum() if df[col].dtype != 'uint' else 0,
125
- 'outliers': detect_outliers(df[col])
126
- })
127
- report['data_health_score'] -= 2 # Deduct 2% per numeric column
128
-
129
- # Categorical specific checks
130
- if pd.api.types.is_string_dtype(df[col]):
131
- col_report.update({
132
- 'top_value': df[col].mode()[0] if not df[col].empty else None,
133
- 'top_freq': df[col].value_counts().iloc[0]/len(df) if not df[col].empty else 0
134
  })
135
- report['data_health_score'] -= 1 # Deduct 1% per string column
136
-
137
- report['column_analysis'][col] = col_report
138
- report['data_health_score'] = max(report['data_health_score'], 0)
139
-
140
  return report
141
 
142
- def detect_outliers(series):
143
- """Detect outliers using IQR method"""
144
- q1 = series.quantile(0.25)
145
- q3 = series.quantile(0.75)
146
- iqr = q3 - q1
147
- return ((series < (q1 - 1.5 * iqr)) | (series > (q3 + 1.5 * iqr))).sum()
148
-
149
- # Define app_mode for navigation
150
- app_mode = st.sidebar.selectbox(
151
- "Select Page",
152
- ["Data Upload", "Smart Cleaning", "Advanced EDA", "Model Training", "Insights", "Predictions", "Neural Network Studio"],
153
- help="Choose the section to navigate to."
154
- )
155
-
156
-
157
- # Initialize df globally
158
- df = pd.DataFrame()
159
-
160
- # --- Data Upload Page ---
161
- # Data Upload Page
162
  if app_mode == "Data Upload":
163
- st.title("📥 Smart Data Hub")
164
- st.markdown("""
165
- **Upload your dataset** (CSV, Excel, Parquet) for comprehensive analysis. Get instant data health insights and quality assessment.
166
- """)
167
-
168
- # File upload
169
- uploaded_file = st.file_uploader("Drag & drop or browse files", type=list(ALLOWED_EXTENSIONS))
170
-
171
  if uploaded_file:
172
- # Validate file
173
- is_valid, message = validate_file(uploaded_file)
174
- if not is_valid:
175
- st.error(f"Upload error: {message}")
176
- st.stop()
177
-
178
- # Load data with progress
179
- with st.spinner(f"Loading {uploaded_file.name} ..."):
180
- try:
181
- if uploaded_file.name.endswith('.csv'):
182
- df = pd.read_csv(uploaded_file, low_memory=False)
183
- elif uploaded_file.name.endswith(('.xlsx', '.xls')):
184
- df = pd.read_excel(uploaded_file)
185
- elif uploaded_file.name.endswith('.parquet'):
186
- df = pd.read_parquet(uploaded_file)
187
- elif uploaded_file.name.endswith('.feather'):
188
- df = pd.read_feather(uploaded_file)
189
- st.session_state.raw_data = df
190
- st.success("Dataset loaded successfully!")
191
- except Exception as e:
192
- st.error(f"Error loading file: {str(e)}")
193
- st.stop()
194
-
195
- # In your Data Upload section, add this when new data is uploaded
196
- if uploaded_file is not None:
197
- # Reset models when new data is uploaded
198
- st.session_state.model = None
199
- st.session_state.preprocessor = None
200
-
201
- # Data Health Dashboard
202
- st.subheader("📊 Data Health Dashboard")
203
- report = enhanced_quality_report(df)
204
-
205
- col1, col2, col3, col4 = st.columns(4)
206
- col1.metric("Total Rows", report['basic_stats']['rows'])
207
- col2.metric("Total Columns", report['basic_stats']['columns'])
208
- col3.metric("Missing Values", report['basic_stats']['missing_values'])
209
- col4.metric("Data Health Score", f"{report['data_health_score']}/100")
210
-
211
- # Column Explorer
212
- with st.expander("🔍 Deep Column Analysis", expanded=True):
213
- selected_col = st.selectbox("Select column to inspect", df.columns)
214
- col_info = report['column_analysis'][selected_col]
215
-
216
- st.write(f"**Type:** {col_info['type']}")
217
- st.write(f"**Unique Values:** {col_info['unique']}")
218
- st.write(f"**Missing Values:** {col_info['missing']} ({col_info['missing']/len(df):.1%})")
219
-
220
- if pd.api.types.is_numeric_dtype(df[selected_col]):
221
- st.write("**Distribution:**")
222
- st.line_chart(df[selected_col])
223
- st.write(f"**Outliers Detected:** {col_info['outliers']}")
224
- else:
225
- st.write("**Most Common Values:**")
226
- top_values = df[selected_col].value_counts().head(5)
227
- st.bar_chart(top_values)
228
-
229
- # Smart Recommendations
230
- with st.expander("💡 Cleaning Recommendations"):
231
- recommendations = []
232
- if report['basic_stats']['duplicates'] > 0:
233
- recommendations.append(f"🚨 Remove {report['basic_stats']['duplicates']} duplicate rows")
234
- if report['basic_stats']['missing_values'] > 0:
235
- recommendations.append("🔧 Apply advanced imputation strategies")
236
- for col, data in report['column_analysis'].items():
237
- if data['missing'] > 0.5 * len(df):
238
- recommendations.append(f"⚠️ Consider dropping {col} (>{50}% missing)")
239
- if data['unique'] == len(df):
240
- recommendations.append(f"🔍 Investigate {col} - potential unique identifier")
241
-
242
- if recommendations:
243
- st.write("### Recommended Actions")
244
- for rec in recommendations[:5]: # Show top 5
245
- st.write(f"- {rec}")
246
  else:
247
- st.success("No critical issues detected - your data looks healthy!")
248
-
249
- # Data Preview
250
- with st.expander("🔎 Data Preview", expanded=True):
251
- preview_size = st.slider("Preview rows", 5, 100, 15)
252
- st.dataframe(df.head(preview_size).style.highlight_null(color='#FF6666'))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
 
254
- # Advanced Profiling
255
- if st.button("🚀 Generate Full Data Profile"):
256
- with st.spinner("Generating comprehensive report..."):
257
- pr = ProfileReport(df, explorative=True,title="Data Upload Report") # Added title to pandas profiling
258
- st_profile_report(pr)
259
 
 
 
 
260
 
261
- # Cleaning Operations with Tabs
262
- st.subheader("🔧 Cleaning Operations")
263
- tab1, tab2, tab3, tab4 = st.tabs(["Missing Values", "Duplicates", "Data Types", "Outliers"])
264
-
265
- # 1. Missing Value Handling
266
- with tab1:
267
- st.markdown("### 🕳️ Handle Missing Values")
268
- missing_cols = df.columns[df.isna().any()].tolist()
269
- if missing_cols:
270
- st.write("Columns with missing values:")
271
- cols = st.multiselect("Select columns to clean", missing_cols, default=missing_cols)
272
-
273
- method = st.radio("Imputation Method", [
274
- "Drop Missing",
275
- "Mean/Median/Mode",
276
- "KNN Imputation",
277
- "MICE Imputation",
278
- "Deep Learning Imputation"
279
- ], horizontal=True)
280
-
281
- if method == "Mean/Median/Mode":
282
- imputation_choice = st.radio("Select Imputation Method", ["Mean", "Median", "Mode"], horizontal=True)
283
-
284
- if st.button(f"Apply {method}"):
285
- try:
286
- original_df = df.copy()
287
- if method == "Mean/Median/Mode":
288
  for col in cols:
289
- if df[col].isnull().any(): # Check if missing values exist before imputing
290
- if pd.api.types.is_numeric_dtype(df[col]):
291
- if imputation_choice == "Mean":
292
- df[col] = df[col].fillna(df[col].mean())
293
- elif imputation_choice == "Median":
294
- df[col] = df[col].fillna(df[col].median())
295
- elif imputation_choice == "Mode":
296
- df[col] = df[col].fillna(df[col].mode()[0])
297
- else: # Impute strings with mode
298
- df[col] = df[col].fillna(df[col].mode()[0])
299
- # Add logic for other methods here...
300
- cleaning_actions.append(f"Applied {method} on {cols}")
301
- update_version(df)
302
- st.success(f"{method} applied successfully! ✅")
303
- except Exception as e:
304
- st.error(f"Error: {str(e)}")
305
- else:
306
- st.success("✨ No missing values found!")
307
-
308
- # 2. Duplicate Handling
309
- with tab2:
310
- st.markdown("### 🔄 Handle Duplicates")
311
- duplicates = df.duplicated().sum()
312
- if duplicates > 0:
313
- st.plotly_chart(px.histogram(df, x=df.duplicated(), title="Duplicate Distribution"))
314
- dup_strategy = st.radio("Duplicate Strategy", [
315
- "Remove All Duplicates",
316
- "Keep First Occurrence",
317
- "Keep Last Occurrence"
318
- ])
319
- if st.button("Handle Duplicates"):
320
- original_count = len(df)
321
- df = df.drop_duplicates(keep={
322
- "Remove All Duplicates": False,
323
- "Keep First Occurrence": 'first',
324
- "Keep Last Occurrence": 'last'
325
- }[dup_strategy])
326
- cleaning_actions.append(f"Removed {original_count - len(df)} duplicates")
327
- update_version(df)
328
- st.success(f"Removed {original_count - len(df)} duplicates! ✅")
329
- else:
330
- st.success("✨ No duplicates found!")
331
-
332
- # 3. Data Type Conversion
333
- with tab3:
334
- st.markdown("### 🔄 Convert Data Types")
335
- col1, col2 = st.columns(2)
336
- with col1:
337
- st.dataframe(df.dtypes.reset_index().rename(columns={0: 'Type', 'index': 'Column'}))
338
- with col2:
339
- col_to_convert = st.selectbox("Select column to convert", df.columns)
340
- new_type = st.selectbox("New Data Type", [
341
- "String", "Integer", "Float",
342
- "Boolean", "Datetime", "Category"
343
  ])
344
- if st.button("Convert Data Type"):
 
345
  try:
346
  if new_type == "String":
347
  df[col_to_convert] = df[col_to_convert].astype(str)
348
  elif new_type == "Integer":
349
  df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce').astype('Int64')
350
- elif new_type == "Float":
351
- df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce')
352
- elif new_type == "Boolean":
353
- df[col_to_convert] = df[col_to_convert].astype(bool)
354
- elif new_type == "Datetime":
355
- df[col_to_convert] = pd.to_datetime(df[col_to_convert], errors='coerce')
356
- elif new_type == "Category":
357
- df[col_to_convert] = df[col_to_convert].astype('category')
358
- cleaning_actions.append(f"Converted {col_to_convert} to {new_type}")
359
- update_version(df)
360
- st.success("Data type converted successfully! ✅")
361
  except Exception as e:
362
- st.error(f"Conversion failed: {str(e)}")
363
-
364
- # 4. Outlier Handling
365
- with tab4:
366
- st.markdown("### 📈 Handle Outliers")
367
- numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
368
- if numeric_cols:
369
- outlier_col = st.selectbox("Select numeric column", numeric_cols)
370
- st.plotly_chart(px.box(df, y=outlier_col, title="Outlier Distribution"))
371
- if st.button("Remove Outliers"):
372
- # Outlier removal logic here...
373
- cleaning_actions.append(f"Removed outliers from {outlier_col}")
374
- update_version(df)
375
- st.success("Outliers removed successfully! ✅")
376
- else:
377
- st.info("ℹ️ No numeric columns found for outlier detection")
378
-
379
-
380
- # Drop Column Functionality with Interface
381
- st.subheader("🗑️ Drop Specific Columns")
382
- cols_to_drop = st.multiselect("Select Columns to Drop", df.columns)
383
- if st.button("Drop Selected Columns"):
384
- try:
385
- df = df.drop(columns=cols_to_drop) # Drop the cols here.
386
- cleaning_actions.append(f"Dropped columns: {', '.join(cols_to_drop)}")
387
- update_version(df)
388
- st.success(f"Columns dropped successfully! ✅")
389
- except (KeyError, ValueError) as e:
390
- st.error(f"Invalid column(s) selected or other error: {e}") # Handle ValueErrors
391
- except Exception as e:
392
- st.error(f"An unexpected error occurred: {e}")
393
- # Label Encoding (Categorical to Numeric)
394
- st.subheader("🔢 Label Encoding")
395
- if st.button("Encode Categorical Columns"):
396
- try:
397
- le = LabelEncoder()
398
- categorical_cols = df.select_dtypes(include=['object', 'category']).columns
399
- for col in categorical_cols:
400
- df[col] = df[col].astype(str) # Ensure all cols are string
401
- df[col] = le.fit_transform(df[col])
402
- cleaning_actions.append("Applied Label Encoding to categorical columns")
403
- update_version(df)
404
- st.success("Label encoding applied successfully! ✅")
405
- except Exception as e:
406
- st.error(f"Label encoding failed: {str(e)}")
407
-
408
- # Live Data Preview after every cleaning action
409
- st.subheader("✨ Live Data Preview")
410
- st.dataframe(df.head(10)) # show 10 rows
411
-
412
- # 2. Duplicate Handling
413
- with tab2:
414
- st.markdown("### 🔄 Handle Duplicates")
415
- duplicates = df.duplicated().sum()
416
- if duplicates > 0:
417
- st.plotly_chart(px.histogram(df, x=df.duplicated(), title="Duplicate Distribution"))
418
- dup_strategy = st.radio("Duplicate Strategy", [
419
- "Remove All Duplicates",
420
- "Keep First Occurrence",
421
- "Keep Last Occurrence"
422
- ])
423
- if st.button("Handle Duplicates"):
424
- original_count = len(df)
425
- df = df.drop_duplicates(keep={
426
- "Remove All Duplicates": False,
427
- "Keep First Occurrence": 'first',
428
- "Keep Last Occurrence": 'last'
429
- }[dup_strategy])
430
- cleaning_actions.append(f"Removed {original_count - len(df)} duplicates")
431
- update_version(df)
432
- st.success(f"Removed {original_count - len(df)} duplicates! ✅")
433
- else:
434
- st.success("✨ No duplicates found!")
435
-
436
- # 3. Data Type Conversion
437
- with tab3:
438
- st.markdown("### 🔄 Convert Data Types")
439
- col1, col2 = st.columns(2)
440
- with col1:
441
- st.dataframe(df.dtypes.reset_index().rename(columns={0: 'Type', 'index': 'Column'}))
442
- with col2:
443
- col_to_convert = st.selectbox("Select column to convert", df.columns)
444
- new_type = st.selectbox("New Data Type", [
445
- "String", "Integer", "Float",
446
- "Boolean", "Datetime", "Category"
447
- ])
448
- if st.button("Convert Data Type"):
449
- try:
450
- if new_type == "String":
451
- df[col_to_convert] = df[col_to_convert].astype(str)
452
- elif new_type == "Integer":
453
- df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce').astype('Int64')
454
- elif new_type == "Float":
455
- df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce')
456
- elif new_type == "Boolean":
457
- df[col_to_convert] = df[col_to_convert].astype(bool)
458
- elif new_type == "Datetime":
459
- df[col_to_convert] = pd.to_datetime(df[col_to_convert], errors='coerce')
460
- elif new_type == "Category":
461
- df[col_to_convert] = df[col_to_convert].astype('category')
462
-
463
- cleaning_actions.append(f"Converted {col_to_convert} to {new_type}")
464
- update_version(df)
465
- st.success("Data type converted successfully! ✅")
466
- except Exception as e:
467
- st.error(f"Conversion failed: {str(e)}")
468
-
469
- # 4. Outlier Handling
470
- with tab4:
471
- st.markdown("### 📈 Handle Outliers")
472
- numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
473
- if numeric_cols:
474
- outlier_col = st.selectbox("Select numeric column", numeric_cols)
475
- st.plotly_chart(px.box(df, y=outlier_col, title="Outlier Distribution"))
476
- outlier_method = st.radio("Outlier Handling Method", ["Z-score", "IQR", "Manual"], horizontal=True)
477
- if st.button("Remove Outliers"):
478
- try:
479
- original_df = df.copy()
480
- if outlier_method == "Z-score":
481
- from scipy import stats
482
- z_scores = np.abs(stats.zscore(df[outlier_col]))
483
- df = df[(z_scores < 3)] # Keep only values with zscore less than 3
484
- cleaning_actions.append(f"Removed outliers from {outlier_col} using Z-score (threshold 3)")
485
- elif outlier_method == "IQR":
486
- Q1 = df[outlier_col].quantile(0.25)
487
- Q3 = df[outlier_col].quantile(0.75)
488
- IQR = Q3 - Q1
489
- df = df[~((df[outlier_col] < (Q1 - 1.5 * IQR)) |(df[outlier_col] > (Q3 + 1.5 * IQR)))]
490
- cleaning_actions.append(f"Removed outliers from {outlier_col} using IQR")
491
- elif outlier_method == "Manual":
492
- lower_bound = st.number_input("Lower Bound", value=df[outlier_col].min(), step=1.0)
493
- upper_bound = st.number_input("Upper Bound", value=df[outlier_col].max(), step=1.0)
494
- df = df[(df[outlier_col] >= lower_bound) & (df[outlier_col] <= upper_bound)]
495
- cleaning_actions.append(f"Removed outliers from {outlier_col} using manual bounds")
496
- update_version(df)
497
- st.success("Outliers removed successfully! ✅")
498
- except Exception as e:
499
- st.error(f"Outlier removal failed: {str(e)}")
500
- else:
501
- st.info("ℹ️ No numeric columns found for outlier detection")
502
-
503
- # Drop Column Functionality with Interface
504
- st.subheader("🗑️ Drop Specific Columns")
505
- cols_to_drop = st.multiselect("Select Columns to Drop", df.columns)
506
- if st.button("Drop Selected Columns"):
507
- try:
508
- df = df.drop(columns=cols_to_drop) #Drop the cols here.
509
- cleaning_actions.append(f"Dropped columns: {', '.join(cols_to_drop)}")
510
- update_version(df)
511
- st.success(f"Columns dropped successfully! ✅")
512
- except (KeyError):
513
- st.error("Invalid column(s) selected.")
514
- except Exception as e:
515
- st.error(f"An unexpected error occurred: {e}")
516
- # Label Encoding (Categorical to Numeric)
517
- st.subheader("🔢 Label Encoding")
518
- if st.button("Encode Categorical Columns"):
519
- try:
520
- le = LabelEncoder()
521
- categorical_cols = df.select_dtypes(include=['object', 'category']).columns
522
- for col in categorical_cols:
523
- df[col] = df[col].astype(str) # Ensure all cols are string
524
- df[col] = le.fit_transform(df[col])
525
- cleaning_actions.append("Applied Label Encoding to categorical columns")
526
- update_version(df)
527
- st.success("Label encoding applied successfully! ✅")
528
- except Exception as e:
529
- st.error(f"Label encoding failed: {str(e)}")
530
-
531
- # Live Data Preview after every cleaning action
532
- st.subheader("✨ Live Data Preview")
533
- st.dataframe(df.head(10)) # show 10 rows
534
- # Save Cleaned Data with Enhanced Feedback
535
- if st.button("💾 Save Cleaned Data"):
536
- st.session_state.cleaned_data = df
537
- st.balloons()
538
-
539
- # Generate comprehensive report
540
- from pandas_profiling import ProfileReport
541
- pr = ProfileReport(df, title="Cleaned Data Report")
542
- st_profile_report(pr)
543
-
544
- # Show cleaning log with diffs
545
- st.subheader("📝 Cleaning Log")
546
- st.table(pd.DataFrame({
547
- "Step": range(1, len(cleaning_actions)+1),
548
- "Action": cleaning_actions
549
- }))
550
-
551
- # Show dataset comparison
552
- col1, col2 = st.columns(2)
553
- with col1:
554
- st.write("Original Data Shape:", st.session_state.raw_data.shape)
555
- with col2:
556
- st.write("Cleaned Data Shape:", df.shape)
557
-
558
- st.success("✅ Cleaned data saved successfully! You can now proceed to analysis.")
559
- elif app_mode == "Advanced EDA":
560
- st.title("🔍 Advanced Exploratory Data Analysis")
561
- st.markdown("""
562
- **Interactive Data Exploration** with optimized visualizations for fast insights.
563
- Uncover patterns and relationships in your data with beautiful, responsive plots.
564
- """)
565
 
566
- if 'cleaned_data' not in st.session_state or st.session_state.cleaned_data is None:
567
- st.warning("Please clean your data in the Smart Cleaning section first.")
 
 
 
568
  st.stop()
569
-
570
- df = st.session_state.cleaned_data.copy()
571
-
572
- # Initialize session state for EDA configuration
573
- if 'eda_config' not in st.session_state:
574
- st.session_state.eda_config = {
575
- 'plot_type': "Histogram",
576
- 'x_col': df.columns[0] if len(df.columns) > 0 else None,
577
- 'y_col': df.columns[1] if len(df.columns) > 1 else None,
578
- 'z_col': df.columns[2] if len(df.columns) > 2 else None,
579
- 'color_col': None,
580
- 'facet_col': None,
581
- 'hover_data_cols': [],
582
- 'color_palette': "Viridis",
583
- 'filter_col': None,
584
- 'filter_options': []
585
- }
586
-
587
- # Main Layout Columns
588
  col1, col2 = st.columns([1, 3])
589
-
590
  with col1:
591
- st.header("📊 Visualization Setup")
592
-
593
- # Plot Type Selection
594
- plot_types = {
595
- "Distribution": ["Histogram", "Box Plot", "Violin Plot", "Density Plot"],
596
- "Relationship": ["Scatter Plot", "Line Plot", "Heatmap", "Pair Plot"],
597
- "Comparison": ["Bar Chart", "Pie Chart", "Parallel Coordinates"],
598
- "3D": ["3D Scatter", "3D Surface"]
599
- }
600
-
601
- selected_category = st.selectbox("Plot Category", list(plot_types.keys()))
602
- st.session_state.eda_config['plot_type'] = st.selectbox(
603
- "Plot Type",
604
- plot_types[selected_category]
605
- )
606
-
607
- # Dynamic Column Selectors
608
- plot_type = st.session_state.eda_config['plot_type']
609
-
610
- if plot_type in ["Histogram", "Box Plot", "Violin Plot", "Density Plot", "Bar Chart", "Pie Chart"]:
611
- st.session_state.eda_config['x_col'] = st.selectbox(
612
- "X Axis",
613
- df.columns,
614
- index=df.columns.get_loc(st.session_state.eda_config['x_col'])
615
- if st.session_state.eda_config['x_col'] in df.columns else 0
616
- )
617
-
618
- if plot_type in ["Scatter Plot", "Line Plot", "Box Plot", "Violin Plot", "Density Plot"]:
619
- st.session_state.eda_config['y_col'] = st.selectbox(
620
- "Y Axis",
621
- df.columns,
622
- index=df.columns.get_loc(st.session_state.eda_config['y_col'])
623
- if st.session_state.eda_config['y_col'] in df.columns else 0
624
- )
625
-
626
- if plot_type in ["3D Scatter", "3D Surface"]:
627
- st.session_state.eda_config['z_col'] = st.selectbox(
628
- "Z Axis",
629
- df.columns,
630
- index=df.columns.get_loc(st.session_state.eda_config['z_col'])
631
- if st.session_state.eda_config['z_col'] in df.columns else 0
632
- )
633
-
634
- # Additional Options
635
- with st.expander("🎨 Customization"):
636
- st.session_state.eda_config['color_col'] = st.selectbox(
637
- "Color By",
638
- [None] + list(df.columns)
639
- )
640
- st.session_state.eda_config['facet_col'] = st.selectbox(
641
- "Facet By",
642
- [None] + list(df.columns)
643
- )
644
- st.session_state.eda_config['hover_data_cols'] = st.multiselect(
645
- "Hover Data",
646
- df.columns
647
- )
648
- st.session_state.eda_config['color_palette'] = st.selectbox(
649
- "Color Palette",
650
- px.colors.named_colorscales()
651
- )
652
-
653
- # Data Filtering
654
- with st.expander("🔎 Data Filtering"):
655
- filter_col = st.selectbox(
656
- "Filter Column",
657
- [None] + list(df.columns)
658
- )
659
- if filter_col:
660
- unique_values = df[filter_col].unique()
661
- selected_values = st.multiselect(
662
- f"Select {filter_col} values",
663
- unique_values,
664
- default=unique_values
665
- )
666
- df = df[df[filter_col].isin(selected_values)]
667
-
668
  with col2:
669
- st.header("📈 Visualization")
670
- config = st.session_state.eda_config
671
-
672
- @st.cache_data(ttl=300)
673
- def generate_plot(df, plot_type, config):
674
- """Cached plot generation function for better performance"""
675
- try:
676
- if plot_type == "Histogram":
677
- return px.histogram(
678
- df, x=config['x_col'],
679
- color=config['color_col'],
680
- nbins=30,
681
- color_discrete_sequence=[config['color_palette']]
682
- )
683
-
684
- elif plot_type == "Scatter Plot":
685
- return px.scatter(
686
- df, x=config['x_col'], y=config['y_col'],
687
- color=config['color_col'],
688
- hover_data=config['hover_data_cols']
689
- )
690
-
691
- elif plot_type == "Box Plot":
692
- return px.box(
693
- df, x=config['x_col'], y=config['y_col'],
694
- color=config['color_col']
695
- )
696
-
697
- elif plot_type == "Violin Plot":
698
- return px.violin(
699
- df, x=config['x_col'], y=config['y_col'],
700
- color=config['color_col'],
701
- box=True
702
- )
703
-
704
- elif plot_type == "Heatmap":
705
- numeric_df = df.select_dtypes(include=np.number)
706
- corr = numeric_df.corr()
707
- return px.imshow(
708
- corr,
709
- text_auto=True,
710
- color_continuous_scale=config['color_palette']
711
- )
712
-
713
- elif plot_type == "3D Scatter":
714
- return px.scatter_3d(
715
- df, x=config['x_col'], y=config['y_col'], z=config['z_col'],
716
- color=config['color_col']
717
- )
718
-
719
- elif plot_type == "Bar Chart":
720
- return px.bar(
721
- df, x=config['x_col'], y=config['y_col'],
722
- color=config['color_col']
723
- )
724
-
725
- elif plot_type == "Pie Chart":
726
- return px.pie(
727
- df, names=config['x_col'], values=config['y_col'],
728
- color_discrete_sequence=[config['color_palette']]
729
- )
730
-
731
- elif plot_type == "Line Plot":
732
- return px.line(
733
- df, x=config['x_col'], y=config['y_col'],
734
- color=config['color_col']
735
- )
736
-
737
- elif plot_type == "Pair Plot":
738
- numeric_cols = df.select_dtypes(include=np.number).columns
739
- return px.scatter_matrix(
740
- df[numeric_cols],
741
- color=config['color_col']
742
- )
743
-
744
- elif plot_type == "Parallel Coordinates":
745
- numeric_df = df.select_dtypes(include=np.number)
746
- return px.parallel_coordinates(
747
- numeric_df,
748
- color_continuous_scale=config['color_palette']
749
- )
750
-
751
- elif plot_type == "Density Plot":
752
- return px.density_contour(
753
- df, x=config['x_col'], y=config['y_col'],
754
- color=config['color_col']
755
- )
756
-
757
- except Exception as e:
758
- st.error(f"Plot generation error: {str(e)}")
759
- return None
760
-
761
- # Generate and display plot
762
- fig = generate_plot(df, plot_type, config)
763
- if fig:
764
  st.plotly_chart(fig, use_container_width=True)
 
 
765
 
766
- # Plot Statistics
767
- with st.expander("📊 Plot Statistics"):
768
- if plot_type in ["Histogram", "Box Plot", "Violin Plot"]:
769
- st.write(f"**{config['x_col']} Statistics**")
770
- st.table(df[config['x_col']].describe())
771
-
772
- if plot_type in ["Scatter Plot", "Line Plot"]:
773
- st.write(f"**Correlation between {config['x_col']} and {config['y_col']}**")
774
- corr = df[[config['x_col'], config['y_col']]].corr().iloc[0,1]
775
- st.metric("Pearson Correlation", f"{corr:.2f}")
776
-
777
- if plot_type == "Heatmap":
778
- st.write("**Correlation Matrix**")
779
- numeric_df = df.select_dtypes(include=np.number)
780
- st.dataframe(numeric_df.corr())
781
-
782
- # Data Summary Section
783
- st.header("📝 Data Summary")
784
- with st.expander("Show Data Summary"):
785
- col1, col2 = st.columns(2)
786
- with col1:
787
- st.write("**Data Shape**")
788
- st.write(f"Rows: {df.shape[0]}")
789
- st.write(f"Columns: {df.shape[1]}")
790
-
791
- with col2:
792
- st.write("**Data Types**")
793
- st.dataframe(df.dtypes.reset_index().rename(columns={
794
- 'index': 'Column', 0: 'Type'
795
- }))
796
-
797
- st.write("**Sample Data**")
798
- st.dataframe(df.head())
799
-
800
- # Model Selection
801
- st.subheader("🤖 Model Selection")
802
- if problem_type == "Regression":
803
- model_options = ["Linear Regression", "Decision Tree", "Random Forest", "Gradient Boosting", "SVM", "Neural Network"]
804
- else: # Classification
805
- model_options = ["Logistic Regression", "Decision Tree", "Random Forest", "Gradient Boosting", "SVM", "Neural Network", "KNN", "Naive Bayes"]
806
- model_name = st.selectbox("Select Model", model_options, help="Choose a model.")
807
-
808
- # Hyperparameter Tuning
809
- st.subheader("🎛️ Hyperparameter Tuning")
810
- with st.expander("Configure Hyperparameters", expanded=True):
811
- if model_name == "Random Forest":
812
- n_estimators = st.slider("Number of Estimators", 10, 200, 100)
813
- max_depth = st.slider("Max Depth", 3, 20, 10)
814
- min_samples_split = st.slider("Min Samples Split", 2, 10, 2)
815
- min_samples_leaf = st.slider("Min Samples Leaf", 1, 10, 1)
816
- hyperparams = {
817
- 'n_estimators': n_estimators,
818
- 'max_depth': max_depth,
819
- 'min_samples_split': min_samples_split,
820
- 'min_samples_leaf': min_samples_leaf
821
- }
822
- elif model_name == "Gradient Boosting": # Correct placement of elif
823
- learning_rate = st.slider("Learning Rate", 0.01, 1.0, 0.1)
824
- n_estimators = st.slider("Number of Estimators", 10, 200, 100)
825
- max_depth = st.slider("Max Depth", 3, 20, 10)
826
- hyperparams = {
827
- 'learning_rate': learning_rate,
828
- 'n_estimators': n_estimators,
829
- 'max_depth': max_depth
830
- }
831
- elif model_name == "Neural Network":
832
- from tensorflow.keras.models import Sequential
833
- from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
834
- from tensorflow.keras.optimizers import Adam, Nadam, RMSprop, SGD
835
-
836
- hidden_layers = st.slider("Number of Hidden Layers", 1, 5, 2)
837
- neurons_per_layer = st.slider("Neurons per Layer", 10, 200, 50)
838
- activation = st.selectbox("Activation Function",
839
- ["relu", "tanh", "sigmoid", "selu", "swish"])
840
- dropout_rate = st.slider("Dropout Rate", 0.0, 0.5, 0.2)
841
- initializer = st.selectbox("Weight Initializer",
842
- ["glorot_uniform", "he_normal", "lecun_uniform"])
843
- learning_rate = st.slider("Learning Rate", 0.0001, 0.1, 0.001, format="%.4f")
844
- optimizer_choice = st.selectbox("Optimizer",
845
- ["Adam", "Nadam", "RMSprop", "SGD"])
846
- batch_norm = st.checkbox("Batch Normalization", value=True)
847
- regularization = st.checkbox("L2 Regularization")
848
- epochs = st.slider("Epochs", 10, 200, 50)
849
- batch_size = st.slider("Batch Size", 16, 128, 32)
850
- hyperparams = {
851
- 'hidden_layers': hidden_layers,
852
- 'neurons_per_layer': neurons_per_layer,
853
- 'activation': activation,
854
- 'dropout_rate': dropout_rate,
855
- 'initializer': initializer,
856
- 'learning_rate': learning_rate,
857
- 'optimizer_choice': optimizer_choice,
858
- 'batch_norm': batch_norm,
859
- 'regularization': regularization,
860
- 'epochs': epochs,
861
- 'batch_size': batch_size
862
- }
863
- else:
864
- hyperparams = {}
865
-
866
- # Train-Test Split
867
- st.subheader("✂️ Train-Test Split")
868
- test_size = st.slider("Test Size", 0.1, 0.5, 0.2, help="Proportion of the dataset to include in the test split.")
869
-
870
- # Model Training
871
- if st.button("🚀 Train Model"):
872
- with st.spinner("Training model..."):
873
- try:
874
- X = df[feature_columns]
875
- y = df[target_column]
876
-
877
- # Check if X is empty
878
- if X.empty:
879
- st.error("No features were selected. Please select feature columns.")
880
- st.stop()
881
-
882
- # Train-Test Split
883
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
884
-
885
- # Preprocessing Pipeline
886
- numeric_features = X.select_dtypes(include=np.number).columns
887
- categorical_features = X.select_dtypes(exclude=np.number).columns
888
-
889
- numeric_transformer = Pipeline(steps=[
890
- ('imputer', SimpleImputer(strategy='median')),
891
- ('scaler', StandardScaler())
892
- ])
893
-
894
- categorical_transformer = Pipeline(steps=[
895
- ('imputer', SimpleImputer(strategy='most_frequent')),
896
- ('onehot', OneHotEncoder(handle_unknown='ignore'))
897
- ])
898
-
899
- preprocessor = ColumnTransformer(
900
- transformers=[
901
- ('num', numeric_transformer, numeric_features),
902
- ('cat', categorical_transformer, categorical_features)
903
- ])
904
-
905
- X_train_processed = preprocessor.fit_transform(X_train)
906
- X_test_processed = preprocessor.transform(X_test)
907
-
908
- # Model Training
909
- if model_name == "Linear Regression":
910
- model = LinearRegression()
911
- elif model_name == "Logistic Regression":
912
- model = LogisticRegression(max_iter=1000)
913
- elif model_name == "Decision Tree":
914
- if problem_type == "Regression":
915
- model = DecisionTreeRegressor()
916
- else:
917
- model = DecisionTreeClassifier()
918
- elif model_name == "Random Forest":
919
- if problem_type == "Regression":
920
- model = RandomForestRegressor(**hyperparams)
921
- else:
922
- model = RandomForestClassifier(**hyperparams)
923
- elif model_name == "Gradient Boosting":
924
- if problem_type == "Regression":
925
- model = GradientBoostingRegressor(**hyperparams)
926
- else:
927
- model = GradientBoostingClassifier(**hyperparams)
928
- elif model_name == "SVM":
929
- if problem_type == "Regression":
930
- model = SVR()
931
- else:
932
- model = SVC()
933
- elif model_name == "Neural Network":
934
- from tensorflow.keras.models import Sequential
935
- from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
936
- from tensorflow.keras.optimizers import Adam, Nadam, RMSprop, SGD
937
-
938
- # Build a new model with the parameters
939
- model = Sequential()
940
- model.add(layers.Input(shape=(X_train_processed.shape[1],)))
941
-
942
- for i in range(hyperparams['hidden_layers']):
943
- model.add(Dense(hyperparams['neurons_per_layer'],
944
- activation=hyperparams['activation'],
945
- kernel_initializer=hyperparams['initializer']))
946
- if hyperparams['batch_norm']:
947
- model.add(BatchNormalization())
948
- model.add(Dropout(hyperparams['dropout_rate']))
949
-
950
- # Output layer
951
- output_activation = 'linear' if problem_type == "Regression" else 'softmax'
952
- output_units = 1 if problem_type == "Regression" else len(np.unique(y_train))
953
- model.add(Dense(output_units, activation=output_activation))
954
-
955
- # Configure optimizer
956
- optimizers = {
957
- "Adam": Adam(learning_rate=hyperparams['learning_rate']),
958
- "Nadam": Nadam(learning_rate=hyperparams['learning_rate']),
959
- "RMSprop": RMSprop(learning_rate=hyperparams['learning_rate']),
960
- "SGD": SGD(learning_rate=hyperparams['learning_rate'], momentum=0.9)
961
- }
962
- optimizer = optimizers[hyperparams['optimizer_choice']]
963
-
964
- model.compile(optimizer=optimizer,
965
- loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
966
- metrics=['mae'] if problem_type == "Regression" else ['accuracy'])
967
- elif model_name == "KNN":
968
- from sklearn.neighbors import KNeighborsClassifier
969
- model = KNeighborsClassifier()
970
- elif model_name == "Naive Bayes":
971
- from sklearn.naive_bayes import GaussianNB
972
- model = GaussianNB()
973
-
974
- # Train the model
975
- if model_name == "Neural Network": # Only for the neural network
976
- history = model.fit(X_train_processed, y_train,
977
- epochs=hyperparams['epochs'],
978
- batch_size=hyperparams['batch_size'],
979
- validation_data=(X_test_processed, y_test),
980
- verbose=0)
981
-
982
- else:
983
- model.fit(X_train_processed, y_train)
984
- # Store model and preprocessor
985
- st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
986
- st.session_state.preprocessor = preprocessor
987
-
988
- # Store the test data for insights and predictions
989
- st.session_state.X_train_selected = X_train_processed
990
- st.session_state.X_test_selected = X_test_processed
991
- st.session_state.y_train = y_train
992
- st.session_state.y_test = y_test
993
-
994
- # Model Evaluation
995
- if problem_type == "Regression":
996
- y_pred = model.predict(X_test_processed)
997
- mse = mean_squared_error(y_test, y_pred)
998
- rmse = np.sqrt(mse)
999
- mae = mean_absolute_error(y_test, y_pred)
1000
- r2 = r2_score(y_test, y_pred)
1001
- st.write(f"Mean Squared Error: {mse:.4f}")
1002
- st.write(f"Root Mean Squared Error: {rmse:.4f}")
1003
- st.write(f"Mean Absolute Error: {mae:.4f}")
1004
- st.write(f"R-squared: {r2:.4f}")
1005
- else: # Classification
1006
- from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
1007
- y_pred = model.predict(X_test_processed)
1008
- if model_name == "Neural Network": # Neural network output probabilities
1009
- y_pred = np.argmax(model.predict(X_test_processed), axis=1)
1010
- accuracy = accuracy_score(y_test, y_pred)
1011
- precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
1012
- recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
1013
- f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
1014
- st.write(f"Accuracy: {accuracy:.4f}")
1015
- st.write(f"Precision: {precision:.4f}")
1016
- st.write(f"Recall: {recall:.4f}")
1017
- st.write(f"F1 Score: {f1:.4f}")
1018
- st.write("Classification Report:")
1019
- st.text(classification_report(y_test, y_pred))
1020
- # confusion matrix
1021
- st.write("Confusion Matrix:")
1022
- conf_matrix = confusion_matrix(y_test, y_pred)
1023
- st.write(conf_matrix)
1024
-
1025
- # Visualization
1026
- st.subheader("📊 Model Performance Visualization")
1027
- if problem_type == "Regression":
1028
- fig, ax = plt.subplots()
1029
- ax.scatter(y_test, y_pred)
1030
- ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
1031
- ax.set_xlabel('Actual')
1032
- ax.set_ylabel('Predicted')
1033
- ax.set_title('Actual vs Predicted')
1034
- st.pyplot(fig)
1035
- elif model_name == "Neural Network":
1036
- fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
1037
- ax1.plot(history.history['loss'], label='Train Loss')
1038
- ax1.plot(history.history['val_loss'], label='Validation Loss')
1039
- ax1.set_title('Loss Evolution')
1040
- ax1.set_xlabel('Epoch')
1041
- ax1.set_ylabel('Loss')
1042
- ax1.legend()
1043
-
1044
- # Plot accuracy/metric
1045
- if problem_type == "Classification":
1046
- ax2.plot(history.history['accuracy'], label='Train Accuracy')
1047
- ax2.plot(history.history['val_accuracy'], label='Validation Accuracy')
1048
- ax2.set_title('Accuracy Evolution')
1049
- ax2.set_ylabel('Accuracy')
1050
- else:
1051
- ax2.plot(history.history['mae'], label='Train MAE')
1052
- ax2.plot(history.history['val_mae'], label='Validation MAE')
1053
- ax2.set_title('MAE Evolution')
1054
- ax2.set_ylabel('MAE')
1055
-
1056
- ax2.set_xlabel('Epoch')
1057
- ax2.legend()
1058
- st.pyplot(fig)
1059
-
1060
- else: # Classification confusion matrix
1061
- from sklearn.metrics import confusion_matrix
1062
- conf_matrix = confusion_matrix(y_test, y_pred)
1063
- fig, ax = plt.subplots()
1064
- sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax)
1065
- ax.set_xlabel('Predicted Labels')
1066
- ax.set_ylabel('True Labels')
1067
- ax.set_title('Confusion Matrix')
1068
- st.pyplot(fig)
1069
- st.success("Model trained successfully!")
1070
- except Exception as e:
1071
- st.error(f"An error occurred during training: {e}")
1072
-
1073
- if st.session_state.model is not None:
1074
- st.subheader("💾 Save Model")
1075
- model_filename = st.text_input("Enter Model Filename (without extension)", "trained_model")
1076
- if st.button("Save Model"):
1077
- try:
1078
- joblib.dump(st.session_state.model, f"{model_filename}.joblib")
1079
- st.success(f"Model saved as {model_filename}.joblib")
1080
- except Exception as e:
1081
- st.error(f"Error saving model: {e}")
1082
- else:
1083
- st.warning("No trained model available. Train a model first to enable saving.")
1084
-
1085
-
1086
- # Insights Section
1087
- elif app_mode == "Insights":
1088
- st.title("📊 Model Insights & Explainability")
1089
- st.markdown("""
1090
- **Understand and Interpret Your Model** with advanced explainability tools and visualizations.
1091
- Gain deeper insights into model behavior and predictions.
1092
- """)
1093
-
1094
- if 'model' not in st.session_state or st.session_state.model is None:
1095
- st.warning("Please train a model in the Model Training section first.")
1096
  st.stop()
1097
-
1098
- model = st.session_state.model.steps[-1][1] # Get the trained model
1099
- preprocessor = st.session_state.model.steps[0][1] # Get the preprocessor
1100
-
1101
- # Model Summary
1102
- st.subheader("📝 Model Summary")
1103
- st.write(f"**Model Type:** {type(model).__name__}")
1104
- st.write(f"**Problem Type:** {'Regression' if hasattr(model, 'predict') else 'Classification'}")
1105
- st.write(f"**Training Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
1106
-
1107
- # Feature Importance
1108
- st.subheader("🔍 Feature Importance")
1109
- if hasattr(model, 'feature_importances_'):
1110
- importances = model.feature_importances_
1111
- feature_names = preprocessor.get_feature_names_out()
1112
- importance_df = pd.DataFrame({
1113
- 'Feature': feature_names,
1114
- 'Importance': importances
1115
- }).sort_values('Importance', ascending=False)
1116
-
1117
- fig, ax = plt.subplots()
1118
- sns.barplot(x='Importance', y='Feature', data=importance_df.head(10), ax=ax)
1119
- ax.set_title('Top 10 Feature Importances')
1120
- st.pyplot(fig)
1121
- else:
1122
- st.info("Feature importance not available for this model type.")
1123
-
1124
- # SHAP Values
1125
- st.subheader("📊 SHAP Values")
1126
- if st.checkbox("Calculate SHAP Values (Warning: May be slow for large datasets)"):
1127
  try:
1128
- import shap
1129
-
1130
- # Use KernelExplainer for models that don't have a built-in explainer
1131
- if not hasattr(model, 'predict'):
1132
- explainer = shap.KernelExplainer(model.predict, st.session_state.X_train_selected[:100, :]) # Use a sample of training data
1133
-
1134
- shap_values = explainer.shap_values(st.session_state.X_test_selected)
1135
- feature_names = preprocessor.get_feature_names_out()
1136
- # Summary Plot
1137
- st.write("### Summary Plot")
1138
- fig, ax = plt.subplots()
1139
- shap.summary_plot(shap_values, features=st.session_state.X_test_selected, feature_names=feature_names, show=False, plot_type="bar") # Change to bar for a cleaner visualization
1140
- st.pyplot(fig)
1141
-
1142
- # Force Plot for Individual Predictions
1143
- st.write("### Individual Prediction Explanation")
1144
- sample_idx = st.slider("Select Sample Index", 0, len(st.session_state.X_test_selected) - 1, 0)
1145
- fig, ax = plt.subplots()
1146
- shap.force_plot(explainer.expected_value, shap_values[sample_idx], st.session_state.X_test_selected[sample_idx],
1147
- feature_names=feature_names, matplotlib=True, show=False)
1148
- st.pyplot(fig)
1149
  else:
1150
- explainer = shap.TreeExplainer(model)
1151
- shap_values = explainer.shap_values(st.session_state.X_test_selected)
1152
- feature_names = preprocessor.get_feature_names_out()
1153
- # Summary Plot
1154
- st.write("### Summary Plot")
1155
- fig, ax = plt.subplots()
1156
- shap.summary_plot(shap_values, features=st.session_state.X_test_selected, feature_names=feature_names, show=False, plot_type="bar") # Change to bar for a cleaner visualization
1157
- st.pyplot(fig)
1158
-
1159
- # Force Plot for Individual Predictions
1160
- st.write("### Individual Prediction Explanation")
1161
- sample_idx = st.slider("Select Sample Index", 0, len(st.session_state.X_test_selected) - 1, 0)
1162
- fig, ax = plt.subplots()
1163
- shap.force_plot(explainer.expected_value, shap_values[sample_idx], st.session_state.X_test_selected[sample_idx],
1164
- feature_names=feature_names, matplotlib=True, show=False)
1165
- st.pyplot(fig)
1166
-
1167
- except Exception as e:
1168
- st.error(f"SHAP calculation failed: {e}")
1169
-
1170
- # Partial Dependence Plots
1171
- st.subheader("📈 Partial Dependence Plots")
1172
- if hasattr(model, 'predict'):
1173
- feature_to_plot = st.selectbox("Select Feature for PDP", preprocessor.get_feature_names_out())
1174
- if st.button("Generate PDP"):
1175
- from sklearn.inspection import PartialDependenceDisplay
1176
- fig, ax = plt.subplots()
1177
- PartialDependenceDisplay.from_estimator(
1178
- model, st.session_state.X_test_selected,
1179
- features=[feature_to_plot],
1180
- feature_names=preprocessor.get_feature_names_out(),
1181
- ax=ax
1182
  )
1183
- st.pyplot(fig)
1184
-
1185
- # Model Performance Over Time
1186
- st.subheader("⏳ Model Performance Over Time")
1187
- if st.checkbox("Track Performance Over Time"):
1188
- performance_history = {
1189
- 'timestamp': [],
1190
- 'metric': [],
1191
- 'value': []
1192
- }
1193
-
1194
- if hasattr(model, 'predict'):
1195
- y_pred = model.predict(st.session_state.X_test_selected)
1196
- mse = mean_squared_error(st.session_state.y_test, y_pred)
1197
- performance_history['timestamp'].append(datetime.now())
1198
- performance_history['metric'].append('MSE')
1199
- performance_history['value'].append(mse)
1200
-
1201
- performance_df = pd.DataFrame(performance_history)
1202
- st.line_chart(performance_df.set_index('timestamp'))
1203
-
1204
- # Model Debugging
1205
- st.subheader("🐛 Model Debugging")
1206
- if st.checkbox("Enable Debug Mode"):
1207
- st.write("### Model Parameters")
1208
- st.json(model.get_params())
1209
-
1210
- st.write("### Training Data Summary")
1211
- st.write(f"Number of Samples: {st.session_state.X_train_selected.shape[0]}")
1212
- st.write(f"Number of Features: {st.session_state.X_train_selected.shape[1]}")
1213
-
1214
- # Export Insights
1215
- st.subheader("💾 Export Insights")
1216
- if st.button("Export Insights as PDF"):
1217
- try:
1218
- from fpdf import FPDF
1219
- pdf = FPDF()
1220
- pdf.add_page()
1221
- pdf.set_font("Arial", size=12)
1222
- pdf.cell(200, 10, txt="Model Insights Report", ln=True, align='C')
1223
- pdf.cell(200, 10, txt=f"Model Type: {type(model).__name__}", ln=True)
1224
- pdf.cell(200, 10, txt=f"Problem Type: {'Regression' if hasattr(model, 'predict') else 'Classification'}", ln=True)
1225
- pdf.output("model_insights.pdf")
1226
- st.success("Insights exported successfully!")
1227
  except Exception as e:
1228
- st.error(f"Export failed: {e}")
1229
 
1230
- # Predictions Section
1231
  elif app_mode == "Predictions":
1232
- st.title("🔮 Prediction Studio")
1233
- st.markdown("""
1234
- **Make Predictions** with your trained model and explore prediction explanations.
1235
- Generate batch predictions and export results.
1236
- """)
1237
-
1238
- if 'model' not in st.session_state or st.session_state.model is None:
1239
- st.warning("Please train a model in the Model Training section first.")
1240
  st.stop()
1241
-
1242
- model = st.session_state.model.steps[-1][1] # Get the trained model
1243
- preprocessor = st.session_state.model.steps[0][1] # Get the preprocessor
1244
-
1245
- # Single Prediction
1246
- st.subheader("🎯 Single Prediction")
1247
- input_data = {}
1248
- feature_names = preprocessor.get_feature_names_out()
1249
- for feature in feature_names:
1250
- if feature in st.session_state.cleaned_data.columns:
1251
- if pd.api.types.is_numeric_dtype(st.session_state.cleaned_data[feature]):
1252
- input_data[feature] = st.number_input(f"Enter {feature}", value=st.session_state.cleaned_data[feature].mean())
1253
- else:
1254
- input_data[feature] = st.selectbox(f"Select {feature}", st.session_state.cleaned_data[feature].unique())
1255
-
1256
- if st.button("Make Prediction"):
1257
- try:
1258
- input_df = pd.DataFrame([input_data])
1259
- input_processed = preprocessor.transform(input_df)
1260
- prediction = model.predict(input_processed)[0]
1261
-
1262
- st.write(f"**Prediction:** {prediction}")
1263
-
1264
- if hasattr(model, 'predict_proba'):
1265
- probabilities = model.predict_proba(input_processed)[0]
1266
- st.write("**Prediction Probabilities:**")
1267
- st.bar_chart(probabilities)
1268
-
1269
- # SHAP Explanation
1270
- if st.checkbox("Show SHAP Explanation"):
1271
- try:
1272
- import shap
1273
- # Use KernelExplainer or TreeExplainer, checking if the model has the property first
1274
- if hasattr(model, 'predict'):
1275
- explainer = shap.TreeExplainer(model)
1276
- shap_values = explainer.shap_values(input_processed)
1277
- else:
1278
- explainer = shap.KernelExplainer(model.predict, st.session_state.X_train_selected[:100, :])
1279
- shap_values = explainer.shap_values(input_processed)
1280
-
1281
- st.write("### SHAP Values")
1282
- fig, ax = plt.subplots()
1283
- shap.force_plot(explainer.expected_value, shap_values, input_processed,
1284
- feature_names=feature_names, matplotlib=True, show=False)
1285
- st.pyplot(fig)
1286
-
1287
- except Exception as e:
1288
- st.error(f"SHAP calculation failed: {e}")
1289
-
1290
- except Exception as e:
1291
- st.error(f"Prediction failed: {e}")
1292
-
1293
- # Batch Predictions
1294
- st.subheader("📂 Batch Predictions")
1295
- batch_file = st.file_uploader("Upload CSV for Batch Predictions", type=["csv"])
1296
- if batch_file is not None:
1297
- try:
1298
- batch_df = pd.read_csv(batch_file)
1299
- batch_processed = preprocessor.transform(batch_df)
1300
- batch_predictions = model.predict(batch_processed)
1301
- batch_df['Prediction'] = batch_predictions
1302
-
1303
- if hasattr(model, 'predict_proba'):
1304
- probabilities = model.predict_proba(batch_processed)
1305
- for i in range(probabilities.shape[1]):
1306
- batch_df[f'Probability_Class_{i}'] = probabilities[:, i]
1307
-
1308
- st.write("### Predictions Preview")
1309
- st.dataframe(batch_df.head())
1310
-
1311
- # Download Predictions
1312
- csv = batch_df.to_csv(index=False)
1313
- b64 = base64.b64encode(csv.encode()).decode()
1314
- href = f'<a href="data:file/csv;base64,{b64}" download="predictions.csv">Download Predictions CSV</a>'
1315
- st.markdown(href, unsafe_allow_html=True)
1316
-
1317
- except Exception as e:
1318
- st.error(f"Batch prediction failed: {e}")
1319
-
1320
- # Prediction Analysis
1321
- st.subheader("📊 Prediction Analysis")
1322
- if st.checkbox("Analyze Predictions"):
1323
- try:
1324
- y_pred = model.predict(st.session_state.X_test_selected)
1325
- y_test = st.session_state.y_test
1326
-
1327
- if hasattr(model, 'predict'):
1328
- fig, ax = plt.subplots()
1329
- ax.scatter(y_test, y_pred)
1330
- ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
1331
- ax.set_xlabel('Actual')
1332
- ax.set_ylabel('Predicted')
1333
- ax.set_title('Actual vs Predicted')
1334
- st.pyplot(fig)
1335
- else:
1336
- conf_matrix = confusion_matrix(y_test, y_pred)
1337
- fig, ax = plt.subplots()
1338
- sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax)
1339
- ax.set_xlabel('Predicted Labels')
1340
- ax.set_ylabel('True Labels')
1341
- ax.set_title('Confusion Matrix')
1342
- st.pyplot(fig)
1343
- except Exception as e:
1344
- st.error(f"Prediction analysis failed: {e}")
1345
-
1346
- # Prediction Export
1347
- st.subheader("💾 Export Predictions")
1348
- if st.button("Export Predictions as PDF"):
1349
- try:
1350
- from fpdf import FPDF
1351
- pdf = FPDF()
1352
- pdf.add_page()
1353
- pdf.set_font("Arial", size=12)
1354
- pdf.cell(200, 10, txt="Predictions Report", ln=True, align='C')
1355
- pdf.cell(200, 10, txt=f"Model Type: {type(model).__name__}", ln=True)
1356
- pdf.cell(200, 10, txt=f"Problem Type: {'Regression' if hasattr(model, 'predict') else 'Classification'}", ln=True)
1357
- pdf.output("predictions_report.pdf")
1358
- st.success("Predictions exported successfully!")
1359
- except Exception as e:
1360
- st.error(f"An unexpected error occurred: {e}")
 
1
+ import streamlit as st
 
2
  import pandas as pd
3
+ import numpy as np
4
+ import plotly.express as px
5
+ import plotly.graph_objects as go
 
 
 
6
  from sklearn.preprocessing import StandardScaler, LabelEncoder
7
  from sklearn.model_selection import train_test_split
8
+ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
9
+ from sklearn.metrics import accuracy_score, mean_squared_error
 
 
10
  from ydata_profiling import ProfileReport
11
  from streamlit_pandas_profiling import st_profile_report
 
 
 
 
 
 
 
 
12
  import joblib
13
+ import shap
14
+ from datetime import datetime
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ # --------------------------
17
+ # Page Configuration
18
+ # --------------------------
19
  st.set_page_config(
20
+ page_title="DataInsight Pro",
21
+ page_icon="🔮",
22
  layout="wide",
 
23
  initial_sidebar_state="expanded"
24
  )
25
+
26
+ # --------------------------
27
+ # Custom Styling
28
+ # --------------------------
29
+ st.markdown("""
30
+ <style>
31
+ .main {background-color: #f8f9fa;}
32
+ .sidebar .sidebar-content {background-color: #2c3e50;}
33
+ .stButton>button {background-color: #3498db; color: white;}
34
+ .stTextInput>div>div>input {border: 1px solid #3498db;}
35
+ .stSelectbox>div>div>select {border: 1px solid #3498db;}
36
+ .stSlider>div>div>div>div {background-color: #3498db;}
37
+ .metric {padding: 15px; background-color: white; border-radius: 10px; box-shadow: 0 2px 5px rgba(0,0,0,0.1);}
38
+ </style>
39
+ """, unsafe_allow_html=True)
40
+
41
+ # --------------------------
42
+ # Session State Initialization
43
+ # --------------------------
44
  if 'raw_data' not in st.session_state:
45
  st.session_state.raw_data = None
46
  if 'cleaned_data' not in st.session_state:
47
  st.session_state.cleaned_data = None
 
48
  if 'model' not in st.session_state:
49
  st.session_state.model = None
 
 
 
 
 
 
 
 
 
 
 
50
 
51
+ # --------------------------
52
+ # Helper Functions
53
+ # --------------------------
54
+ def generate_quality_report(df):
 
 
 
 
 
 
 
 
55
  """Generate comprehensive data quality report"""
56
  report = {
57
+ 'basic': {
58
  'rows': df.shape[0],
59
  'columns': df.shape[1],
60
+ 'missing': df.isna().sum().sum(),
61
  'duplicates': df.duplicated().sum()
62
  },
63
+ 'columns': {}
 
64
  }
 
65
  for col in df.columns:
66
  col_report = {
67
  'type': str(df[col].dtype),
68
  'unique': df[col].nunique(),
69
  'missing': df[col].isna().sum(),
 
70
  }
 
 
71
  if pd.api.types.is_numeric_dtype(df[col]):
72
  col_report.update({
73
  'mean': df[col].mean(),
74
  'std': df[col].std(),
75
+ 'zeros': (df[col] == 0).sum()
 
 
 
 
 
 
 
 
 
 
76
  })
77
+ report['columns'][col] = col_report
 
 
 
 
78
  return report
79
 
80
+ # --------------------------
81
+ # Sidebar Navigation
82
+ # --------------------------
83
+ with st.sidebar:
84
+ st.title("🔮 DataInsight Pro")
85
+ app_mode = st.selectbox(
86
+ "Navigation",
87
+ ["Data Upload", "Data Cleaning", "EDA", "Model Training", "Predictions"],
88
+ format_func=lambda x: f"📌 {x}"
89
+ )
90
+ st.markdown("---")
91
+ st.markdown("Created by [Your Name]")
92
+ st.markdown("v1.2 | © 2024")
93
+
94
+ # --------------------------
95
+ # Main App Pages
96
+ # --------------------------
 
 
 
97
  if app_mode == "Data Upload":
98
+ st.title("📤 Data Upload & Profiling")
99
+
100
+ uploaded_file = st.file_uploader("Upload your dataset (CSV/XLSX)", type=["csv", "xlsx"])
101
+
 
 
 
 
102
  if uploaded_file:
103
+ try:
104
+ if uploaded_file.name.endswith('.csv'):
105
+ df = pd.read_csv(uploaded_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  else:
107
+ df = pd.read_excel(uploaded_file)
108
+
109
+ st.session_state.raw_data = df
110
+
111
+ col1, col2, col3 = st.columns(3)
112
+ with col1:
113
+ st.metric("Rows", df.shape[0])
114
+ with col2:
115
+ st.metric("Columns", df.shape[1])
116
+ with col3:
117
+ st.metric("Missing Values", df.isna().sum().sum())
118
+
119
+ with st.expander("Data Preview", expanded=True):
120
+ st.dataframe(df.head(10), use_container_width=True)
121
+
122
+ if st.button("Generate Full Profile Report"):
123
+ with st.spinner("Generating comprehensive analysis..."):
124
+ pr = ProfileReport(df, explorative=True)
125
+ st_profile_report(pr)
126
+
127
+ except Exception as e:
128
+ st.error(f"Error loading file: {str(e)}")
129
 
130
+ elif app_mode == "Data Cleaning":
131
+ st.title("🧹 Smart Data Cleaning")
 
 
 
132
 
133
+ if st.session_state.raw_data is None:
134
+ st.warning("Please upload data first")
135
+ st.stop()
136
 
137
+ df = st.session_state.raw_data.copy()
138
+
139
+ # Missing Value Handling
140
+ with st.expander("🔍 Missing Values Treatment", expanded=True):
141
+ missing_cols = df.columns[df.isna().any()].tolist()
142
+ if missing_cols:
143
+ cols = st.multiselect("Select columns to handle", missing_cols)
144
+ method = st.selectbox("Imputation Method", [
145
+ "Drop Missing",
146
+ "Mean/Median",
147
+ "Custom Value"
148
+ ])
149
+
150
+ if st.button("Apply Treatment"):
151
+ if method == "Drop Missing":
152
+ df = df.dropna(subset=cols)
153
+ elif method == "Mean/Median":
 
 
 
 
 
 
 
 
 
 
154
  for col in cols:
155
+ if pd.api.types.is_numeric_dtype(df[col]):
156
+ df[col] = df[col].fillna(df[col].median())
157
+ st.session_state.cleaned_data = df
158
+ st.success("Missing values handled successfully!")
159
+ else:
160
+ st.success("No missing values found!")
161
+
162
+ # Data Type Conversion
163
+ with st.expander("🔄 Data Type Conversion"):
164
+ col_to_convert = st.selectbox("Select column", df.columns)
165
+ new_type = st.selectbox("New data type", [
166
+ "String", "Integer", "Float",
167
+ "Boolean", "Datetime"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  ])
169
+
170
+ if st.button("Convert"):
171
  try:
172
  if new_type == "String":
173
  df[col_to_convert] = df[col_to_convert].astype(str)
174
  elif new_type == "Integer":
175
  df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce').astype('Int64')
176
+ st.session_state.cleaned_data = df
177
+ st.success("Conversion successful!")
 
 
 
 
 
 
 
 
 
178
  except Exception as e:
179
+ st.error(f"Error: {str(e)}")
180
+
181
+ if st.session_state.cleaned_data is not None:
182
+ with st.expander("✨ Cleaned Data Preview"):
183
+ st.dataframe(st.session_state.cleaned_data.head(), use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
+ elif app_mode == "EDA":
186
+ st.title("🔍 Exploratory Data Analysis")
187
+
188
+ if st.session_state.cleaned_data is None:
189
+ st.warning("Please clean your data first")
190
  st.stop()
191
+
192
+ df = st.session_state.cleaned_data
193
+
194
+ # Visualization Selector
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  col1, col2 = st.columns([1, 3])
 
196
  with col1:
197
+ st.subheader("Visualization Setup")
198
+ plot_type = st.selectbox("Choose plot type", [
199
+ "Scatter Plot", "Histogram",
200
+ "Box Plot", "Correlation Matrix"
201
+ ])
202
+
203
+ x_axis = st.selectbox("X-Axis", df.columns)
204
+ y_axis = st.selectbox("Y-Axis", df.columns) if plot_type in ["Scatter Plot", "Box Plot"] else None
205
+ color_by = st.selectbox("Color By", [None] + df.columns.tolist())
206
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  with col2:
208
+ st.subheader("Visualization")
209
+ try:
210
+ if plot_type == "Scatter Plot":
211
+ fig = px.scatter(df, x=x_axis, y=y_axis, color=color_by)
212
+ elif plot_type == "Histogram":
213
+ fig = px.histogram(df, x=x_axis, color=color_by)
214
+ elif plot_type == "Box Plot":
215
+ fig = px.box(df, x=x_axis, y=y_axis, color=color_by)
216
+ elif plot_type == "Correlation Matrix":
217
+ corr = df.select_dtypes(include=np.number).corr()
218
+ fig = px.imshow(corr, text_auto=True)
219
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  st.plotly_chart(fig, use_container_width=True)
221
+ except Exception as e:
222
+ st.error(f"Visualization error: {str(e)}")
223
 
224
+ elif app_mode == "Model Training":
225
+ st.title("🤖 Intelligent Model Training")
226
+
227
+ if st.session_state.cleaned_data is None:
228
+ st.warning("Please clean your data first")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  st.stop()
230
+
231
+ df = st.session_state.cleaned_data
232
+
233
+ # Model Setup
234
+ col1, col2 = st.columns(2)
235
+ with col1:
236
+ target = st.selectbox("Select Target Variable", df.columns)
237
+ problem_type = st.selectbox("Problem Type", ["Classification", "Regression"])
238
+ with col2:
239
+ features = st.multiselect("Select Features", df.columns.drop(target))
240
+ test_size = st.slider("Test Size", 0.1, 0.5, 0.2)
241
+
242
+ if st.button("Train Model"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
  try:
244
+ X = df[features]
245
+ y = df[target]
246
+
247
+ # Preprocessing
248
+ X = pd.get_dummies(X)
249
+ y = LabelEncoder().fit_transform(y) if problem_type == "Classification" else y
250
+
251
+ X_train, X_test, y_train, y_test = train_test_split(
252
+ X, y, test_size=test_size, random_state=42
253
+ )
254
+
255
+ # Model Training
256
+ if problem_type == "Classification":
257
+ model = RandomForestClassifier()
 
 
 
 
 
 
 
258
  else:
259
+ model = RandomForestRegressor()
260
+
261
+ model.fit(X_train, y_train)
262
+ st.session_state.model = model
263
+
264
+ # Evaluation
265
+ y_pred = model.predict(X_test)
266
+ if problem_type == "Classification":
267
+ accuracy = accuracy_score(y_test, y_pred)
268
+ st.metric("Accuracy", f"{accuracy:.2%}")
269
+ else:
270
+ mse = mean_squared_error(y_test, y_pred)
271
+ st.metric("MSE", f"{mse:.2f}")
272
+
273
+ # Feature Importance
274
+ fig = px.bar(
275
+ x=model.feature_importances_,
276
+ y=X.columns,
277
+ orientation='h',
278
+ title="Feature Importance"
 
 
 
 
 
 
 
 
 
 
 
 
279
  )
280
+ st.plotly_chart(fig, use_container_width=True)
281
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  except Exception as e:
283
+ st.error(f"Training failed: {str(e)}")
284
 
 
285
  elif app_mode == "Predictions":
286
+ st.title("🔮 Predictive Analytics")
287
+
288
+ if st.session_state.model is None:
289
+ st.warning("Please train a model first")
 
 
 
 
290
  st.stop()
291
+
292
+ model = st.session_state.model
293
+
294
+ # Prediction Interface
295
+ col1, col2 = st.columns(2)
296
+ with col1:
297
+ st.subheader("Input Parameters")
298
+ input_data = {}
299
+ for feature in model.feature_names_in_:
300
+ input_data[feature] = st.number_input(feature)
301
+
302
+ with col2:
303
+ st.subheader("Prediction Result")
304
+ if st.button("Generate Prediction"):
305
+ try:
306
+ input_df = pd.DataFrame([input_data])
307
+ prediction = model.predict(input_df)[0]
308
+ st.metric("Predicted Value", prediction)
309
+
310
+ # SHAP Explanation
311
+ explainer = shap.TreeExplainer(model)
312
+ shap_values = explainer.shap_values(input_df)
313
+ fig = shap.force_plot(
314
+ explainer.expected_value[0],
315
+ shap_values[0],
316
+ input_df.iloc[0],
317
+ matplotlib=False
318
+ )
319
+ st.components.v1.html(shap.getjs() + fig.html(), height=300)
320
+
321
+ except Exception as e:
322
+ st.error(f"Prediction failed: {str(e)}")