Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -3,6 +3,15 @@ import pandas as pd
|
|
3 |
import numpy as np
|
4 |
import plotly.express as px
|
5 |
import plotly.graph_objects as go
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
import matplotlib.pyplot as plt #For SHAP charts
|
7 |
from scipy.stats import pearsonr, spearmanr
|
8 |
from sklearn.inspection import permutation_importance
|
@@ -356,20 +365,20 @@ if app_mode == "Data Upload":
|
|
356 |
# --------------------------
|
357 |
elif app_mode == "Data Cleaning":
|
358 |
st.title("🧹 Smart Data Cleaning")
|
359 |
-
|
360 |
-
|
|
|
361 |
st.warning("Please upload data first")
|
362 |
st.stop()
|
363 |
-
|
364 |
-
#
|
365 |
-
df = st.session_state.cleaned_data.copy() # Changed line
|
366 |
-
|
367 |
-
# Initialize session state
|
368 |
if 'data_versions' not in st.session_state:
|
369 |
st.session_state.data_versions = [st.session_state.raw_data.copy()]
|
370 |
if 'cleaned_data' not in st.session_state:
|
371 |
st.session_state.cleaned_data = st.session_state.raw_data.copy()
|
372 |
-
|
|
|
|
|
373 |
|
374 |
# --------------------------
|
375 |
# Data Health Dashboard
|
@@ -394,11 +403,12 @@ elif app_mode == "Data Cleaning":
|
|
394 |
# --------------------------
|
395 |
# Undo Functionality
|
396 |
# --------------------------
|
397 |
-
|
|
|
398 |
if st.button("⏮️ Undo Last Action"):
|
399 |
-
st.session_state.data_versions.pop()
|
400 |
-
st.session_state.cleaned_data = st.session_state.data_versions[-1].copy()
|
401 |
-
st.
|
402 |
|
403 |
# --------------------------
|
404 |
# Missing Value Handling
|
@@ -439,7 +449,8 @@ elif app_mode == "Data Cleaning":
|
|
439 |
new_df[cols] = new_df[cols].bfill()
|
440 |
|
441 |
update_cleaned_data(new_df)
|
442 |
-
|
|
|
443 |
except Exception as e:
|
444 |
st.error(f"Error: {str(e)}")
|
445 |
else:
|
@@ -480,6 +491,7 @@ elif app_mode == "Data Cleaning":
|
|
480 |
new_df[col_to_convert] = pd.to_datetime(new_df[col_to_convert], format=date_format, errors='coerce')
|
481 |
|
482 |
update_cleaned_data(new_df)
|
|
|
483 |
except Exception as e:
|
484 |
st.error(f"Error: {str(e)}")
|
485 |
|
@@ -495,6 +507,7 @@ elif app_mode == "Data Cleaning":
|
|
495 |
new_df = df.copy()
|
496 |
new_df = new_df.drop(columns=columns_to_drop)
|
497 |
update_cleaned_data(new_df)
|
|
|
498 |
|
499 |
# --------------------------
|
500 |
# Label Encoding
|
@@ -511,6 +524,7 @@ elif app_mode == "Data Cleaning":
|
|
511 |
new_df[col] = le.fit_transform(new_df[col].astype(str))
|
512 |
label_encoders[col] = le
|
513 |
update_cleaned_data(new_df)
|
|
|
514 |
|
515 |
# --------------------------
|
516 |
# StandardScaler
|
@@ -525,6 +539,7 @@ elif app_mode == "Data Cleaning":
|
|
525 |
scaler = StandardScaler()
|
526 |
new_df[scale_cols] = scaler.fit_transform(new_df[scale_cols])
|
527 |
update_cleaned_data(new_df)
|
|
|
528 |
except Exception as e:
|
529 |
st.error(f"Error: {str(e)}")
|
530 |
|
@@ -558,6 +573,7 @@ elif app_mode == "Data Cleaning":
|
|
558 |
text_cols = new_df.select_dtypes(include='object').columns
|
559 |
new_df[text_cols] = new_df[text_cols].apply(lambda x: x.str.strip())
|
560 |
update_cleaned_data(new_df)
|
|
|
561 |
|
562 |
# --------------------------
|
563 |
# Cleaned Data Preview
|
@@ -565,11 +581,10 @@ elif app_mode == "Data Cleaning":
|
|
565 |
if st.session_state.get("cleaned_data") is not None:
|
566 |
enhance_section_title("Cleaned Data Preview", "✨")
|
567 |
with st.expander("✨ Cleaned Data Preview", expanded=True):
|
568 |
-
st.dataframe(
|
569 |
-
|
570 |
-
|
571 |
-
|
572 |
-
)
|
573 |
|
574 |
# --------------------------
|
575 |
# EDA
|
@@ -577,11 +592,31 @@ elif app_mode == "Data Cleaning":
|
|
577 |
elif app_mode == "EDA":
|
578 |
st.title("🔍 Interactive Data Explorer")
|
579 |
|
580 |
-
|
581 |
-
|
582 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
583 |
|
584 |
-
|
|
|
585 |
|
586 |
# --------------------------
|
587 |
# Enhanced Data Overview
|
@@ -910,24 +945,32 @@ elif app_mode == "EDA":
|
|
910 |
elif app_mode == "Model Training":
|
911 |
st.title("🤖 Intelligent Model Training")
|
912 |
|
913 |
-
|
914 |
-
|
915 |
-
|
916 |
-
|
917 |
-
|
918 |
-
|
919 |
-
|
920 |
-
|
921 |
-
|
922 |
-
|
923 |
-
|
924 |
-
|
925 |
-
|
926 |
-
|
927 |
-
|
928 |
-
|
929 |
-
|
|
|
|
|
|
|
|
|
|
|
930 |
|
|
|
|
|
|
|
931 |
# Model Setup
|
932 |
col1, col2, col3 = st.columns(3)
|
933 |
with col1:
|
@@ -996,16 +1039,21 @@ elif app_mode == "Model Training":
|
|
996 |
|
997 |
use_grid_search = st.checkbox("Use Grid Search for Hyperparameter Tuning")
|
998 |
|
|
|
999 |
if st.button("Train Model"):
|
1000 |
if not features:
|
1001 |
st.error("Please select at least one feature.")
|
1002 |
st.stop()
|
1003 |
-
|
|
|
1004 |
# Call the training function
|
1005 |
-
model, scaler, label_encoder, imputer_numerical, metrics, column_order, importance
|
|
|
|
|
1006 |
|
1007 |
-
|
1008 |
-
|
|
|
1009 |
|
1010 |
# Display Metrics
|
1011 |
st.subheader("Model Evaluation Metrics")
|
@@ -1109,10 +1157,22 @@ elif app_mode == "Model Training":
|
|
1109 |
# Predictions Section (Fixed)
|
1110 |
if app_mode == "Predictions":
|
1111 |
st.title("�� Predictive Analytics - Informed Business Decisions")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1112 |
|
1113 |
-
if st.session_state
|
1114 |
-
st.warning("Please
|
1115 |
st.stop()
|
|
|
|
|
1116 |
|
1117 |
model_data = st.session_state.model # Get the entire dictionary
|
1118 |
model = model_data['model'] # Access model
|
|
|
3 |
import numpy as np
|
4 |
import plotly.express as px
|
5 |
import plotly.graph_objects as go
|
6 |
+
from sklearn.impute import SimpleImputer
|
7 |
+
from sklearn.model_selection import GridSearchCV
|
8 |
+
from sklearn.linear_model import LogisticRegression
|
9 |
+
from sklearn.svm import SVC
|
10 |
+
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
|
11 |
+
from sklearn.neural_network import MLPRegressor, MLPClassifier
|
12 |
+
from sklearn.metrics import confusion_matrix, classification_report, r2_score
|
13 |
+
from sklearn.model_selection import cross_val_score
|
14 |
+
import scipy.stats as stats
|
15 |
import matplotlib.pyplot as plt #For SHAP charts
|
16 |
from scipy.stats import pearsonr, spearmanr
|
17 |
from sklearn.inspection import permutation_importance
|
|
|
365 |
# --------------------------
|
366 |
elif app_mode == "Data Cleaning":
|
367 |
st.title("🧹 Smart Data Cleaning")
|
368 |
+
|
369 |
+
# Check for raw data FIRST
|
370 |
+
if 'raw_data' not in st.session_state:
|
371 |
st.warning("Please upload data first")
|
372 |
st.stop()
|
373 |
+
|
374 |
+
# Initialize data_versions and cleaned_data together
|
|
|
|
|
|
|
375 |
if 'data_versions' not in st.session_state:
|
376 |
st.session_state.data_versions = [st.session_state.raw_data.copy()]
|
377 |
if 'cleaned_data' not in st.session_state:
|
378 |
st.session_state.cleaned_data = st.session_state.raw_data.copy()
|
379 |
+
|
380 |
+
# Now safely use cleaned_data
|
381 |
+
df = st.session_state.cleaned_data.copy()
|
382 |
|
383 |
# --------------------------
|
384 |
# Data Health Dashboard
|
|
|
403 |
# --------------------------
|
404 |
# Undo Functionality
|
405 |
# --------------------------
|
406 |
+
# In Data Cleaning page's Undo section:
|
407 |
+
if 'data_versions' in st.session_state and len(st.session_state.data_versions) > 1:
|
408 |
if st.button("⏮️ Undo Last Action"):
|
409 |
+
st.session_state.data_versions.pop()
|
410 |
+
st.session_state.cleaned_data = st.session_state.data_versions[-1].copy()
|
411 |
+
st.rerun()
|
412 |
|
413 |
# --------------------------
|
414 |
# Missing Value Handling
|
|
|
449 |
new_df[cols] = new_df[cols].bfill()
|
450 |
|
451 |
update_cleaned_data(new_df)
|
452 |
+
st.rerun() #Force re-run after apply
|
453 |
+
|
454 |
except Exception as e:
|
455 |
st.error(f"Error: {str(e)}")
|
456 |
else:
|
|
|
491 |
new_df[col_to_convert] = pd.to_datetime(new_df[col_to_convert], format=date_format, errors='coerce')
|
492 |
|
493 |
update_cleaned_data(new_df)
|
494 |
+
st.rerun() #Force re-run after apply
|
495 |
except Exception as e:
|
496 |
st.error(f"Error: {str(e)}")
|
497 |
|
|
|
507 |
new_df = df.copy()
|
508 |
new_df = new_df.drop(columns=columns_to_drop)
|
509 |
update_cleaned_data(new_df)
|
510 |
+
st.rerun() #Force re-run after apply
|
511 |
|
512 |
# --------------------------
|
513 |
# Label Encoding
|
|
|
524 |
new_df[col] = le.fit_transform(new_df[col].astype(str))
|
525 |
label_encoders[col] = le
|
526 |
update_cleaned_data(new_df)
|
527 |
+
st.rerun() #Force re-run after apply
|
528 |
|
529 |
# --------------------------
|
530 |
# StandardScaler
|
|
|
539 |
scaler = StandardScaler()
|
540 |
new_df[scale_cols] = scaler.fit_transform(new_df[scale_cols])
|
541 |
update_cleaned_data(new_df)
|
542 |
+
st.rerun()#Force re-run after apply
|
543 |
except Exception as e:
|
544 |
st.error(f"Error: {str(e)}")
|
545 |
|
|
|
573 |
text_cols = new_df.select_dtypes(include='object').columns
|
574 |
new_df[text_cols] = new_df[text_cols].apply(lambda x: x.str.strip())
|
575 |
update_cleaned_data(new_df)
|
576 |
+
st.rerun() #Force re-run after apply
|
577 |
|
578 |
# --------------------------
|
579 |
# Cleaned Data Preview
|
|
|
581 |
if st.session_state.get("cleaned_data") is not None:
|
582 |
enhance_section_title("Cleaned Data Preview", "✨")
|
583 |
with st.expander("✨ Cleaned Data Preview", expanded=True):
|
584 |
+
st.dataframe(st.session_state.cleaned_data.head(), use_container_width=True)
|
585 |
+
|
586 |
+
|
587 |
+
|
|
|
588 |
|
589 |
# --------------------------
|
590 |
# EDA
|
|
|
592 |
elif app_mode == "EDA":
|
593 |
st.title("🔍 Interactive Data Explorer")
|
594 |
|
595 |
+
# Universal check for all dependent pages
|
596 |
+
if 'cleaned_data' not in st.session_state:
|
597 |
+
st.warning("No cleaned data found! Please either:")
|
598 |
+
|
599 |
+
col1, col2 = st.columns(2)
|
600 |
+
with col1:
|
601 |
+
if st.button("↩️ Go to Data Cleaning"):
|
602 |
+
st.session_state.app_mode = "Data Cleaning"
|
603 |
+
st.experimental_rerun()
|
604 |
+
|
605 |
+
with col2:
|
606 |
+
uploaded_clean = st.file_uploader("📤 Or upload clean data",
|
607 |
+
type=["csv", "xlsx"])
|
608 |
+
if uploaded_clean:
|
609 |
+
try:
|
610 |
+
st.session_state.cleaned_data = pd.read_csv(uploaded_clean)
|
611 |
+
st.success("Loaded clean data!")
|
612 |
+
st.experimental_rerun()
|
613 |
+
except Exception as e:
|
614 |
+
st.error(f"Invalid file: {str(e)}")
|
615 |
+
|
616 |
+
st.stop() # Halt execution until resolved
|
617 |
|
618 |
+
# Only reaches here if cleaned_data exists
|
619 |
+
df = st.session_state.cleaned_data.copy()
|
620 |
|
621 |
# --------------------------
|
622 |
# Enhanced Data Overview
|
|
|
945 |
elif app_mode == "Model Training":
|
946 |
st.title("🤖 Intelligent Model Training")
|
947 |
|
948 |
+
# Universal check for all dependent pages
|
949 |
+
if 'cleaned_data' not in st.session_state:
|
950 |
+
st.warning("No cleaned data found! Please either:")
|
951 |
+
|
952 |
+
col1, col2 = st.columns(2)
|
953 |
+
with col1:
|
954 |
+
if st.button("↩️ Go to Data Cleaning"):
|
955 |
+
st.session_state.app_mode = "Data Cleaning"
|
956 |
+
st.experimental_rerun()
|
957 |
+
|
958 |
+
with col2:
|
959 |
+
uploaded_clean = st.file_uploader("📤 Or upload clean data",
|
960 |
+
type=["csv", "xlsx"])
|
961 |
+
if uploaded_clean:
|
962 |
+
try:
|
963 |
+
st.session_state.cleaned_data = pd.read_csv(uploaded_clean)
|
964 |
+
st.success("Loaded clean data!")
|
965 |
+
st.experimental_rerun()
|
966 |
+
except Exception as e:
|
967 |
+
st.error(f"Invalid file: {str(e)}")
|
968 |
+
|
969 |
+
st.stop() # Halt execution until resolved
|
970 |
|
971 |
+
# Only reaches here if cleaned_data exists
|
972 |
+
df = st.session_state.cleaned_data.copy()
|
973 |
+
|
974 |
# Model Setup
|
975 |
col1, col2, col3 = st.columns(3)
|
976 |
with col1:
|
|
|
1039 |
|
1040 |
use_grid_search = st.checkbox("Use Grid Search for Hyperparameter Tuning")
|
1041 |
|
1042 |
+
# In Model Training section - Fix indentation for training logic
|
1043 |
if st.button("Train Model"):
|
1044 |
if not features:
|
1045 |
st.error("Please select at least one feature.")
|
1046 |
st.stop()
|
1047 |
+
|
1048 |
+
# INDENT ALL THIS CODE UNDER THE BUTTON CLICK
|
1049 |
# Call the training function
|
1050 |
+
model, scaler, label_encoder, imputer_numerical, metrics, column_order, importance, X_train, y_train = train_model(
|
1051 |
+
df.copy(), target, features, problem_type, test_size, model_type, model_params, use_grid_search
|
1052 |
+
)
|
1053 |
|
1054 |
+
if model: # Only proceed if training was successful
|
1055 |
+
st.success("Model trained successfully!")
|
1056 |
+
# ... rest of model display code ...
|
1057 |
|
1058 |
# Display Metrics
|
1059 |
st.subheader("Model Evaluation Metrics")
|
|
|
1157 |
# Predictions Section (Fixed)
|
1158 |
if app_mode == "Predictions":
|
1159 |
st.title("�� Predictive Analytics - Informed Business Decisions")
|
1160 |
+
st.warning("Note: SHAP explanations currently work best with tree-based models like Random Forest")
|
1161 |
+
|
1162 |
+
# Add model upload section
|
1163 |
+
uploaded_model = st.file_uploader("Upload trained model", type="joblib")
|
1164 |
+
if uploaded_model:
|
1165 |
+
try:
|
1166 |
+
st.session_state.model = joblib.load(uploaded_model)
|
1167 |
+
st.success("Model loaded successfully!")
|
1168 |
+
except:
|
1169 |
+
st.error("Invalid model file")
|
1170 |
|
1171 |
+
if 'model' not in st.session_state:
|
1172 |
+
st.warning("Please load a trained model first")
|
1173 |
st.stop()
|
1174 |
+
|
1175 |
+
# Rest of your predictions code...
|
1176 |
|
1177 |
model_data = st.session_state.model # Get the entire dictionary
|
1178 |
model = model_data['model'] # Access model
|