Spaces:

CosmickVisions
/

Data-Vision

Running

App Files Files Community

CosmickVisions commited on Mar 2

Commit

9c080d6

verified ·

1 Parent(s): 756b59d

Update app.py

Browse files

Files changed (1) hide show

app.py +255 -157

app.py CHANGED Viewed

@@ -12,12 +12,45 @@ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 from sklearn.metrics import accuracy_score, mean_squared_error
 from ydata_profiling import ProfileReport
 from streamlit_pandas_profiling import st_profile_report
-import joblib  # For saving and loading models
-import os  # For file directory
 import shap
 from datetime import datetime
-from stqdm import stqdm
 # --------------------------
 # Helper Functions
@@ -25,8 +58,7 @@ from stqdm import stqdm
 def enhance_section_title(title, icon="✨"):
     """Helper function to create a styled section title with an icon."""
     st.markdown(f"<h2 style='border-bottom: 2px solid #ccc; padding-bottom: 5px;'>{icon} {title}</h2>", unsafe_allow_html=True)
-@st.cache_data
 def update_cleaned_data(df):
     """Updates the cleaned data in session state."""
     st.session_state.cleaned_data = df
@@ -34,7 +66,6 @@ def update_cleaned_data(df):
         st.session_state.data_versions.append(df.copy())
     st.success("Action completed successfully!")
-@st.cache_data
 def generate_quality_report(df):
     """Generate comprehensive data quality report"""
     report = {
@@ -60,8 +91,8 @@ def generate_quality_report(df):
             })
         report['columns'][col] = col_report
     return report
-@st.cache_data
 def train_model(df, target, features, problem_type, test_size, model_type, model_params, use_grid_search=False):
     """Trains a model with hyperparameter tuning, cross-validation, and customizable model architecture."""
@@ -69,36 +100,42 @@ def train_model(df, target, features, problem_type, test_size, model_type, model
         X = df[features]
         y = df[target]
-        # Input Validation (rest of the input validation code remains the same)
         if target not in df.columns:
             raise ValueError(f"Target variable '{target}' not found in DataFrame.")
         for feature in features:
             if feature not in df.columns:
                 raise ValueError(f"Feature '{feature}' not found in DataFrame.")
-        # Preprocessing Pipeline (rest of preprocessing code remains the same)
         numerical_features = X.select_dtypes(include=np.number).columns
         categorical_features = X.select_dtypes(exclude=np.number).columns
         imputer_numerical = SimpleImputer(strategy='mean')  # Or 'median', 'most_frequent', 'constant'
         X[numerical_features] = imputer_numerical.fit_transform(X[numerical_features])
-        X = pd.get_dummies(X, columns=categorical_features, dummy_na=False)
-        label_encoder = None  # Initialize label_encoder
         if problem_type == "Classification" or problem_type == "Multiclass":
             label_encoder = LabelEncoder()
             y = label_encoder.fit_transform(y)
         X_train, X_test, y_train, y_test = train_test_split(
             X, y, test_size=test_size, random_state=42
         )
-        scaler = StandardScaler()
-        X_train = scaler.fit_transform(X_train)
-        X_test = scaler.transform(X_test)
-        # Model Selection and Hyperparameter Tuning (rest of model selection code remains the same)
         if problem_type == "Regression":
             if model_type == "Random Forest":
                 model = RandomForestRegressor(random_state=42)
@@ -115,9 +152,9 @@ def train_model(df, target, features, problem_type, test_size, model_type, model
                     'max_depth': [3, 5]
                 }
             elif model_type == "Neural Network":
-                 model = MLPRegressor(random_state=42, max_iter=500)
                  param_grid = {
-                     'hidden_layer_sizes': [(50,), (100,), (50, 50)],
                      'activation': ['relu', 'tanh'],
                      'alpha': [0.0001, 0.001]
                  }
@@ -141,9 +178,9 @@ def train_model(df, target, features, problem_type, test_size, model_type, model
                     'max_depth': [3, 5]
                 }
             elif model_type == "Neural Network":
-                model = MLPClassifier(random_state=42, max_iter=500)
                 param_grid = {
-                    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
                     'activation': ['relu', 'tanh'],
                     'alpha': [0.0001, 0.001]
                 }
@@ -153,11 +190,11 @@ def train_model(df, target, features, problem_type, test_size, model_type, model
         elif problem_type == "Multiclass": #Multiclass
             if model_type == "Logistic Regression":
-                model = LogisticRegression(random_state=42, solver='liblinear', multi_class='ovr')
-                param_grid = {'C': [0.1, 1.0, 10.0]}
             elif model_type == "Support Vector Machine":
-                model = SVC(random_state=42, probability=True)
                 param_grid = {'C': [0.1, 1.0, 10.0], 'kernel': ['rbf', 'linear']}
             elif model_type == "Random Forest":
@@ -166,7 +203,7 @@ def train_model(df, target, features, problem_type, test_size, model_type, model
                     'n_estimators': [100, 200],
                     'max_depth': [None, 5, 10],
                     'min_samples_split': [2, 5],
-                    'criterion': ['gini', 'entropy']
                 }
             else:
@@ -174,47 +211,51 @@ def train_model(df, target, features, problem_type, test_size, model_type, model
         else:
             raise ValueError(f"Invalid problem type: {problem_type}")
-        param_grid.update(model_params)
         if use_grid_search:
             grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy' if problem_type in ['Classification', 'Multiclass'] else 'neg_mean_squared_error', verbose=1, n_jobs=-1)
-            grid_search.fit(X_train, y_train)
-            model = grid_search.best_estimator_
-            st.write("Best hyperparameters found by Grid Search:", grid_search.best_params_)
         else:
-            model.fit(X_train, y_train)
-        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy' if problem_type in ['Classification', 'Multiclass'] else 'neg_mean_squared_error')
         st.write("Cross-validation scores:", cv_scores)
         st.write("Mean cross-validation score:", cv_scores.mean())
-        # Evaluation (rest of evaluation code remains the same)
-        y_pred = model.predict(X_test)
-        metrics = {}
         if problem_type == "Classification":
             metrics['accuracy'] = accuracy_score(y_test, y_pred)
             metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
-            metrics['classification_report'] = classification_report(y_test, y_pred, output_dict=True)
         elif problem_type == "Multiclass":
             metrics['accuracy'] = accuracy_score(y_test, y_pred)
             metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
-            metrics['classification_report'] = classification_report(y_test, y_pred, output_dict=True)
         else:
             metrics['mse'] = mean_squared_error(y_test, y_pred)
             metrics['r2'] = r2_score(y_test, y_pred)
-        # Feature Importance (rest of feature importance code remains the same)
         try:
-            result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42)
             importance = result.importances_mean
         except Exception as e:
             st.warning(f"Could not calculate feature importance: {e}")
             importance = None
         column_order = X.columns
         return model, scaler, label_encoder, imputer_numerical, metrics, column_order, importance, X_train, y_train # Return X_train and y_train
@@ -222,7 +263,8 @@ def train_model(df, target, features, problem_type, test_size, model_type, model
     except Exception as e:
         st.error(f"Training failed: {str(e)}")
         return None, None, None, None, None, None, None, None, None
 def validate_model(model_path, df, target, features, test_size):
     """Loads a model, preprocesses data, and evaluates the model on a validation set."""
     try:
@@ -304,13 +346,6 @@ def prediction_input_form(features, default_values=None):
         input_data[feature] = st.number_input(f"{feature}:", value=default_value)
     return input_data
-if 'raw_data' not in st.session_state:
-    st.session_state.raw_data = None
-if 'cleaned_data' not in st.session_state:
-    st.session_state.cleaned_data = None
-if 'data_versions' not in st.session_state:
-    st.session_state.data_versions = []
 # --------------------------
 # Sidebar Navigation
 # --------------------------
@@ -330,20 +365,18 @@ with st.sidebar:
 # --------------------------
 if app_mode == "Data Upload":
     st.title("📤 Data Upload & Profiling")
-    uploaded_file = st.file_uploader("Upload your dataset (CSV/XLSX)", type=["csv", "xlsx"], key="file_uploader")
     if uploaded_file:
         try:
             if uploaded_file.name.endswith('.csv'):
                 df = pd.read_csv(uploaded_file)
             else:
                 df = pd.read_excel(uploaded_file)
             st.session_state.raw_data = df
-            st.session_state.cleaned_data = df.copy()  # Set initial cleaned data
-            st.session_state.data_versions = [df.copy()]  # Initialize data versions
             col1, col2, col3 = st.columns(3)
             with col1:
                 st.metric("Rows", df.shape[0])
@@ -351,19 +384,57 @@ if app_mode == "Data Upload":
                 st.metric("Columns", df.shape[1])
             with col3:
                 st.metric("Missing Values", df.isna().sum().sum())
             with st.expander("Data Preview", expanded=True):
                 st.dataframe(df.head(10), use_container_width=True)
             if st.button("Generate Full Profile Report"):
                 with st.spinner("Generating comprehensive analysis..."):
                     pr = ProfileReport(df, explorative=True)
                     st_profile_report(pr)
         except Exception as e:
             st.error(f"Error loading file: {str(e)}")
-# Data Cleaning Section
 elif app_mode == "Data Cleaning":
     st.title("🧹 Smart Data Cleaning")
@@ -394,6 +465,14 @@ elif app_mode == "Data Cleaning":
                 profile = ProfileReport(df, minimal=True)
                 st_profile_report(profile)
     # Missing Value Handling
     enhance_section_title("Missing Values Treatment", "🔍")
     with st.expander("🔍 Missing Values Treatment", expanded=True):
@@ -430,7 +509,7 @@ elif app_mode == "Data Cleaning":
                     new_df[cols] = new_df[cols].bfill()
                 update_cleaned_data(new_df)
-                st.run()  # Force re-run after apply
     # Data Type Conversion
     enhance_section_title("Data Type Conversion", "🔄")
@@ -465,7 +544,7 @@ elif app_mode == "Data Cleaning":
                     new_df[col_to_convert] = pd.to_datetime(new_df[col_to_convert], format=date_format, errors='coerce')
                 update_cleaned_data(new_df)
-                st.run()  # Force re-run after apply
             except Exception as e:
                 st.error(f"Error: {str(e)}")
@@ -478,7 +557,7 @@ elif app_mode == "Data Cleaning":
             if st.button("Confirm Drop (Columns)"):
                 new_df = df.drop(columns=columns_to_drop)
                 update_cleaned_data(new_df)
-                st.run()  # Force re-run after apply
     # Label Encoding
     enhance_section_title("Label Encoding", "🔢")
@@ -491,7 +570,7 @@ elif app_mode == "Data Cleaning":
                     le = LabelEncoder()
                     new_df[col] = le.fit_transform(new_df[col].astype(str))
                 update_cleaned_data(new_df)
-                st.run()  # Force re-run after apply
     # StandardScaler
     enhance_section_title("StandardScaler", "📏")
@@ -503,7 +582,7 @@ elif app_mode == "Data Cleaning":
                 scaler = StandardScaler()
                 new_df[scale_cols] = scaler.fit_transform(new_df[scale_cols])
                 update_cleaned_data(new_df)
-                st.run()  # Force re-run after apply
     # Pattern-Based Cleaning
     enhance_section_title("Pattern-Based Cleaning", "🕵️")
@@ -516,13 +595,24 @@ elif app_mode == "Data Cleaning":
             new_df = df.copy()
             new_df[selected_col] = new_df[selected_col].str.replace(pattern, replacement, regex=True)
             update_cleaned_data(new_df)
-            st.run()  # Force re-run after apply
     enhance_section_title("✨ Cleaned Data Preview", "✨")
     with st.expander("✨ Cleaned Data Preview"):
-        st.dataframe(new_df.head(), use_container_width=True)
 # --------------------------
@@ -531,16 +621,11 @@ elif app_mode == "Data Cleaning":
 elif app_mode == "EDA":
     st.title("🔍 Interactive Data Explorer")
-    if st.session_state.raw_data is None:
-        st.warning("Please upload data first")
-        st.stop()  # Stop execution if no data uploaded
-    if 'cleaned_data' in st.session_state and st.session_state.cleaned_data is not None:
-        df = st.session_state.cleaned_data.copy()  # Work on the latest cleaned data
-    else:
-        st.warning("No cleaned data available. Please clean your data first.")
-        st.stop()
     # --------------------------
     # Enhanced Data Overview
@@ -640,7 +725,7 @@ elif app_mode == "EDA":
         try:
             fig = None  # Initialize fig to None
             if st.session_state.cleaned_data is None:
-                st.warning("Please upload data first")
                 st.stop()
             # Generate appropriate visualization with input validation
@@ -869,18 +954,23 @@ elif app_mode == "EDA":
 elif app_mode == "Model Training":
     st.title("🤖 Intelligent Model Training")
-    if st.session_state.raw_data is None:
-        st.warning("Please upload data first")
-        st.stop()  # Stop execution if no data uploaded
-    if 'cleaned_data' in st.session_state and st.session_state.cleaned_data is not None:
-        df = st.session_state.cleaned_data.copy()  # Work on the latest cleaned data
-    else:
-        st.warning("No cleaned data available. Please clean your data first.")
-        st.stop()  # Stop execution if no cleaned data is available
-    # Rest of the model training code...
     # Model Setup
     col1, col2, col3 = st.columns(3)
@@ -950,61 +1040,18 @@ elif app_mode == "Model Training":
     use_grid_search = st.checkbox("Use Grid Search for Hyperparameter Tuning")
-    if st.button("Train Model"):
         if not features:
             st.error("Please select at least one feature.")
             st.stop()
         # Call the training function
-        model, scaler, label_encoder, imputer_numerical, metrics, column_order, importance = train_model(df.copy(), target, features, problem_type, test_size, model_type, model_params, use_grid_search) # Pass a copy to avoid modifying the original
         if model: # Only proceed if training was successful
             st.success("Model trained successfully!")
-            # Display Metrics
-            st.subheader("Model Evaluation Metrics")
-            if problem_type in ["Classification", "Multiclass"]: #Combined here
-                st.metric("Accuracy", f"{metrics['accuracy']:.2%}")
-                # Confusion Matrix Visualization
-                st.subheader("Confusion Matrix")
-                cm = metrics['confusion_matrix']
-                class_names = [str(i) for i in np.unique(df[target])] #Get original class names
-                fig_cm = px.imshow(cm,
-                                    labels=dict(x="Predicted", y="Actual"),
-                                    x=class_names,
-                                    y=class_names,
-                                    color_continuous_scale="Viridis")
-                st.plotly_chart(fig_cm, use_container_width=True)
-                # Classification Report
-                st.subheader("Classification Report")
-                report = metrics['classification_report']
-                report_df = pd.DataFrame(report).transpose()
-                st.dataframe(report_df)
-            else:
-                st.metric("MSE", f"{metrics['mse']:.2f}")
-                st.metric("R2", f"{metrics['r2']:.2f}")
-            # Feature Importance
-            st.subheader("Feature Importance")
-            try:
-                fig_importance = px.bar(
-                    x=importance,
-                    y=column_order, #Use stored column order
-                    orientation='h',
-                    title="Feature Importance"
-                )
-                st.plotly_chart(fig_importance, use_container_width=True)
-            except Exception as e:
-                st.warning(f"Could not display feature importance: {e}")
-             # Explainable AI (Placeholder)
-            st.subheader("Explainable AI (XAI)")
-            st.write("Future implementation will include model explanations using techniques like SHAP or LIME.") #To be implemented
-            if st.checkbox("Show a random model explanation (example)"): #Example of a feature, to be implemented
-                 st.write("This feature is important because...")
             # Save Model
             st.subheader("Save Model")
@@ -1060,22 +1107,12 @@ elif app_mode == "Model Training":
                     st.metric("MSE", f"{validation_metrics['mse']:.2f}")
                     st.metric("R2", f"{validation_metrics['r2']:.2f}")
-# Predictions Section (Fixed)
 elif app_mode == "Predictions":
-    st.title("🔮 Predictive Analytics")
-    if st.session_state.raw_data is None:
-        st.warning("Please upload data first")
-        st.stop()  # Stop execution if no data uploaded
-    if 'cleaned_data' in st.session_state and st.session_state.cleaned_data is not None:
-        df = st.session_state.cleaned_data.copy()  # Work on the latest cleaned data
-    else:
-        st.warning("No cleaned data available. Please clean your data first.")
-        st.stop()  # Stop execution if no cleaned data is available
-    # Rest of the predictions code...
     model_data = st.session_state.model  # Get the entire dictionary
     model = model_data['model']  # Access model
@@ -1095,8 +1132,8 @@ elif app_mode == "Predictions":
     with col2:
         st.subheader("Data Overview")
-        input_df = pd.DataFrame([input_data])  # Make DataFrame
-        st.dataframe(input_df, use_container_width=True)  # DataFrame of the input to see it
     # Predicts Function and Displays Result
     if st.button("Generate Prediction & Insights"):
@@ -1110,12 +1147,14 @@ elif app_mode == "Predictions":
             # 3. One-hot encode (handle unseen categories)
             categorical_features = input_df.select_dtypes(exclude=np.number).columns
-            input_df = pd.get_dummies(input_df, columns=categorical_features, dummy_na=False)
             # 4. Ensure correct column order
             for col in column_order:
                 if col not in input_df.columns:
                     input_df[col] = 0
             input_df = input_df[column_order]
             # 5. Scale the input
@@ -1138,21 +1177,29 @@ elif app_mode == "Predictions":
             if problem_type == "Classification":
                 explainer = shap.TreeExplainer(model)
-                shap_values = explainer.shap_values(scaled_input)
-                fig = shap.force_plot(explainer.expected_value[1], shap_values[1], input_df, matplotlib=False, link="logit")
-                st.components.v1.html(shap.getjs() + fig.html(), height=400, width=900)
             else:
-                explainer = shap.TreeExplainer(model)
-                shap_values = explainer.shap_values(scaled_input)
-                fig = shap.force_plot(explainer.expected_value, shap_values, input_df, matplotlib=False)
-                st.components.v1.html(shap.getjs() + fig.html(), height=400, width=900)
             st.write("The visualization above explains how each feature contributed to the final prediction.")
             # 9. Add Permutation Feature Importance (for more global understanding)
             try:
                 enhance_section_title("Global Feature Importance", "🌍")
-                X = pd.DataFrame(scaler.transform(input_df), columns=input_df.columns)
                 result = permutation_importance(model, X, input_df, n_repeats=10, random_state=42)
                 importance = result.importances_mean
@@ -1163,4 +1210,55 @@ elif app_mode == "Predictions":
                 st.warning(f"Could not calculate permutation feature importance: {e}")
         except Exception as e:
-            st.error(f"Prediction failed: {str(e)}")

 from sklearn.metrics import accuracy_score, mean_squared_error
 from ydata_profiling import ProfileReport
 from streamlit_pandas_profiling import st_profile_report
+import joblib
 import shap
 from datetime import datetime
+# --------------------------
+# Page Configuration
+# --------------------------
+st.set_page_config(
+    page_title="DataInsight Pro",
+    page_icon="🔮",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# --------------------------
+# Custom Styling
+# --------------------------
+st.markdown("""
+    <style>
+    .main {background-color: #f8f9fa;}
+    .sidebar .sidebar-content {background-color: #2c3e50;}
+    .stButton>button {background-color: #3498db; color: white;}
+    .stTextInput>div>div>input {border: 1px solid #3498db;}
+    .stSelectbox>div>div>select {border: 1px solid #3498db;}
+    .stSlider>div>div>div>div {background-color: #3498db;}
+    .metric {padding: 15px; background-color: white; border-radius: 10px; box-shadow: 0 2px 5px rgba(0,0,0,0.1);}
+    </style>
+""", unsafe_allow_html=True)
+# --------------------------
+# Session State Initialization
+# --------------------------
+if 'raw_data' not in st.session_state:
+    st.session_state.raw_data = None
+if 'cleaned_data' not in st.session_state:
+    st.session_state.cleaned_data = None
+if 'model' not in st.session_state:
+    st.session_state.model = None
 # --------------------------
 # Helper Functions
 def enhance_section_title(title, icon="✨"):
     """Helper function to create a styled section title with an icon."""
     st.markdown(f"<h2 style='border-bottom: 2px solid #ccc; padding-bottom: 5px;'>{icon} {title}</h2>", unsafe_allow_html=True)
 def update_cleaned_data(df):
     """Updates the cleaned data in session state."""
     st.session_state.cleaned_data = df
         st.session_state.data_versions.append(df.copy())
     st.success("Action completed successfully!")
 def generate_quality_report(df):
     """Generate comprehensive data quality report"""
     report = {
             })
         report['columns'][col] = col_report
     return report
+# Function to train the model (Separated for clarity and reusability)
 def train_model(df, target, features, problem_type, test_size, model_type, model_params, use_grid_search=False):
     """Trains a model with hyperparameter tuning, cross-validation, and customizable model architecture."""
         X = df[features]
         y = df[target]
+        # Input Validation
         if target not in df.columns:
             raise ValueError(f"Target variable '{target}' not found in DataFrame.")
         for feature in features:
             if feature not in df.columns:
                 raise ValueError(f"Feature '{feature}' not found in DataFrame.")
+        # Preprocessing Pipeline:  Handles missing values, encoding, scaling
+        # Imputation: Handle missing values BEFORE encoding (numerical only for SimpleImputer)
         numerical_features = X.select_dtypes(include=np.number).columns
         categorical_features = X.select_dtypes(exclude=np.number).columns
         imputer_numerical = SimpleImputer(strategy='mean')  # Or 'median', 'most_frequent', 'constant'
         X[numerical_features] = imputer_numerical.fit_transform(X[numerical_features])
+        # Encoding (One-Hot Encode Categorical Features)
+        X = pd.get_dummies(X, columns=categorical_features, dummy_na=False) # dummy_na = False.  We imputed already.
+        # Target Encoding (if classification)
+        label_encoder = None #Initialize label_encoder
         if problem_type == "Classification" or problem_type == "Multiclass":
             label_encoder = LabelEncoder()
             y = label_encoder.fit_transform(y)
+        # Split the data
         X_train, X_test, y_train, y_test = train_test_split(
             X, y, test_size=test_size, random_state=42
         )
+        # Scaling (AFTER splitting!)
+        scaler = StandardScaler() # Or try MinMaxScaler, RobustScaler, QuantileTransformer
+        X_train_scaled = scaler.fit_transform(X_train) #Fit to the training data ONLY
+        X_test_scaled = scaler.transform(X_test) #Transform the test data using the fitted scaler
+        # Model Selection and Hyperparameter Tuning
         if problem_type == "Regression":
             if model_type == "Random Forest":
                 model = RandomForestRegressor(random_state=42)
                     'max_depth': [3, 5]
                 }
             elif model_type == "Neural Network":
+                 model = MLPRegressor(random_state=42, max_iter=500) #set max_iter to 500
                  param_grid = {
+                     'hidden_layer_sizes': [(50,), (100,), (50, 50)], #example sizes for depth
                      'activation': ['relu', 'tanh'],
                      'alpha': [0.0001, 0.001]
                  }
                     'max_depth': [3, 5]
                 }
             elif model_type == "Neural Network":
+                model = MLPClassifier(random_state=42, max_iter=500) #set max_iter to 500
                 param_grid = {
+                    'hidden_layer_sizes': [(50,), (100,), (50, 50)], #example sizes for depth
                     'activation': ['relu', 'tanh'],
                     'alpha': [0.0001, 0.001]
                 }
         elif problem_type == "Multiclass": #Multiclass
             if model_type == "Logistic Regression":
+                model = LogisticRegression(random_state=42, solver='liblinear', multi_class='ovr')  # 'ovr' for one-vs-rest
+                param_grid = {'C': [0.1, 1.0, 10.0]}  # Regularization parameter
             elif model_type == "Support Vector Machine":
+                model = SVC(random_state=42, probability=True)  # probability=True for probabilities
                 param_grid = {'C': [0.1, 1.0, 10.0], 'kernel': ['rbf', 'linear']}
             elif model_type == "Random Forest":
                     'n_estimators': [100, 200],
                     'max_depth': [None, 5, 10],
                     'min_samples_split': [2, 5],
+                    'criterion': ['gini', 'entropy'] #criterion for decision
                 }
             else:
         else:
             raise ValueError(f"Invalid problem type: {problem_type}")
+        # Update param_grid with user-defined parameters
+        param_grid.update(model_params) #This is key to use the model_params provided by user
         if use_grid_search:
             grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy' if problem_type in ['Classification', 'Multiclass'] else 'neg_mean_squared_error', verbose=1, n_jobs=-1)
+            grid_search.fit(X_train_scaled, y_train) # Use scaled training data
+            model = grid_search.best_estimator_ # Use the best model found
+            st.write("Best hyperparameters found by Grid Search:", grid_search.best_params_) #Print best parameters
         else:
+            model.fit(X_train_scaled, y_train) # Use scaled training data
+        # Cross-Validation (after hyperparameter tuning, if applicable)
+        cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy' if problem_type in ['Classification', 'Multiclass'] else 'neg_mean_squared_error') # Use scaled training data
         st.write("Cross-validation scores:", cv_scores)
         st.write("Mean cross-validation score:", cv_scores.mean())
+        # Evaluation
+        y_pred = model.predict(X_test_scaled) # Use scaled test data
+        metrics = {} #Store metrics in a dictionary
         if problem_type == "Classification":
             metrics['accuracy'] = accuracy_score(y_test, y_pred)
             metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
+            metrics['classification_report'] = classification_report(y_test, y_pred, output_dict=True) #Get report as dictionary
         elif problem_type == "Multiclass":
             metrics['accuracy'] = accuracy_score(y_test, y_pred)
             metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
+            metrics['classification_report'] = classification_report(y_test, y_pred, output_dict=True) #Get report as dictionary
         else:
             metrics['mse'] = mean_squared_error(y_test, y_pred)
             metrics['r2'] = r2_score(y_test, y_pred)
+        # Feature Importance (Permutation Importance for potentially better handling of correlated features)
         try:
+            result = permutation_importance(model, X_test_scaled, y_test, n_repeats=10, random_state=42) #Permutation Feature Importance # Use scaled test data
             importance = result.importances_mean
         except Exception as e:
             st.warning(f"Could not calculate feature importance: {e}")
             importance = None
+        # Store the column order for prediction purposes
         column_order = X.columns
         return model, scaler, label_encoder, imputer_numerical, metrics, column_order, importance, X_train, y_train # Return X_train and y_train
     except Exception as e:
         st.error(f"Training failed: {str(e)}")
         return None, None, None, None, None, None, None, None, None
+# Model Validation Function
 def validate_model(model_path, df, target, features, test_size):
     """Loads a model, preprocesses data, and evaluates the model on a validation set."""
     try:
         input_data[feature] = st.number_input(f"{feature}:", value=default_value)
     return input_data
 # --------------------------
 # Sidebar Navigation
 # --------------------------
 # --------------------------
 if app_mode == "Data Upload":
     st.title("📤 Data Upload & Profiling")
+    uploaded_file = st.file_uploader("Upload your dataset (CSV/XLSX)", type=["csv", "xlsx"])
     if uploaded_file:
         try:
             if uploaded_file.name.endswith('.csv'):
                 df = pd.read_csv(uploaded_file)
             else:
                 df = pd.read_excel(uploaded_file)
             st.session_state.raw_data = df
             col1, col2, col3 = st.columns(3)
             with col1:
                 st.metric("Rows", df.shape[0])
                 st.metric("Columns", df.shape[1])
             with col3:
                 st.metric("Missing Values", df.isna().sum().sum())
             with st.expander("Data Preview", expanded=True):
                 st.dataframe(df.head(10), use_container_width=True)
             if st.button("Generate Full Profile Report"):
                 with st.spinner("Generating comprehensive analysis..."):
                     pr = ProfileReport(df, explorative=True)
                     st_profile_report(pr)
+        except Exception as e:
+            st.error(f"Error loading file: {str(e)}")
+# --------------------------
+# Page Content
+# --------------------------
+if app_mode == "Data Upload":
+    st.title("📤 Data Upload & Profiling")
+    uploaded_file = st.file_uploader("Upload your dataset (CSV/XLSX)", type=["csv", "xlsx"])
+    if uploaded_file:
+        try:
+            if uploaded_file.name.endswith('.csv'):
+                df = pd.read_csv(uploaded_file)
+            else:
+                df = pd.read_excel(uploaded_file)
+            st.session_state.raw_data = df
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                st.metric("Rows", df.shape[0])
+            with col2:
+                st.metric("Columns", df.shape[1])
+            with col3:
+                st.metric("Missing Values", df.isna().sum().sum())
+            with st.expander("Data Preview", expanded=True):
+                st.dataframe(df.head(10), use_container_width=True)
+            if st.button("Generate Full Profile Report"):
+                with st.spinner("Generating comprehensive analysis..."):
+                    pr = ProfileReport(df, explorative=True)
+                    st_profile_report(pr)
         except Exception as e:
             st.error(f"Error loading file: {str(e)}")
+# --------------------------
+# Page Content
+# --------------------------
 elif app_mode == "Data Cleaning":
     st.title("🧹 Smart Data Cleaning")
                 profile = ProfileReport(df, minimal=True)
                 st_profile_report(profile)
+    # Undo Functionality
+    if len(st.session_state.data_versions) > 1:
+        if st.button("⏮️ Undo Last Action"):
+            st.session_state.data_versions.pop()  # Remove current version
+            st.session_state.cleaned_data = st.session_state.data_versions[-1].copy() # Set data
+            st.success("Last action undone!")
+            st.experimental_rerun() #Force re-run after undo
     # Missing Value Handling
     enhance_section_title("Missing Values Treatment", "🔍")
     with st.expander("🔍 Missing Values Treatment", expanded=True):
                     new_df[cols] = new_df[cols].bfill()
                 update_cleaned_data(new_df)
+                st.experimental_rerun()  # Force re-run after apply
     # Data Type Conversion
     enhance_section_title("Data Type Conversion", "🔄")
                     new_df[col_to_convert] = pd.to_datetime(new_df[col_to_convert], format=date_format, errors='coerce')
                 update_cleaned_data(new_df)
+                st.experimental_rerun()  # Force re-run after apply
             except Exception as e:
                 st.error(f"Error: {str(e)}")
             if st.button("Confirm Drop (Columns)"):
                 new_df = df.drop(columns=columns_to_drop)
                 update_cleaned_data(new_df)
+                st.experimental_rerun()  # Force re-run after apply
     # Label Encoding
     enhance_section_title("Label Encoding", "🔢")
                     le = LabelEncoder()
                     new_df[col] = le.fit_transform(new_df[col].astype(str))
                 update_cleaned_data(new_df)
+                st.experimental_rerun()  # Force re-run after apply
     # StandardScaler
     enhance_section_title("StandardScaler", "📏")
                 scaler = StandardScaler()
                 new_df[scale_cols] = scaler.fit_transform(new_df[scale_cols])
                 update_cleaned_data(new_df)
+                st.experimental_rerun()  # Force re-run after apply
     # Pattern-Based Cleaning
     enhance_section_title("Pattern-Based Cleaning", "🕵️")
             new_df = df.copy()
             new_df[selected_col] = new_df[selected_col].str.replace(pattern, replacement, regex=True)
             update_cleaned_data(new_df)
+            st.experimental_rerun()  # Force re-run after apply
+    # Bulk Operations
+    enhance_section_title("Bulk Actions", "🚀")
+    with st.expander("🚀 Bulk Actions"):
+        if st.button("Auto-Clean Common Issues (Cleaning)"):
+            new_df = df.copy()
+            new_df = new_df.dropna(axis=1, how='all')  # Remove empty cols
+            new_df = new_df.convert_dtypes()  # Better type inference
+            text_cols = new_df.select_dtypes(include='object').columns
+            new_df[text_cols] = new_df[text_cols].apply(lambda x: x.str.strip())
+            update_cleaned_data(new_df)
+            st.experimental_rerun()  # Force re-run after apply
+    # Cleaned Data Preview
     enhance_section_title("✨ Cleaned Data Preview", "✨")
     with st.expander("✨ Cleaned Data Preview"):
+        st.dataframe(st.session_state.cleaned_data.head(), use_container_width=True)
 # --------------------------
 elif app_mode == "EDA":
     st.title("🔍 Interactive Data Explorer")
+    if st.session_state.cleaned_data is None:
+        st.warning("Please clean your data first")
+        st.stop()
+    df = st.session_state.cleaned_data
     # --------------------------
     # Enhanced Data Overview
         try:
             fig = None  # Initialize fig to None
             if st.session_state.cleaned_data is None:
+                st.warning("Please clean your data first")
                 st.stop()
             # Generate appropriate visualization with input validation
 elif app_mode == "Model Training":
     st.title("🤖 Intelligent Model Training")
+    if st.session_state.get("cleaned_data") is None:
+        st.warning("Please clean your data first")
+            # Show Upload Clean Data button
+        uploaded_clean_file = st.file_uploader("Upload your cleaned dataset (CSV/XLSX)", type=["csv", "xlsx"])
+        if uploaded_clean_file:
+            try:
+                if uploaded_clean_file.name.endswith('.csv'):
+                    df = pd.read_csv(uploaded_clean_file)
+                else:
+                    df = pd.read_excel(uploaded_clean_file)
+                st.session_state.cleaned_data = df
+                st.success("Cleaned data uploaded successfully!")
+            except Exception as e:
+                st.error(f"Error loading file: {str(e)}")
+        st.stop()
+    df = st.session_state.cleaned_data
     # Model Setup
     col1, col2, col3 = st.columns(3)
     use_grid_search = st.checkbox("Use Grid Search for Hyperparameter Tuning")
+        if st.button("Train Model"):
         if not features:
             st.error("Please select at least one feature.")
             st.stop()
         # Call the training function
+        model, scaler, label_encoder, imputer_numerical, metrics, column_order, importance, X_train, y_train = train_model(df.copy(), target, features, problem_type, test_size, model_type, model_params, use_grid_search) # Pass a copy to avoid modifying the original # Capture X_train and y_train
         if model: # Only proceed if training was successful
             st.success("Model trained successfully!")
+            # ... (rest of the Model Training code - metrics display, feature importance, saving model) ...
             # Save Model
             st.subheader("Save Model")
                     st.metric("MSE", f"{validation_metrics['mse']:.2f}")
                     st.metric("R2", f"{validation_metrics['r2']:.2f}")
 elif app_mode == "Predictions":
+    st.title("🔮 Predictive Analytics - Informed Business Decisions")
+    if st.session_state.get("model") is None:
+        st.warning("Please train a model first")
+        st.stop()
     model_data = st.session_state.model  # Get the entire dictionary
     model = model_data['model']  # Access model
     with col2:
         st.subheader("Data Overview")
+        input_df = pd.DataFrame([input_data]) #Make DataFrame
+        st.dataframe(input_df,use_container_width=True) #DataFrame of the input to see it
     # Predicts Function and Displays Result
     if st.button("Generate Prediction & Insights"):
             # 3. One-hot encode (handle unseen categories)
             categorical_features = input_df.select_dtypes(exclude=np.number).columns
+            input_df = pd.get_dummies(input_df, columns=categorical_features, dummy_na=False) # dummy_na = False.  We imputed already.
             # 4. Ensure correct column order
+            # Add missing columns with 0 values
             for col in column_order:
                 if col not in input_df.columns:
                     input_df[col] = 0
+            # Reorder Columns
             input_df = input_df[column_order]
             # 5. Scale the input
             if problem_type == "Classification":
                 explainer = shap.TreeExplainer(model)
+                shap_values = explainer.shap_values(scaled_input)  # Use the scaled input
+                # class_names = [str(i) for i in range(len(shap_values))]  # Dynamic class names - not needed for force plot
+                fig = shap.force_plot(explainer.expected_value[1], shap_values[1], input_df, matplotlib=False,link="logit") # shap_values[1] for class 1 - force plot
+                st.components.v1.html(shap.getjs() + fig.html(), height=400, width=900) # Adjust height and width as needed.
             else:
+                explainer = shap.TreeExplainer(model)  # Regression
+                shap_values = explainer.shap_values(scaled_input)  # Use the scaled input
+                fig = shap.force_plot(explainer.expected_value, shap_values, input_df, matplotlib=False) # shap_values single array for regression
+                st.components.v1.html(shap.getjs() + fig.html(), height=400, width=900) # Adjust height and width as needed.
             st.write("The visualization above explains how each feature contributed to the final prediction.")
             # 9. Add Permutation Feature Importance (for more global understanding)
             try:
                 enhance_section_title("Global Feature Importance", "🌍")
+                X = pd.DataFrame(scaler.transform(pd.get_dummies(pd.DataFrame(imputer_numerical.transform(input_df), columns=input_df.columns))), columns=input_df.columns) # Apply preprocessing for permutation
+                #X = pd.DataFrame(scaler.transform(input_df), columns = input_df.columns)
+                #X = input_df[input_df.columns]
+                X_train = model_data['X_train'] #Get X train
+                y_train = model_data['y_train'] #Get Y train
                 result = permutation_importance(model, X, input_df, n_repeats=10, random_state=42)
                 importance = result.importances_mean
                 st.warning(f"Could not calculate permutation feature importance: {e}")
         except Exception as e:
+            st.error(f"Prediction failed: {str(e)}")
+# Force rerun Streamlit app after data cleaning operations
+            st.experimental_rerun()
+if __name__ == "__main__":
+    # Session State Initialization
+    if 'raw_data' not in st.session_state:
+        st.session_state.raw_data = None
+    if 'cleaned_data' not in st.session_state:
+        st.session_state.cleaned_data = None
+    if 'model' not in st.session_state:
+        st.session_state.model = None
+    if 'data_versions' not in st.session_state:
+        st.session_state.data_versions = []
+    # Custom Styling (Keep it in main if needed)
+    st.markdown("""
+        <style>
+        .main {background-color: #f8f9fa;}
+        .sidebar .sidebar-content {background-color: #2c3e50;}
+        .stButton>button {background-color: #3498db; color: white;}
+        .stTextInput>div>div>input {border: 1px solid #3498db;}
+        .stSelectbox>div>div>select {border: 1px solid #3498db;}
+        .stSlider>div>div>div>div {background-color: #3498db;}
+        .metric {padding: 15px; background-color: white; border-radius: 10px; box-shadow: 0 2px 5px rgba(0,0,0,0.1);}
+        </style>
+    """, unsafe_allow_html=True)
+    # Sidebar Navigation (Keep it in main)
+    with st.sidebar:
+        st.title("🔮 DataInsight Pro")
+        app_mode = st.selectbox(
+            "Navigation",
+            ["Data Upload", "Data Cleaning", "EDA", "Model Training", "Predictions"],
+            format_func=lambda x: f"📌 {x}"
+        )
+        st.markdown("---")
+        st.markdown("Created by Calvin Allen-Crawford")
+        st.markdown("v1.0 | © 2025")
+    # Call app mode function based on selection
+    if app_mode == "Data Upload":
+        app_mode_data_upload()
+    elif app_mode == "Data Cleaning":
+        app_mode_data_cleaning()
+    elif app_mode == "EDA":
+        app_mode_eda()
+    elif app_mode == "Model Training":
+        app_mode_model_training()
+    elif app_mode == "Predictions":
+        app_mode_predictions()