Spaces:

CosmickVisions
/

Data-Vision

Sleeping

App Files Files Community

CosmickVisions commited on Mar 2

Commit

e5b765a

verified ·

1 Parent(s): 4438fd1

Update app.py

Browse files

Files changed (1) hide show

app.py +186 -243

app.py CHANGED Viewed

@@ -12,45 +12,12 @@ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 from sklearn.metrics import accuracy_score, mean_squared_error
 from ydata_profiling import ProfileReport
 from streamlit_pandas_profiling import st_profile_report
-import joblib
 import shap
 from datetime import datetime
-# --------------------------
-# Page Configuration
-# --------------------------
-st.set_page_config(
-    page_title="DataInsight Pro",
-    page_icon="🔮",
-    layout="wide",
-    initial_sidebar_state="expanded"
-)
-# --------------------------
-# Custom Styling
-# --------------------------
-st.markdown("""
-    <style>
-    .main {background-color: #f8f9fa;}
-    .sidebar .sidebar-content {background-color: #2c3e50;}
-    .stButton>button {background-color: #3498db; color: white;}
-    .stTextInput>div>div>input {border: 1px solid #3498db;}
-    .stSelectbox>div>div>select {border: 1px solid #3498db;}
-    .stSlider>div>div>div>div {background-color: #3498db;}
-    .metric {padding: 15px; background-color: white; border-radius: 10px; box-shadow: 0 2px 5px rgba(0,0,0,0.1);}
-    </style>
-""", unsafe_allow_html=True)
-# --------------------------
-# Session State Initialization
-# --------------------------
-if 'raw_data' not in st.session_state:
-    st.session_state.raw_data = None
-if 'cleaned_data' not in st.session_state:
-    st.session_state.cleaned_data = None
-if 'model' not in st.session_state:
-    st.session_state.model = None
 # --------------------------
 # Helper Functions
@@ -66,6 +33,7 @@ def update_cleaned_data(df):
         st.session_state.data_versions.append(df.copy())
     st.success("Action completed successfully!")
 def generate_quality_report(df):
     """Generate comprehensive data quality report"""
     report = {
@@ -91,8 +59,8 @@ def generate_quality_report(df):
             })
         report['columns'][col] = col_report
     return report
-# Function to train the model (Separated for clarity and reusability)
 def train_model(df, target, features, problem_type, test_size, model_type, model_params, use_grid_search=False):
     """Trains a model with hyperparameter tuning, cross-validation, and customizable model architecture."""
@@ -100,42 +68,36 @@ def train_model(df, target, features, problem_type, test_size, model_type, model
         X = df[features]
         y = df[target]
-        # Input Validation
         if target not in df.columns:
             raise ValueError(f"Target variable '{target}' not found in DataFrame.")
         for feature in features:
             if feature not in df.columns:
                 raise ValueError(f"Feature '{feature}' not found in DataFrame.")
-        # Preprocessing Pipeline:  Handles missing values, encoding, scaling
-        # Imputation: Handle missing values BEFORE encoding (numerical only for SimpleImputer)
         numerical_features = X.select_dtypes(include=np.number).columns
         categorical_features = X.select_dtypes(exclude=np.number).columns
         imputer_numerical = SimpleImputer(strategy='mean')  # Or 'median', 'most_frequent', 'constant'
         X[numerical_features] = imputer_numerical.fit_transform(X[numerical_features])
-        # Encoding (One-Hot Encode Categorical Features)
-        X = pd.get_dummies(X, columns=categorical_features, dummy_na=False) # dummy_na = False.  We imputed already.
-        # Target Encoding (if classification)
-        label_encoder = None #Initialize label_encoder
         if problem_type == "Classification" or problem_type == "Multiclass":
             label_encoder = LabelEncoder()
             y = label_encoder.fit_transform(y)
-        # Split the data
         X_train, X_test, y_train, y_test = train_test_split(
             X, y, test_size=test_size, random_state=42
         )
-        # Scaling (AFTER splitting!)
-        scaler = StandardScaler() # Or try MinMaxScaler, RobustScaler, QuantileTransformer
-        X_train_scaled = scaler.fit_transform(X_train) #Fit to the training data ONLY
-        X_test_scaled = scaler.transform(X_test) #Transform the test data using the fitted scaler
-        # Model Selection and Hyperparameter Tuning
         if problem_type == "Regression":
             if model_type == "Random Forest":
                 model = RandomForestRegressor(random_state=42)
@@ -152,9 +114,9 @@ def train_model(df, target, features, problem_type, test_size, model_type, model
                     'max_depth': [3, 5]
                 }
             elif model_type == "Neural Network":
-                 model = MLPRegressor(random_state=42, max_iter=500) #set max_iter to 500
                  param_grid = {
-                     'hidden_layer_sizes': [(50,), (100,), (50, 50)], #example sizes for depth
                      'activation': ['relu', 'tanh'],
                      'alpha': [0.0001, 0.001]
                  }
@@ -178,9 +140,9 @@ def train_model(df, target, features, problem_type, test_size, model_type, model
                     'max_depth': [3, 5]
                 }
             elif model_type == "Neural Network":
-                model = MLPClassifier(random_state=42, max_iter=500) #set max_iter to 500
                 param_grid = {
-                    'hidden_layer_sizes': [(50,), (100,), (50, 50)], #example sizes for depth
                     'activation': ['relu', 'tanh'],
                     'alpha': [0.0001, 0.001]
                 }
@@ -190,11 +152,11 @@ def train_model(df, target, features, problem_type, test_size, model_type, model
         elif problem_type == "Multiclass": #Multiclass
             if model_type == "Logistic Regression":
-                model = LogisticRegression(random_state=42, solver='liblinear', multi_class='ovr')  # 'ovr' for one-vs-rest
-                param_grid = {'C': [0.1, 1.0, 10.0]}  # Regularization parameter
             elif model_type == "Support Vector Machine":
-                model = SVC(random_state=42, probability=True)  # probability=True for probabilities
                 param_grid = {'C': [0.1, 1.0, 10.0], 'kernel': ['rbf', 'linear']}
             elif model_type == "Random Forest":
@@ -203,7 +165,7 @@ def train_model(df, target, features, problem_type, test_size, model_type, model
                     'n_estimators': [100, 200],
                     'max_depth': [None, 5, 10],
                     'min_samples_split': [2, 5],
-                    'criterion': ['gini', 'entropy'] #criterion for decision
                 }
             else:
@@ -211,51 +173,47 @@ def train_model(df, target, features, problem_type, test_size, model_type, model
         else:
             raise ValueError(f"Invalid problem type: {problem_type}")
-        # Update param_grid with user-defined parameters
-        param_grid.update(model_params) #This is key to use the model_params provided by user
         if use_grid_search:
             grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy' if problem_type in ['Classification', 'Multiclass'] else 'neg_mean_squared_error', verbose=1, n_jobs=-1)
-            grid_search.fit(X_train_scaled, y_train) # Use scaled training data
-            model = grid_search.best_estimator_ # Use the best model found
-            st.write("Best hyperparameters found by Grid Search:", grid_search.best_params_) #Print best parameters
         else:
-            model.fit(X_train_scaled, y_train) # Use scaled training data
-        # Cross-Validation (after hyperparameter tuning, if applicable)
-        cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy' if problem_type in ['Classification', 'Multiclass'] else 'neg_mean_squared_error') # Use scaled training data
         st.write("Cross-validation scores:", cv_scores)
         st.write("Mean cross-validation score:", cv_scores.mean())
-        # Evaluation
-        y_pred = model.predict(X_test_scaled) # Use scaled test data
-        metrics = {} #Store metrics in a dictionary
         if problem_type == "Classification":
             metrics['accuracy'] = accuracy_score(y_test, y_pred)
             metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
-            metrics['classification_report'] = classification_report(y_test, y_pred, output_dict=True) #Get report as dictionary
         elif problem_type == "Multiclass":
             metrics['accuracy'] = accuracy_score(y_test, y_pred)
             metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
-            metrics['classification_report'] = classification_report(y_test, y_pred, output_dict=True) #Get report as dictionary
         else:
             metrics['mse'] = mean_squared_error(y_test, y_pred)
             metrics['r2'] = r2_score(y_test, y_pred)
-        # Feature Importance (Permutation Importance for potentially better handling of correlated features)
         try:
-            result = permutation_importance(model, X_test_scaled, y_test, n_repeats=10, random_state=42) #Permutation Feature Importance # Use scaled test data
             importance = result.importances_mean
         except Exception as e:
             st.warning(f"Could not calculate feature importance: {e}")
             importance = None
-        # Store the column order for prediction purposes
         column_order = X.columns
         return model, scaler, label_encoder, imputer_numerical, metrics, column_order, importance, X_train, y_train # Return X_train and y_train
@@ -263,8 +221,7 @@ def train_model(df, target, features, problem_type, test_size, model_type, model
     except Exception as e:
         st.error(f"Training failed: {str(e)}")
         return None, None, None, None, None, None, None, None, None
-# Model Validation Function
 def validate_model(model_path, df, target, features, test_size):
     """Loads a model, preprocesses data, and evaluates the model on a validation set."""
     try:
@@ -328,11 +285,9 @@ def validate_model(model_path, df, target, features, test_size):
 # Prediction helper Function
 def prediction_input_form(features, default_values=None):
     """Generates input forms for each feature and returns a dictionary of inputs.
     Args:
         features (list): List of feature names.
         default_values (dict, optional): Default values for each feature. Defaults to None.
     Returns:
         dict: Dictionary where keys are feature names and values are user inputs.
     """
@@ -365,54 +320,18 @@ with st.sidebar:
 # --------------------------
 if app_mode == "Data Upload":
     st.title("📤 Data Upload & Profiling")
-    uploaded_file = st.file_uploader("Upload your dataset (CSV/XLSX)", type=["csv", "xlsx"])
-    if uploaded_file:
-        try:
-            if uploaded_file.name.endswith('.csv'):
-                df = pd.read_csv(uploaded_file)
-            else:
-                df = pd.read_excel(uploaded_file)
-            st.session_state.raw_data = df
-            col1, col2, col3 = st.columns(3)
-            with col1:
-                st.metric("Rows", df.shape[0])
-            with col2:
-                st.metric("Columns", df.shape[1])
-            with col3:
-                st.metric("Missing Values", df.isna().sum().sum())
-            with st.expander("Data Preview", expanded=True):
-                st.dataframe(df.head(10), use_container_width=True)
-            if st.button("Generate Full Profile Report"):
-                with st.spinner("Generating comprehensive analysis..."):
-                    pr = ProfileReport(df, explorative=True)
-                    st_profile_report(pr)
-        except Exception as e:
-            st.error(f"Error loading file: {str(e)}")
-# --------------------------
-# Page Content
-# --------------------------
-if app_mode == "Data Upload":
-    st.title("📤 Data Upload & Profiling")
     uploaded_file = st.file_uploader("Upload your dataset (CSV/XLSX)", type=["csv", "xlsx"])
     if uploaded_file:
         try:
             if uploaded_file.name.endswith('.csv'):
                 df = pd.read_csv(uploaded_file)
             else:
                 df = pd.read_excel(uploaded_file)
             st.session_state.raw_data = df
             col1, col2, col3 = st.columns(3)
             with col1:
                 st.metric("Rows", df.shape[0])
@@ -420,15 +339,15 @@ if app_mode == "Data Upload":
                 st.metric("Columns", df.shape[1])
             with col3:
                 st.metric("Missing Values", df.isna().sum().sum())
             with st.expander("Data Preview", expanded=True):
                 st.dataframe(df.head(10), use_container_width=True)
             if st.button("Generate Full Profile Report"):
                 with st.spinner("Generating comprehensive analysis..."):
                     pr = ProfileReport(df, explorative=True)
                     st_profile_report(pr)
         except Exception as e:
             st.error(f"Error loading file: {str(e)}")
@@ -440,16 +359,21 @@ elif app_mode == "Data Cleaning":
     if st.session_state.raw_data is None:
         st.warning("Please upload data first")
-        st.stop()  # Stop execution if no data uploaded
-    if 'cleaned_data' in st.session_state and st.session_state.cleaned_data is not None:
-        df = st.session_state.cleaned_data.copy()  # Work on the latest cleaned data
-    else:
-        st.warning("No cleaned data available. Please clean your data first.")
-        st.stop()  # Stop execution if no cleaned data is available
     # Data Health Dashboard
     enhance_section_title("Data Health Dashboard", "📊")
     with st.expander("📊 Data Health Dashboard", expanded=True):
         col1, col2, col3 = st.columns(3)
         with col1:
@@ -465,15 +389,19 @@ elif app_mode == "Data Cleaning":
                 profile = ProfileReport(df, minimal=True)
                 st_profile_report(profile)
     # Undo Functionality
     if len(st.session_state.data_versions) > 1:
         if st.button("⏮️ Undo Last Action"):
             st.session_state.data_versions.pop()  # Remove current version
             st.session_state.cleaned_data = st.session_state.data_versions[-1].copy() # Set data
             st.success("Last action undone!")
-            st.experimental_rerun() #Force re-run after undo
     # Missing Value Handling
     enhance_section_title("Missing Values Treatment", "🔍")
     with st.expander("🔍 Missing Values Treatment", expanded=True):
         missing_cols = df.columns[df.isna().any()].tolist()
@@ -491,27 +419,35 @@ elif app_mode == "Data Cleaning":
                 custom_val = st.text_input("Enter custom value")
             if st.button("Apply Treatment (Missing)"):
-                new_df = df.copy()
-                if method == "Drop Missing":
-                    new_df = new_df.dropna(subset=cols)
-                elif method == "Mean/Median":
-                    for col in cols:
-                        if pd.api.types.is_numeric_dtype(new_df[col]):
-                            new_df[col] = new_df[col].fillna(new_df[col].median())
-                        else:
-                            new_df[col] = new_df[col].fillna(new_df[col].mode()[0])
-                elif method == "Custom Value" and custom_val:
-                    for col in cols:
-                        new_df[col] = new_df[col].fillna(custom_val)
-                elif method == "Forward Fill":
-                    new_df[cols] = new_df[cols].ffill()
-                elif method == "Backward Fill":
-                    new_df[cols] = new_df[cols].bfill()
-                update_cleaned_data(new_df)
-                st.experimental_rerun()  # Force re-run after apply
     # Data Type Conversion
     enhance_section_title("Data Type Conversion", "🔄")
     with st.expander("🔄 Data Type Conversion"):
         col_to_convert = st.selectbox("Select column", df.columns)
@@ -524,8 +460,8 @@ elif app_mode == "Data Cleaning":
             date_format = st.text_input("Date format (e.g. %Y-%m-%d)", "%Y-%m-%d")
         if st.button("Convert (Data Type)"):
-            new_df = df.copy()
             try:
                 if new_type == "String":
                     new_df[col_to_convert] = new_df[col_to_convert].astype(str)
                 elif new_type == "Integer":
@@ -544,47 +480,61 @@ elif app_mode == "Data Cleaning":
                     new_df[col_to_convert] = pd.to_datetime(new_df[col_to_convert], format=date_format, errors='coerce')
                 update_cleaned_data(new_df)
-                st.experimental_rerun()  # Force re-run after apply
             except Exception as e:
                 st.error(f"Error: {str(e)}")
     # Drop Columns
     enhance_section_title("Drop Columns", "🗑️")
     with st.expander("🗑️ Drop Columns"):
         columns_to_drop = st.multiselect("Select columns to drop", df.columns)
         if columns_to_drop:
             st.warning(f"Will drop: {', '.join(columns_to_drop)}")
             if st.button("Confirm Drop (Columns)"):
-                new_df = df.drop(columns=columns_to_drop)
                 update_cleaned_data(new_df)
-                st.experimental_rerun()  # Force re-run after apply
     # Label Encoding
     enhance_section_title("Label Encoding", "🔢")
     with st.expander("🔢 Label Encoding"):
         data_to_encode = st.multiselect("Select categorical columns to encode", df.select_dtypes(include='object').columns)
         if data_to_encode:
             if st.button("Apply Label Encoding (Encoding)"):
                 new_df = df.copy()
                 for col in data_to_encode:
                     le = LabelEncoder()
                     new_df[col] = le.fit_transform(new_df[col].astype(str))
                 update_cleaned_data(new_df)
-                st.experimental_rerun()  # Force re-run after apply
     # StandardScaler
     enhance_section_title("StandardScaler", "📏")
     with st.expander("📏 StandardScaler"):
         scale_cols = st.multiselect("Select numeric columns to scale", df.select_dtypes(include=np.number).columns)
         if scale_cols:
             if st.button("Apply StandardScaler (Scaling)"):
-                new_df = df.copy()
-                scaler = StandardScaler()
-                new_df[scale_cols] = scaler.fit_transform(new_df[scale_cols])
-                update_cleaned_data(new_df)
-                st.experimental_rerun()  # Force re-run after apply
     # Pattern-Based Cleaning
     enhance_section_title("Pattern-Based Cleaning", "🕵️")
     with st.expander("🕵️ Pattern-Based Cleaning"):
         selected_col = st.selectbox("Select text column", df.select_dtypes(include='object').columns)
@@ -592,12 +542,17 @@ elif app_mode == "Data Cleaning":
         replacement = st.text_input("Replacement value")
         if st.button("Apply Pattern Replacement (Replace)"):
-            new_df = df.copy()
-            new_df[selected_col] = new_df[selected_col].str.replace(pattern, replacement, regex=True)
-            update_cleaned_data(new_df)
-            st.experimental_rerun()  # Force re-run after apply
     # Bulk Operations
     enhance_section_title("Bulk Actions", "🚀")
     with st.expander("🚀 Bulk Actions"):
         if st.button("Auto-Clean Common Issues (Cleaning)"):
@@ -607,12 +562,17 @@ elif app_mode == "Data Cleaning":
             text_cols = new_df.select_dtypes(include='object').columns
             new_df[text_cols] = new_df[text_cols].apply(lambda x: x.str.strip())
             update_cleaned_data(new_df)
-            st.experimental_rerun()  # Force re-run after apply
     # Cleaned Data Preview
-    enhance_section_title("✨ Cleaned Data Preview", "✨")
-    with st.expander("✨ Cleaned Data Preview"):
-        st.dataframe(st.session_state.cleaned_data.head(), use_container_width=True)
 # --------------------------
@@ -725,7 +685,7 @@ elif app_mode == "EDA":
         try:
             fig = None  # Initialize fig to None
             if st.session_state.cleaned_data is None:
-                st.warning("Please clean your data first")
                 st.stop()
             # Generate appropriate visualization with input validation
@@ -1046,12 +1006,55 @@ elif app_mode == "Model Training":
             st.stop()
         # Call the training function
-        model, scaler, label_encoder, imputer_numerical, metrics, column_order, importance, X_train, y_train = train_model(df.copy(), target, features, problem_type, test_size, model_type, model_params, use_grid_search) # Pass a copy to avoid modifying the original # Capture X_train and y_train
         if model: # Only proceed if training was successful
             st.success("Model trained successfully!")
-            # ... (rest of the Model Training code - metrics display, feature importance, saving model) ...
             # Save Model
             st.subheader("Save Model")
@@ -1107,7 +1110,8 @@ elif app_mode == "Model Training":
                     st.metric("MSE", f"{validation_metrics['mse']:.2f}")
                     st.metric("R2", f"{validation_metrics['r2']:.2f}")
-elif app_mode == "Predictions":
     st.title("🔮 Predictive Analytics - Informed Business Decisions")
     if st.session_state.get("model") is None:
@@ -1132,8 +1136,8 @@ elif app_mode == "Predictions":
     with col2:
         st.subheader("Data Overview")
-        input_df = pd.DataFrame([input_data]) #Make DataFrame
-        st.dataframe(input_df,use_container_width=True) #DataFrame of the input to see it
     # Predicts Function and Displays Result
     if st.button("Generate Prediction & Insights"):
@@ -1147,14 +1151,12 @@ elif app_mode == "Predictions":
             # 3. One-hot encode (handle unseen categories)
             categorical_features = input_df.select_dtypes(exclude=np.number).columns
-            input_df = pd.get_dummies(input_df, columns=categorical_features, dummy_na=False) # dummy_na = False.  We imputed already.
             # 4. Ensure correct column order
-            # Add missing columns with 0 values
             for col in column_order:
                 if col not in input_df.columns:
                     input_df[col] = 0
-            # Reorder Columns
             input_df = input_df[column_order]
             # 5. Scale the input
@@ -1177,29 +1179,21 @@ elif app_mode == "Predictions":
             if problem_type == "Classification":
                 explainer = shap.TreeExplainer(model)
-                shap_values = explainer.shap_values(scaled_input)  # Use the scaled input
-                # class_names = [str(i) for i in range(len(shap_values))]  # Dynamic class names - not needed for force plot
-                fig = shap.force_plot(explainer.expected_value[1], shap_values[1], input_df, matplotlib=False,link="logit") # shap_values[1] for class 1 - force plot
-                st.components.v1.html(shap.getjs() + fig.html(), height=400, width=900) # Adjust height and width as needed.
             else:
-                explainer = shap.TreeExplainer(model)  # Regression
-                shap_values = explainer.shap_values(scaled_input)  # Use the scaled input
-                fig = shap.force_plot(explainer.expected_value, shap_values, input_df, matplotlib=False) # shap_values single array for regression
-                st.components.v1.html(shap.getjs() + fig.html(), height=400, width=900) # Adjust height and width as needed.
             st.write("The visualization above explains how each feature contributed to the final prediction.")
             # 9. Add Permutation Feature Importance (for more global understanding)
             try:
                 enhance_section_title("Global Feature Importance", "🌍")
-                X = pd.DataFrame(scaler.transform(pd.get_dummies(pd.DataFrame(imputer_numerical.transform(input_df), columns=input_df.columns))), columns=input_df.columns) # Apply preprocessing for permutation
-                #X = pd.DataFrame(scaler.transform(input_df), columns = input_df.columns)
-                #X = input_df[input_df.columns]
-                X_train = model_data['X_train'] #Get X train
-                y_train = model_data['y_train'] #Get Y train
                 result = permutation_importance(model, X, input_df, n_repeats=10, random_state=42)
                 importance = result.importances_mean
@@ -1210,55 +1204,4 @@ elif app_mode == "Predictions":
                 st.warning(f"Could not calculate permutation feature importance: {e}")
         except Exception as e:
-            st.error(f"Prediction failed: {str(e)}")
-# Force rerun Streamlit app after data cleaning operations
-            st.experimental_rerun()
-if __name__ == "__main__":
-    # Session State Initialization
-    if 'raw_data' not in st.session_state:
-        st.session_state.raw_data = None
-    if 'cleaned_data' not in st.session_state:
-        st.session_state.cleaned_data = None
-    if 'model' not in st.session_state:
-        st.session_state.model = None
-    if 'data_versions' not in st.session_state:
-        st.session_state.data_versions = []
-    # Custom Styling (Keep it in main if needed)
-    st.markdown("""
-        <style>
-        .main {background-color: #f8f9fa;}
-        .sidebar .sidebar-content {background-color: #2c3e50;}
-        .stButton>button {background-color: #3498db; color: white;}
-        .stTextInput>div>div>input {border: 1px solid #3498db;}
-        .stSelectbox>div>div>select {border: 1px solid #3498db;}
-        .stSlider>div>div>div>div {background-color: #3498db;}
-        .metric {padding: 15px; background-color: white; border-radius: 10px; box-shadow: 0 2px 5px rgba(0,0,0,0.1);}
-        </style>
-    """, unsafe_allow_html=True)
-    # Sidebar Navigation (Keep it in main)
-    with st.sidebar:
-        st.title("🔮 DataInsight Pro")
-        app_mode = st.selectbox(
-            "Navigation",
-            ["Data Upload", "Data Cleaning", "EDA", "Model Training", "Predictions"],
-            format_func=lambda x: f"📌 {x}"
-        )
-        st.markdown("---")
-        st.markdown("Created by Calvin Allen-Crawford")
-        st.markdown("v1.0 | © 2025")
-    # Call app mode function based on selection
-    if app_mode == "Data Upload":
-        app_mode_data_upload()
-    elif app_mode == "Data Cleaning":
-        app_mode_data_cleaning()
-    elif app_mode == "EDA":
-        app_mode_eda()
-    elif app_mode == "Model Training":
-        app_mode_model_training()
-    elif app_mode == "Predictions":
-        app_mode_predictions()

 from sklearn.metrics import accuracy_score, mean_squared_error
 from ydata_profiling import ProfileReport
 from streamlit_pandas_profiling import st_profile_report
+import joblib  # For saving and loading models
+import os  # For file directory
 import shap
 from datetime import datetime
+from stqdm import stqdm
 # --------------------------
 # Helper Functions
         st.session_state.data_versions.append(df.copy())
     st.success("Action completed successfully!")
+@st.cache_data
 def generate_quality_report(df):
     """Generate comprehensive data quality report"""
     report = {
             })
         report['columns'][col] = col_report
     return report
+@st.cache_data
 def train_model(df, target, features, problem_type, test_size, model_type, model_params, use_grid_search=False):
     """Trains a model with hyperparameter tuning, cross-validation, and customizable model architecture."""
         X = df[features]
         y = df[target]
+        # Input Validation (rest of the input validation code remains the same)
         if target not in df.columns:
             raise ValueError(f"Target variable '{target}' not found in DataFrame.")
         for feature in features:
             if feature not in df.columns:
                 raise ValueError(f"Feature '{feature}' not found in DataFrame.")
+        # Preprocessing Pipeline (rest of preprocessing code remains the same)
         numerical_features = X.select_dtypes(include=np.number).columns
         categorical_features = X.select_dtypes(exclude=np.number).columns
         imputer_numerical = SimpleImputer(strategy='mean')  # Or 'median', 'most_frequent', 'constant'
         X[numerical_features] = imputer_numerical.fit_transform(X[numerical_features])
+        X = pd.get_dummies(X, columns=categorical_features, dummy_na=False)
+        label_encoder = None  # Initialize label_encoder
         if problem_type == "Classification" or problem_type == "Multiclass":
             label_encoder = LabelEncoder()
             y = label_encoder.fit_transform(y)
         X_train, X_test, y_train, y_test = train_test_split(
             X, y, test_size=test_size, random_state=42
         )
+        scaler = StandardScaler()
+        X_train = scaler.fit_transform(X_train)
+        X_test = scaler.transform(X_test)
+        # Model Selection and Hyperparameter Tuning (rest of model selection code remains the same)
         if problem_type == "Regression":
             if model_type == "Random Forest":
                 model = RandomForestRegressor(random_state=42)
                     'max_depth': [3, 5]
                 }
             elif model_type == "Neural Network":
+                 model = MLPRegressor(random_state=42, max_iter=500)
                  param_grid = {
+                     'hidden_layer_sizes': [(50,), (100,), (50, 50)],
                      'activation': ['relu', 'tanh'],
                      'alpha': [0.0001, 0.001]
                  }
                     'max_depth': [3, 5]
                 }
             elif model_type == "Neural Network":
+                model = MLPClassifier(random_state=42, max_iter=500)
                 param_grid = {
+                    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
                     'activation': ['relu', 'tanh'],
                     'alpha': [0.0001, 0.001]
                 }
         elif problem_type == "Multiclass": #Multiclass
             if model_type == "Logistic Regression":
+                model = LogisticRegression(random_state=42, solver='liblinear', multi_class='ovr')
+                param_grid = {'C': [0.1, 1.0, 10.0]}
             elif model_type == "Support Vector Machine":
+                model = SVC(random_state=42, probability=True)
                 param_grid = {'C': [0.1, 1.0, 10.0], 'kernel': ['rbf', 'linear']}
             elif model_type == "Random Forest":
                     'n_estimators': [100, 200],
                     'max_depth': [None, 5, 10],
                     'min_samples_split': [2, 5],
+                    'criterion': ['gini', 'entropy']
                 }
             else:
         else:
             raise ValueError(f"Invalid problem type: {problem_type}")
+        param_grid.update(model_params)
         if use_grid_search:
             grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy' if problem_type in ['Classification', 'Multiclass'] else 'neg_mean_squared_error', verbose=1, n_jobs=-1)
+            grid_search.fit(X_train, y_train)
+            model = grid_search.best_estimator_
+            st.write("Best hyperparameters found by Grid Search:", grid_search.best_params_)
         else:
+            model.fit(X_train, y_train)
+        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy' if problem_type in ['Classification', 'Multiclass'] else 'neg_mean_squared_error')
         st.write("Cross-validation scores:", cv_scores)
         st.write("Mean cross-validation score:", cv_scores.mean())
+        # Evaluation (rest of evaluation code remains the same)
+        y_pred = model.predict(X_test)
+        metrics = {}
         if problem_type == "Classification":
             metrics['accuracy'] = accuracy_score(y_test, y_pred)
             metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
+            metrics['classification_report'] = classification_report(y_test, y_pred, output_dict=True)
         elif problem_type == "Multiclass":
             metrics['accuracy'] = accuracy_score(y_test, y_pred)
             metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
+            metrics['classification_report'] = classification_report(y_test, y_pred, output_dict=True)
         else:
             metrics['mse'] = mean_squared_error(y_test, y_pred)
             metrics['r2'] = r2_score(y_test, y_pred)
+        # Feature Importance (rest of feature importance code remains the same)
         try:
+            result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42)
             importance = result.importances_mean
         except Exception as e:
             st.warning(f"Could not calculate feature importance: {e}")
             importance = None
         column_order = X.columns
         return model, scaler, label_encoder, imputer_numerical, metrics, column_order, importance, X_train, y_train # Return X_train and y_train
     except Exception as e:
         st.error(f"Training failed: {str(e)}")
         return None, None, None, None, None, None, None, None, None
 def validate_model(model_path, df, target, features, test_size):
     """Loads a model, preprocesses data, and evaluates the model on a validation set."""
     try:
 # Prediction helper Function
 def prediction_input_form(features, default_values=None):
     """Generates input forms for each feature and returns a dictionary of inputs.
     Args:
         features (list): List of feature names.
         default_values (dict, optional): Default values for each feature. Defaults to None.
     Returns:
         dict: Dictionary where keys are feature names and values are user inputs.
     """
 # --------------------------
 if app_mode == "Data Upload":
     st.title("📤 Data Upload & Profiling")
     uploaded_file = st.file_uploader("Upload your dataset (CSV/XLSX)", type=["csv", "xlsx"])
     if uploaded_file:
         try:
             if uploaded_file.name.endswith('.csv'):
                 df = pd.read_csv(uploaded_file)
             else:
                 df = pd.read_excel(uploaded_file)
             st.session_state.raw_data = df
             col1, col2, col3 = st.columns(3)
             with col1:
                 st.metric("Rows", df.shape[0])
                 st.metric("Columns", df.shape[1])
             with col3:
                 st.metric("Missing Values", df.isna().sum().sum())
             with st.expander("Data Preview", expanded=True):
                 st.dataframe(df.head(10), use_container_width=True)
             if st.button("Generate Full Profile Report"):
                 with st.spinner("Generating comprehensive analysis..."):
                     pr = ProfileReport(df, explorative=True)
                     st_profile_report(pr)
         except Exception as e:
             st.error(f"Error loading file: {str(e)}")
     if st.session_state.raw_data is None:
         st.warning("Please upload data first")
+        st.stop()
+    df = st.session_state.raw_data.copy()  # Ensure df is defined in this section
+    # Initialize session state (only if it's not already there)
+    if 'data_versions' not in st.session_state:
+        st.session_state.data_versions = [st.session_state.raw_data.copy()]
+    if 'cleaned_data' not in st.session_state: # Added a conditional value
+        st.session_state.cleaned_data = st.session_state.raw_data.copy()
+    # --------------------------
     # Data Health Dashboard
+    # --------------------------
     enhance_section_title("Data Health Dashboard", "📊")
     with st.expander("📊 Data Health Dashboard", expanded=True):
         col1, col2, col3 = st.columns(3)
         with col1:
                 profile = ProfileReport(df, minimal=True)
                 st_profile_report(profile)
+    # --------------------------
     # Undo Functionality
+    # --------------------------
     if len(st.session_state.data_versions) > 1:
         if st.button("⏮️ Undo Last Action"):
             st.session_state.data_versions.pop()  # Remove current version
             st.session_state.cleaned_data = st.session_state.data_versions[-1].copy() # Set data
             st.success("Last action undone!")
+            st.rerun() #Force re-run after undo
+    # --------------------------
     # Missing Value Handling
+    # --------------------------
     enhance_section_title("Missing Values Treatment", "🔍")
     with st.expander("🔍 Missing Values Treatment", expanded=True):
         missing_cols = df.columns[df.isna().any()].tolist()
                 custom_val = st.text_input("Enter custom value")
             if st.button("Apply Treatment (Missing)"):
+                try:
+                    new_df = df.copy()  # Create a copy to modify
+                    if method == "Drop Missing":
+                        new_df = new_df.dropna(subset=cols)
+                    elif method == "Mean/Median":
+                        for col in cols:
+                            if pd.api.types.is_numeric_dtype(new_df[col]):
+                                new_df[col] = new_df[col].fillna(new_df[col].median())
+                            else:
+                                new_df[col] = new_df[col].fillna(new_df[col].mode()[0])
+                    elif method == "Custom Value" and custom_val:
+                        for col in cols:
+                            new_df[col] = new_df[col].fillna(custom_val)
+                    elif method == "Forward Fill":
+                        new_df[cols] = new_df[cols].ffill()
+                    elif method == "Backward Fill":
+                        new_df[cols] = new_df[cols].bfill()
+                    update_cleaned_data(new_df)
+                    st.rerun() #Force re-run after apply
+                except Exception as e:
+                    st.error(f"Error: {str(e)}")
+        else:
+            st.success("✨ No missing values found!")
+    # --------------------------
     # Data Type Conversion
+    # --------------------------
     enhance_section_title("Data Type Conversion", "🔄")
     with st.expander("🔄 Data Type Conversion"):
         col_to_convert = st.selectbox("Select column", df.columns)
             date_format = st.text_input("Date format (e.g. %Y-%m-%d)", "%Y-%m-%d")
         if st.button("Convert (Data Type)"):
             try:
+                new_df = df.copy()
                 if new_type == "String":
                     new_df[col_to_convert] = new_df[col_to_convert].astype(str)
                 elif new_type == "Integer":
                     new_df[col_to_convert] = pd.to_datetime(new_df[col_to_convert], format=date_format, errors='coerce')
                 update_cleaned_data(new_df)
+                st.rerun() #Force re-run after apply
             except Exception as e:
                 st.error(f"Error: {str(e)}")
+    # --------------------------
     # Drop Columns
+    # --------------------------
     enhance_section_title("Drop Columns", "🗑️")
     with st.expander("🗑️ Drop Columns"):
         columns_to_drop = st.multiselect("Select columns to drop", df.columns)
         if columns_to_drop:
             st.warning(f"Will drop: {', '.join(columns_to_drop)}")
             if st.button("Confirm Drop (Columns)"):
+                new_df = df.copy()
+                new_df = new_df.drop(columns=columns_to_drop)
                 update_cleaned_data(new_df)
+                st.rerun() #Force re-run after apply
+    # --------------------------
     # Label Encoding
+    # --------------------------
     enhance_section_title("Label Encoding", "🔢")
     with st.expander("🔢 Label Encoding"):
         data_to_encode = st.multiselect("Select categorical columns to encode", df.select_dtypes(include='object').columns)
         if data_to_encode:
             if st.button("Apply Label Encoding (Encoding)"):
                 new_df = df.copy()
+                label_encoders = {}
                 for col in data_to_encode:
                     le = LabelEncoder()
                     new_df[col] = le.fit_transform(new_df[col].astype(str))
+                    label_encoders[col] = le
                 update_cleaned_data(new_df)
+                st.rerun() #Force re-run after apply
+    # --------------------------
     # StandardScaler
+    # --------------------------
     enhance_section_title("StandardScaler", "📏")
     with st.expander("📏 StandardScaler"):
         scale_cols = st.multiselect("Select numeric columns to scale", df.select_dtypes(include=np.number).columns)
         if scale_cols:
             if st.button("Apply StandardScaler (Scaling)"):
+                try:
+                    new_df = df.copy()
+                    scaler = StandardScaler()
+                    new_df[scale_cols] = scaler.fit_transform(new_df[scale_cols])
+                    update_cleaned_data(new_df)
+                    st.rerun()#Force re-run after apply
+                except Exception as e:
+                    st.error(f"Error: {str(e)}")
+    # --------------------------
     # Pattern-Based Cleaning
+    # --------------------------
     enhance_section_title("Pattern-Based Cleaning", "🕵️")
     with st.expander("🕵️ Pattern-Based Cleaning"):
         selected_col = st.selectbox("Select text column", df.select_dtypes(include='object').columns)
         replacement = st.text_input("Replacement value")
         if st.button("Apply Pattern Replacement (Replace)"):
+            try:
+                new_df = df.copy()
+                new_df[selected_col] = new_df[selected_col].str.replace(pattern, replacement, regex=True)
+                update_cleaned_data(new_df)
+                st.rerun() #Force re-run after apply
+            except Exception as e:
+                st.error(f"Error: {str(e)}")
+    # --------------------------
     # Bulk Operations
+    # --------------------------
     enhance_section_title("Bulk Actions", "🚀")
     with st.expander("🚀 Bulk Actions"):
         if st.button("Auto-Clean Common Issues (Cleaning)"):
             text_cols = new_df.select_dtypes(include='object').columns
             new_df[text_cols] = new_df[text_cols].apply(lambda x: x.str.strip())
             update_cleaned_data(new_df)
+            st.rerun() #Force re-run after apply
+    # --------------------------
     # Cleaned Data Preview
+    # --------------------------
+    if st.session_state.get("cleaned_data") is not None:
+        enhance_section_title("Cleaned Data Preview", "✨")
+        with st.expander("✨ Cleaned Data Preview", expanded=True):
+            st.dataframe(st.session_state.cleaned_data.head(), use_container_width=True)
 # --------------------------
         try:
             fig = None  # Initialize fig to None
             if st.session_state.cleaned_data is None:
+                st.warning("Please upload data first")
                 st.stop()
             # Generate appropriate visualization with input validation
             st.stop()
         # Call the training function
+        model, scaler, label_encoder, imputer_numerical, metrics, column_order, importance = train_model(df.copy(), target, features, problem_type, test_size, model_type, model_params, use_grid_search) # Pass a copy to avoid modifying the original
         if model: # Only proceed if training was successful
             st.success("Model trained successfully!")
+            # Display Metrics
+            st.subheader("Model Evaluation Metrics")
+            if problem_type in ["Classification", "Multiclass"]: #Combined here
+                st.metric("Accuracy", f"{metrics['accuracy']:.2%}")
+                # Confusion Matrix Visualization
+                st.subheader("Confusion Matrix")
+                cm = metrics['confusion_matrix']
+                class_names = [str(i) for i in np.unique(df[target])] #Get original class names
+                fig_cm = px.imshow(cm,
+                                    labels=dict(x="Predicted", y="Actual"),
+                                    x=class_names,
+                                    y=class_names,
+                                    color_continuous_scale="Viridis")
+                st.plotly_chart(fig_cm, use_container_width=True)
+                # Classification Report
+                st.subheader("Classification Report")
+                report = metrics['classification_report']
+                report_df = pd.DataFrame(report).transpose()
+                st.dataframe(report_df)
+            else:
+                st.metric("MSE", f"{metrics['mse']:.2f}")
+                st.metric("R2", f"{metrics['r2']:.2f}")
+            # Feature Importance
+            st.subheader("Feature Importance")
+            try:
+                fig_importance = px.bar(
+                    x=importance,
+                    y=column_order, #Use stored column order
+                    orientation='h',
+                    title="Feature Importance"
+                )
+                st.plotly_chart(fig_importance, use_container_width=True)
+            except Exception as e:
+                st.warning(f"Could not display feature importance: {e}")
+             # Explainable AI (Placeholder)
+            st.subheader("Explainable AI (XAI)")
+            st.write("Future implementation will include model explanations using techniques like SHAP or LIME.") #To be implemented
+            if st.checkbox("Show a random model explanation (example)"): #Example of a feature, to be implemented
+                 st.write("This feature is important because...")
             # Save Model
             st.subheader("Save Model")
                     st.metric("MSE", f"{validation_metrics['mse']:.2f}")
                     st.metric("R2", f"{validation_metrics['r2']:.2f}")
+# Predictions Section (Fixed)
+if app_mode == "Predictions":
     st.title("🔮 Predictive Analytics - Informed Business Decisions")
     if st.session_state.get("model") is None:
     with col2:
         st.subheader("Data Overview")
+        input_df = pd.DataFrame([input_data])  # Make DataFrame
+        st.dataframe(input_df, use_container_width=True)  # DataFrame of the input to see it
     # Predicts Function and Displays Result
     if st.button("Generate Prediction & Insights"):
             # 3. One-hot encode (handle unseen categories)
             categorical_features = input_df.select_dtypes(exclude=np.number).columns
+            input_df = pd.get_dummies(input_df, columns=categorical_features, dummy_na=False)
             # 4. Ensure correct column order
             for col in column_order:
                 if col not in input_df.columns:
                     input_df[col] = 0
             input_df = input_df[column_order]
             # 5. Scale the input
             if problem_type == "Classification":
                 explainer = shap.TreeExplainer(model)
+                shap_values = explainer.shap_values(scaled_input)
+                fig = shap.force_plot(explainer.expected_value[1], shap_values[1], input_df, matplotlib=False, link="logit")
+                st.components.v1.html(shap.getjs() + fig.html(), height=400, width=900)
             else:
+                explainer = shap.TreeExplainer(model)
+                shap_values = explainer.shap_values(scaled_input)
+                fig = shap.force_plot(explainer.expected_value, shap_values, input_df, matplotlib=False)
+                st.components.v1.html(shap.getjs() + fig.html(), height=400, width=900)
             st.write("The visualization above explains how each feature contributed to the final prediction.")
             # 9. Add Permutation Feature Importance (for more global understanding)
             try:
                 enhance_section_title("Global Feature Importance", "🌍")
+                X = pd.DataFrame(scaler.transform(input_df), columns=input_df.columns)
                 result = permutation_importance(model, X, input_df, n_repeats=10, random_state=42)
                 importance = result.importances_mean
                 st.warning(f"Could not calculate permutation feature importance: {e}")
         except Exception as e:
+            st.error(f"Prediction failed: {str(e)}")