Spaces:

CosmickVisions
/

Data-Vision

Running

App Files Files Community

CosmickVisions commited on Mar 2

Commit

bd14dcd

verified ·

1 Parent(s): 07ebc51

Update app.py

Browse files

Files changed (1) hide show

app.py +679 -135

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import pandas as pd
 import numpy as np
 import plotly.express as px
 import plotly.graph_objects as go
 from sklearn.preprocessing import StandardScaler, LabelEncoder
 from sklearn.model_selection import train_test_split
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
@@ -192,39 +193,6 @@ with st.sidebar:
 # --------------------------
 # Main App Pages
 # --------------------------
-if app_mode == "Data Upload":
-    st.title("📤 Data Upload & Profiling")
-    uploaded_file = st.file_uploader("Upload your dataset (CSV/XLSX)", type=["csv", "xlsx"])
-    if uploaded_file:
-        try:
-            if uploaded_file.name.endswith('.csv'):
-                df = pd.read_csv(uploaded_file)
-            else:
-                df = pd.read_excel(uploaded_file)
-            st.session_state.raw_data = df
-            col1, col2, col3 = st.columns(3)
-            with col1:
-                st.metric("Rows", df.shape[0])
-            with col2:
-                st.metric("Columns", df.shape[1])
-            with col3:
-                st.metric("Missing Values", df.isna().sum().sum())
-            with st.expander("Data Preview", expanded=True):
-                st.dataframe(df.head(10), use_container_width=True)
-            if st.button("Generate Full Profile Report"):
-                with st.spinner("Generating comprehensive analysis..."):
-                    pr = ProfileReport(df, explorative=True)
-                    st_profile_report(pr)
-        except Exception as e:
-            st.error(f"Error loading file: {str(e)}")
 elif app_mode == "Data Cleaning":
     st.title("🧹 Smart Data Cleaning")
@@ -232,9 +200,43 @@ elif app_mode == "Data Cleaning":
         st.warning("Please upload data first")
         st.stop()
-    df = st.session_state.raw_data.copy()
     # Missing Value Handling
     with st.expander("🔍 Missing Values Treatment", expanded=True):
         missing_cols = df.columns[df.isna().any()].tolist()
         if missing_cols:
@@ -242,22 +244,43 @@ elif app_mode == "Data Cleaning":
             method = st.selectbox("Imputation Method", [
                 "Drop Missing",
                 "Mean/Median",
-                "Custom Value"
             ])
             if st.button("Apply Treatment"):
-                if method == "Drop Missing":
-                    df = df.dropna(subset=cols)
-                elif method == "Mean/Median":
-                    for col in cols:
-                        if pd.api.types.is_numeric_dtype(df[col]):
-                            df[col] = df[col].fillna(df[col].median())
-                st.session_state.cleaned_data = df
-                st.success("Missing values handled successfully!")
         else:
-            st.success("No missing values found!")
     # Data Type Conversion
     with st.expander("🔄 Data Type Conversion"):
         col_to_convert = st.selectbox("Select column", df.columns)
         new_type = st.selectbox("New data type", [
@@ -265,64 +288,119 @@ elif app_mode == "Data Cleaning":
             "Boolean", "Datetime"
         ])
         if st.button("Convert"):
             try:
                 if new_type == "String":
                     df[col_to_convert] = df[col_to_convert].astype(str)
                 elif new_type == "Integer":
-                    df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce').astype('Int64')
                 st.session_state.cleaned_data = df
                 st.success("Conversion successful!")
             except Exception as e:
                 st.error(f"Error: {str(e)}")
-    if st.session_state.cleaned_data is not None:
-        with st.expander("✨ Cleaned Data Preview"):
-            st.dataframe(st.session_state.cleaned_data.head(), use_container_width=True)
-        # Drop Columns
     with st.expander("🗑️ Drop Columns"):
         columns_to_drop = st.multiselect("Select columns to drop", df.columns)
-        if st.button("Drop Columns"):
-            df = df.drop(columns=columns_to_drop)
-            st.session_state.cleaned_data = df
-            st.success("Selected columns dropped successfully!")
-    # Label Encoder
-    with st.expander("Label Encoder"):
-        data_to_encode = st.multiselect("Select columns to encode", df.columns)
         if data_to_encode:
-            label_encoders = {}
-            for col in data_to_encode:
-                le = LabelEncoder()
-                df[col] = le.fit_transform(df[col].astype(str))
-                label_encoders[col] = le  # Storing the encoders in case you need to inverse transform later
-            st.session_state.cleaned_data = df
-            st.success("Selected columns encoded successfully!")
-            # Optionally, display the encoded data
-            with st.expander("✨ Encoded Data Preview"):
-                st.dataframe(st.session_state.cleaned_data.head(), use_container_width=True)
     # StandardScaler
     with st.expander("📏 StandardScaler"):
-        scale_cols = st.multiselect("Select columns to scale", df.columns)
-        if st.button("Apply StandardScaler"):
             try:
-                scaler = StandardScaler()
-                df[scale_cols] = scaler.fit_transform(df[scale_cols])
                 st.session_state.cleaned_data = df
-                st.success("Standard scaling applied successfully!")
-                # Optionally, display the scaled data
-                with st.expander("✨ Scaled Data Preview"):
-                    st.dataframe(st.session_state.cleaned_data.head(), use_container_width=True)
             except Exception as e:
-                st.error(f"Error: {str(e)}")
-elif app_mode == "EDA":
     st.title("🔍 Exploratory Data Analysis")
     if st.session_state.cleaned_data is None:
@@ -331,96 +409,562 @@ elif app_mode == "EDA":
     df = st.session_state.cleaned_data
     # Visualization Selector
     col1, col2 = st.columns([1, 3])
     with col1:
-        st.subheader("Visualization Setup")
         plot_type = st.selectbox("Choose plot type", [
             "Scatter Plot", "Histogram",
-            "Box Plot", "Correlation Matrix"
         ])
         x_axis = st.selectbox("X-Axis", df.columns)
-        y_axis = st.selectbox("Y-Axis", df.columns) if plot_type in ["Scatter Plot", "Box Plot"] else None
         color_by = st.selectbox("Color By", [None] + df.columns.tolist())
     with col2:
-        st.subheader("Visualization")
         try:
             if plot_type == "Scatter Plot":
-                fig = px.scatter(df, x=x_axis, y=y_axis, color=color_by)
             elif plot_type == "Histogram":
-                fig = px.histogram(df, x=x_axis, color=color_by)
             elif plot_type == "Box Plot":
-                fig = px.box(df, x=x_axis, y=y_axis, color=color_by)
             elif plot_type == "Correlation Matrix":
                 corr = df.select_dtypes(include=np.number).corr()
-                fig = px.imshow(corr, text_auto=True)
             st.plotly_chart(fig, use_container_width=True)
         except Exception as e:
             st.error(f"Visualization error: {str(e)}")
 elif app_mode == "Model Training":
     st.title("🤖 Intelligent Model Training")
-    if st.session_state.cleaned_data is None:
         st.warning("Please clean your data first")
         st.stop()
     df = st.session_state.cleaned_data
     # Model Setup
-    col1, col2 = st.columns(2)
     with col1:
         target = st.selectbox("Select Target Variable", df.columns)
-        problem_type = st.selectbox("Problem Type", ["Classification", "Regression"])
     with col2:
-        features = st.multiselect("Select Features", df.columns.drop(target))
         test_size = st.slider("Test Size", 0.1, 0.5, 0.2)
-    if st.button("Train Model"):
         try:
-            X = df[features]
-            y = df[target]
-            # Preprocessing
-            X = pd.get_dummies(X)
-            y = LabelEncoder().fit_transform(y) if problem_type == "Classification" else y
-            X_train, X_test, y_train, y_test = train_test_split(
-                X, y, test_size=test_size, random_state=42
-            )
-            # Model Training
-            if problem_type == "Classification":
-                model = RandomForestClassifier()
-            else:
-                model = RandomForestRegressor()
-            model.fit(X_train, y_train)
-            st.session_state.model = model
-            # Evaluation
-            y_pred = model.predict(X_test)
-            if problem_type == "Classification":
-                accuracy = accuracy_score(y_test, y_pred)
-                st.metric("Accuracy", f"{accuracy:.2%}")
             else:
-                mse = mean_squared_error(y_test, y_pred)
-                st.metric("MSE", f"{mse:.2f}")
             # Feature Importance
-            fig = px.bar(
-                x=model.feature_importances_,
-                y=X.columns,
-                orientation='h',
-                title="Feature Importance"
-            )
-            st.plotly_chart(fig, use_container_width=True)
-        except Exception as e:
-            st.error(f"Training failed: {str(e)}")
 elif app_mode == "Predictions":
     st.title("🔮 Predictive Analytics")

 import numpy as np
 import plotly.express as px
 import plotly.graph_objects as go
+from scipy.stats import pearsonr, spearmanr
 from sklearn.preprocessing import StandardScaler, LabelEncoder
 from sklearn.model_selection import train_test_split
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 # --------------------------
 # Main App Pages
 # --------------------------
 elif app_mode == "Data Cleaning":
     st.title("🧹 Smart Data Cleaning")
         st.warning("Please upload data first")
         st.stop()
+    # Initialize session state for undo functionality
+    if 'data_versions' not in st.session_state:
+        st.session_state.data_versions = [st.session_state.raw_data.copy()]
+    df = st.session_state.data_versions[-1].copy()
+    # --------------------------
+    # Data Health Dashboard
+    # --------------------------
+    with st.expander("📊 Data Health Dashboard", expanded=True):
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            st.metric("Total Columns", len(df.columns))
+        with col2:
+            st.metric("Total Rows", len(df))
+        with col3:
+            st.metric("Missing Values", df.isna().sum().sum())
+        # Generate quick profile report
+        if st.button("Generate Data Health Report"):
+            with st.spinner("Analyzing data..."):
+                profile = ProfileReport(df, minimal=True)
+                st_profile_report(profile)
+    # --------------------------
+    # Undo Functionality
+    # --------------------------
+    if len(st.session_state.data_versions) > 1:
+        if st.button("⏮️ Undo Last Action"):
+            st.session_state.data_versions.pop()
+            df = st.session_state.data_versions[-1].copy()
+            st.session_state.cleaned_data = df
+            st.success("Last action undone!")
+    # --------------------------
     # Missing Value Handling
+    # --------------------------
     with st.expander("🔍 Missing Values Treatment", expanded=True):
         missing_cols = df.columns[df.isna().any()].tolist()
         if missing_cols:
             method = st.selectbox("Imputation Method", [
                 "Drop Missing",
                 "Mean/Median",
+                "Custom Value",
+                "Forward Fill",
+                "Backward Fill"
             ])
+            if method == "Custom Value":
+                custom_val = st.text_input("Enter custom value")
             if st.button("Apply Treatment"):
+                st.session_state.data_versions.append(df.copy())
+                try:
+                    if method == "Drop Missing":
+                        df = df.dropna(subset=cols)
+                    elif method == "Mean/Median":
+                        for col in cols:
+                            if pd.api.types.is_numeric_dtype(df[col]):
+                                df[col] = df[col].fillna(df[col].median())
+                            else:
+                                df[col] = df[col].fillna(df[col].mode()[0])
+                    elif method == "Custom Value" and custom_val:
+                        for col in cols:
+                            df[col] = df[col].fillna(custom_val)
+                    elif method == "Forward Fill":
+                        df[cols] = df[cols].ffill()
+                    elif method == "Backward Fill":
+                        df[cols] = df[cols].bfill()
+                    st.session_state.cleaned_data = df
+                    st.success("Missing values handled successfully!")
+                except Exception as e:
+                    st.error(f"Error: {str(e)}")
         else:
+            st.success("✨ No missing values found!")
+    # --------------------------
     # Data Type Conversion
+    # --------------------------
     with st.expander("🔄 Data Type Conversion"):
         col_to_convert = st.selectbox("Select column", df.columns)
         new_type = st.selectbox("New data type", [
             "Boolean", "Datetime"
         ])
+        if new_type == "Datetime":
+            date_format = st.text_input("Date format (e.g. %Y-%m-%d)", "%Y-%m-%d")
         if st.button("Convert"):
+            st.session_state.data_versions.append(df.copy())
             try:
                 if new_type == "String":
                     df[col_to_convert] = df[col_to_convert].astype(str)
                 elif new_type == "Integer":
+                    if df[col_to_convert].dtype == 'object':
+                        st.error("Cannot convert text column to integer!")
+                    else:
+                        df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce').astype('Int64')
+                elif new_type == "Float":
+                    if df[col_to_convert].dtype == 'object':
+                        st.error("Cannot convert text column to float!")
+                    else:
+                        df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce')
+                elif new_type == "Boolean":
+                    df[col_to_convert] = df[col_to_convert].astype(bool)
+                elif new_type == "Datetime":
+                    df[col_to_convert] = pd.to_datetime(df[col_to_convert], format=date_format, errors='coerce')
                 st.session_state.cleaned_data = df
                 st.success("Conversion successful!")
             except Exception as e:
                 st.error(f"Error: {str(e)}")
+    # --------------------------
+    # Drop Columns
+    # --------------------------
     with st.expander("🗑️ Drop Columns"):
         columns_to_drop = st.multiselect("Select columns to drop", df.columns)
+        if columns_to_drop:
+            st.warning(f"Will drop: {', '.join(columns_to_drop)}")
+            if st.button("Confirm Drop"):
+                st.session_state.data_versions.append(df.copy())
+                df = df.drop(columns=columns_to_drop)
+                st.session_state.cleaned_data = df
+                st.success("Selected columns dropped successfully!")
+    # --------------------------
+    # Label Encoding
+    # --------------------------
+    with st.expander("🔢 Label Encoding"):
+        data_to_encode = st.multiselect("Select categorical columns to encode", df.select_dtypes(include='object').columns)
         if data_to_encode:
+            if st.button("Apply Label Encoding"):
+                st.session_state.data_versions.append(df.copy())
+                label_encoders = {}
+                for col in data_to_encode:
+                    le = LabelEncoder()
+                    df[col] = le.fit_transform(df[col].astype(str))
+                    label_encoders[col] = le
+                st.session_state.cleaned_data = df
+                st.success("Label encoding applied successfully!")
+    # --------------------------
     # StandardScaler
+    # --------------------------
     with st.expander("📏 StandardScaler"):
+        scale_cols = st.multiselect("Select numeric columns to scale", df.select_dtypes(include=np.number).columns)
+        if scale_cols:
+            if st.button("Apply StandardScaler"):
+                st.session_state.data_versions.append(df.copy())
+                try:
+                    scaler = StandardScaler()
+                    df[scale_cols] = scaler.fit_transform(df[scale_cols])
+                    st.session_state.cleaned_data = df
+                    st.success("Standard scaling applied successfully!")
+                except Exception as e:
+                    st.error(f"Error: {str(e)}")
+    # --------------------------
+    # Pattern-Based Cleaning
+    # --------------------------
+    with st.expander("🕵️ Pattern-Based Cleaning"):
+        selected_col = st.selectbox("Select text column", df.select_dtypes(include='object').columns)
+        pattern = st.text_input("Regex pattern (e.g. \d+ for numbers)")
+        replacement = st.text_input("Replacement value")
+        if st.button("Apply Pattern Replacement"):
+            st.session_state.data_versions.append(df.copy())
             try:
+                df[selected_col] = df[selected_col].str.replace(pattern, replacement, regex=True)
                 st.session_state.cleaned_data = df
+                st.success("Pattern replacement applied successfully!")
             except Exception as e:
+                st.error(f"Error: {str(e)}")
+    # --------------------------
+    # Bulk Operations
+    # --------------------------
+    with st.expander("🚀 Bulk Actions"):
+        if st.button("Auto-Clean Common Issues"):
+            st.session_state.data_versions.append(df.copy())
+            df = df.dropna(axis=1, how='all')  # Remove empty cols
+            df = df.convert_dtypes()  # Better type inference
+            text_cols = df.select_dtypes(include='object').columns
+            df[text_cols] = df[text_cols].apply(lambda x: x.str.strip())
+            st.session_state.cleaned_data = df
+            st.success("Bulk cleaning completed!")
+    # --------------------------
+    # Cleaned Data Preview
+    # --------------------------
+    if st.session_state.cleaned_data is not None:
+        with st.expander("✨ Cleaned Data Preview", expanded=True):
+            st.dataframe(st.session_state.cleaned_data.head(), use_container_width=True)
+# Main function for EDA
+def eda():
     st.title("🔍 Exploratory Data Analysis")
     if st.session_state.cleaned_data is None:
     df = st.session_state.cleaned_data
+    # --------------------------
+    # Data Overview
+    # --------------------------
+    with st.expander("📊 Data Overview", expanded=True):
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            st.metric("Total Rows", df.shape[0])
+        with col2:
+            st.metric("Total Columns", df.shape[1])
+        with col3:
+            st.metric("Missing Values", df.isna().sum().sum())
+        if st.checkbox("Show Data Preview"):
+            st.dataframe(df.head(), use_container_width=True)
+    # --------------------------
     # Visualization Selector
+    # --------------------------
+    st.subheader("📈 Visualization Setup")
     col1, col2 = st.columns([1, 3])
     with col1:
         plot_type = st.selectbox("Choose plot type", [
             "Scatter Plot", "Histogram",
+            "Box Plot", "Correlation Matrix",
+            "Line Chart", "Heatmap", "Violin Plot",
+            "3D Scatter Plot", "Parallel Coordinates",
+            "Pair Plot", "Density Contour"
         ])
         x_axis = st.selectbox("X-Axis", df.columns)
+        y_axis = st.selectbox("Y-Axis", df.columns) if plot_type in ["Scatter Plot", "Box Plot", "Line Chart", "Violin Plot", "3D Scatter Plot", "Density Contour"] else None
+        z_axis = st.selectbox("Z-Axis", df.columns) if plot_type == "3D Scatter Plot" else None
         color_by = st.selectbox("Color By", [None] + df.columns.tolist())
+        facet_col = st.selectbox("Facet By", [None] + df.columns.tolist())
     with col2:
+        st.subheader("📊 Visualization")
         try:
             if plot_type == "Scatter Plot":
+                fig = px.scatter(df, x=x_axis, y=y_axis, color=color_by, facet_col=facet_col)
             elif plot_type == "Histogram":
+                fig = px.histogram(df, x=x_axis, color=color_by, facet_col=facet_col)
             elif plot_type == "Box Plot":
+                fig = px.box(df, x=x_axis, y=y_axis, color=color_by, facet_col=facet_col)
             elif plot_type == "Correlation Matrix":
                 corr = df.select_dtypes(include=np.number).corr()
+                fig = px.imshow(corr, text_auto=True, color_continuous_scale='Viridis')
+            elif plot_type == "Line Chart":
+                fig = px.line(df, x=x_axis, y=y_axis, color=color_by, facet_col=facet_col)
+            elif plot_type == "Heatmap":
+                fig = go.Figure(data=go.Heatmap(
+                    z=df.corr().values,
+                    x=df.columns,
+                    y=df.columns,
+                    colorscale='Viridis'))
+            elif plot_type == "Violin Plot":
+                fig = px.violin(df, x=x_axis, y=y_axis, color=color_by, facet_col=facet_col)
+            elif plot_type == "3D Scatter Plot":
+                fig = px.scatter_3d(df, x=x_axis, y=y_axis, z=z_axis, color=color_by)
+            elif plot_type == "Parallel Coordinates":
+                fig = px.parallel_coordinates(df, color=color_by)
+            elif plot_type == "Pair Plot":
+                fig = px.scatter_matrix(df, color=color_by)
+            elif plot_type == "Density Contour":
+                fig = px.density_contour(df, x=x_axis, y=y_axis, color=color_by)
             st.plotly_chart(fig, use_container_width=True)
         except Exception as e:
             st.error(f"Visualization error: {str(e)}")
+    # --------------------------
+    # Relationship Diagnostics
+    # --------------------------
+    st.subheader("🔗 Relationship Diagnostics")
+    selected_columns = st.multiselect("Select columns to analyze relationships", df.columns)
+    if selected_columns:
+        if len(selected_columns) == 2:
+            col1, col2 = st.columns(2)
+            with col1:
+                st.write(f"**Scatter Plot: {selected_columns[0]} vs {selected_columns[1]}**")
+                fig = px.scatter(df, x=selected_columns[0], y=selected_columns[1], trendline="ols")
+                st.plotly_chart(fig, use_container_width=True)
+            with col2:
+                st.write("**Statistical Summary**")
+                st.write(df[selected_columns].describe())
+                # Correlation Analysis
+                pearson_corr, _ = pearsonr(df[selected_columns[0]], df[selected_columns[1]])
+                spearman_corr, _ = spearmanr(df[selected_columns[0]], df[selected_columns[1]])
+                st.metric("Pearson Correlation", f"{pearson_corr:.2f}")
+                st.metric("Spearman Correlation", f"{spearman_corr:.2f}")
+                st.write("**Regression Line**")
+                st.write(f"Equation: y = {fig.data[1].line.color} * x + {fig.data[1].line.dash}")
+        elif len(selected_columns) > 2:
+            st.warning("Please select only two columns for relationship analysis.")
+        else:
+            st.warning("Please select at least two columns for relationship analysis.")
+    # --------------------------
+    # Advanced Statistics
+    # --------------------------
+    with st.expander("📊 Advanced Statistics", expanded=False):
+        st.write("**Column-wise Statistics**")
+        selected_col = st.selectbox("Select a column for detailed analysis", df.columns)
+        if selected_col:
+            if pd.api.types.is_numeric_dtype(df[selected_col]):
+                st.write(f"**Distribution of {selected_col}**")
+                fig = px.histogram(df, x=selected_col, nbins=30)
+                st.plotly_chart(fig, use_container_width=True)
+                st.write("**Outlier Detection**")
+                Q1 = df[selected_col].quantile(0.25)
+                Q3 = df[selected_col].quantile(0.75)
+                IQR = Q3 - Q1
+                outliers = df[(df[selected_col] < (Q1 - 1.5 * IQR)) | (df[selected_col] > (Q3 + 1.5 * IQR))]
+                st.write(f"Number of outliers: {len(outliers)}")
+                st.dataframe(outliers.head(), use_container_width=True)
+            else:
+                st.write(f"**Value Counts for {selected_col}**")
+                value_counts = df[selected_col].value_counts()
+                st.bar_chart(value_counts)
+    # --------------------------
+    # Save Visualizations
+    # --------------------------
+    st.subheader("💾 Save Visualizations")
+    if st.button("Export Current Visualization as PNG"):
+        try:
+            fig.write_image("visualization.png")
+            st.success("Visualization saved as PNG!")
+        except Exception as e:
+            st.error(f"Error saving visualization: {str(e)}")
+# Call the EDA function
+eda()
+# Function to train the model (Separated for clarity and reusability)
+def train_model(df, target, features, problem_type, test_size, model_type, model_params, use_grid_search=False):
+    """Trains a model with hyperparameter tuning, cross-validation, and customizable model architecture."""
+    try:
+        X = df[features]
+        y = df[target]
+        # Input Validation
+        if target not in df.columns:
+            raise ValueError(f"Target variable '{target}' not found in DataFrame.")
+        for feature in features:
+            if feature not in df.columns:
+                raise ValueError(f"Feature '{feature}' not found in DataFrame.")
+        # Preprocessing Pipeline:  Handles missing values, encoding, scaling
+        # Imputation: Handle missing values BEFORE encoding (numerical only for SimpleImputer)
+        numerical_features = X.select_dtypes(include=np.number).columns
+        categorical_features = X.select_dtypes(exclude=np.number).columns
+        imputer_numerical = SimpleImputer(strategy='mean')  # Or 'median', 'most_frequent', 'constant'
+        X[numerical_features] = imputer_numerical.fit_transform(X[numerical_features])
+        # Encoding (One-Hot Encode Categorical Features)
+        X = pd.get_dummies(X, columns=categorical_features, dummy_na=False) # dummy_na = False.  We imputed already.
+        # Target Encoding (if classification)
+        label_encoder = None #Initialize label_encoder
+        if problem_type == "Classification" or problem_type == "Multiclass":
+            label_encoder = LabelEncoder()
+            y = label_encoder.fit_transform(y)
+        # Split the data
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=test_size, random_state=42
+        )
+        # Scaling (AFTER splitting!)
+        scaler = StandardScaler() # Or try MinMaxScaler, RobustScaler, QuantileTransformer
+        X_train = scaler.fit_transform(X_train) #Fit to the training data ONLY
+        X_test = scaler.transform(X_test) #Transform the test data using the fitted scaler
+        # Model Selection and Hyperparameter Tuning
+        if problem_type == "Regression":
+            if model_type == "Random Forest":
+                model = RandomForestRegressor(random_state=42)
+                param_grid = {
+                    'n_estimators': [100, 200],
+                    'max_depth': [None, 5, 10],
+                    'min_samples_split': [2, 5]
+                }
+            elif model_type == "Gradient Boosting":
+                model = GradientBoostingRegressor(random_state=42)
+                param_grid = {
+                    'n_estimators': [100, 200],
+                    'learning_rate': [0.01, 0.1],
+                    'max_depth': [3, 5]
+                }
+            elif model_type == "Neural Network":
+                 model = MLPRegressor(random_state=42, max_iter=500) #set max_iter to 500
+                 param_grid = {
+                     'hidden_layer_sizes': [(50,), (100,), (50, 50)], #example sizes for depth
+                     'activation': ['relu', 'tanh'],
+                     'alpha': [0.0001, 0.001]
+                 }
+            else:
+                raise ValueError(f"Invalid model type: {model_type}")
+        elif problem_type == "Classification": #Binary
+            if model_type == "Random Forest":
+                model = RandomForestClassifier(random_state=42)
+                param_grid = {
+                    'n_estimators': [100, 200],
+                    'max_depth': [None, 5, 10],
+                    'min_samples_split': [2, 5]
+                }
+            elif model_type == "Gradient Boosting":
+                model = GradientBoostingClassifier(random_state=42)
+                param_grid = {
+                    'n_estimators': [100, 200],
+                    'learning_rate': [0.01, 0.1],
+                    'max_depth': [3, 5]
+                }
+            elif model_type == "Neural Network":
+                model = MLPClassifier(random_state=42, max_iter=500) #set max_iter to 500
+                param_grid = {
+                    'hidden_layer_sizes': [(50,), (100,), (50, 50)], #example sizes for depth
+                    'activation': ['relu', 'tanh'],
+                    'alpha': [0.0001, 0.001]
+                }
+            else:
+                raise ValueError(f"Invalid model type: {model_type}")
+        elif problem_type == "Multiclass": #Multiclass
+            if model_type == "Logistic Regression":
+                model = LogisticRegression(random_state=42, solver='liblinear', multi_class='ovr')  # 'ovr' for one-vs-rest
+                param_grid = {'C': [0.1, 1.0, 10.0]}  # Regularization parameter
+            elif model_type == "Support Vector Machine":
+                model = SVC(random_state=42, probability=True)  # probability=True for probabilities
+                param_grid = {'C': [0.1, 1.0, 10.0], 'kernel': ['rbf', 'linear']}
+            elif model_type == "Random Forest":
+                model = RandomForestClassifier(random_state=42)
+                param_grid = {
+                    'n_estimators': [100, 200],
+                    'max_depth': [None, 5, 10],
+                    'min_samples_split': [2, 5],
+                    'criterion': ['gini', 'entropy'] #criterion for decision
+                }
+            else:
+                raise ValueError(f"Invalid model type: {model_type} for Multiclass")
+        else:
+            raise ValueError(f"Invalid problem type: {problem_type}")
+        # Update param_grid with user-defined parameters
+        param_grid.update(model_params) #This is key to use the model_params provided by user
+        if use_grid_search:
+            grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy' if problem_type in ['Classification', 'Multiclass'] else 'neg_mean_squared_error', verbose=1, n_jobs=-1)
+            grid_search.fit(X_train, y_train)
+            model = grid_search.best_estimator_ # Use the best model found
+            st.write("Best hyperparameters found by Grid Search:", grid_search.best_params_) #Print best parameters
+        else:
+            model.fit(X_train, y_train)
+        # Cross-Validation (after hyperparameter tuning, if applicable)
+        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy' if problem_type in ['Classification', 'Multiclass'] else 'neg_mean_squared_error')
+        st.write("Cross-validation scores:", cv_scores)
+        st.write("Mean cross-validation score:", cv_scores.mean())
+        # Evaluation
+        y_pred = model.predict(X_test)
+        metrics = {} #Store metrics in a dictionary
+        if problem_type == "Classification":
+            metrics['accuracy'] = accuracy_score(y_test, y_pred)
+            metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
+            metrics['classification_report'] = classification_report(y_test, y_pred, output_dict=True) #Get report as dictionary
+        elif problem_type == "Multiclass":
+            metrics['accuracy'] = accuracy_score(y_test, y_pred)
+            metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
+            metrics['classification_report'] = classification_report(y_test, y_pred, output_dict=True) #Get report as dictionary
+        else:
+            metrics['mse'] = mean_squared_error(y_test, y_pred)
+            metrics['r2'] = r2_score(y_test, y_pred)
+        # Feature Importance (Permutation Importance for potentially better handling of correlated features)
+        try:
+            result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42) #Permutation Feature Importance
+            importance = result.importances_mean
+        except Exception as e:
+            st.warning(f"Could not calculate feature importance: {e}")
+            importance = None
+        # Store the column order for prediction purposes
+        column_order = X.columns
+        return model, scaler, label_encoder, imputer_numerical, metrics, column_order, importance
+    except Exception as e:
+        st.error(f"Training failed: {str(e)}")
+        return None, None, None, None, None, None, None
+# Model Validation Function
+def validate_model(model_path, df, target, features, test_size):
+    """Loads a model, preprocesses data, and evaluates the model on a validation set."""
+    try:
+        loaded_data = joblib.load(model_path)
+        model = loaded_data['model']
+        scaler = loaded_data['scaler']
+        label_encoder = loaded_data['label_encoder']
+        imputer_numerical = loaded_data['imputer_numerical']
+        column_order = loaded_data['column_order']
+        problem_type = loaded_data['problem_type']
+        X = df[features]
+        y = df[target]
+        # Imputation
+        numerical_features = X.select_dtypes(include=np.number).columns
+        X[numerical_features] = imputer_numerical.transform(X[numerical_features])
+        # Encoding
+        X = pd.get_dummies(X, columns=X.select_dtypes(exclude=np.number).columns, dummy_na=False)
+        # Ensure correct column order
+        X = X[column_order] #Reorder the columns
+        # Split the data
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=test_size, random_state=42
+        )
+        # Scaling
+        X_train = scaler.transform(X_train)
+        X_test = scaler.transform(X_test)
+         # Target Encoding (if classification) - Use the same encoder used during training
+        if problem_type == "Classification" or problem_type == "Multiclass":
+            y = label_encoder.transform(y)
+        y_pred = model.predict(X_test)
+        metrics = {}
+        if problem_type == "Classification":
+            metrics['accuracy'] = accuracy_score(y_test, y_pred)
+            metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
+            metrics['classification_report'] = classification_report(y_test, y_pred, output_dict=True)
+        elif problem_type == "Multiclass":
+            metrics['accuracy'] = accuracy_score(y_test, y_pred)
+            metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
+            metrics['classification_report'] = classification_report(y_test, y_pred, output_dict=True)
+        else:
+            metrics['mse'] = mean_squared_error(y_test, y_pred)
+            metrics['r2'] = r2_score(y_test, y_pred)
+        return metrics, problem_type
+    except Exception as e:
+        st.error(f"Validation failed: {str(e)}")
+        return None, None
+# Streamlit App
 elif app_mode == "Model Training":
     st.title("🤖 Intelligent Model Training")
+    if st.session_state.get("cleaned_data") is None:
         st.warning("Please clean your data first")
         st.stop()
     df = st.session_state.cleaned_data
     # Model Setup
+    col1, col2, col3 = st.columns(3)
     with col1:
         target = st.selectbox("Select Target Variable", df.columns)
+        problem_type = st.selectbox("Problem Type", ["Classification", "Regression", "Multiclass"]) #Added Multiclass
     with col2:
+        available_features = df.columns.drop(target)
+        features = st.multiselect("Select Features", available_features, default=list(available_features)) # Select all as default
+    with col3:
         test_size = st.slider("Test Size", 0.1, 0.5, 0.2)
+    # Model Type Selection
+    if problem_type == "Regression":
+        model_type = st.selectbox("Select Regression Model", ["Random Forest", "Gradient Boosting", "Neural Network"])
+    elif problem_type == "Classification":
+        model_type = st.selectbox("Select Classification Model", ["Random Forest", "Gradient Boosting", "Neural Network"])
+    elif problem_type == "Multiclass":
+        model_type = st.selectbox("Select Multiclass Model", ["Logistic Regression", "Support Vector Machine", "Random Forest"]) #Added SVM and Logistic Regression
+    else:
+        model_type = None #handle this
+    # Hyperparameter Configuration - Dynamic based on Model Type
+    st.subheader("Hyperparameter Configuration")
+    model_params = {}
+    if model_type == "Neural Network": #Add options for NN parameters
+        hidden_layers = st.text_input("Hidden Layer Sizes (e.g., 50,50 for two layers of 50 neurons)", "50,50")
+        activation = st.selectbox("Activation Function", ["relu", "tanh", "logistic"])
+        alpha = st.number_input("L2 Regularization (Alpha)", value=0.0001)
+        #Process the hidden layers string to a tuple of ints
         try:
+            hidden_layer_sizes = tuple(map(int, hidden_layers.split(',')))
+            model_params['hidden_layer_sizes'] = hidden_layer_sizes
+        except ValueError:
+            st.error("Invalid format for Hidden Layer Sizes.  Use comma-separated integers (e.g., 50,50)")
+        model_params['activation'] = activation
+        model_params['alpha'] = alpha
+    elif model_type == "Gradient Boosting":
+        n_estimators = st.slider("Number of Estimators", 50, 300, 100)
+        learning_rate = st.number_input("Learning Rate", value=0.1)
+        max_depth = st.slider("Max Depth", 2, 10, 3)
+        model_params['n_estimators'] = n_estimators
+        model_params['learning_rate'] = learning_rate
+        model_params['max_depth'] = max_depth
+    elif model_type == "Logistic Regression":
+        c_value = st.number_input("C (Regularization)", value=1.0)
+        model_params['C'] = c_value
+    elif model_type == "Support Vector Machine":
+        c_value = st.number_input("C (Regularization)", value=1.0)
+        kernel_type = st.selectbox("Kernel Type", ['rbf', 'linear', 'poly', 'sigmoid'])
+        model_params['C'] = c_value
+        model_params['kernel'] = kernel_type
+    elif model_type == "Random Forest":
+        n_estimators = st.slider("Number of Estimators", 50, 300, 100)
+        max_depth = st.slider("Max Depth", 2, 10, 3)
+        model_params['n_estimators'] = n_estimators
+        model_params['max_depth'] = max_depth
+    use_grid_search = st.checkbox("Use Grid Search for Hyperparameter Tuning")
+    if st.button("Train Model"):
+        if not features:
+            st.error("Please select at least one feature.")
+            st.stop()
+        # Call the training function
+        model, scaler, label_encoder, imputer_numerical, metrics, column_order, importance = train_model(df.copy(), target, features, problem_type, test_size, model_type, model_params, use_grid_search) # Pass a copy to avoid modifying the original
+        if model: # Only proceed if training was successful
+            st.success("Model trained successfully!")
+            # Display Metrics
+            st.subheader("Model Evaluation Metrics")
+            if problem_type in ["Classification", "Multiclass"]: #Combined here
+                st.metric("Accuracy", f"{metrics['accuracy']:.2%}")
+                # Confusion Matrix Visualization
+                st.subheader("Confusion Matrix")
+                cm = metrics['confusion_matrix']
+                class_names = [str(i) for i in np.unique(df[target])] #Get original class names
+                fig_cm = px.imshow(cm,
+                                    labels=dict(x="Predicted", y="Actual"),
+                                    x=class_names,
+                                    y=class_names,
+                                    color_continuous_scale="Viridis")
+                st.plotly_chart(fig_cm, use_container_width=True)
+                # Classification Report
+                st.subheader("Classification Report")
+                report = metrics['classification_report']
+                report_df = pd.DataFrame(report).transpose()
+                st.dataframe(report_df)
             else:
+                st.metric("MSE", f"{metrics['mse']:.2f}")
+                st.metric("R2", f"{metrics['r2']:.2f}")
             # Feature Importance
+            st.subheader("Feature Importance")
+            try:
+                fig_importance = px.bar(
+                    x=importance,
+                    y=column_order, #Use stored column order
+                    orientation='h',
+                    title="Feature Importance"
+                )
+                st.plotly_chart(fig_importance, use_container_width=True)
+            except Exception as e:
+                st.warning(f"Could not display feature importance: {e}")
+             # Explainable AI (Placeholder)
+            st.subheader("Explainable AI (XAI)")
+            st.write("Future implementation will include model explanations using techniques like SHAP or LIME.") #To be implemented
+            if st.checkbox("Show a random model explanation (example)"): #Example of a feature, to be implemented
+                 st.write("This feature is important because...")
+            # Save Model
+            st.subheader("Save Model")
+            model_name = st.text_input("Enter model name (without extension)", "my_model")
+            if st.button("Save Model"):
+                try:
+                    model_path = f"{model_name}.joblib"
+                    joblib.dump({
+                        'model': model,
+                        'scaler': scaler,
+                        'label_encoder': label_encoder,
+                        'imputer_numerical': imputer_numerical,
+                        'column_order': column_order,
+                        'features': features,
+                        'target': target,
+                        'problem_type': problem_type,
+                        'model_type': model_type,
+                        'model_params': model_params
+                    }, model_path)
+                    st.success(f"Model saved as {model_path}")
+                except Exception as e:
+                    st.error(f"Error saving model: {e}")
+    # Model Validation Section
+    st.header("Model Validation")
+    model_path_validate = st.text_input("Enter path to saved model for validation", "my_model.joblib")
+    if st.button("Validate Model"):
+        if not os.path.exists(model_path_validate):
+            st.error("Model file not found.")
+        else:
+            validation_metrics, problem_type = validate_model(model_path_validate, df.copy(), target, features, test_size) #Pass a copy of the dataframe
+            if validation_metrics:
+                st.subheader("Validation Metrics")
+                if problem_type in ["Classification", "Multiclass"]: #Combined here
+                    st.metric("Accuracy", f"{validation_metrics['accuracy']:.2%}")
+                    st.subheader("Confusion Matrix")
+                    cm = validation_metrics['confusion_matrix']
+                    class_names = [str(i) for i in np.unique(df[target])] #Get original class names
+                    fig_cm = px.imshow(cm,
+                                        labels=dict(x="Predicted", y="Actual"),
+                                        x=class_names,
+                                        y=class_names,
+                                        color_continuous_scale="Viridis")
+                    st.plotly_chart(fig_cm, use_container_width=True)
+                    st.subheader("Classification Report")
+                    report = validation_metrics['classification_report']
+                    report_df = pd.DataFrame(report).transpose()
+                    st.dataframe(report_df)
+                else:
+                    st.metric("MSE", f"{validation_metrics['mse']:.2f}")
+                    st.metric("R2", f"{validation_metrics['r2']:.2f}")
 elif app_mode == "Predictions":
     st.title("🔮 Predictive Analytics")