Spaces:

CosmickVisions
/

Data-Vision

Running

App Files Files Community

CosmickVisions commited on Mar 2

Commit

213f577

verified ·

1 Parent(s): b9d21cf

Update app.py

Browse files

Files changed (1) hide show

app.py +255 -1293

app.py CHANGED Viewed

@@ -1,1360 +1,322 @@
-import gradio as gr
-import numpy as np
 import pandas as pd
-import matplotlib.pyplot as plt
-import seaborn as sns
-import io
-import os
-from tensorflow.keras.models import Sequential
-from tensorflow.keras.layers import Dense, Dropout
 from sklearn.preprocessing import StandardScaler, LabelEncoder
 from sklearn.model_selection import train_test_split
-import re
-# Pydantic is now in pydantic-settings, fixed
-from pydantic_settings import BaseSettings # Fix: import from pydantic_settings
-# pandas_profiling import and fix
 from ydata_profiling import ProfileReport
 from streamlit_pandas_profiling import st_profile_report
-import streamlit as st
-import numpy as np
-import pandas as pd
-import plotly.express as px
-from scipy import stats
-import plotly.colors as pc
 import joblib
-from io import StringIO
-import requests
-import asyncio
-from io import BytesIO
-import base64
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers, callbacks
-from tensorflow.keras.utils import to_categorical
-from keras.models import Sequential
-from keras.layers import Dense
-import mimetypes
-import tensorflow
-import matplotlib.pyplot as plt
-from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, learning_curve
-from sklearn.linear_model import LinearRegression, LogisticRegression
-from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
-from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
-from sklearn.svm import SVR, SVC
-from sklearn.feature_selection import SelectKBest
-from sklearn.experimental import enable_iterative_imputer
-from sklearn.impute import IterativeImputer
-from sklearn.neural_network import MLPRegressor, MLPClassifier
-from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
-from sklearn.impute import KNNImputer, SimpleImputer
-from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder, LabelEncoder
-from sklearn.compose import ColumnTransformer
-from sklearn.pipeline import Pipeline
-from datetime import datetime  # Import datetime
-# Enhanced configuration
 st.set_page_config(
-    page_title="Executive Insights Pro",
     layout="wide",
-    page_icon="📈",
     initial_sidebar_state="expanded"
 )
-# Initial session state setup (at the top of your script)
 if 'raw_data' not in st.session_state:
     st.session_state.raw_data = None
 if 'cleaned_data' not in st.session_state:
     st.session_state.cleaned_data = None
 if 'model' not in st.session_state:
     st.session_state.model = None
-if 'preprocessor' not in st.session_state:
-    st.session_state.preprocessor = None
-# Security: Set allowed file types
-ALLOWED_EXTENSIONS = {'csv', 'xlsx', 'parquet', 'feather'}
-MAX_FILE_SIZE_MB = 250  # 250MB limit
-def validate_file(file):
-    """Comprehensive file validation"""
-    if not file:
-        return False, "No file uploaded"
-    extension = file.name.split('.')[-1].lower()
-    if extension not in ALLOWED_EXTENSIONS:
-        return False, f"Unsupported file type: {extension}"
-    file_size_mb = file.size / (1024 * 1024)
-    if file_size_mb > MAX_FILE_SIZE_MB:
-        return False, f"File size exceeds {MAX_FILE_SIZE_MB}MB limit"
-    return True, ""
-@st.cache_data(ttl=3600, show_spinner="Analyzing data quality...")
-def enhanced_quality_report(df):
     """Generate comprehensive data quality report"""
     report = {
-        'basic_stats': {
             'rows': df.shape[0],
             'columns': df.shape[1],
-            'missing_values': df.isna().sum().sum(),
             'duplicates': df.duplicated().sum()
         },
-        'column_analysis': {},
-        'data_health_score': 100  # Starting score
     }
     for col in df.columns:
         col_report = {
             'type': str(df[col].dtype),
             'unique': df[col].nunique(),
             'missing': df[col].isna().sum(),
-            'samples': df[col].dropna().sample(3).tolist() if df[col].dtype == 'object' else []
         }
-        # Numeric specific checks
         if pd.api.types.is_numeric_dtype(df[col]):
             col_report.update({
                 'mean': df[col].mean(),
                 'std': df[col].std(),
-                'zeros': (df[col] == 0).sum(),
-                'negatives': (df[col] < 0).sum() if df[col].dtype != 'uint' else 0,
-                'outliers': detect_outliers(df[col])
-            })
-            report['data_health_score'] -= 2  # Deduct 2% per numeric column
-        # Categorical specific checks
-        if pd.api.types.is_string_dtype(df[col]):
-            col_report.update({
-                'top_value': df[col].mode()[0] if not df[col].empty else None,
-                'top_freq': df[col].value_counts().iloc[0]/len(df) if not df[col].empty else 0
             })
-            report['data_health_score'] -= 1  # Deduct 1% per string column
-        report['column_analysis'][col] = col_report
-        report['data_health_score'] = max(report['data_health_score'], 0)
     return report
-def detect_outliers(series):
-    """Detect outliers using IQR method"""
-    q1 = series.quantile(0.25)
-    q3 = series.quantile(0.75)
-    iqr = q3 - q1
-    return ((series < (q1 - 1.5 * iqr)) | (series > (q3 + 1.5 * iqr))).sum()
-# Define app_mode for navigation
-app_mode = st.sidebar.selectbox(
-    "Select Page",
-    ["Data Upload", "Smart Cleaning", "Advanced EDA", "Model Training", "Insights", "Predictions", "Neural Network Studio"],
-    help="Choose the section to navigate to."
-)
-# Initialize df globally
-df = pd.DataFrame()
-# --- Data Upload Page ---
-# Data Upload Page
 if app_mode == "Data Upload":
-    st.title("📥 Smart Data Hub")
-    st.markdown("""
-        **Upload your dataset** (CSV, Excel, Parquet) for comprehensive analysis. Get instant data health insights and quality assessment.
-    """)
-    # File upload
-    uploaded_file = st.file_uploader("Drag & drop or browse files", type=list(ALLOWED_EXTENSIONS))
     if uploaded_file:
-        # Validate file
-        is_valid, message = validate_file(uploaded_file)
-        if not is_valid:
-            st.error(f"Upload error: {message}")
-            st.stop()
-        # Load data with progress
-        with st.spinner(f"Loading {uploaded_file.name} ..."):
-            try:
-                if uploaded_file.name.endswith('.csv'):
-                    df = pd.read_csv(uploaded_file, low_memory=False)
-                elif uploaded_file.name.endswith(('.xlsx', '.xls')):
-                    df = pd.read_excel(uploaded_file)
-                elif uploaded_file.name.endswith('.parquet'):
-                    df = pd.read_parquet(uploaded_file)
-                elif uploaded_file.name.endswith('.feather'):
-                    df = pd.read_feather(uploaded_file)
-                st.session_state.raw_data = df
-                st.success("Dataset loaded successfully!")
-            except Exception as e:
-                st.error(f"Error loading file: {str(e)}")
-                st.stop()
-                # In your Data Upload section, add this when new data is uploaded
-    if uploaded_file is not None:
-        # Reset models when new data is uploaded
-        st.session_state.model = None
-        st.session_state.preprocessor = None
-        # Data Health Dashboard
-        st.subheader("📊 Data Health Dashboard")
-        report = enhanced_quality_report(df)
-        col1, col2, col3, col4 = st.columns(4)
-        col1.metric("Total Rows", report['basic_stats']['rows'])
-        col2.metric("Total Columns", report['basic_stats']['columns'])
-        col3.metric("Missing Values", report['basic_stats']['missing_values'])
-        col4.metric("Data Health Score", f"{report['data_health_score']}/100")
-        # Column Explorer
-        with st.expander("🔍 Deep Column Analysis", expanded=True):
-            selected_col = st.selectbox("Select column to inspect", df.columns)
-            col_info = report['column_analysis'][selected_col]
-            st.write(f"**Type:** {col_info['type']}")
-            st.write(f"**Unique Values:** {col_info['unique']}")
-            st.write(f"**Missing Values:** {col_info['missing']} ({col_info['missing']/len(df):.1%})")
-            if pd.api.types.is_numeric_dtype(df[selected_col]):
-                st.write("**Distribution:**")
-                st.line_chart(df[selected_col])
-                st.write(f"**Outliers Detected:** {col_info['outliers']}")
-            else:
-                st.write("**Most Common Values:**")
-                top_values = df[selected_col].value_counts().head(5)
-                st.bar_chart(top_values)
-        # Smart Recommendations
-        with st.expander("💡 Cleaning Recommendations"):
-            recommendations = []
-            if report['basic_stats']['duplicates'] > 0:
-                recommendations.append(f"🚨 Remove {report['basic_stats']['duplicates']} duplicate rows")
-            if report['basic_stats']['missing_values'] > 0:
-                recommendations.append("🔧 Apply advanced imputation strategies")
-            for col, data in report['column_analysis'].items():
-                if data['missing'] > 0.5 * len(df):
-                    recommendations.append(f"⚠️ Consider dropping {col} (>{50}% missing)")
-                if data['unique'] == len(df):
-                    recommendations.append(f"🔍 Investigate {col} - potential unique identifier")
-            if recommendations:
-                st.write("### Recommended Actions")
-                for rec in recommendations[:5]:  # Show top 5
-                    st.write(f"- {rec}")
             else:
-                st.success("No critical issues detected - your data looks healthy!")
-        # Data Preview
-        with st.expander("🔎 Data Preview", expanded=True):
-            preview_size = st.slider("Preview rows", 5, 100, 15)
-            st.dataframe(df.head(preview_size).style.highlight_null(color='#FF6666'))
-        # Advanced Profiling
-        if st.button("🚀 Generate Full Data Profile"):
-            with st.spinner("Generating comprehensive report..."):
-                pr = ProfileReport(df, explorative=True,title="Data Upload Report")  # Added title to pandas profiling
-                st_profile_report(pr)
-    # Cleaning Operations with Tabs
-st.subheader("🔧 Cleaning Operations")
-tab1, tab2, tab3, tab4 = st.tabs(["Missing Values", "Duplicates", "Data Types", "Outliers"])
-# 1. Missing Value Handling
-with tab1:
-    st.markdown("### 🕳️ Handle Missing Values")
-    missing_cols = df.columns[df.isna().any()].tolist()
-    if missing_cols:
-        st.write("Columns with missing values:")
-        cols = st.multiselect("Select columns to clean", missing_cols, default=missing_cols)
-        method = st.radio("Imputation Method", [
-            "Drop Missing",
-            "Mean/Median/Mode",
-            "KNN Imputation",
-            "MICE Imputation",
-            "Deep Learning Imputation"
-        ], horizontal=True)
-        if method == "Mean/Median/Mode":
-            imputation_choice = st.radio("Select Imputation Method", ["Mean", "Median", "Mode"], horizontal=True)
-        if st.button(f"Apply {method}"):
-            try:
-                original_df = df.copy()
-                if method == "Mean/Median/Mode":
                     for col in cols:
-                        if df[col].isnull().any():  # Check if missing values exist before imputing
-                            if pd.api.types.is_numeric_dtype(df[col]):
-                                if imputation_choice == "Mean":
-                                    df[col] = df[col].fillna(df[col].mean())
-                                elif imputation_choice == "Median":
-                                    df[col] = df[col].fillna(df[col].median())
-                                elif imputation_choice == "Mode":
-                                    df[col] = df[col].fillna(df[col].mode()[0])
-                            else:  # Impute strings with mode
-                                df[col] = df[col].fillna(df[col].mode()[0])
-                # Add logic for other methods here...
-                cleaning_actions.append(f"Applied {method} on {cols}")
-                update_version(df)
-                st.success(f"{method} applied successfully! ✅")
-            except Exception as e:
-                st.error(f"Error: {str(e)}")
-    else:
-        st.success("✨ No missing values found!")
-# 2. Duplicate Handling
-with tab2:
-    st.markdown("### 🔄 Handle Duplicates")
-    duplicates = df.duplicated().sum()
-    if duplicates > 0:
-        st.plotly_chart(px.histogram(df, x=df.duplicated(), title="Duplicate Distribution"))
-        dup_strategy = st.radio("Duplicate Strategy", [
-            "Remove All Duplicates",
-            "Keep First Occurrence",
-            "Keep Last Occurrence"
-        ])
-        if st.button("Handle Duplicates"):
-            original_count = len(df)
-            df = df.drop_duplicates(keep={
-                "Remove All Duplicates": False,
-                "Keep First Occurrence": 'first',
-                "Keep Last Occurrence": 'last'
-            }[dup_strategy])
-            cleaning_actions.append(f"Removed {original_count - len(df)} duplicates")
-            update_version(df)
-            st.success(f"Removed {original_count - len(df)} duplicates! ✅")
-    else:
-        st.success("✨ No duplicates found!")
-# 3. Data Type Conversion
-with tab3:
-    st.markdown("### 🔄 Convert Data Types")
-    col1, col2 = st.columns(2)
-    with col1:
-        st.dataframe(df.dtypes.reset_index().rename(columns={0: 'Type', 'index': 'Column'}))
-    with col2:
-        col_to_convert = st.selectbox("Select column to convert", df.columns)
-        new_type = st.selectbox("New Data Type", [
-            "String", "Integer", "Float",
-            "Boolean", "Datetime", "Category"
         ])
-        if st.button("Convert Data Type"):
             try:
                 if new_type == "String":
                     df[col_to_convert] = df[col_to_convert].astype(str)
                 elif new_type == "Integer":
                     df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce').astype('Int64')
-                elif new_type == "Float":
-                    df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce')
-                elif new_type == "Boolean":
-                    df[col_to_convert] = df[col_to_convert].astype(bool)
-                elif new_type == "Datetime":
-                    df[col_to_convert] = pd.to_datetime(df[col_to_convert], errors='coerce')
-                elif new_type == "Category":
-                    df[col_to_convert] = df[col_to_convert].astype('category')
-                cleaning_actions.append(f"Converted {col_to_convert} to {new_type}")
-                update_version(df)
-                st.success("Data type converted successfully! ✅")
             except Exception as e:
-                st.error(f"Conversion failed: {str(e)}")
-# 4. Outlier Handling
-with tab4:
-    st.markdown("### 📈 Handle Outliers")
-    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
-    if numeric_cols:
-        outlier_col = st.selectbox("Select numeric column", numeric_cols)
-        st.plotly_chart(px.box(df, y=outlier_col, title="Outlier Distribution"))
-        if st.button("Remove Outliers"):
-            # Outlier removal logic here...
-            cleaning_actions.append(f"Removed outliers from {outlier_col}")
-            update_version(df)
-            st.success("Outliers removed successfully! ✅")
-    else:
-        st.info("ℹ️ No numeric columns found for outlier detection")
-    # Drop Column Functionality with Interface
-    st.subheader("🗑️ Drop Specific Columns")
-    cols_to_drop = st.multiselect("Select Columns to Drop", df.columns)
-    if st.button("Drop Selected Columns"):
-        try:
-            df = df.drop(columns=cols_to_drop)  # Drop the cols here.
-            cleaning_actions.append(f"Dropped columns: {', '.join(cols_to_drop)}")
-            update_version(df)
-            st.success(f"Columns dropped successfully! ✅")
-        except (KeyError, ValueError) as e:
-            st.error(f"Invalid column(s) selected or other error: {e}") # Handle ValueErrors
-        except Exception as e:
-            st.error(f"An unexpected error occurred: {e}")
-    # Label Encoding (Categorical to Numeric)
-    st.subheader("🔢 Label Encoding")
-    if st.button("Encode Categorical Columns"):
-        try:
-            le = LabelEncoder()
-            categorical_cols = df.select_dtypes(include=['object', 'category']).columns
-            for col in categorical_cols:
-                df[col] = df[col].astype(str)  # Ensure all cols are string
-                df[col] = le.fit_transform(df[col])
-            cleaning_actions.append("Applied Label Encoding to categorical columns")
-            update_version(df)
-            st.success("Label encoding applied successfully! ✅")
-        except Exception as e:
-            st.error(f"Label encoding failed: {str(e)}")
-    # Live Data Preview after every cleaning action
-    st.subheader("✨ Live Data Preview")
-    st.dataframe(df.head(10)) # show 10 rows
-    # 2. Duplicate Handling
-    with tab2:
-        st.markdown("### 🔄 Handle Duplicates")
-        duplicates = df.duplicated().sum()
-        if duplicates > 0:
-            st.plotly_chart(px.histogram(df, x=df.duplicated(), title="Duplicate Distribution"))
-            dup_strategy = st.radio("Duplicate Strategy", [
-                "Remove All Duplicates",
-                "Keep First Occurrence",
-                "Keep Last Occurrence"
-            ])
-            if st.button("Handle Duplicates"):
-                original_count = len(df)
-                df = df.drop_duplicates(keep={
-                    "Remove All Duplicates": False,
-                    "Keep First Occurrence": 'first',
-                    "Keep Last Occurrence": 'last'
-                }[dup_strategy])
-                cleaning_actions.append(f"Removed {original_count - len(df)} duplicates")
-                update_version(df)
-                st.success(f"Removed {original_count - len(df)} duplicates! ✅")
-        else:
-            st.success("✨ No duplicates found!")
-    # 3. Data Type Conversion
-    with tab3:
-        st.markdown("### 🔄 Convert Data Types")
-        col1, col2 = st.columns(2)
-        with col1:
-            st.dataframe(df.dtypes.reset_index().rename(columns={0: 'Type', 'index': 'Column'}))
-        with col2:
-            col_to_convert = st.selectbox("Select column to convert", df.columns)
-            new_type = st.selectbox("New Data Type", [
-                "String", "Integer", "Float",
-                "Boolean", "Datetime", "Category"
-            ])
-            if st.button("Convert Data Type"):
-                try:
-                    if new_type == "String":
-                        df[col_to_convert] = df[col_to_convert].astype(str)
-                    elif new_type == "Integer":
-                        df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce').astype('Int64')
-                    elif new_type == "Float":
-                        df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce')
-                    elif new_type == "Boolean":
-                        df[col_to_convert] = df[col_to_convert].astype(bool)
-                    elif new_type == "Datetime":
-                        df[col_to_convert] = pd.to_datetime(df[col_to_convert], errors='coerce')
-                    elif new_type == "Category":
-                        df[col_to_convert] = df[col_to_convert].astype('category')
-                    cleaning_actions.append(f"Converted {col_to_convert} to {new_type}")
-                    update_version(df)
-                    st.success("Data type converted successfully! ✅")
-                except Exception as e:
-                    st.error(f"Conversion failed: {str(e)}")
-    # 4. Outlier Handling
-    with tab4:
-        st.markdown("### 📈 Handle Outliers")
-        numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
-        if numeric_cols:
-            outlier_col = st.selectbox("Select numeric column", numeric_cols)
-            st.plotly_chart(px.box(df, y=outlier_col, title="Outlier Distribution"))
-            outlier_method = st.radio("Outlier Handling Method", ["Z-score", "IQR", "Manual"], horizontal=True)
-            if st.button("Remove Outliers"):
-                try:
-                    original_df = df.copy()
-                    if outlier_method == "Z-score":
-                        from scipy import stats
-                        z_scores = np.abs(stats.zscore(df[outlier_col]))
-                        df = df[(z_scores < 3)] # Keep only values with zscore less than 3
-                        cleaning_actions.append(f"Removed outliers from {outlier_col} using Z-score (threshold 3)")
-                    elif outlier_method == "IQR":
-                        Q1 = df[outlier_col].quantile(0.25)
-                        Q3 = df[outlier_col].quantile(0.75)
-                        IQR = Q3 - Q1
-                        df = df[~((df[outlier_col] < (Q1 - 1.5 * IQR)) |(df[outlier_col] > (Q3 + 1.5 * IQR)))]
-                        cleaning_actions.append(f"Removed outliers from {outlier_col} using IQR")
-                    elif outlier_method == "Manual":
-                        lower_bound = st.number_input("Lower Bound", value=df[outlier_col].min(), step=1.0)
-                        upper_bound = st.number_input("Upper Bound", value=df[outlier_col].max(), step=1.0)
-                        df = df[(df[outlier_col] >= lower_bound) & (df[outlier_col] <= upper_bound)]
-                        cleaning_actions.append(f"Removed outliers from {outlier_col} using manual bounds")
-                    update_version(df)
-                    st.success("Outliers removed successfully! ✅")
-                except Exception as e:
-                    st.error(f"Outlier removal failed: {str(e)}")
-        else:
-            st.info("ℹ️ No numeric columns found for outlier detection")
-    # Drop Column Functionality with Interface
-    st.subheader("🗑️ Drop Specific Columns")
-    cols_to_drop = st.multiselect("Select Columns to Drop", df.columns)
-    if st.button("Drop Selected Columns"):
-        try:
-            df = df.drop(columns=cols_to_drop) #Drop the cols here.
-            cleaning_actions.append(f"Dropped columns: {', '.join(cols_to_drop)}")
-            update_version(df)
-            st.success(f"Columns dropped successfully! ✅")
-        except (KeyError):
-            st.error("Invalid column(s) selected.")
-        except Exception as e:
-            st.error(f"An unexpected error occurred: {e}")
-    # Label Encoding (Categorical to Numeric)
-    st.subheader("🔢 Label Encoding")
-    if st.button("Encode Categorical Columns"):
-        try:
-            le = LabelEncoder()
-            categorical_cols = df.select_dtypes(include=['object', 'category']).columns
-            for col in categorical_cols:
-                df[col] = df[col].astype(str)  # Ensure all cols are string
-                df[col] = le.fit_transform(df[col])
-            cleaning_actions.append("Applied Label Encoding to categorical columns")
-            update_version(df)
-            st.success("Label encoding applied successfully! ✅")
-        except Exception as e:
-            st.error(f"Label encoding failed: {str(e)}")
-    # Live Data Preview after every cleaning action
-    st.subheader("✨ Live Data Preview")
-    st.dataframe(df.head(10)) # show 10 rows
-    # Save Cleaned Data with Enhanced Feedback
-    if st.button("💾 Save Cleaned Data"):
-        st.session_state.cleaned_data = df
-        st.balloons()
-        # Generate comprehensive report
-        from pandas_profiling import ProfileReport
-        pr = ProfileReport(df, title="Cleaned Data Report")
-        st_profile_report(pr)
-        # Show cleaning log with diffs
-        st.subheader("📝 Cleaning Log")
-        st.table(pd.DataFrame({
-            "Step": range(1, len(cleaning_actions)+1),
-            "Action": cleaning_actions
-        }))
-        # Show dataset comparison
-        col1, col2 = st.columns(2)
-        with col1:
-            st.write("Original Data Shape:", st.session_state.raw_data.shape)
-        with col2:
-            st.write("Cleaned Data Shape:", df.shape)
-        st.success("✅ Cleaned data saved successfully! You can now proceed to analysis.")
-elif app_mode == "Advanced EDA":
-    st.title("🔍 Advanced Exploratory Data Analysis")
-    st.markdown("""
-        **Interactive Data Exploration** with optimized visualizations for fast insights.
-        Uncover patterns and relationships in your data with beautiful, responsive plots.
-    """)
-    if 'cleaned_data' not in st.session_state or st.session_state.cleaned_data is None:
-        st.warning("Please clean your data in the Smart Cleaning section first.")
         st.stop()
-    df = st.session_state.cleaned_data.copy()
-    # Initialize session state for EDA configuration
-    if 'eda_config' not in st.session_state:
-        st.session_state.eda_config = {
-            'plot_type': "Histogram",
-            'x_col': df.columns[0] if len(df.columns) > 0 else None,
-            'y_col': df.columns[1] if len(df.columns) > 1 else None,
-            'z_col': df.columns[2] if len(df.columns) > 2 else None,
-            'color_col': None,
-            'facet_col': None,
-            'hover_data_cols': [],
-            'color_palette': "Viridis",
-            'filter_col': None,
-            'filter_options': []
-        }
-    # Main Layout Columns
     col1, col2 = st.columns([1, 3])
     with col1:
-        st.header("📊 Visualization Setup")
-        # Plot Type Selection
-        plot_types = {
-            "Distribution": ["Histogram", "Box Plot", "Violin Plot", "Density Plot"],
-            "Relationship": ["Scatter Plot", "Line Plot", "Heatmap", "Pair Plot"],
-            "Comparison": ["Bar Chart", "Pie Chart", "Parallel Coordinates"],
-            "3D": ["3D Scatter", "3D Surface"]
-        }
-        selected_category = st.selectbox("Plot Category", list(plot_types.keys()))
-        st.session_state.eda_config['plot_type'] = st.selectbox(
-            "Plot Type",
-            plot_types[selected_category]
-        )
-        # Dynamic Column Selectors
-        plot_type = st.session_state.eda_config['plot_type']
-        if plot_type in ["Histogram", "Box Plot", "Violin Plot", "Density Plot", "Bar Chart", "Pie Chart"]:
-            st.session_state.eda_config['x_col'] = st.selectbox(
-                "X Axis",
-                df.columns,
-                index=df.columns.get_loc(st.session_state.eda_config['x_col'])
-                if st.session_state.eda_config['x_col'] in df.columns else 0
-            )
-        if plot_type in ["Scatter Plot", "Line Plot", "Box Plot", "Violin Plot", "Density Plot"]:
-            st.session_state.eda_config['y_col'] = st.selectbox(
-                "Y Axis",
-                df.columns,
-                index=df.columns.get_loc(st.session_state.eda_config['y_col'])
-                if st.session_state.eda_config['y_col'] in df.columns else 0
-            )
-        if plot_type in ["3D Scatter", "3D Surface"]:
-            st.session_state.eda_config['z_col'] = st.selectbox(
-                "Z Axis",
-                df.columns,
-                index=df.columns.get_loc(st.session_state.eda_config['z_col'])
-                if st.session_state.eda_config['z_col'] in df.columns else 0
-            )
-        # Additional Options
-        with st.expander("🎨 Customization"):
-            st.session_state.eda_config['color_col'] = st.selectbox(
-                "Color By",
-                [None] + list(df.columns)
-            )
-            st.session_state.eda_config['facet_col'] = st.selectbox(
-                "Facet By",
-                [None] + list(df.columns)
-            )
-            st.session_state.eda_config['hover_data_cols'] = st.multiselect(
-                "Hover Data",
-                df.columns
-            )
-            st.session_state.eda_config['color_palette'] = st.selectbox(
-                "Color Palette",
-                px.colors.named_colorscales()
-            )
-        # Data Filtering
-        with st.expander("🔎 Data Filtering"):
-            filter_col = st.selectbox(
-                "Filter Column",
-                [None] + list(df.columns)
-            )
-            if filter_col:
-                unique_values = df[filter_col].unique()
-                selected_values = st.multiselect(
-                    f"Select {filter_col} values",
-                    unique_values,
-                    default=unique_values
-                )
-                df = df[df[filter_col].isin(selected_values)]
     with col2:
-        st.header("📈 Visualization")
-        config = st.session_state.eda_config
-        @st.cache_data(ttl=300)
-        def generate_plot(df, plot_type, config):
-            """Cached plot generation function for better performance"""
-            try:
-                if plot_type == "Histogram":
-                    return px.histogram(
-                        df, x=config['x_col'],
-                        color=config['color_col'],
-                        nbins=30,
-                        color_discrete_sequence=[config['color_palette']]
-                    )
-                elif plot_type == "Scatter Plot":
-                    return px.scatter(
-                        df, x=config['x_col'], y=config['y_col'],
-                        color=config['color_col'],
-                        hover_data=config['hover_data_cols']
-                    )
-                elif plot_type == "Box Plot":
-                    return px.box(
-                        df, x=config['x_col'], y=config['y_col'],
-                        color=config['color_col']
-                    )
-                elif plot_type == "Violin Plot":
-                    return px.violin(
-                        df, x=config['x_col'], y=config['y_col'],
-                        color=config['color_col'],
-                        box=True
-                    )
-                elif plot_type == "Heatmap":
-                    numeric_df = df.select_dtypes(include=np.number)
-                    corr = numeric_df.corr()
-                    return px.imshow(
-                        corr,
-                        text_auto=True,
-                        color_continuous_scale=config['color_palette']
-                    )
-                elif plot_type == "3D Scatter":
-                    return px.scatter_3d(
-                        df, x=config['x_col'], y=config['y_col'], z=config['z_col'],
-                        color=config['color_col']
-                    )
-                elif plot_type == "Bar Chart":
-                    return px.bar(
-                        df, x=config['x_col'], y=config['y_col'],
-                        color=config['color_col']
-                    )
-                elif plot_type == "Pie Chart":
-                    return px.pie(
-                        df, names=config['x_col'], values=config['y_col'],
-                        color_discrete_sequence=[config['color_palette']]
-                    )
-                elif plot_type == "Line Plot":
-                    return px.line(
-                        df, x=config['x_col'], y=config['y_col'],
-                        color=config['color_col']
-                    )
-                elif plot_type == "Pair Plot":
-                    numeric_cols = df.select_dtypes(include=np.number).columns
-                    return px.scatter_matrix(
-                        df[numeric_cols],
-                        color=config['color_col']
-                    )
-                elif plot_type == "Parallel Coordinates":
-                    numeric_df = df.select_dtypes(include=np.number)
-                    return px.parallel_coordinates(
-                        numeric_df,
-                        color_continuous_scale=config['color_palette']
-                    )
-                elif plot_type == "Density Plot":
-                    return px.density_contour(
-                        df, x=config['x_col'], y=config['y_col'],
-                        color=config['color_col']
-                    )
-            except Exception as e:
-                st.error(f"Plot generation error: {str(e)}")
-                return None
-        # Generate and display plot
-        fig = generate_plot(df, plot_type, config)
-        if fig:
             st.plotly_chart(fig, use_container_width=True)
-            # Plot Statistics
-            with st.expander("📊 Plot Statistics"):
-                if plot_type in ["Histogram", "Box Plot", "Violin Plot"]:
-                    st.write(f"**{config['x_col']} Statistics**")
-                    st.table(df[config['x_col']].describe())
-                if plot_type in ["Scatter Plot", "Line Plot"]:
-                    st.write(f"**Correlation between {config['x_col']} and {config['y_col']}**")
-                    corr = df[[config['x_col'], config['y_col']]].corr().iloc[0,1]
-                    st.metric("Pearson Correlation", f"{corr:.2f}")
-                if plot_type == "Heatmap":
-                    st.write("**Correlation Matrix**")
-                    numeric_df = df.select_dtypes(include=np.number)
-                    st.dataframe(numeric_df.corr())
-    # Data Summary Section
-    st.header("📝 Data Summary")
-    with st.expander("Show Data Summary"):
-        col1, col2 = st.columns(2)
-        with col1:
-            st.write("**Data Shape**")
-            st.write(f"Rows: {df.shape[0]}")
-            st.write(f"Columns: {df.shape[1]}")
-        with col2:
-            st.write("**Data Types**")
-            st.dataframe(df.dtypes.reset_index().rename(columns={
-                'index': 'Column', 0: 'Type'
-            }))
-        st.write("**Sample Data**")
-        st.dataframe(df.head())
-# Model Selection
-    st.subheader("🤖 Model Selection")
-    if problem_type == "Regression":
-        model_options = ["Linear Regression", "Decision Tree", "Random Forest", "Gradient Boosting", "SVM", "Neural Network"]
-    else:  # Classification
-        model_options = ["Logistic Regression", "Decision Tree", "Random Forest", "Gradient Boosting", "SVM", "Neural Network", "KNN", "Naive Bayes"]
-    model_name = st.selectbox("Select Model", model_options, help="Choose a model.")
-    # Hyperparameter Tuning
-    st.subheader("🎛️ Hyperparameter Tuning")
-    with st.expander("Configure Hyperparameters", expanded=True):
-        if model_name == "Random Forest":
-            n_estimators = st.slider("Number of Estimators", 10, 200, 100)
-            max_depth = st.slider("Max Depth", 3, 20, 10)
-            min_samples_split = st.slider("Min Samples Split", 2, 10, 2)
-            min_samples_leaf = st.slider("Min Samples Leaf", 1, 10, 1)
-            hyperparams = {
-                'n_estimators': n_estimators,
-                'max_depth': max_depth,
-                'min_samples_split': min_samples_split,
-                'min_samples_leaf': min_samples_leaf
-            }
-        elif model_name == "Gradient Boosting":  # Correct placement of elif
-            learning_rate = st.slider("Learning Rate", 0.01, 1.0, 0.1)
-            n_estimators = st.slider("Number of Estimators", 10, 200, 100)
-            max_depth = st.slider("Max Depth", 3, 20, 10)
-            hyperparams = {
-                'learning_rate': learning_rate,
-                'n_estimators': n_estimators,
-                'max_depth': max_depth
-            }
-        elif model_name == "Neural Network":
-            from tensorflow.keras.models import Sequential
-            from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
-            from tensorflow.keras.optimizers import Adam, Nadam, RMSprop, SGD
-            hidden_layers = st.slider("Number of Hidden Layers", 1, 5, 2)
-            neurons_per_layer = st.slider("Neurons per Layer", 10, 200, 50)
-            activation = st.selectbox("Activation Function",
-                ["relu", "tanh", "sigmoid", "selu", "swish"])
-            dropout_rate = st.slider("Dropout Rate", 0.0, 0.5, 0.2)
-            initializer = st.selectbox("Weight Initializer",
-                ["glorot_uniform", "he_normal", "lecun_uniform"])
-            learning_rate = st.slider("Learning Rate", 0.0001, 0.1, 0.001, format="%.4f")
-            optimizer_choice = st.selectbox("Optimizer",
-                ["Adam", "Nadam", "RMSprop", "SGD"])
-            batch_norm = st.checkbox("Batch Normalization", value=True)
-            regularization = st.checkbox("L2 Regularization")
-            epochs = st.slider("Epochs", 10, 200, 50)
-            batch_size = st.slider("Batch Size", 16, 128, 32)
-            hyperparams = {
-                'hidden_layers': hidden_layers,
-                'neurons_per_layer': neurons_per_layer,
-                'activation': activation,
-                'dropout_rate': dropout_rate,
-                'initializer': initializer,
-                'learning_rate': learning_rate,
-                'optimizer_choice': optimizer_choice,
-                'batch_norm': batch_norm,
-                'regularization': regularization,
-                'epochs': epochs,
-                'batch_size': batch_size
-            }
-        else:
-            hyperparams = {}
-    # Train-Test Split
-    st.subheader("✂️ Train-Test Split")
-    test_size = st.slider("Test Size", 0.1, 0.5, 0.2, help="Proportion of the dataset to include in the test split.")
-    # Model Training
-    if st.button("🚀 Train Model"):
-        with st.spinner("Training model..."):
-            try:
-                X = df[feature_columns]
-                y = df[target_column]
-                # Check if X is empty
-                if X.empty:
-                    st.error("No features were selected. Please select feature columns.")
-                    st.stop()
-                # Train-Test Split
-                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
-                # Preprocessing Pipeline
-                numeric_features = X.select_dtypes(include=np.number).columns
-                categorical_features = X.select_dtypes(exclude=np.number).columns
-                numeric_transformer = Pipeline(steps=[
-                    ('imputer', SimpleImputer(strategy='median')),
-                    ('scaler', StandardScaler())
-                ])
-                categorical_transformer = Pipeline(steps=[
-                    ('imputer', SimpleImputer(strategy='most_frequent')),
-                    ('onehot', OneHotEncoder(handle_unknown='ignore'))
-                ])
-                preprocessor = ColumnTransformer(
-                    transformers=[
-                        ('num', numeric_transformer, numeric_features),
-                        ('cat', categorical_transformer, categorical_features)
-                    ])
-                X_train_processed = preprocessor.fit_transform(X_train)
-                X_test_processed = preprocessor.transform(X_test)
-                # Model Training
-                if model_name == "Linear Regression":
-                    model = LinearRegression()
-                elif model_name == "Logistic Regression":
-                    model = LogisticRegression(max_iter=1000)
-                elif model_name == "Decision Tree":
-                    if problem_type == "Regression":
-                        model = DecisionTreeRegressor()
-                    else:
-                        model = DecisionTreeClassifier()
-                elif model_name == "Random Forest":
-                    if problem_type == "Regression":
-                        model = RandomForestRegressor(**hyperparams)
-                    else:
-                        model = RandomForestClassifier(**hyperparams)
-                elif model_name == "Gradient Boosting":
-                    if problem_type == "Regression":
-                        model = GradientBoostingRegressor(**hyperparams)
-                    else:
-                        model = GradientBoostingClassifier(**hyperparams)
-                elif model_name == "SVM":
-                    if problem_type == "Regression":
-                        model = SVR()
-                    else:
-                        model = SVC()
-                elif model_name == "Neural Network":
-                    from tensorflow.keras.models import Sequential
-                    from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
-                    from tensorflow.keras.optimizers import Adam, Nadam, RMSprop, SGD
-                    # Build a new model with the parameters
-                    model = Sequential()
-                    model.add(layers.Input(shape=(X_train_processed.shape[1],)))
-                    for i in range(hyperparams['hidden_layers']):
-                        model.add(Dense(hyperparams['neurons_per_layer'],
-                                         activation=hyperparams['activation'],
-                                         kernel_initializer=hyperparams['initializer']))
-                        if hyperparams['batch_norm']:
-                            model.add(BatchNormalization())
-                        model.add(Dropout(hyperparams['dropout_rate']))
-                    # Output layer
-                    output_activation = 'linear' if problem_type == "Regression" else 'softmax'
-                    output_units = 1 if problem_type == "Regression" else len(np.unique(y_train))
-                    model.add(Dense(output_units, activation=output_activation))
-                    # Configure optimizer
-                    optimizers = {
-                        "Adam": Adam(learning_rate=hyperparams['learning_rate']),
-                        "Nadam": Nadam(learning_rate=hyperparams['learning_rate']),
-                        "RMSprop": RMSprop(learning_rate=hyperparams['learning_rate']),
-                        "SGD": SGD(learning_rate=hyperparams['learning_rate'], momentum=0.9)
-                    }
-                    optimizer = optimizers[hyperparams['optimizer_choice']]
-                    model.compile(optimizer=optimizer,
-                                   loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
-                                   metrics=['mae'] if problem_type == "Regression" else ['accuracy'])
-                elif model_name == "KNN":
-                    from sklearn.neighbors import KNeighborsClassifier
-                    model = KNeighborsClassifier()
-                elif model_name == "Naive Bayes":
-                    from sklearn.naive_bayes import GaussianNB
-                    model = GaussianNB()
-                # Train the model
-                if model_name == "Neural Network":  # Only for the neural network
-                    history = model.fit(X_train_processed, y_train,
-                                        epochs=hyperparams['epochs'],
-                                        batch_size=hyperparams['batch_size'],
-                                        validation_data=(X_test_processed, y_test),
-                                        verbose=0)
-                else:
-                    model.fit(X_train_processed, y_train)
-                # Store model and preprocessor
-                st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
-                st.session_state.preprocessor = preprocessor
-                # Store the test data for insights and predictions
-                st.session_state.X_train_selected = X_train_processed
-                st.session_state.X_test_selected = X_test_processed
-                st.session_state.y_train = y_train
-                st.session_state.y_test = y_test
-                # Model Evaluation
-                if problem_type == "Regression":
-                    y_pred = model.predict(X_test_processed)
-                    mse = mean_squared_error(y_test, y_pred)
-                    rmse = np.sqrt(mse)
-                    mae = mean_absolute_error(y_test, y_pred)
-                    r2 = r2_score(y_test, y_pred)
-                    st.write(f"Mean Squared Error: {mse:.4f}")
-                    st.write(f"Root Mean Squared Error: {rmse:.4f}")
-                    st.write(f"Mean Absolute Error: {mae:.4f}")
-                    st.write(f"R-squared: {r2:.4f}")
-                else:  # Classification
-                    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
-                    y_pred = model.predict(X_test_processed)
-                    if model_name == "Neural Network":  # Neural network output probabilities
-                        y_pred = np.argmax(model.predict(X_test_processed), axis=1)
-                    accuracy = accuracy_score(y_test, y_pred)
-                    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
-                    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
-                    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
-                    st.write(f"Accuracy: {accuracy:.4f}")
-                    st.write(f"Precision: {precision:.4f}")
-                    st.write(f"Recall: {recall:.4f}")
-                    st.write(f"F1 Score: {f1:.4f}")
-                    st.write("Classification Report:")
-                    st.text(classification_report(y_test, y_pred))
-                    # confusion matrix
-                    st.write("Confusion Matrix:")
-                    conf_matrix = confusion_matrix(y_test, y_pred)
-                    st.write(conf_matrix)
-                # Visualization
-                st.subheader("📊 Model Performance Visualization")
-                if problem_type == "Regression":
-                    fig, ax = plt.subplots()
-                    ax.scatter(y_test, y_pred)
-                    ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
-                    ax.set_xlabel('Actual')
-                    ax.set_ylabel('Predicted')
-                    ax.set_title('Actual vs Predicted')
-                    st.pyplot(fig)
-                elif model_name == "Neural Network":
-                    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
-                    ax1.plot(history.history['loss'], label='Train Loss')
-                    ax1.plot(history.history['val_loss'], label='Validation Loss')
-                    ax1.set_title('Loss Evolution')
-                    ax1.set_xlabel('Epoch')
-                    ax1.set_ylabel('Loss')
-                    ax1.legend()
-                    # Plot accuracy/metric
-                    if problem_type == "Classification":
-                        ax2.plot(history.history['accuracy'], label='Train Accuracy')
-                        ax2.plot(history.history['val_accuracy'], label='Validation Accuracy')
-                        ax2.set_title('Accuracy Evolution')
-                        ax2.set_ylabel('Accuracy')
-                    else:
-                        ax2.plot(history.history['mae'], label='Train MAE')
-                        ax2.plot(history.history['val_mae'], label='Validation MAE')
-                        ax2.set_title('MAE Evolution')
-                        ax2.set_ylabel('MAE')
-                    ax2.set_xlabel('Epoch')
-                    ax2.legend()
-                    st.pyplot(fig)
-                else: # Classification confusion matrix
-                    from sklearn.metrics import confusion_matrix
-                    conf_matrix = confusion_matrix(y_test, y_pred)
-                    fig, ax = plt.subplots()
-                    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax)
-                    ax.set_xlabel('Predicted Labels')
-                    ax.set_ylabel('True Labels')
-                    ax.set_title('Confusion Matrix')
-                    st.pyplot(fig)
-                st.success("Model trained successfully!")
-            except Exception as e:
-                st.error(f"An error occurred during training: {e}")
-    if st.session_state.model is not None:
-        st.subheader("💾 Save Model")
-        model_filename = st.text_input("Enter Model Filename (without extension)", "trained_model")
-        if st.button("Save Model"):
-            try:
-                joblib.dump(st.session_state.model, f"{model_filename}.joblib")
-                st.success(f"Model saved as {model_filename}.joblib")
-            except Exception as e:
-                st.error(f"Error saving model: {e}")
-    else:
-        st.warning("No trained model available. Train a model first to enable saving.")
-# Insights Section
-elif app_mode == "Insights":
-    st.title("📊 Model Insights & Explainability")
-    st.markdown("""
-        **Understand and Interpret Your Model** with advanced explainability tools and visualizations.
-        Gain deeper insights into model behavior and predictions.
-    """)
-    if 'model' not in st.session_state or st.session_state.model is None:
-        st.warning("Please train a model in the Model Training section first.")
         st.stop()
-    model = st.session_state.model.steps[-1][1]  # Get the trained model
-    preprocessor = st.session_state.model.steps[0][1]  # Get the preprocessor
-    # Model Summary
-    st.subheader("📝 Model Summary")
-    st.write(f"**Model Type:** {type(model).__name__}")
-    st.write(f"**Problem Type:** {'Regression' if hasattr(model, 'predict') else 'Classification'}")
-    st.write(f"**Training Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
-    # Feature Importance
-    st.subheader("🔍 Feature Importance")
-    if hasattr(model, 'feature_importances_'):
-        importances = model.feature_importances_
-        feature_names = preprocessor.get_feature_names_out()
-        importance_df = pd.DataFrame({
-            'Feature': feature_names,
-            'Importance': importances
-        }).sort_values('Importance', ascending=False)
-        fig, ax = plt.subplots()
-        sns.barplot(x='Importance', y='Feature', data=importance_df.head(10), ax=ax)
-        ax.set_title('Top 10 Feature Importances')
-        st.pyplot(fig)
-    else:
-        st.info("Feature importance not available for this model type.")
-    # SHAP Values
-    st.subheader("📊 SHAP Values")
-    if st.checkbox("Calculate SHAP Values (Warning: May be slow for large datasets)"):
         try:
-            import shap
-            # Use KernelExplainer for models that don't have a built-in explainer
-            if not hasattr(model, 'predict'):
-                explainer = shap.KernelExplainer(model.predict, st.session_state.X_train_selected[:100, :])  # Use a sample of training data
-                shap_values = explainer.shap_values(st.session_state.X_test_selected)
-                feature_names = preprocessor.get_feature_names_out()
-                # Summary Plot
-                st.write("### Summary Plot")
-                fig, ax = plt.subplots()
-                shap.summary_plot(shap_values, features=st.session_state.X_test_selected, feature_names=feature_names, show=False, plot_type="bar")  # Change to bar for a cleaner visualization
-                st.pyplot(fig)
-                # Force Plot for Individual Predictions
-                st.write("### Individual Prediction Explanation")
-                sample_idx = st.slider("Select Sample Index", 0, len(st.session_state.X_test_selected) - 1, 0)
-                fig, ax = plt.subplots()
-                shap.force_plot(explainer.expected_value, shap_values[sample_idx], st.session_state.X_test_selected[sample_idx],
-                               feature_names=feature_names, matplotlib=True, show=False)
-                st.pyplot(fig)
             else:
-                explainer = shap.TreeExplainer(model)
-                shap_values = explainer.shap_values(st.session_state.X_test_selected)
-                feature_names = preprocessor.get_feature_names_out()
-                # Summary Plot
-                st.write("### Summary Plot")
-                fig, ax = plt.subplots()
-                shap.summary_plot(shap_values, features=st.session_state.X_test_selected, feature_names=feature_names, show=False, plot_type="bar")  # Change to bar for a cleaner visualization
-                st.pyplot(fig)
-                # Force Plot for Individual Predictions
-                st.write("### Individual Prediction Explanation")
-                sample_idx = st.slider("Select Sample Index", 0, len(st.session_state.X_test_selected) - 1, 0)
-                fig, ax = plt.subplots()
-                shap.force_plot(explainer.expected_value, shap_values[sample_idx], st.session_state.X_test_selected[sample_idx],
-                               feature_names=feature_names, matplotlib=True, show=False)
-                st.pyplot(fig)
-        except Exception as e:
-            st.error(f"SHAP calculation failed: {e}")
-    # Partial Dependence Plots
-    st.subheader("📈 Partial Dependence Plots")
-    if hasattr(model, 'predict'):
-        feature_to_plot = st.selectbox("Select Feature for PDP", preprocessor.get_feature_names_out())
-        if st.button("Generate PDP"):
-            from sklearn.inspection import PartialDependenceDisplay
-            fig, ax = plt.subplots()
-            PartialDependenceDisplay.from_estimator(
-                model, st.session_state.X_test_selected,
-                features=[feature_to_plot],
-                feature_names=preprocessor.get_feature_names_out(),
-                ax=ax
             )
-            st.pyplot(fig)
-    # Model Performance Over Time
-    st.subheader("⏳ Model Performance Over Time")
-    if st.checkbox("Track Performance Over Time"):
-        performance_history = {
-            'timestamp': [],
-            'metric': [],
-            'value': []
-        }
-        if hasattr(model, 'predict'):
-            y_pred = model.predict(st.session_state.X_test_selected)
-            mse = mean_squared_error(st.session_state.y_test, y_pred)
-            performance_history['timestamp'].append(datetime.now())
-            performance_history['metric'].append('MSE')
-            performance_history['value'].append(mse)
-        performance_df = pd.DataFrame(performance_history)
-        st.line_chart(performance_df.set_index('timestamp'))
-    # Model Debugging
-    st.subheader("🐛 Model Debugging")
-    if st.checkbox("Enable Debug Mode"):
-        st.write("### Model Parameters")
-        st.json(model.get_params())
-        st.write("### Training Data Summary")
-        st.write(f"Number of Samples: {st.session_state.X_train_selected.shape[0]}")
-        st.write(f"Number of Features: {st.session_state.X_train_selected.shape[1]}")
-    # Export Insights
-    st.subheader("💾 Export Insights")
-    if st.button("Export Insights as PDF"):
-        try:
-            from fpdf import FPDF
-            pdf = FPDF()
-            pdf.add_page()
-            pdf.set_font("Arial", size=12)
-            pdf.cell(200, 10, txt="Model Insights Report", ln=True, align='C')
-            pdf.cell(200, 10, txt=f"Model Type: {type(model).__name__}", ln=True)
-            pdf.cell(200, 10, txt=f"Problem Type: {'Regression' if hasattr(model, 'predict') else 'Classification'}", ln=True)
-            pdf.output("model_insights.pdf")
-            st.success("Insights exported successfully!")
         except Exception as e:
-            st.error(f"Export failed: {e}")
-# Predictions Section
 elif app_mode == "Predictions":
-    st.title("🔮 Prediction Studio")
-    st.markdown("""
-        **Make Predictions** with your trained model and explore prediction explanations.
-        Generate batch predictions and export results.
-    """)
-    if 'model' not in st.session_state or st.session_state.model is None:
-        st.warning("Please train a model in the Model Training section first.")
         st.stop()
-    model = st.session_state.model.steps[-1][1]  # Get the trained model
-    preprocessor = st.session_state.model.steps[0][1]  # Get the preprocessor
-    # Single Prediction
-    st.subheader("🎯 Single Prediction")
-    input_data = {}
-    feature_names = preprocessor.get_feature_names_out()
-    for feature in feature_names:
-        if feature in st.session_state.cleaned_data.columns:
-            if pd.api.types.is_numeric_dtype(st.session_state.cleaned_data[feature]):
-                input_data[feature] = st.number_input(f"Enter {feature}", value=st.session_state.cleaned_data[feature].mean())
-            else:
-                input_data[feature] = st.selectbox(f"Select {feature}", st.session_state.cleaned_data[feature].unique())
-    if st.button("Make Prediction"):
-        try:
-            input_df = pd.DataFrame([input_data])
-            input_processed = preprocessor.transform(input_df)
-            prediction = model.predict(input_processed)[0]
-            st.write(f"**Prediction:** {prediction}")
-            if hasattr(model, 'predict_proba'):
-                probabilities = model.predict_proba(input_processed)[0]
-                st.write("**Prediction Probabilities:**")
-                st.bar_chart(probabilities)
-            # SHAP Explanation
-            if st.checkbox("Show SHAP Explanation"):
-                try:
-                    import shap
-                    # Use KernelExplainer or TreeExplainer, checking if the model has the property first
-                    if hasattr(model, 'predict'):
-                        explainer = shap.TreeExplainer(model)
-                        shap_values = explainer.shap_values(input_processed)
-                    else:
-                        explainer = shap.KernelExplainer(model.predict, st.session_state.X_train_selected[:100, :])
-                        shap_values = explainer.shap_values(input_processed)
-                    st.write("### SHAP Values")
-                    fig, ax = plt.subplots()
-                    shap.force_plot(explainer.expected_value, shap_values, input_processed,
-                                   feature_names=feature_names, matplotlib=True, show=False)
-                    st.pyplot(fig)
-                except Exception as e:
-                    st.error(f"SHAP calculation failed: {e}")
-        except Exception as e:
-            st.error(f"Prediction failed: {e}")
-    # Batch Predictions
-    st.subheader("📂 Batch Predictions")
-    batch_file = st.file_uploader("Upload CSV for Batch Predictions", type=["csv"])
-    if batch_file is not None:
-        try:
-            batch_df = pd.read_csv(batch_file)
-            batch_processed = preprocessor.transform(batch_df)
-            batch_predictions = model.predict(batch_processed)
-            batch_df['Prediction'] = batch_predictions
-            if hasattr(model, 'predict_proba'):
-                probabilities = model.predict_proba(batch_processed)
-                for i in range(probabilities.shape[1]):
-                    batch_df[f'Probability_Class_{i}'] = probabilities[:, i]
-            st.write("### Predictions Preview")
-            st.dataframe(batch_df.head())
-            # Download Predictions
-            csv = batch_df.to_csv(index=False)
-            b64 = base64.b64encode(csv.encode()).decode()
-            href = f'<a href="data:file/csv;base64,{b64}" download="predictions.csv">Download Predictions CSV</a>'
-            st.markdown(href, unsafe_allow_html=True)
-        except Exception as e:
-            st.error(f"Batch prediction failed: {e}")
-# Prediction Analysis
-st.subheader("📊 Prediction Analysis")
-if st.checkbox("Analyze Predictions"):
-    try:
-        y_pred = model.predict(st.session_state.X_test_selected)
-        y_test = st.session_state.y_test
-        if hasattr(model, 'predict'):
-            fig, ax = plt.subplots()
-            ax.scatter(y_test, y_pred)
-            ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
-            ax.set_xlabel('Actual')
-            ax.set_ylabel('Predicted')
-            ax.set_title('Actual vs Predicted')
-            st.pyplot(fig)
-        else:
-            conf_matrix = confusion_matrix(y_test, y_pred)
-            fig, ax = plt.subplots()
-            sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax)
-            ax.set_xlabel('Predicted Labels')
-            ax.set_ylabel('True Labels')
-            ax.set_title('Confusion Matrix')
-            st.pyplot(fig)
-    except Exception as e:
-        st.error(f"Prediction analysis failed: {e}")
-# Prediction Export
-st.subheader("💾 Export Predictions")
-if st.button("Export Predictions as PDF"):
-    try:
-        from fpdf import FPDF
-        pdf = FPDF()
-        pdf.add_page()
-        pdf.set_font("Arial", size=12)
-        pdf.cell(200, 10, txt="Predictions Report", ln=True, align='C')
-        pdf.cell(200, 10, txt=f"Model Type: {type(model).__name__}", ln=True)
-        pdf.cell(200, 10, txt=f"Problem Type: {'Regression' if hasattr(model, 'predict') else 'Classification'}", ln=True)
-        pdf.output("predictions_report.pdf")
-        st.success("Predictions exported successfully!")
-    except Exception as e:
-        st.error(f"An unexpected error occurred: {e}")

+import streamlit as st
 import pandas as pd
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
 from sklearn.preprocessing import StandardScaler, LabelEncoder
 from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.metrics import accuracy_score, mean_squared_error
 from ydata_profiling import ProfileReport
 from streamlit_pandas_profiling import st_profile_report
 import joblib
+import shap
+from datetime import datetime
+# --------------------------
+# Page Configuration
+# --------------------------
 st.set_page_config(
+    page_title="DataInsight Pro",
+    page_icon="🔮",
     layout="wide",
     initial_sidebar_state="expanded"
 )
+# --------------------------
+# Custom Styling
+# --------------------------
+st.markdown("""
+    <style>
+    .main {background-color: #f8f9fa;}
+    .sidebar .sidebar-content {background-color: #2c3e50;}
+    .stButton>button {background-color: #3498db; color: white;}
+    .stTextInput>div>div>input {border: 1px solid #3498db;}
+    .stSelectbox>div>div>select {border: 1px solid #3498db;}
+    .stSlider>div>div>div>div {background-color: #3498db;}
+    .metric {padding: 15px; background-color: white; border-radius: 10px; box-shadow: 0 2px 5px rgba(0,0,0,0.1);}
+    </style>
+""", unsafe_allow_html=True)
+# --------------------------
+# Session State Initialization
+# --------------------------
 if 'raw_data' not in st.session_state:
     st.session_state.raw_data = None
 if 'cleaned_data' not in st.session_state:
     st.session_state.cleaned_data = None
 if 'model' not in st.session_state:
     st.session_state.model = None
+# --------------------------
+# Helper Functions
+# --------------------------
+def generate_quality_report(df):
     """Generate comprehensive data quality report"""
     report = {
+        'basic': {
             'rows': df.shape[0],
             'columns': df.shape[1],
+            'missing': df.isna().sum().sum(),
             'duplicates': df.duplicated().sum()
         },
+        'columns': {}
     }
     for col in df.columns:
         col_report = {
             'type': str(df[col].dtype),
             'unique': df[col].nunique(),
             'missing': df[col].isna().sum(),
         }
         if pd.api.types.is_numeric_dtype(df[col]):
             col_report.update({
                 'mean': df[col].mean(),
                 'std': df[col].std(),
+                'zeros': (df[col] == 0).sum()
             })
+        report['columns'][col] = col_report
     return report
+# --------------------------
+# Sidebar Navigation
+# --------------------------
+with st.sidebar:
+    st.title("🔮 DataInsight Pro")
+    app_mode = st.selectbox(
+        "Navigation",
+        ["Data Upload", "Data Cleaning", "EDA", "Model Training", "Predictions"],
+        format_func=lambda x: f"📌 {x}"
+    )
+    st.markdown("---")
+    st.markdown("Created by [Your Name]")
+    st.markdown("v1.2 | © 2024")
+# --------------------------
+# Main App Pages
+# --------------------------
 if app_mode == "Data Upload":
+    st.title("📤 Data Upload & Profiling")
+    uploaded_file = st.file_uploader("Upload your dataset (CSV/XLSX)", type=["csv", "xlsx"])
     if uploaded_file:
+        try:
+            if uploaded_file.name.endswith('.csv'):
+                df = pd.read_csv(uploaded_file)
             else:
+                df = pd.read_excel(uploaded_file)
+            st.session_state.raw_data = df
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                st.metric("Rows", df.shape[0])
+            with col2:
+                st.metric("Columns", df.shape[1])
+            with col3:
+                st.metric("Missing Values", df.isna().sum().sum())
+            with st.expander("Data Preview", expanded=True):
+                st.dataframe(df.head(10), use_container_width=True)
+            if st.button("Generate Full Profile Report"):
+                with st.spinner("Generating comprehensive analysis..."):
+                    pr = ProfileReport(df, explorative=True)
+                    st_profile_report(pr)
+        except Exception as e:
+            st.error(f"Error loading file: {str(e)}")
+elif app_mode == "Data Cleaning":
+    st.title("🧹 Smart Data Cleaning")
+    if st.session_state.raw_data is None:
+        st.warning("Please upload data first")
+        st.stop()
+    df = st.session_state.raw_data.copy()
+    # Missing Value Handling
+    with st.expander("🔍 Missing Values Treatment", expanded=True):
+        missing_cols = df.columns[df.isna().any()].tolist()
+        if missing_cols:
+            cols = st.multiselect("Select columns to handle", missing_cols)
+            method = st.selectbox("Imputation Method", [
+                "Drop Missing",
+                "Mean/Median",
+                "Custom Value"
+            ])
+            if st.button("Apply Treatment"):
+                if method == "Drop Missing":
+                    df = df.dropna(subset=cols)
+                elif method == "Mean/Median":
                     for col in cols:
+                        if pd.api.types.is_numeric_dtype(df[col]):
+                            df[col] = df[col].fillna(df[col].median())
+                st.session_state.cleaned_data = df
+                st.success("Missing values handled successfully!")
+        else:
+            st.success("No missing values found!")
+    # Data Type Conversion
+    with st.expander("🔄 Data Type Conversion"):
+        col_to_convert = st.selectbox("Select column", df.columns)
+        new_type = st.selectbox("New data type", [
+            "String", "Integer", "Float",
+            "Boolean", "Datetime"
         ])
+        if st.button("Convert"):
             try:
                 if new_type == "String":
                     df[col_to_convert] = df[col_to_convert].astype(str)
                 elif new_type == "Integer":
                     df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce').astype('Int64')
+                st.session_state.cleaned_data = df
+                st.success("Conversion successful!")
             except Exception as e:
+                st.error(f"Error: {str(e)}")
+    if st.session_state.cleaned_data is not None:
+        with st.expander("✨ Cleaned Data Preview"):
+            st.dataframe(st.session_state.cleaned_data.head(), use_container_width=True)
+elif app_mode == "EDA":
+    st.title("🔍 Exploratory Data Analysis")
+    if st.session_state.cleaned_data is None:
+        st.warning("Please clean your data first")
         st.stop()
+    df = st.session_state.cleaned_data
+    # Visualization Selector
     col1, col2 = st.columns([1, 3])
     with col1:
+        st.subheader("Visualization Setup")
+        plot_type = st.selectbox("Choose plot type", [
+            "Scatter Plot", "Histogram",
+            "Box Plot", "Correlation Matrix"
+        ])
+        x_axis = st.selectbox("X-Axis", df.columns)
+        y_axis = st.selectbox("Y-Axis", df.columns) if plot_type in ["Scatter Plot", "Box Plot"] else None
+        color_by = st.selectbox("Color By", [None] + df.columns.tolist())
     with col2:
+        st.subheader("Visualization")
+        try:
+            if plot_type == "Scatter Plot":
+                fig = px.scatter(df, x=x_axis, y=y_axis, color=color_by)
+            elif plot_type == "Histogram":
+                fig = px.histogram(df, x=x_axis, color=color_by)
+            elif plot_type == "Box Plot":
+                fig = px.box(df, x=x_axis, y=y_axis, color=color_by)
+            elif plot_type == "Correlation Matrix":
+                corr = df.select_dtypes(include=np.number).corr()
+                fig = px.imshow(corr, text_auto=True)
             st.plotly_chart(fig, use_container_width=True)
+        except Exception as e:
+            st.error(f"Visualization error: {str(e)}")
+elif app_mode == "Model Training":
+    st.title("🤖 Intelligent Model Training")
+    if st.session_state.cleaned_data is None:
+        st.warning("Please clean your data first")
         st.stop()
+    df = st.session_state.cleaned_data
+    # Model Setup
+    col1, col2 = st.columns(2)
+    with col1:
+        target = st.selectbox("Select Target Variable", df.columns)
+        problem_type = st.selectbox("Problem Type", ["Classification", "Regression"])
+    with col2:
+        features = st.multiselect("Select Features", df.columns.drop(target))
+        test_size = st.slider("Test Size", 0.1, 0.5, 0.2)
+    if st.button("Train Model"):
         try:
+            X = df[features]
+            y = df[target]
+            # Preprocessing
+            X = pd.get_dummies(X)
+            y = LabelEncoder().fit_transform(y) if problem_type == "Classification" else y
+            X_train, X_test, y_train, y_test = train_test_split(
+                X, y, test_size=test_size, random_state=42
+            )
+            # Model Training
+            if problem_type == "Classification":
+                model = RandomForestClassifier()
             else:
+                model = RandomForestRegressor()
+            model.fit(X_train, y_train)
+            st.session_state.model = model
+            # Evaluation
+            y_pred = model.predict(X_test)
+            if problem_type == "Classification":
+                accuracy = accuracy_score(y_test, y_pred)
+                st.metric("Accuracy", f"{accuracy:.2%}")
+            else:
+                mse = mean_squared_error(y_test, y_pred)
+                st.metric("MSE", f"{mse:.2f}")
+            # Feature Importance
+            fig = px.bar(
+                x=model.feature_importances_,
+                y=X.columns,
+                orientation='h',
+                title="Feature Importance"
             )
+            st.plotly_chart(fig, use_container_width=True)
         except Exception as e:
+            st.error(f"Training failed: {str(e)}")
 elif app_mode == "Predictions":
+    st.title("🔮 Predictive Analytics")
+    if st.session_state.model is None:
+        st.warning("Please train a model first")
         st.stop()
+    model = st.session_state.model
+    # Prediction Interface
+    col1, col2 = st.columns(2)
+    with col1:
+        st.subheader("Input Parameters")
+        input_data = {}
+        for feature in model.feature_names_in_:
+            input_data[feature] = st.number_input(feature)
+    with col2:
+        st.subheader("Prediction Result")
+        if st.button("Generate Prediction"):
+            try:
+                input_df = pd.DataFrame([input_data])
+                prediction = model.predict(input_df)[0]
+                st.metric("Predicted Value", prediction)
+                # SHAP Explanation
+                explainer = shap.TreeExplainer(model)
+                shap_values = explainer.shap_values(input_df)
+                fig = shap.force_plot(
+                    explainer.expected_value[0],
+                    shap_values[0],
+                    input_df.iloc[0],
+                    matplotlib=False
+                )
+                st.components.v1.html(shap.getjs() + fig.html(), height=300)
+            except Exception as e:
+                st.error(f"Prediction failed: {str(e)}")