Spaces:

CosmickVisions
/

Data-Vision

Sleeping

App Files Files Community

CosmickVisions commited on Mar 2

Commit

f859c5c

verified ·

1 Parent(s): 3fddd77

Update app.py

Browse files

Files changed (1) hide show

app.py +261 -559

app.py CHANGED Viewed

@@ -1,6 +1,24 @@
-import streamlit as st
 import pandas as pd
 import numpy as np
 import plotly.express as px
 from scipy import stats
 import plotly.colors as pc
@@ -27,16 +45,14 @@ from sklearn.svm import SVR, SVC
 from sklearn.feature_selection import SelectKBest
 from sklearn.experimental import enable_iterative_imputer
 from sklearn.impute import IterativeImputer
-from sklearn.neural_network import MLPRegressor
-from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score
 from sklearn.impute import KNNImputer, SimpleImputer
 from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder, LabelEncoder
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
-from ydata_profiling import ProfileReport
-from streamlit_pandas_profiling import st_profile_report
 # Enhanced configuration
@@ -51,7 +67,12 @@ if 'raw_data' not in st.session_state:
     st.session_state.raw_data = None
 if 'cleaned_data' not in st.session_state:
     st.session_state.cleaned_data = None
 # Security: Set allowed file types
 ALLOWED_EXTENSIONS = {'csv', 'xlsx', 'parquet', 'feather'}
 MAX_FILE_SIZE_MB = 250  # 250MB limit
@@ -60,15 +81,15 @@ def validate_file(file):
     """Comprehensive file validation"""
     if not file:
         return False, "No file uploaded"
     extension = file.name.split('.')[-1].lower()
     if extension not in ALLOWED_EXTENSIONS:
         return False, f"Unsupported file type: {extension}"
     file_size_mb = file.size / (1024 * 1024)
     if file_size_mb > MAX_FILE_SIZE_MB:
         return False, f"File size exceeds {MAX_FILE_SIZE_MB}MB limit"
     return True, ""
 @st.cache_data(ttl=3600, show_spinner="Analyzing data quality...")
@@ -164,10 +185,10 @@ if app_mode == "Data Upload":
                     df = pd.read_parquet(uploaded_file)
                 elif uploaded_file.name.endswith('.feather'):
                     df = pd.read_feather(uploaded_file)
                 st.session_state.raw_data = df
                 st.success("Dataset loaded successfully!")
             except Exception as e:
                 st.error(f"Error loading file: {str(e)}")
                 st.stop()
@@ -181,7 +202,7 @@ if app_mode == "Data Upload":
         # Data Health Dashboard
         st.subheader("📊 Data Health Dashboard")
         report = enhanced_quality_report(df)
         col1, col2, col3, col4 = st.columns(4)
         col1.metric("Total Rows", report['basic_stats']['rows'])
         col2.metric("Total Columns", report['basic_stats']['columns'])
@@ -192,11 +213,11 @@ if app_mode == "Data Upload":
         with st.expander("🔍 Deep Column Analysis", expanded=True):
             selected_col = st.selectbox("Select column to inspect", df.columns)
             col_info = report['column_analysis'][selected_col]
             st.write(f"**Type:** {col_info['type']}")
             st.write(f"**Unique Values:** {col_info['unique']}")
             st.write(f"**Missing Values:** {col_info['missing']} ({col_info['missing']/len(df):.1%})")
             if pd.api.types.is_numeric_dtype(df[selected_col]):
                 st.write("**Distribution:**")
                 st.line_chart(df[selected_col])
@@ -218,7 +239,7 @@ if app_mode == "Data Upload":
                     recommendations.append(f"⚠️ Consider dropping {col} (>{50}% missing)")
                 if data['unique'] == len(df):
                     recommendations.append(f"🔍 Investigate {col} - potential unique identifier")
             if recommendations:
                 st.write("### Recommended Actions")
                 for rec in recommendations[:5]:  # Show top 5
@@ -234,7 +255,7 @@ if app_mode == "Data Upload":
         # Advanced Profiling
         if st.button("🚀 Generate Full Data Profile"):
             with st.spinner("Generating comprehensive report..."):
-                pr = ProfileReport(df, explorative=True)
                 st_profile_report(pr)
 elif app_mode == "Smart Cleaning":
@@ -266,7 +287,7 @@ elif app_mode == "Smart Cleaning":
         st.caption(f"Current Version: {st.session_state.current_version+1}/{len(st.session_state.data_versions)}")
         progress = (st.session_state.current_version + 1) / len(st.session_state.data_versions)
         st.progress(progress)
         col1, col2 = st.columns(2)
         with col1:
             if st.button("⏮️ Undo Last Action", disabled=st.session_state.current_version == 0):
@@ -281,7 +302,7 @@ elif app_mode == "Smart Cleaning":
     st.subheader("📊 Data Health Dashboard")
     with st.expander("Show Comprehensive Data Report", expanded=True):
         from pandas_profiling import ProfileReport
-        pr = ProfileReport(df, explorative=True)
         st_profile_report(pr)
     # Enhanced Health Summary with Cards
@@ -301,11 +322,11 @@ elif app_mode == "Smart Cleaning":
     st.markdown("### 📈 Data Health Visualizations")
     col1, col2 = st.columns(2)
     with col1:
-        st.plotly_chart(px.bar(df.isna().sum(), title="Missing Values per Column",
-                         labels={'index': 'Column', 'value': 'Missing Count'},
                          color=df.isna().sum(), color_continuous_scale="Bluered"))
     with col2:
-        st.plotly_chart(px.pie(values=df.dtypes.value_counts(), names=df.dtypes.value_counts().index,
                               title="Data Type Distribution", hole=0.3))
     # Cleaning Operations with Tabs
@@ -319,15 +340,15 @@ elif app_mode == "Smart Cleaning":
         if missing_cols:
             st.write("Columns with missing values:")
             cols = st.multiselect("Select columns to clean", missing_cols, default=missing_cols)
             method = st.radio("Imputation Method", [
-                "Drop Missing",
-                "Mean/Median/Mode",
-                "KNN Imputation",
-                "MICE Imputation",
                 "Deep Learning Imputation"
             ], horizontal=True)
             if st.button(f"Apply {method}"):
                 try:
                     original_df = df.copy()
@@ -348,7 +369,7 @@ elif app_mode == "Smart Cleaning":
             st.plotly_chart(px.histogram(df, x=df.duplicated(), title="Duplicate Distribution"))
             dup_strategy = st.radio("Duplicate Strategy", [
                 "Remove All Duplicates",
-                "Keep First Occurrence",
                 "Keep Last Occurrence"
             ])
             if st.button("Handle Duplicates"):
@@ -373,7 +394,7 @@ elif app_mode == "Smart Cleaning":
         with col2:
             col_to_convert = st.selectbox("Select column to convert", df.columns)
             new_type = st.selectbox("New Data Type", [
-                "String", "Integer", "Float",
                 "Boolean", "Datetime", "Category"
             ])
             if st.button("Convert Data Type"):
@@ -404,27 +425,27 @@ elif app_mode == "Smart Cleaning":
     if st.button("💾 Save Cleaned Data"):
         st.session_state.cleaned_data = df
         st.balloons()
         # Generate comprehensive report
         from pandas_profiling import ProfileReport
         pr = ProfileReport(df, title="Cleaned Data Report")
         st_profile_report(pr)
         # Show cleaning log with diffs
         st.subheader("📝 Cleaning Log")
         st.table(pd.DataFrame({
             "Step": range(1, len(cleaning_actions)+1),
             "Action": cleaning_actions
         }))
         # Show dataset comparison
         col1, col2 = st.columns(2)
         with col1:
             st.write("Original Data Shape:", st.session_state.raw_data.shape)
         with col2:
             st.write("Cleaned Data Shape:", df.shape)
-        st.success("✅ Cleaned data saved successfully! You can now proceed to analysis.")
 elif app_mode == "Advanced EDA":
     st.title("🔍 Advanced Exploratory Data Analysis")
     st.markdown("""
@@ -458,7 +479,7 @@ elif app_mode == "Advanced EDA":
     with col1:
         st.header("📊 Visualization Setup")
         # Plot Type Selection
         plot_types = {
             "Distribution": ["Histogram", "Box Plot", "Violin Plot", "Density Plot"],
@@ -466,7 +487,7 @@ elif app_mode == "Advanced EDA":
             "Comparison": ["Bar Chart", "Pie Chart", "Parallel Coordinates"],
             "3D": ["3D Scatter", "3D Surface"]
         }
         selected_category = st.selectbox("Plot Category", list(plot_types.keys()))
         st.session_state.eda_config['plot_type'] = st.selectbox(
             "Plot Type",
@@ -475,28 +496,28 @@ elif app_mode == "Advanced EDA":
         # Dynamic Column Selectors
         plot_type = st.session_state.eda_config['plot_type']
         if plot_type in ["Histogram", "Box Plot", "Violin Plot", "Density Plot", "Bar Chart", "Pie Chart"]:
             st.session_state.eda_config['x_col'] = st.selectbox(
                 "X Axis",
                 df.columns,
-                index=df.columns.get_loc(st.session_state.eda_config['x_col'])
                 if st.session_state.eda_config['x_col'] in df.columns else 0
             )
         if plot_type in ["Scatter Plot", "Line Plot", "Box Plot", "Violin Plot", "Density Plot"]:
             st.session_state.eda_config['y_col'] = st.selectbox(
                 "Y Axis",
                 df.columns,
-                index=df.columns.get_loc(st.session_state.eda_config['y_col'])
                 if st.session_state.eda_config['y_col'] in df.columns else 0
             )
         if plot_type in ["3D Scatter", "3D Surface"]:
             st.session_state.eda_config['z_col'] = st.selectbox(
                 "Z Axis",
                 df.columns,
-                index=df.columns.get_loc(st.session_state.eda_config['z_col'])
                 if st.session_state.eda_config['z_col'] in df.columns else 0
             )
@@ -537,92 +558,92 @@ elif app_mode == "Advanced EDA":
     with col2:
         st.header("📈 Visualization")
         config = st.session_state.eda_config
         @st.cache_data(ttl=300)
         def generate_plot(df, plot_type, config):
             """Cached plot generation function for better performance"""
             try:
                 if plot_type == "Histogram":
                     return px.histogram(
-                        df, x=config['x_col'],
                         color=config['color_col'],
                         nbins=30,
                         color_discrete_sequence=[config['color_palette']]
                     )
                 elif plot_type == "Scatter Plot":
                     return px.scatter(
                         df, x=config['x_col'], y=config['y_col'],
                         color=config['color_col'],
                         hover_data=config['hover_data_cols']
                     )
                 elif plot_type == "Box Plot":
                     return px.box(
                         df, x=config['x_col'], y=config['y_col'],
                         color=config['color_col']
                     )
                 elif plot_type == "Violin Plot":
                     return px.violin(
                         df, x=config['x_col'], y=config['y_col'],
                         color=config['color_col'],
                         box=True
                     )
                 elif plot_type == "Heatmap":
                     numeric_df = df.select_dtypes(include=np.number)
                     corr = numeric_df.corr()
                     return px.imshow(
-                        corr,
                         text_auto=True,
                         color_continuous_scale=config['color_palette']
                     )
                 elif plot_type == "3D Scatter":
                     return px.scatter_3d(
                         df, x=config['x_col'], y=config['y_col'], z=config['z_col'],
                         color=config['color_col']
                     )
                 elif plot_type == "Bar Chart":
                     return px.bar(
                         df, x=config['x_col'], y=config['y_col'],
                         color=config['color_col']
                     )
                 elif plot_type == "Pie Chart":
                     return px.pie(
                         df, names=config['x_col'], values=config['y_col'],
                         color_discrete_sequence=[config['color_palette']]
                     )
                 elif plot_type == "Line Plot":
                     return px.line(
                         df, x=config['x_col'], y=config['y_col'],
                         color=config['color_col']
                     )
                 elif plot_type == "Pair Plot":
                     numeric_cols = df.select_dtypes(include=np.number).columns
                     return px.scatter_matrix(
                         df[numeric_cols],
                         color=config['color_col']
                     )
                 elif plot_type == "Parallel Coordinates":
                     numeric_df = df.select_dtypes(include=np.number)
                     return px.parallel_coordinates(
                         numeric_df,
                         color_continuous_scale=config['color_palette']
                     )
                 elif plot_type == "Density Plot":
                     return px.density_contour(
                         df, x=config['x_col'], y=config['y_col'],
                         color=config['color_col']
                     )
             except Exception as e:
                 st.error(f"Plot generation error: {str(e)}")
                 return None
@@ -631,18 +652,18 @@ elif app_mode == "Advanced EDA":
         fig = generate_plot(df, plot_type, config)
         if fig:
             st.plotly_chart(fig, use_container_width=True)
             # Plot Statistics
             with st.expander("📊 Plot Statistics"):
                 if plot_type in ["Histogram", "Box Plot", "Violin Plot"]:
                     st.write(f"**{config['x_col']} Statistics**")
                     st.table(df[config['x_col']].describe())
                 if plot_type in ["Scatter Plot", "Line Plot"]:
                     st.write(f"**Correlation between {config['x_col']} and {config['y_col']}**")
                     corr = df[[config['x_col'], config['y_col']]].corr().iloc[0,1]
                     st.metric("Pearson Correlation", f"{corr:.2f}")
                 if plot_type == "Heatmap":
                     st.write("**Correlation Matrix**")
                     numeric_df = df.select_dtypes(include=np.number)
@@ -656,16 +677,16 @@ elif app_mode == "Advanced EDA":
             st.write("**Data Shape**")
             st.write(f"Rows: {df.shape[0]}")
             st.write(f"Columns: {df.shape[1]}")
         with col2:
             st.write("**Data Types**")
             st.dataframe(df.dtypes.reset_index().rename(columns={
                 'index': 'Column', 0: 'Type'
             }))
         st.write("**Sample Data**")
         st.dataframe(df.head())
 # Model Training Section
 elif app_mode == "Model Training":
     st.title("🚂 Model Training Studio")
@@ -714,42 +735,45 @@ elif app_mode == "Model Training":
         model_options = ["Logistic Regression", "Decision Tree", "Random Forest", "Gradient Boosting", "SVM", "Neural Network", "KNN", "Naive Bayes"]
     model_name = st.selectbox("Select Model", model_options, help="Choose a model.")
-    # Hyperparameter Tuning
-    st.subheader("🎛️ Hyperparameter Tuning")
-    with st.expander("Configure Hyperparameters", expanded=True):
-        if model_name == "Random Forest":
-            n_estimators = st.slider("Number of Estimators", 10, 200, 100)
-            max_depth = st.slider("Max Depth", 3, 20, 10)
-            min_samples_split = st.slider("Min Samples Split", 2, 10, 2)
-            min_samples_leaf = st.slider("Min Samples Leaf", 1, 10, 1)
-            hyperparams = {
-                'n_estimators': n_estimators,
-                'max_depth': max_depth,
-                'min_samples_split': min_samples_split,
-                'min_samples_leaf': min_samples_leaf
-            }
-        elif model_name == "Gradient Boosting":
-            learning_rate = st.slider("Learning Rate", 0.01, 1.0, 0.1)
-            n_estimators = st.slider("Number of Estimators", 10, 200, 100)
-            max_depth = st.slider("Max Depth", 3, 20, 10)
-            hyperparams = {
-                'learning_rate': learning_rate,
-                'n_estimators': n_estimators,
-                'max_depth': max_depth
-            }
-        elif model_name == "Neural Network":
-            hidden_layers = st.slider("Number of Hidden Layers", 1, 5, 2)
-            neurons_per_layer = st.slider("Neurons per Layer", 10, 200, 50)
-            epochs = st.slider("Epochs", 10, 200, 50)
-            batch_size = st.slider("Batch Size", 16, 128, 32)
-            hyperparams = {
-                'hidden_layers': hidden_layers,
-                'neurons_per_layer': neurons_per_layer,
-                'epochs': epochs,
-                'batch_size': batch_size
-            }
-        else:
-            hyperparams = {}
     # Train-Test Split
     st.subheader("✂️ Train-Test Split")
@@ -819,39 +843,69 @@ elif app_mode == "Model Training":
                     else:
                         model = SVC()
                 elif model_name == "Neural Network":
-                    if problem_type == "Regression":
-                        model = MLPRegressor(
-                            hidden_layer_sizes=[hyperparams['neurons_per_layer']] * hyperparams['hidden_layers'],
-                            max_iter=hyperparams['epochs'],
-                            batch_size=hyperparams['batch_size']
-                        )
-                    else:
-                        model = MLPClassifier(
-                            hidden_layer_sizes=[hyperparams['neurons_per_layer']] * hyperparams['hidden_layers'],
-                            max_iter=hyperparams['epochs'],
-                            batch_size=hyperparams['batch_size']
-                        )
                 elif model_name == "KNN":
                     model = KNeighborsClassifier()
                 elif model_name == "Naive Bayes":
                     model = GaussianNB()
                 # Train the model
-                model.fit(X_train_processed, y_train)
                 # Store model and preprocessor
                 st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
                 st.session_state.preprocessor = preprocessor
-                # Store the test data
                 st.session_state.X_train_selected = X_train_processed
                 st.session_state.X_test_selected = X_test_processed
                 st.session_state.y_train = y_train
                 st.session_state.y_test = y_test
                 # Model Evaluation
-                y_pred = model.predict(X_test_processed)
                 if problem_type == "Regression":
                     mse = mean_squared_error(y_test, y_pred)
                     rmse = np.sqrt(mse)
                     mae = mean_absolute_error(y_test, y_pred)
@@ -860,7 +914,11 @@ elif app_mode == "Model Training":
                     st.write(f"Root Mean Squared Error: {rmse:.4f}")
                     st.write(f"Mean Absolute Error: {mae:.4f}")
                     st.write(f"R-squared: {r2:.4f}")
-                else:
                     accuracy = accuracy_score(y_test, y_pred)
                     precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
                     recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
@@ -871,6 +929,10 @@ elif app_mode == "Model Training":
                     st.write(f"F1 Score: {f1:.4f}")
                     st.write("Classification Report:")
                     st.text(classification_report(y_test, y_pred))
                 # Visualization
                 st.subheader("📊 Model Performance Visualization")
@@ -882,7 +944,33 @@ elif app_mode == "Model Training":
                     ax.set_ylabel('Predicted')
                     ax.set_title('Actual vs Predicted')
                     st.pyplot(fig)
-                else:
                     conf_matrix = confusion_matrix(y_test, y_pred)
                     fig, ax = plt.subplots()
                     sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax)
@@ -890,7 +978,6 @@ elif app_mode == "Model Training":
                     ax.set_ylabel('True Labels')
                     ax.set_title('Confusion Matrix')
                     st.pyplot(fig)
                 st.success("Model trained successfully!")
             except Exception as e:
                 st.error(f"An error occurred during training: {e}")
@@ -908,180 +995,6 @@ elif app_mode == "Model Training":
         st.warning("No trained model available. Train a model first to enable saving.")
-# Visualization Lab Section
-elif app_mode == "Visualization Lab":
-    st.title("🔬 Visualization Lab")
-    st.markdown("""
-        **Explore and Visualize Your Data** with advanced plotting tools and interactive visualizations.
-        Uncover hidden patterns and relationships in your data.
-    """)
-    if 'cleaned_data' not in st.session_state or st.session_state.cleaned_data is None:
-        st.warning("Please clean your data in the Smart Cleaning section first.")
-        st.stop()
-    df = st.session_state.cleaned_data.copy()
-    # Visualization Type Selection
-    st.subheader("📊 Choose Visualization Type")
-    plot_types = [
-        "Histogram", "Scatter Plot", "Box Plot", "Violin Plot",
-        "Correlation Heatmap", "Parallel Coordinates", "Andrews Curves",
-        "Pair Plot", "Density Contour", "3D Scatter", "Time Series",
-        "Sunburst Chart", "Funnel Chart", "Clustering Analysis"
-    ]
-    plot_type = st.selectbox("Select Visualization Type", plot_types)
-    # Dynamic Controls Based on Plot Type
-    if plot_type != "Correlation Heatmap":
-        x_col = st.selectbox("X Axis", df.columns)
-    if plot_type in ["Scatter Plot", "Box Plot", "Violin Plot", "Time Series", "3D Scatter", "Histogram"]:
-        y_col = st.selectbox("Y Axis", df.columns)
-    if plot_type == "3D Scatter":
-        z_col = st.selectbox("Z Axis", df.columns)
-        color_col = st.selectbox("Color by", [None] + list(df.columns))
-    # Advanced Plot Customization
-    with st.expander("🎨 Advanced Customization", expanded=False):
-        color_palette = st.selectbox("Color Palette", ["Viridis", "Plasma", "Magma", "Cividis", "RdBu", "Rainbow"])
-        hover_data_cols = st.multiselect("Hover Data", df.columns)
-    # Plot Generation
-    try:
-        fig = None
-        if plot_type == "Histogram":
-            fig = px.histogram(
-                df, x=x_col, y=y_col,
-                nbins=30, template="plotly_dark",
-                color_discrete_sequence=[color_palette]
-            )
-        elif plot_type == "Scatter Plot":
-            fig = px.scatter(
-                df, x=x_col, y=y_col,
-                color=color_col,
-                size=hover_data_cols,
-                hover_data=hover_data_cols
-            )
-        elif plot_type == "3D Scatter":
-            fig = px.scatter_3d(
-                df, x=x_col, y=y_col, z=z_col,
-                color=color_col,
-                color_discrete_sequence=[color_palette]
-            )
-        elif plot_type == "Correlation Heatmap":
-            numeric_df = df.select_dtypes(include=np.number)
-            if not numeric_df.empty:
-                corr = numeric_df.corr()
-                fig = px.imshow(
-                    corr, text_auto=True,
-                    color_continuous_scale=color_palette
-                )
-            else:
-                st.warning("No numerical columns found for correlation heatmap.")
-        elif plot_type == "Box Plot":
-            fig = px.box(
-                df, x=x_col, y=y_col,
-                color=color_col
-            )
-        elif plot_type == "Violin Plot":
-            fig = px.violin(
-                df, x=x_col, y=y_col,
-                box=True, points="all",
-                color=color_col
-            )
-        elif plot_type == "Time Series":
-            df = df.sort_values(by=x_col)
-            fig = px.line(
-                df, x=x_col, y=y_col,
-                color=color_col
-            )
-        elif plot_type == "Scatter Matrix":
-            fig = px.scatter_matrix(
-                df, dimensions=[x_col, y_col],
-                color=color_col
-            )
-        if fig:
-            st.plotly_chart(fig, use_container_width=True)
-    except Exception as e:
-        st.error(f"An error occurred while generating the plot: {e}")
-    # Statistical Analysis Section
-    with st.expander("📊 Statistical Analysis", expanded=True):
-        analysis_type = st.selectbox("Select Analysis Type", [
-            "Descriptive Statistics",
-            "Correlation Analysis",
-            "Hypothesis Testing",
-            "Distribution Fitting"
-        ])
-        if analysis_type == "Descriptive Statistics":
-            st.write(df.describe(include='all'))
-        elif analysis_type == "Correlation Analysis":
-            numeric_cols = df.select_dtypes(include=np.number).columns
-            if len(numeric_cols) >= 2:
-                corr_method = st.selectbox("Correlation Method", [
-                    "Pearson", "Kendall", "Spearman"
-                ])
-                corr_matrix = df[numeric_cols].corr(method=corr_method.lower())
-                st.write(corr_matrix)
-                st.heatmap(corr_matrix, annot=True, cmap=color_palette)
-            else:
-                st.warning("Need at least 2 numeric columns for correlation analysis")
-        elif analysis_type == "Hypothesis Testing":
-            test_type = st.selectbox("Select Test Type", [
-                "T-test", "Chi-Squared Test", "ANOVA", "Mann-Whitney U"
-            ])
-            if test_type == "T-test":
-                col1 = st.selectbox("Column 1 (Numeric)", df.select_dtypes(include=np.number).columns)
-                col2 = st.selectbox("Column 2 (Categorical)", df.select_dtypes(include='object').columns)
-                if st.button("Run T-test"):
-                    groups = df.groupby(col2)[col1].apply(list)
-                    if len(groups) == 2:
-                        t_stat, p_value = stats.ttest_ind(groups.iloc[0], groups.iloc[1])
-                        st.write(f"T-statistic: {t_stat:.4f}")
-                        st.write(f"P-value: {p_value:.4f}")
-                        if p_value < 0.05:
-                            st.write("Reject the null hypothesis.")
-                        else:
-                            st.write("Fail to reject the null hypothesis.")
-                    else:
-                        st.write("Select a categorical column with exactly two categories.")
-        elif analysis_type == "Distribution Fitting":
-            numeric_col = st.selectbox("Select Numeric Column", df.select_dtypes(include=np.number).columns)
-            dist_types = ["Normal", "Log-Normal", "Exponential", "Gamma"]
-            selected_dist = st.selectbox("Select Distribution Type", dist_types)
-            if st.button("Fit Distribution"):
-                from scipy.stats import norm, lognorm, expon, gamma
-                dist_functions = {
-                    "Normal": norm,
-                    "Log-Normal": lognorm,
-                    "Exponential": expon,
-                    "Gamma": gamma
-                }
-                params = dist_functions[selected_dist].fit(df[numeric_col].dropna())
-                st.write(f"Fitted Parameters: {params}")
-    # Data Profiling Section
-    with st.expander("📝 Generate Full Data Profile", expanded=False):
-        if st.button("🚀 Generate Comprehensive Report"):
-            with st.spinner("Generating report..."):
-                pr = ProfileReport(df, explorative=True)
-                st_profile_report(pr)
 # Insights Section
 elif app_mode == "Insights":
     st.title("📊 Model Insights & Explainability")
@@ -1112,7 +1025,7 @@ elif app_mode == "Insights":
             'Feature': feature_names,
             'Importance': importances
         }).sort_values('Importance', ascending=False)
         fig, ax = plt.subplots()
         sns.barplot(x='Importance', y='Feature', data=importance_df.head(10), ax=ax)
         ax.set_title('Top 10 Feature Importances')
@@ -1125,22 +1038,44 @@ elif app_mode == "Insights":
     if st.checkbox("Calculate SHAP Values (Warning: May be slow for large datasets)"):
         try:
             import shap
-            explainer = shap.TreeExplainer(model)
-            shap_values = explainer.shap_values(st.session_state.X_test_selected)
-            # Summary Plot
-            st.write("### Summary Plot")
-            fig, ax = plt.subplots()
-            shap.summary_plot(shap_values, st.session_state.X_test_selected, feature_names=preprocessor.get_feature_names_out())
-            st.pyplot(fig)
-            # Force Plot for Individual Predictions
-            st.write("### Individual Prediction Explanation")
-            sample_idx = st.slider("Select Sample Index", 0, len(st.session_state.X_test_selected)-1, 0)
-            fig, ax = plt.subplots()
-            shap.force_plot(explainer.expected_value, shap_values[sample_idx], st.session_state.X_test_selected[sample_idx],
-                           feature_names=preprocessor.get_feature_names_out(), matplotlib=True, show=False)
-            st.pyplot(fig)
         except Exception as e:
             st.error(f"SHAP calculation failed: {e}")
@@ -1152,8 +1087,8 @@ elif app_mode == "Insights":
             from sklearn.inspection import PartialDependenceDisplay
             fig, ax = plt.subplots()
             PartialDependenceDisplay.from_estimator(
-                model, st.session_state.X_test_selected,
-                features=[feature_to_plot],
                 feature_names=preprocessor.get_feature_names_out(),
                 ax=ax
             )
@@ -1167,14 +1102,14 @@ elif app_mode == "Insights":
             'metric': [],
             'value': []
         }
         if hasattr(model, 'predict'):
             y_pred = model.predict(st.session_state.X_test_selected)
             mse = mean_squared_error(st.session_state.y_test, y_pred)
             performance_history['timestamp'].append(datetime.now())
             performance_history['metric'].append('MSE')
             performance_history['value'].append(mse)
         performance_df = pd.DataFrame(performance_history)
         st.line_chart(performance_df.set_index('timestamp'))
@@ -1203,7 +1138,6 @@ elif app_mode == "Insights":
             st.success("Insights exported successfully!")
         except Exception as e:
             st.error(f"Export failed: {e}")
 # Predictions Section
 elif app_mode == "Predictions":
@@ -1236,9 +1170,9 @@ elif app_mode == "Predictions":
             input_df = pd.DataFrame([input_data])
             input_processed = preprocessor.transform(input_df)
             prediction = model.predict(input_processed)[0]
             st.write(f"**Prediction:** {prediction}")
             if hasattr(model, 'predict_proba'):
                 probabilities = model.predict_proba(input_processed)[0]
                 st.write("**Prediction Probabilities:**")
@@ -1248,14 +1182,20 @@ elif app_mode == "Predictions":
             if st.checkbox("Show SHAP Explanation"):
                 try:
                     import shap
-                    explainer = shap.TreeExplainer(model)
-                    shap_values = explainer.shap_values(input_processed)
                     st.write("### SHAP Values")
                     fig, ax = plt.subplots()
-                    shap.force_plot(explainer.expected_value, shap_values, input_processed,
                                    feature_names=feature_names, matplotlib=True, show=False)
                     st.pyplot(fig)
                 except Exception as e:
                     st.error(f"SHAP calculation failed: {e}")
@@ -1328,243 +1268,5 @@ elif app_mode == "Predictions":
             pdf.cell(200, 10, txt=f"Problem Type: {'Regression' if hasattr(model, 'predict') else 'Classification'}", ln=True)
             pdf.output("predictions_report.pdf")
             st.success("Predictions exported successfully!")
-        except Exception as e:
-            st.error(f"Export failed: {e}")
-# Neural Network Studio Section
-elif app_mode == "Neural Network Studio":
-    st.title("🧠 Neural Network Studio")
-    st.markdown("""
-        **Build and Train Neural Networks** with advanced configurations and visualizations.
-        Explore deep learning models with ease.
-    """)
-    if 'cleaned_data' not in st.session_state or st.session_state.cleaned_data is None:
-        st.warning("Please clean your data in the Smart Cleaning section first.")
-        st.stop()
-    df = st.session_state.cleaned_data.copy()
-    # Target Variable Selection
-    st.subheader("🎯 Target Variable")
-    target_column = st.selectbox("Select Target Variable", df.columns, help="Choose the column to predict.")
-    # Problem Type Selection
-    st.subheader("📝 Problem Type")
-    problem_type = st.radio("Select Problem Type", ["Regression", "Classification"], help="Choose the type of machine learning problem.")
-    # Feature Selection
-    st.subheader("🔧 Feature Selection")
-    use_all_features = st.checkbox("Use All Features", value=True, help="Select to use all features for training. Deselect to manually choose features.")
-    if use_all_features:
-        feature_columns = df.drop(columns=[target_column]).columns.tolist()
-    else:
-        feature_columns = st.multiselect("Select Feature Columns", df.drop(columns=[target_column]).columns, help="Choose the features you want to use for prediction.")
-    # Neural Network Configuration
-    st.subheader("⚙️ Neural Network Configuration")
-    with st.expander("Configure Neural Network", expanded=True):
-        col1, col2 = st.columns(2)
-        with col1:
-            hidden_layers = st.slider("Number of Hidden Layers", 1, 5, 2)
-            neurons_per_layer = st.slider("Neurons per Layer", 10, 200, 50)
-            activation = st.selectbox("Activation Function",
-                ["relu", "tanh", "sigmoid", "selu", "swish"])
-            dropout_rate = st.slider("Dropout Rate", 0.0, 0.5, 0.2)
-            initializer = st.selectbox("Weight Initializer",
-                ["glorot_uniform", "he_normal", "lecun_uniform"])
-        with col2:
-            learning_rate = st.slider("Learning Rate", 0.0001, 0.1, 0.001, format="%.4f")
-            optimizer_choice = st.selectbox("Optimizer",
-                ["Adam", "Nadam", "RMSprop", "SGD"])
-            batch_norm = st.checkbox("Batch Normalization", value=True)
-            regularization = st.checkbox("L2 Regularization")
-        epochs = st.slider("Epochs", 10, 200, 50)
-        batch_size = st.slider("Batch Size", 16, 128, 32)
-    # Train-Test Split
-    st.subheader("✂️ Train-Test Split")
-    test_size = st.slider("Test Size", 0.1, 0.5, 0.2, help="Proportion of the dataset to include in the test split.")
-    # Model Training
-    if st.button("🚀 Train Neural Network"):
-        with st.spinner("Training neural network..."):
-            try:
-                X = df[feature_columns]
-                y = df[target_column]
-                # Train-Test Split
-                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
-                # Preprocessing Pipeline
-                numeric_features = X.select_dtypes(include=np.number).columns
-                categorical_features = X.select_dtypes(exclude=np.number).columns
-                numeric_transformer = Pipeline(steps=[
-                    ('imputer', SimpleImputer(strategy='median')),
-                    ('scaler', StandardScaler())
-                ])
-                categorical_transformer = Pipeline(steps=[
-                    ('imputer', SimpleImputer(strategy='most_frequent')),
-                    ('onehot', OneHotEncoder(handle_unknown='ignore'))
-                ])
-                preprocessor = ColumnTransformer(
-                    transformers=[
-                        ('num', numeric_transformer, numeric_features),
-                        ('cat', categorical_transformer, categorical_features)
-                    ])
-                X_train_processed = preprocessor.fit_transform(X_train)
-                X_test_processed = preprocessor.transform(X_test)
-                # Build neural network with advanced features
-                model = keras.Sequential()
-                model.add(layers.Input(shape=(X_train_processed.shape[1],)))
-                for _ in range(hidden_layers):
-                    # Create configurable layers
-                    layer_config = {
-                        'units': neurons_per_layer,
-                        'activation': activation,
-                        'kernel_initializer': initializer
-                    }
-                    if regularization:
-                        layer_config['kernel_regularizer'] = keras.regularizers.l2(0.01)
-                    model.add(layers.Dense(**layer_config))
-                    if batch_norm:
-                        model.add(layers.BatchNormalization())
-                    if dropout_rate > 0:
-                        model.add(layers.Dropout(dropout_rate))
-                # Output layer
-                output_activation = 'linear' if problem_type == "Regression" else 'softmax'
-                output_units = 1 if problem_type == "Regression" else len(np.unique(y_train))
-                model.add(layers.Dense(output_units, activation=output_activation))
-                # Configure optimizer
-                optimizers = {
-                    "Adam": keras.optimizers.Adam(learning_rate=learning_rate),
-                    "Nadam": keras.optimizers.Nadam(learning_rate=learning_rate),
-                    "RMSprop": keras.optimizers.RMSprop(learning_rate=learning_rate),
-                    "SGD": keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9)
-                }
-                optimizer = optimizers[optimizer_choice]
-                # Compile the model
-                model.compile(optimizer=optimizer,
-                               loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
-                               metrics=['mae'] if problem_type == "Regression" else ['accuracy'])
-                # Add callbacks section
-                with st.expander("Advanced Training Options"):
-                    early_stopping = st.checkbox("Early Stopping", value=True)
-                    reduce_lr = st.checkbox("Reduce Learning Rate on Plateau")
-                    patience = st.slider("Patience Epochs", 5, 20, 10) if early_stopping else 0
-                callbacks_list = []
-                if early_stopping:
-                    callbacks_list.append(
-                        callbacks.EarlyStopping(patience=patience, restore_best_weights=True))
-                if reduce_lr:
-                    callbacks_list.append(
-                        callbacks.ReduceLROnPlateau(factor=0.2, patience=patience//2))
-                # Train the model with callbacks
-                history = model.fit(
-                    X_train_processed, y_train,
-                    epochs=epochs,
-                    batch_size=batch_size,
-                    validation_split=0.2,
-                    callbacks=callbacks_list,
-                    verbose=0
-                )
-                # Store model and preprocessor
-                st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
-                st.session_state.preprocessor = preprocessor
-                # Store the test data
-                st.session_state.X_train_selected = X_train_processed
-                st.session_state.X_test_selected = X_test_processed
-                st.session_state.y_train = y_train
-                st.session_state.y_test = y_test
-                # Model Evaluation
-                y_pred = model.predict(X_test_processed)
-                # Post-processing for classification
-                if problem_type == "Classification":
-                    y_pred = np.argmax(y_pred, axis=1)  # Convert probabilities to class labels
-                if problem_type == "Regression":
-                    mse = mean_squared_error(y_test, y_pred)
-                    rmse = np.sqrt(mse)
-                    mae = mean_absolute_error(y_test, y_pred)
-                    r2 = r2_score(y_test, y_pred)
-                    st.write(f"Mean Squared Error: {mse:.4f}")
-                    st.write(f"Root Mean Squared Error: {rmse:.4f}")
-                    st.write(f"Mean Absolute Error: {mae:.4f}")
-                    st.write(f"R-squared: {r2:.4f}")
-                else:
-                    accuracy = accuracy_score(y_test, y_pred)
-                    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
-                    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
-                    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
-                    st.write(f"Accuracy: {accuracy:.4f}")
-                    st.write(f"Precision: {precision:.4f}")
-                    st.write(f"Recall: {recall:.4f}")
-                    st.write(f"F1 Score: {f1:.4f}")
-                    st.write("Classification Report:")
-                    st.text(classification_report(y_test, y_pred))
-                # Visualization with multiple metrics
-                st.subheader("📊 Training History")
-                fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
-                # Plot loss
-                ax1.plot(history.history['loss'], label='Train Loss')
-                ax1.plot(history.history['val_loss'], label='Validation Loss')
-                ax1.set_title('Loss Evolution')
-                ax1.set_xlabel('Epoch')
-                ax1.set_ylabel('Loss')
-                ax1.legend()
-                # Plot accuracy/metric
-                if problem_type == "Classification":
-                    ax2.plot(history.history['accuracy'], label='Train Accuracy')
-                    ax2.plot(history.history['val_accuracy'], label='Validation Accuracy')
-                    ax2.set_title('Accuracy Evolution')
-                    ax2.set_ylabel('Accuracy')
-                else:
-                    ax2.plot(history.history['mae'], label='Train MAE')
-                    ax2.plot(history.history['val_mae'], label='Validation MAE')
-                    ax2.set_title('MAE Evolution')
-                    ax2.set_ylabel('MAE')
-                ax2.set_xlabel('Epoch')
-                ax2.legend()
-                st.pyplot(fig)
-                st.success("Neural network trained successfully!")
-            except Exception as e:
-                st.error(f"An error occurred during training: {e}")
-    # Model Saving
-    if st.session_state.model is not None:
-        st.subheader("💾 Save Model")
-        model_filename = st.text_input("Enter Model Filename (without extension)", "neural_network")
-        if st.button("Save Model"):
-            try:
-                # Save the entire Keras model including architecture and weights
-                st.session_state.model.named_steps['model'].save(f"{model_filename}.h5")  # Saves as a HDF5 file
-                st.success(f"Model saved as {model_filename}.h5")
-            except Exception as e:
-                st.error(f"Error saving model: {e}")

+import gradio as gr
+import numpy as np
 import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import io
+import os
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Dense, Dropout
+from sklearn.preprocessing import StandardScaler, LabelEncoder
+from sklearn.model_selection import train_test_split
+import re
+# Pydantic is now in pydantic-settings, fixed
+from pydantic_settings import BaseSettings # Fix: import from pydantic_settings
+# pandas_profiling import and fix
+from ydata_profiling import ProfileReport
+from streamlit_pandas_profiling import st_profile_report
+import streamlit as st
 import numpy as np
+import pandas as pd
 import plotly.express as px
 from scipy import stats
 import plotly.colors as pc
 from sklearn.feature_selection import SelectKBest
 from sklearn.experimental import enable_iterative_imputer
 from sklearn.impute import IterativeImputer
+from sklearn.neural_network import MLPRegressor, MLPClassifier
+from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
 from sklearn.impute import KNNImputer, SimpleImputer
 from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder, LabelEncoder
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
+from datetime import datetime  # Import datetime
 # Enhanced configuration
     st.session_state.raw_data = None
 if 'cleaned_data' not in st.session_state:
     st.session_state.cleaned_data = None
+if 'model' not in st.session_state:
+    st.session_state.model = None
+if 'preprocessor' not in st.session_state:
+    st.session_state.preprocessor = None
 # Security: Set allowed file types
 ALLOWED_EXTENSIONS = {'csv', 'xlsx', 'parquet', 'feather'}
 MAX_FILE_SIZE_MB = 250  # 250MB limit
     """Comprehensive file validation"""
     if not file:
         return False, "No file uploaded"
     extension = file.name.split('.')[-1].lower()
     if extension not in ALLOWED_EXTENSIONS:
         return False, f"Unsupported file type: {extension}"
     file_size_mb = file.size / (1024 * 1024)
     if file_size_mb > MAX_FILE_SIZE_MB:
         return False, f"File size exceeds {MAX_FILE_SIZE_MB}MB limit"
     return True, ""
 @st.cache_data(ttl=3600, show_spinner="Analyzing data quality...")
                     df = pd.read_parquet(uploaded_file)
                 elif uploaded_file.name.endswith('.feather'):
                     df = pd.read_feather(uploaded_file)
                 st.session_state.raw_data = df
                 st.success("Dataset loaded successfully!")
             except Exception as e:
                 st.error(f"Error loading file: {str(e)}")
                 st.stop()
         # Data Health Dashboard
         st.subheader("📊 Data Health Dashboard")
         report = enhanced_quality_report(df)
         col1, col2, col3, col4 = st.columns(4)
         col1.metric("Total Rows", report['basic_stats']['rows'])
         col2.metric("Total Columns", report['basic_stats']['columns'])
         with st.expander("🔍 Deep Column Analysis", expanded=True):
             selected_col = st.selectbox("Select column to inspect", df.columns)
             col_info = report['column_analysis'][selected_col]
             st.write(f"**Type:** {col_info['type']}")
             st.write(f"**Unique Values:** {col_info['unique']}")
             st.write(f"**Missing Values:** {col_info['missing']} ({col_info['missing']/len(df):.1%})")
             if pd.api.types.is_numeric_dtype(df[selected_col]):
                 st.write("**Distribution:**")
                 st.line_chart(df[selected_col])
                     recommendations.append(f"⚠️ Consider dropping {col} (>{50}% missing)")
                 if data['unique'] == len(df):
                     recommendations.append(f"🔍 Investigate {col} - potential unique identifier")
             if recommendations:
                 st.write("### Recommended Actions")
                 for rec in recommendations[:5]:  # Show top 5
         # Advanced Profiling
         if st.button("🚀 Generate Full Data Profile"):
             with st.spinner("Generating comprehensive report..."):
+                pr = ProfileReport(df, explorative=True,title="Data Upload Report")  # Added title to pandas profiling
                 st_profile_report(pr)
 elif app_mode == "Smart Cleaning":
         st.caption(f"Current Version: {st.session_state.current_version+1}/{len(st.session_state.data_versions)}")
         progress = (st.session_state.current_version + 1) / len(st.session_state.data_versions)
         st.progress(progress)
         col1, col2 = st.columns(2)
         with col1:
             if st.button("⏮️ Undo Last Action", disabled=st.session_state.current_version == 0):
     st.subheader("📊 Data Health Dashboard")
     with st.expander("Show Comprehensive Data Report", expanded=True):
         from pandas_profiling import ProfileReport
+        pr = ProfileReport(df, title="Smart Cleaning Data Report")  # Add title to pandas profiling report
         st_profile_report(pr)
     # Enhanced Health Summary with Cards
     st.markdown("### 📈 Data Health Visualizations")
     col1, col2 = st.columns(2)
     with col1:
+        st.plotly_chart(px.bar(df.isna().sum(), title="Missing Values per Column",
+                         labels={'index': 'Column', 'value': 'Missing Count'},
                          color=df.isna().sum(), color_continuous_scale="Bluered"))
     with col2:
+        st.plotly_chart(px.pie(values=df.dtypes.value_counts(), names=df.dtypes.value_counts().index,
                               title="Data Type Distribution", hole=0.3))
     # Cleaning Operations with Tabs
         if missing_cols:
             st.write("Columns with missing values:")
             cols = st.multiselect("Select columns to clean", missing_cols, default=missing_cols)
             method = st.radio("Imputation Method", [
+                "Drop Missing",
+                "Mean/Median/Mode",
+                "KNN Imputation",
+                "MICE Imputation",
                 "Deep Learning Imputation"
             ], horizontal=True)
             if st.button(f"Apply {method}"):
                 try:
                     original_df = df.copy()
             st.plotly_chart(px.histogram(df, x=df.duplicated(), title="Duplicate Distribution"))
             dup_strategy = st.radio("Duplicate Strategy", [
                 "Remove All Duplicates",
+                "Keep First Occurrence",
                 "Keep Last Occurrence"
             ])
             if st.button("Handle Duplicates"):
         with col2:
             col_to_convert = st.selectbox("Select column to convert", df.columns)
             new_type = st.selectbox("New Data Type", [
+                "String", "Integer", "Float",
                 "Boolean", "Datetime", "Category"
             ])
             if st.button("Convert Data Type"):
     if st.button("💾 Save Cleaned Data"):
         st.session_state.cleaned_data = df
         st.balloons()
         # Generate comprehensive report
         from pandas_profiling import ProfileReport
         pr = ProfileReport(df, title="Cleaned Data Report")
         st_profile_report(pr)
         # Show cleaning log with diffs
         st.subheader("📝 Cleaning Log")
         st.table(pd.DataFrame({
             "Step": range(1, len(cleaning_actions)+1),
             "Action": cleaning_actions
         }))
         # Show dataset comparison
         col1, col2 = st.columns(2)
         with col1:
             st.write("Original Data Shape:", st.session_state.raw_data.shape)
         with col2:
             st.write("Cleaned Data Shape:", df.shape)
+        st.success("✅ Cleaned data saved successfully! You can now proceed to analysis.")
 elif app_mode == "Advanced EDA":
     st.title("🔍 Advanced Exploratory Data Analysis")
     st.markdown("""
     with col1:
         st.header("📊 Visualization Setup")
         # Plot Type Selection
         plot_types = {
             "Distribution": ["Histogram", "Box Plot", "Violin Plot", "Density Plot"],
             "Comparison": ["Bar Chart", "Pie Chart", "Parallel Coordinates"],
             "3D": ["3D Scatter", "3D Surface"]
         }
         selected_category = st.selectbox("Plot Category", list(plot_types.keys()))
         st.session_state.eda_config['plot_type'] = st.selectbox(
             "Plot Type",
         # Dynamic Column Selectors
         plot_type = st.session_state.eda_config['plot_type']
         if plot_type in ["Histogram", "Box Plot", "Violin Plot", "Density Plot", "Bar Chart", "Pie Chart"]:
             st.session_state.eda_config['x_col'] = st.selectbox(
                 "X Axis",
                 df.columns,
+                index=df.columns.get_loc(st.session_state.eda_config['x_col'])
                 if st.session_state.eda_config['x_col'] in df.columns else 0
             )
         if plot_type in ["Scatter Plot", "Line Plot", "Box Plot", "Violin Plot", "Density Plot"]:
             st.session_state.eda_config['y_col'] = st.selectbox(
                 "Y Axis",
                 df.columns,
+                index=df.columns.get_loc(st.session_state.eda_config['y_col'])
                 if st.session_state.eda_config['y_col'] in df.columns else 0
             )
         if plot_type in ["3D Scatter", "3D Surface"]:
             st.session_state.eda_config['z_col'] = st.selectbox(
                 "Z Axis",
                 df.columns,
+                index=df.columns.get_loc(st.session_state.eda_config['z_col'])
                 if st.session_state.eda_config['z_col'] in df.columns else 0
             )
     with col2:
         st.header("📈 Visualization")
         config = st.session_state.eda_config
         @st.cache_data(ttl=300)
         def generate_plot(df, plot_type, config):
             """Cached plot generation function for better performance"""
             try:
                 if plot_type == "Histogram":
                     return px.histogram(
+                        df, x=config['x_col'],
                         color=config['color_col'],
                         nbins=30,
                         color_discrete_sequence=[config['color_palette']]
                     )
                 elif plot_type == "Scatter Plot":
                     return px.scatter(
                         df, x=config['x_col'], y=config['y_col'],
                         color=config['color_col'],
                         hover_data=config['hover_data_cols']
                     )
                 elif plot_type == "Box Plot":
                     return px.box(
                         df, x=config['x_col'], y=config['y_col'],
                         color=config['color_col']
                     )
                 elif plot_type == "Violin Plot":
                     return px.violin(
                         df, x=config['x_col'], y=config['y_col'],
                         color=config['color_col'],
                         box=True
                     )
                 elif plot_type == "Heatmap":
                     numeric_df = df.select_dtypes(include=np.number)
                     corr = numeric_df.corr()
                     return px.imshow(
+                        corr,
                         text_auto=True,
                         color_continuous_scale=config['color_palette']
                     )
                 elif plot_type == "3D Scatter":
                     return px.scatter_3d(
                         df, x=config['x_col'], y=config['y_col'], z=config['z_col'],
                         color=config['color_col']
                     )
                 elif plot_type == "Bar Chart":
                     return px.bar(
                         df, x=config['x_col'], y=config['y_col'],
                         color=config['color_col']
                     )
                 elif plot_type == "Pie Chart":
                     return px.pie(
                         df, names=config['x_col'], values=config['y_col'],
                         color_discrete_sequence=[config['color_palette']]
                     )
                 elif plot_type == "Line Plot":
                     return px.line(
                         df, x=config['x_col'], y=config['y_col'],
                         color=config['color_col']
                     )
                 elif plot_type == "Pair Plot":
                     numeric_cols = df.select_dtypes(include=np.number).columns
                     return px.scatter_matrix(
                         df[numeric_cols],
                         color=config['color_col']
                     )
                 elif plot_type == "Parallel Coordinates":
                     numeric_df = df.select_dtypes(include=np.number)
                     return px.parallel_coordinates(
                         numeric_df,
                         color_continuous_scale=config['color_palette']
                     )
                 elif plot_type == "Density Plot":
                     return px.density_contour(
                         df, x=config['x_col'], y=config['y_col'],
                         color=config['color_col']
                     )
             except Exception as e:
                 st.error(f"Plot generation error: {str(e)}")
                 return None
         fig = generate_plot(df, plot_type, config)
         if fig:
             st.plotly_chart(fig, use_container_width=True)
             # Plot Statistics
             with st.expander("📊 Plot Statistics"):
                 if plot_type in ["Histogram", "Box Plot", "Violin Plot"]:
                     st.write(f"**{config['x_col']} Statistics**")
                     st.table(df[config['x_col']].describe())
                 if plot_type in ["Scatter Plot", "Line Plot"]:
                     st.write(f"**Correlation between {config['x_col']} and {config['y_col']}**")
                     corr = df[[config['x_col'], config['y_col']]].corr().iloc[0,1]
                     st.metric("Pearson Correlation", f"{corr:.2f}")
                 if plot_type == "Heatmap":
                     st.write("**Correlation Matrix**")
                     numeric_df = df.select_dtypes(include=np.number)
             st.write("**Data Shape**")
             st.write(f"Rows: {df.shape[0]}")
             st.write(f"Columns: {df.shape[1]}")
         with col2:
             st.write("**Data Types**")
             st.dataframe(df.dtypes.reset_index().rename(columns={
                 'index': 'Column', 0: 'Type'
             }))
         st.write("**Sample Data**")
         st.dataframe(df.head())
 # Model Training Section
 elif app_mode == "Model Training":
     st.title("🚂 Model Training Studio")
         model_options = ["Logistic Regression", "Decision Tree", "Random Forest", "Gradient Boosting", "SVM", "Neural Network", "KNN", "Naive Bayes"]
     model_name = st.selectbox("Select Model", model_options, help="Choose a model.")
+    elif model_name == "Gradient Boosting":
+                learning_rate = st.slider("Learning Rate", 0.01, 1.0, 0.1)
+                n_estimators = st.slider("Number of Estimators", 10, 200, 100)
+                max_depth = st.slider("Max Depth", 3, 20, 10)
+                hyperparams = {
+                    'learning_rate': learning_rate,
+                    'n_estimators': n_estimators,
+                    'max_depth': max_depth
+                }
+            elif model_name == "Neural Network":
+                hidden_layers = st.slider("Number of Hidden Layers", 1, 5, 2)
+                neurons_per_layer = st.slider("Neurons per Layer", 10, 200, 50)
+                activation = st.selectbox("Activation Function",
+                    ["relu", "tanh", "sigmoid", "selu", "swish"])
+                dropout_rate = st.slider("Dropout Rate", 0.0, 0.5, 0.2)
+                initializer = st.selectbox("Weight Initializer",
+                    ["glorot_uniform", "he_normal", "lecun_uniform"])
+                learning_rate = st.slider("Learning Rate", 0.0001, 0.1, 0.001, format="%.4f")
+                optimizer_choice = st.selectbox("Optimizer",
+                    ["Adam", "Nadam", "RMSprop", "SGD"])
+                batch_norm = st.checkbox("Batch Normalization", value=True)
+                regularization = st.checkbox("L2 Regularization")
+                epochs = st.slider("Epochs", 10, 200, 50)
+                batch_size = st.slider("Batch Size", 16, 128, 32)
+                hyperparams = {
+                    'hidden_layers': hidden_layers,
+                    'neurons_per_layer': neurons_per_layer,
+                    'activation': activation,
+                    'dropout_rate': dropout_rate,
+                    'initializer': initializer,
+                    'learning_rate': learning_rate,
+                    'optimizer_choice': optimizer_choice,
+                    'batch_norm': batch_norm,
+                    'regularization': regularization,
+                    'epochs': epochs,
+                    'batch_size': batch_size,
+                }
+            else:
+                hyperparams = {}
     # Train-Test Split
     st.subheader("✂️ Train-Test Split")
                     else:
                         model = SVC()
                 elif model_name == "Neural Network":
+                    from tensorflow.keras.models import Sequential
+                    from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
+                    from tensorflow.keras.optimizers import Adam, Nadam, RMSprop, SGD
+                    # Build a new model with the parameters
+                    model = Sequential()
+                    model.add(layers.Input(shape=(X_train_processed.shape[1],)))
+                    for i in range(hyperparams['hidden_layers']):
+                        model.add(Dense(hyperparams['neurons_per_layer'],
+                                         activation=hyperparams['activation'],
+                                         kernel_initializer=hyperparams['initializer']))
+                        if hyperparams['batch_norm']:
+                            model.add(BatchNormalization())
+                        model.add(Dropout(hyperparams['dropout_rate']))
+                    # Output layer
+                    output_activation = 'linear' if problem_type == "Regression" else 'softmax'
+                    output_units = 1 if problem_type == "Regression" else len(np.unique(y_train))
+                    model.add(Dense(output_units, activation=output_activation))
+                    # Configure optimizer
+                    optimizers = {
+                        "Adam": Adam(learning_rate=hyperparams['learning_rate']),
+                        "Nadam": Nadam(learning_rate=hyperparams['learning_rate']),
+                        "RMSprop": RMSprop(learning_rate=hyperparams['learning_rate']),
+                        "SGD": SGD(learning_rate=hyperparams['learning_rate'], momentum=0.9)
+                    }
+                    optimizer = optimizers[hyperparams['optimizer_choice']]
+                    model.compile(optimizer=optimizer,
+                                   loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
+                                   metrics=['mae'] if problem_type == "Regression" else ['accuracy'])
                 elif model_name == "KNN":
+                    from sklearn.neighbors import KNeighborsClassifier
                     model = KNeighborsClassifier()
                 elif model_name == "Naive Bayes":
+                    from sklearn.naive_bayes import GaussianNB
                     model = GaussianNB()
                 # Train the model
+                if model_name == "Neural Network":  # Only for the neural network
+                    history = model.fit(X_train_processed, y_train,
+                                        epochs=hyperparams['epochs'],
+                                        batch_size=hyperparams['batch_size'],
+                                        validation_data=(X_test_processed, y_test),
+                                        verbose=0)
+                else:
+                    model.fit(X_train_processed, y_train)
                 # Store model and preprocessor
                 st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
                 st.session_state.preprocessor = preprocessor
+                # Store the test data for insights and predictions
                 st.session_state.X_train_selected = X_train_processed
                 st.session_state.X_test_selected = X_test_processed
                 st.session_state.y_train = y_train
                 st.session_state.y_test = y_test
                 # Model Evaluation
                 if problem_type == "Regression":
+                    y_pred = model.predict(X_test_processed)
                     mse = mean_squared_error(y_test, y_pred)
                     rmse = np.sqrt(mse)
                     mae = mean_absolute_error(y_test, y_pred)
                     st.write(f"Root Mean Squared Error: {rmse:.4f}")
                     st.write(f"Mean Absolute Error: {mae:.4f}")
                     st.write(f"R-squared: {r2:.4f}")
+                else:  # Classification
+                    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
+                    y_pred = model.predict(X_test_processed)
+                    if model_name == "Neural Network":  # Neural network output probabilities
+                        y_pred = np.argmax(model.predict(X_test_processed), axis=1)
                     accuracy = accuracy_score(y_test, y_pred)
                     precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
                     recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
                     st.write(f"F1 Score: {f1:.4f}")
                     st.write("Classification Report:")
                     st.text(classification_report(y_test, y_pred))
+                    # confusion matrix
+                    st.write("Confusion Matrix:")
+                    conf_matrix = confusion_matrix(y_test, y_pred)
+                    st.write(conf_matrix)
                 # Visualization
                 st.subheader("📊 Model Performance Visualization")
                     ax.set_ylabel('Predicted')
                     ax.set_title('Actual vs Predicted')
                     st.pyplot(fig)
+                elif model_name == "Neural Network":
+                    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
+                    ax1.plot(history.history['loss'], label='Train Loss')
+                    ax1.plot(history.history['val_loss'], label='Validation Loss')
+                    ax1.set_title('Loss Evolution')
+                    ax1.set_xlabel('Epoch')
+                    ax1.set_ylabel('Loss')
+                    ax1.legend()
+                    # Plot accuracy/metric
+                    if problem_type == "Classification":
+                        ax2.plot(history.history['accuracy'], label='Train Accuracy')
+                        ax2.plot(history.history['val_accuracy'], label='Validation Accuracy')
+                        ax2.set_title('Accuracy Evolution')
+                        ax2.set_ylabel('Accuracy')
+                    else:
+                        ax2.plot(history.history['mae'], label='Train MAE')
+                        ax2.plot(history.history['val_mae'], label='Validation MAE')
+                        ax2.set_title('MAE Evolution')
+                        ax2.set_ylabel('MAE')
+                    ax2.set_xlabel('Epoch')
+                    ax2.legend()
+                    st.pyplot(fig)
+                else: # Classification confusion matrix
+                    from sklearn.metrics import confusion_matrix
                     conf_matrix = confusion_matrix(y_test, y_pred)
                     fig, ax = plt.subplots()
                     sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax)
                     ax.set_ylabel('True Labels')
                     ax.set_title('Confusion Matrix')
                     st.pyplot(fig)
                 st.success("Model trained successfully!")
             except Exception as e:
                 st.error(f"An error occurred during training: {e}")
         st.warning("No trained model available. Train a model first to enable saving.")
 # Insights Section
 elif app_mode == "Insights":
     st.title("📊 Model Insights & Explainability")
             'Feature': feature_names,
             'Importance': importances
         }).sort_values('Importance', ascending=False)
         fig, ax = plt.subplots()
         sns.barplot(x='Importance', y='Feature', data=importance_df.head(10), ax=ax)
         ax.set_title('Top 10 Feature Importances')
     if st.checkbox("Calculate SHAP Values (Warning: May be slow for large datasets)"):
         try:
             import shap
+            # Use KernelExplainer for models that don't have a built-in explainer
+            if not hasattr(model, 'predict'):
+                explainer = shap.KernelExplainer(model.predict, st.session_state.X_train_selected[:100, :])  # Use a sample of training data
+                shap_values = explainer.shap_values(st.session_state.X_test_selected)
+                feature_names = preprocessor.get_feature_names_out()
+                # Summary Plot
+                st.write("### Summary Plot")
+                fig, ax = plt.subplots()
+                shap.summary_plot(shap_values, features=st.session_state.X_test_selected, feature_names=feature_names, show=False, plot_type="bar")  # Change to bar for a cleaner visualization
+                st.pyplot(fig)
+                # Force Plot for Individual Predictions
+                st.write("### Individual Prediction Explanation")
+                sample_idx = st.slider("Select Sample Index", 0, len(st.session_state.X_test_selected) - 1, 0)
+                fig, ax = plt.subplots()
+                shap.force_plot(explainer.expected_value, shap_values[sample_idx], st.session_state.X_test_selected[sample_idx],
+                               feature_names=feature_names, matplotlib=True, show=False)
+                st.pyplot(fig)
+            else:
+                explainer = shap.TreeExplainer(model)
+                shap_values = explainer.shap_values(st.session_state.X_test_selected)
+                feature_names = preprocessor.get_feature_names_out()
+                # Summary Plot
+                st.write("### Summary Plot")
+                fig, ax = plt.subplots()
+                shap.summary_plot(shap_values, features=st.session_state.X_test_selected, feature_names=feature_names, show=False, plot_type="bar")  # Change to bar for a cleaner visualization
+                st.pyplot(fig)
+                # Force Plot for Individual Predictions
+                st.write("### Individual Prediction Explanation")
+                sample_idx = st.slider("Select Sample Index", 0, len(st.session_state.X_test_selected) - 1, 0)
+                fig, ax = plt.subplots()
+                shap.force_plot(explainer.expected_value, shap_values[sample_idx], st.session_state.X_test_selected[sample_idx],
+                               feature_names=feature_names, matplotlib=True, show=False)
+                st.pyplot(fig)
         except Exception as e:
             st.error(f"SHAP calculation failed: {e}")
             from sklearn.inspection import PartialDependenceDisplay
             fig, ax = plt.subplots()
             PartialDependenceDisplay.from_estimator(
+                model, st.session_state.X_test_selected,
+                features=[feature_to_plot],
                 feature_names=preprocessor.get_feature_names_out(),
                 ax=ax
             )
             'metric': [],
             'value': []
         }
         if hasattr(model, 'predict'):
             y_pred = model.predict(st.session_state.X_test_selected)
             mse = mean_squared_error(st.session_state.y_test, y_pred)
             performance_history['timestamp'].append(datetime.now())
             performance_history['metric'].append('MSE')
             performance_history['value'].append(mse)
         performance_df = pd.DataFrame(performance_history)
         st.line_chart(performance_df.set_index('timestamp'))
             st.success("Insights exported successfully!")
         except Exception as e:
             st.error(f"Export failed: {e}")
 # Predictions Section
 elif app_mode == "Predictions":
             input_df = pd.DataFrame([input_data])
             input_processed = preprocessor.transform(input_df)
             prediction = model.predict(input_processed)[0]
             st.write(f"**Prediction:** {prediction}")
             if hasattr(model, 'predict_proba'):
                 probabilities = model.predict_proba(input_processed)[0]
                 st.write("**Prediction Probabilities:**")
             if st.checkbox("Show SHAP Explanation"):
                 try:
                     import shap
+                    # Use KernelExplainer or TreeExplainer, checking if the model has the property first
+                    if hasattr(model, 'predict'):
+                        explainer = shap.TreeExplainer(model)
+                        shap_values = explainer.shap_values(input_processed)
+                    else:
+                        explainer = shap.KernelExplainer(model.predict, st.session_state.X_train_selected[:100, :])
+                        shap_values = explainer.shap_values(input_processed)
                     st.write("### SHAP Values")
                     fig, ax = plt.subplots()
+                    shap.force_plot(explainer.expected_value, shap_values, input_processed,
                                    feature_names=feature_names, matplotlib=True, show=False)
                     st.pyplot(fig)
                 except Exception as e:
                     st.error(f"SHAP calculation failed: {e}")
             pdf.cell(200, 10, txt=f"Problem Type: {'Regression' if hasattr(model, 'predict') else 'Classification'}", ln=True)
             pdf.output("predictions_report.pdf")
             st.success("Predictions exported successfully!")
+except Exception as e:
+    st.error(f"An unexpected error occurred: {e}")