Spaces:

CosmickVisions
/

Neural-Vision

Sleeping

App Files Files Community

CosmickVisions commited on Mar 20

Commit

c9ae121

verified ·

1 Parent(s): 93957eb

Update app.py

Browse files

Files changed (1) hide show

app.py +128 -999

app.py CHANGED Viewed

@@ -1,17 +1,14 @@
 import streamlit as st
 import pandas as pd
 import plotly.express as px
-import plotly.graph_objects as go
 import numpy as np
 from sklearn.model_selection import train_test_split
 from sklearn.neural_network import MLPClassifier, MLPRegressor
-from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
-from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
 from sklearn.metrics import accuracy_score, r2_score, silhouette_score, confusion_matrix, classification_report, mean_squared_error
 from sklearn.preprocessing import StandardScaler
 from ydata_profiling import ProfileReport
 from streamlit_pandas_profiling import st_profile_report
-from streamlit_lottie import st_lottie
 from groq import Groq
 from langchain_community.vectorstores import FAISS
 from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -21,11 +18,6 @@ from langchain_community.tools.tavily_search import TavilySearchResults
 import os
 from dotenv import load_dotenv
 import tempfile
-import datetime
-import time
-import matplotlib.pyplot as plt
-import shap
-import xgboost as xgb
 # Load environment variables
 load_dotenv()
@@ -149,102 +141,6 @@ st.markdown("""
         transform: translateY(-2px);
         box-shadow: 0 6px 16px rgba(59, 130, 246, 0.4);
     }
-    /* Card styles */
-    .card {
-        background: var(--white);
-        border-radius: 16px;
-        box-shadow: 0 4px 16px rgba(0,0,0,0.1);
-        padding: 20px;
-        margin-bottom: 25px;
-        transition: all 0.3s ease;
-    }
-    .card:hover {
-        box-shadow: 0 8px 24px rgba(0,0,0,0.15);
-        transform: translateY(-2px);
-    }
-    /* Step header styles */
-    .step-header {
-        display: flex;
-        align-items: center;
-        margin-bottom: 15px;
-    }
-    .step-counter {
-        background: var(--primary-blue);
-        color: var(--white);
-        width: 36px;
-        height: 36px;
-        border-radius: 50%;
-        display: flex;
-        align-items: center;
-        justify-content: center;
-        font-weight: bold;
-        margin-right: 15px;
-        box-shadow: 0 4px 10px rgba(59, 130, 246, 0.3);
-    }
-    .step-title {
-        font-size: 1.5rem;
-        font-weight: 700;
-        color: var(--dark-blue);
-    }
-    /* Notification styles */
-    .notification {
-        display: flex;
-        align-items: center;
-        background: #ECFDF5;
-        border-left: 4px solid #059669;
-        color: #065F46;
-        padding: 15px;
-        border-radius: 8px;
-        margin: 15px 0;
-        box-shadow: 0 2px 8px rgba(5, 150, 105, 0.1);
-        transition: transform 0.2s ease;
-    }
-    .notification:hover {
-        transform: translateY(-2px);
-    }
-    .notification-icon {
-        background: #059669;
-        color: white;
-        width: 24px;
-        height: 24px;
-        border-radius: 50%;
-        display: flex;
-        align-items: center;
-        justify-content: center;
-        margin-right: 15px;
-        font-weight: bold;
-    }
-    /* Metrics card styles */
-    .metrics-card {
-        background: var(--white);
-        border-radius: 12px;
-        padding: 15px;
-        box-shadow: 0 2px 10px rgba(0,0,0,0.08);
-        text-align: center;
-        transition: all 0.3s ease;
-        height: 100%;
-        display: flex;
-        flex-direction: column;
-        justify-content: center;
-    }
-    .metrics-card:hover {
-        transform: translateY(-3px);
-        box-shadow: 0 6px 16px rgba(0,0,0,0.12);
-    }
-    .metrics-value {
-        font-size: 2rem;
-        font-weight: 700;
-        color: var(--primary-blue);
-        margin-bottom: 5px;
-    }
-    .metrics-label {
-        color: var(--medium-grey);
-        font-size: 0.9rem;
-        font-weight: 500;
-    }
     </style>
 """, unsafe_allow_html=True)
@@ -338,932 +234,165 @@ def plot_clusters(X, labels):
     fig = px.scatter(X, x=X.columns[0], y=X.columns[1], color=labels, title="Cluster Visualization")
     return fig
-def plot_learning_curve(model, X, y, cv=5):
-    """Plot learning curve to show model performance with increasing data"""
-    from sklearn.model_selection import learning_curve
-    train_sizes, train_scores, test_scores = learning_curve(
-        model, X, y, cv=cv, scoring='accuracy' if hasattr(y, 'nunique') else 'r2',
-        n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10))
-    train_mean = np.mean(train_scores, axis=1)
-    train_std = np.std(train_scores, axis=1)
-    test_mean = np.mean(test_scores, axis=1)
-    test_std = np.std(test_scores, axis=1)
-    # Create DataFrame for plotting
-    df_curve = pd.DataFrame({
-        'Training Size (%)': train_sizes / len(X) * 100,
-        'Training Score': train_mean,
-        'Training Upper': train_mean + train_std,
-        'Training Lower': train_mean - train_std,
-        'Testing Score': test_mean,
-        'Testing Upper': test_mean + test_std,
-        'Testing Lower': test_mean - test_std
-    })
-    # Create the plot
-    fig = go.Figure()
-    # Add training data with confidence interval
-    fig.add_trace(go.Scatter(
-        x=df_curve['Training Size (%)'],
-        y=df_curve['Training Score'],
-        mode='lines+markers',
-        name='Training Score',
-        line=dict(color='blue', width=2),
-        marker=dict(size=8)
-    ))
-    fig.add_trace(go.Scatter(
-        x=df_curve['Training Size (%)'],
-        y=df_curve['Training Upper'],
-        mode='lines',
-        line=dict(width=0),
-        showlegend=False
-    ))
-    fig.add_trace(go.Scatter(
-        x=df_curve['Training Size (%)'],
-        y=df_curve['Training Lower'],
-        mode='lines',
-        line=dict(width=0),
-        fill='tonexty',
-        fillcolor='rgba(0, 0, 255, 0.1)',
-        showlegend=False
-    ))
-    # Add testing data with confidence interval
-    fig.add_trace(go.Scatter(
-        x=df_curve['Training Size (%)'],
-        y=df_curve['Testing Score'],
-        mode='lines+markers',
-        name='Testing Score',
-        line=dict(color='red', width=2),
-        marker=dict(size=8)
-    ))
-    fig.add_trace(go.Scatter(
-        x=df_curve['Training Size (%)'],
-        y=df_curve['Testing Upper'],
-        mode='lines',
-        line=dict(width=0),
-        showlegend=False
-    ))
-    fig.add_trace(go.Scatter(
-        x=df_curve['Training Size (%)'],
-        y=df_curve['Testing Lower'],
-        mode='lines',
-        line=dict(width=0),
-        fill='tonexty',
-        fillcolor='rgba(255, 0, 0, 0.1)',
-        showlegend=False
-    ))
-    # Update layout
-    fig.update_layout(
-        title='Learning Curve',
-        xaxis_title='Training Set Size (%)',
-        yaxis_title='Score',
-        hovermode='x unified',
-        width=700,
-        height=400,
-        legend=dict(
-            orientation="h",
-            yanchor="bottom",
-            y=1.02,
-            xanchor="right",
-            x=1
-        )
-    )
-    return fig
-def plot_shap_summary(model, X):
-    """Create SHAP summary plot for model explainability"""
-    try:
-        # Create explainer based on model type
-        if hasattr(model, 'predict_proba'):
-            explainer = shap.Explainer(model)
-        else:
-            explainer = shap.Explainer(model)
-        # Calculate SHAP values
-        shap_values = explainer(X)
-        # Create the SHAP summary plot
-        plt.figure(figsize=(10, 8))
-        shap.summary_plot(shap_values, X, show=False)
-        fig = plt.gcf()
-        plt.tight_layout()
-        return fig
-    except Exception as e:
-        st.warning(f"Could not generate SHAP plot: {e}")
-        return None
-# Data Exploration and Insights Generation
-def generate_data_insights(df):
-    """Generate comprehensive insights about the dataset"""
-    insights = {}
-    # Basic statistics
-    insights['shape'] = df.shape
-    insights['missing_values'] = df.isna().sum().sum()
-    insights['duplicate_rows'] = df.duplicated().sum()
-    # Column types
-    insights['numeric_columns'] = list(df.select_dtypes(include=['number']).columns)
-    insights['categorical_columns'] = list(df.select_dtypes(include=['object', 'category', 'bool']).columns)
-    insights['datetime_columns'] = []
-    for col in df.columns:
-        try:
-            if pd.to_datetime(df[col], errors='coerce').notna().any():
-                insights['datetime_columns'].append(col)
-        except:
-            pass
-    # Distribution statistics
-    insights['skewed_columns'] = []
-    for col in insights['numeric_columns']:
-        if abs(df[col].skew()) > 1.0:
-            insights['skewed_columns'].append((col, df[col].skew()))
-    # Correlation analysis
-    if len(insights['numeric_columns']) > 1:
-        corr_matrix = df[insights['numeric_columns']].corr().abs()
-        corr_pairs = []
-        for i in range(len(corr_matrix.columns)):
-            for j in range(i):
-                if corr_matrix.iloc[i, j] > 0.7:  # Strong correlation threshold
-                    corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))
-        insights['correlated_features'] = sorted(corr_pairs, key=lambda x: x[2], reverse=True)
-    # Categorical feature analysis
-    insights['high_cardinality_features'] = []
-    for col in insights['categorical_columns']:
-        if df[col].nunique() > 10:
-            insights['high_cardinality_features'].append((col, df[col].nunique()))
-    # Missing value patterns
-    insights['missing_patterns'] = []
-    for col in df.columns:
-        missing_pct = df[col].isna().mean() * 100
-        if missing_pct > 0:
-            insights['missing_patterns'].append((col, missing_pct))
-    # Outlier detection
-    insights['outlier_columns'] = []
-    for col in insights['numeric_columns']:
-        Q1 = df[col].quantile(0.25)
-        Q3 = df[col].quantile(0.75)
-        IQR = Q3 - Q1
-        outliers_count = ((df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))).sum()
-        if outliers_count > 0:
-            insights['outlier_columns'].append((col, outliers_count, outliers_count/len(df)*100))
-    return insights
-# Enhanced Model Selection and Training Functions
-def get_model_options(problem_type):
-    """Get appropriate models for the selected problem type"""
-    if problem_type == "Classification":
-        return {
-            "Neural Network": MLPClassifier(max_iter=1000, random_state=42),
-            "Random Forest": RandomForestClassifier(random_state=42),
-            "Gradient Boosting": GradientBoostingClassifier(random_state=42),
-            "XGBoost": xgb.XGBClassifier(random_state=42)
-        }
-    elif problem_type == "Regression":
-        return {
-            "Neural Network": MLPRegressor(max_iter=1000, random_state=42),
-            "Random Forest": RandomForestRegressor(random_state=42),
-            "Gradient Boosting": GradientBoostingRegressor(random_state=42),
-            "XGBoost": xgb.XGBRegressor(random_state=42)
-        }
-    else:  # Clustering
-        return {
-            "K-Means": KMeans(random_state=42),
-            "DBSCAN": DBSCAN(),
-            "Agglomerative": AgglomerativeClustering()
-        }
-def train_model_with_optimization(model, X_train, X_test, y_train, y_test, problem_type, optimization_level="basic"):
-    """Train model with optional hyperparameter optimization"""
-    start_time = time.time()
-    if optimization_level == "none":
-        # Simple fit without optimization
-        model.fit(X_train, y_train)
-        best_model = model
-    elif optimization_level == "basic":
-        # Basic parameter grid
-        param_grid = {}
-        if problem_type in ["Classification", "Regression"]:
-            if isinstance(model, (RandomForestClassifier, RandomForestRegressor)):
-                param_grid = {
-                    'n_estimators': [100, 200],
-                    'max_depth': [None, 10, 20]
-                }
-            elif isinstance(model, (GradientBoostingClassifier, GradientBoostingRegressor)):
-                param_grid = {
-                    'n_estimators': [100, 200],
-                    'learning_rate': [0.01, 0.1]
-                }
-            elif isinstance(model, (MLPClassifier, MLPRegressor)):
-                param_grid = {
-                    'hidden_layer_sizes': [(100,), (100, 50)],
-                    'alpha': [0.0001, 0.001]
-                }
-            elif "XGB" in str(model.__class__):
-                param_grid = {
-                    'n_estimators': [100, 200],
-                    'learning_rate': [0.01, 0.1],
-                    'max_depth': [3, 6]
-                }
-        elif problem_type == "Clustering":
-            if isinstance(model, KMeans):
-                param_grid = {
-                    'n_clusters': [3, 4, 5, 6]
-                }
-            elif isinstance(model, DBSCAN):
-                param_grid = {
-                    'eps': [0.3, 0.5, 0.7],
-                    'min_samples': [5, 10, 15]
-                }
-            elif isinstance(model, AgglomerativeClustering):
-                param_grid = {
-                    'n_clusters': [3, 4, 5, 6],
-                    'linkage': ['ward', 'complete', 'average']
-                }
-        # Only run GridSearchCV if we have parameters to optimize
-        if param_grid:
-            if problem_type == "Clustering":
-                # For clustering, use silhouette score as the metric
-                from sklearn.metrics import make_scorer, silhouette_score
-                from sklearn.model_selection import GridSearchCV
-                # Custom scorer for clustering
-                def silhouette_scorer(estimator, X):
-                    labels = estimator.fit_predict(X)
-                    if len(set(labels)) <= 1:  # Check if all points are in one cluster
-                        return -1
-                    return silhouette_score(X, labels)
-                grid_search = GridSearchCV(
-                    estimator=model,
-                    param_grid=param_grid,
-                    scoring=make_scorer(silhouette_scorer),
-                    cv=3,
-                    n_jobs=-1
-                )
-                grid_search.fit(X_train)
-            else:
-                # For classification and regression
-                scoring = 'accuracy' if problem_type == "Classification" else 'r2'
-                grid_search = GridSearchCV(
-                    estimator=model,
-                    param_grid=param_grid,
-                    scoring=scoring,
-                    cv=5,
-                    n_jobs=-1
-                )
-                grid_search.fit(X_train, y_train)
-            best_model = grid_search.best_estimator_
-        else:
-            # If no param grid, just fit the model
-            model.fit(X_train, y_train)
-            best_model = model
-    else:  # Advanced optimization
-        # TODO: Implement advanced optimization with more parameters,
-        # RandomizedSearchCV or BayesianOptimization
-        pass
-    # Calculate training time
-    training_time = time.time() - start_time
-    # Get predictions for evaluation
-    if problem_type == "Clustering":
-        if hasattr(best_model, 'predict'):
-            y_pred = best_model.predict(X_test)
-        else:
-            y_pred = best_model.fit_predict(X_test)
-    else:
-        y_pred = best_model.predict(X_test)
-    return best_model, y_pred, training_time
 # Pages
 def data_upload_page():
-    """Enhanced data upload & analysis page"""
-    st.markdown('<div class="card">', unsafe_allow_html=True)
-    # Create a header with animation
-    col1, col2 = st.columns([1, 3])
-    with col1:
-        st_lottie(lottie_upload, height=150, key="upload_animation")
-    with col2:
-        st.markdown('<div class="step-header">', unsafe_allow_html=True)
-        st.markdown('<div class="step-counter">1</div>', unsafe_allow_html=True)
-        st.markdown('<div class="step-title">Data Upload & Exploratory Analysis</div>', unsafe_allow_html=True)
-        st.markdown('</div>', unsafe_allow_html=True)
-        st.markdown("Upload your dataset and get comprehensive insights before model training.")
-    # File uploader with enhanced UI
-    uploaded_file = st.file_uploader("Upload Dataset (CSV, Excel, or JSON)",
-                                     type=["csv", "xlsx", "json"],
-                                     help="Upload your data file to start analysis")
     if uploaded_file:
-        # Provide feedback during loading
-        with st.spinner('Reading and analyzing your dataset...'):
-            # Determine file type and read
-            if uploaded_file.name.endswith('csv'):
-                df = pd.read_csv(uploaded_file)
-            elif uploaded_file.name.endswith('xlsx'):
-                df = pd.read_excel(uploaded_file)
-            elif uploaded_file.name.endswith('json'):
-                df = pd.read_json(uploaded_file)
-            # Store in session state
-            st.session_state.df = df
-            st.session_state.vector_store = create_vector_store(convert_df_to_text(df))
-            st.session_state.metrics = {}
-            # Generate insights
-            st.session_state.dataset_insights = generate_data_insights(df)
-        # Success notification
-        if 'notification' not in st.session_state or st.session_state.notification is None:
-            st.session_state.notification = "Data successfully loaded! 🎉"
-        # Display a notification
-        st.markdown(f"""
-        <div class="notification">
-            <div class="notification-icon">✓</div>
-            <div>{st.session_state.notification}</div>
-        </div>
-        """, unsafe_allow_html=True)
-        st.session_state.notification = None
-        # Create tabs for different data views
-        data_tabs = st.tabs(["📊 Overview", "🔍 Data Explorer", "📈 Visualizations", "📋 Profile Report"])
-        with data_tabs[0]:
-            st.subheader("Dataset Overview")
-            col1, col2, col3 = st.columns(3)
-            with col1:
-                st.markdown('<div class="metrics-card">', unsafe_allow_html=True)
-                st.markdown(f'<div class="metrics-value">{df.shape[0]:,}</div>', unsafe_allow_html=True)
-                st.markdown('<div class="metrics-label">Total Samples</div>', unsafe_allow_html=True)
-                st.markdown('</div>', unsafe_allow_html=True)
-            with col2:
-                st.markdown('<div class="metrics-card">', unsafe_allow_html=True)
-                st.markdown(f'<div class="metrics-value">{df.shape[1]}</div>', unsafe_allow_html=True)
-                st.markdown('<div class="metrics-label">Features</div>', unsafe_allow_html=True)
-                st.markdown('</div>', unsafe_allow_html=True)
-            with col3:
-                missing_pct = df.isna().sum().sum() / (df.shape[0] * df.shape[1]) * 100
-                st.markdown('<div class="metrics-card">', unsafe_allow_html=True)
-                st.markdown(f'<div class="metrics-value">{missing_pct:.1f}%</div>', unsafe_allow_html=True)
-                st.markdown('<div class="metrics-label">Missing Values</div>', unsafe_allow_html=True)
-                st.markdown('</div>', unsafe_allow_html=True)
-            # Data Types Breakdown
-            st.subheader("Data Types")
-            dtype_counts = df.dtypes.value_counts().reset_index()
-            dtype_counts.columns = ['Data Type', 'Count']
-            fig = px.pie(dtype_counts, values='Count', names='Data Type', hole=0.4,
-                         color_discrete_sequence=px.colors.qualitative.Bold)
-            fig.update_layout(margin=dict(t=0, b=0, l=0, r=0), height=300)
-            st.plotly_chart(fig, use_container_width=True)
-            # Key Insights
-            st.subheader("Key Insights")
-            insights = st.session_state.dataset_insights
-            insight_cols = st.columns(2)
-            with insight_cols[0]:
-                st.markdown("**Data Quality Issues**")
-                if insights['missing_patterns']:
-                    st.markdown("🔹 **Missing Values:**")
-                    for col, pct in sorted(insights['missing_patterns'], key=lambda x: x[1], reverse=True)[:5]:
-                        st.markdown(f"  • *{col}*: {pct:.1f}% missing")
-                if insights['outlier_columns']:
-                    st.markdown("🔹 **Outliers Detected:**")
-                    for col, count, pct in sorted(insights['outlier_columns'], key=lambda x: x[2], reverse=True)[:5]:
-                        st.markdown(f"  • *{col}*: {count} outliers ({pct:.1f}%)")
-            with insight_cols[1]:
-                st.markdown("**Feature Relationships**")
-                if 'correlated_features' in insights and insights['correlated_features']:
-                    st.markdown("🔹 **Highly Correlated Features:**")
-                    for col1, col2, corr in insights['correlated_features'][:5]:
-                        st.markdown(f"  • *{col1}* & *{col2}*: {corr:.2f} correlation")
-                if insights['skewed_columns']:
-                    st.markdown("🔹 **Skewed Distributions:**")
-                    for col, skew in sorted(insights['skewed_columns'], key=lambda x: abs(x[1]), reverse=True)[:5]:
-                        direction = "right" if skew > 0 else "left"
-                        st.markdown(f"  • *{col}*: {direction}-skewed ({skew:.2f})")
-        with data_tabs[1]:
-            st.subheader("Interactive Data Explorer")
-            # Filter and search options
-            col1, col2 = st.columns([2, 3])
-            with col1:
-                search_term = st.text_input("Search columns", "")
-            with col2:
-                selected_dtypes = st.multiselect(
-                    "Filter by data type",
-                    options=['numeric', 'object', 'datetime', 'category', 'bool'],
-                    default=['numeric', 'object']
-                )
-            # Apply filters
-            filtered_cols = df.columns
-            if search_term:
-                filtered_cols = [col for col in filtered_cols if search_term.lower() in col.lower()]
-            if selected_dtypes:
-                dtype_map = {
-                    'numeric': 'number',
-                    'object': 'object',
-                    'datetime': 'datetime',
-                    'category': 'category',
-                    'bool': 'bool'
-                }
-                dtype_filtered = []
-                for dtype in selected_dtypes:
-                    dtype_filtered.extend(df.select_dtypes(include=[dtype_map[dtype]]).columns)
-                filtered_cols = [col for col in filtered_cols if col in dtype_filtered]
-            # Display filtered dataframe
-            if filtered_cols:
-                st.dataframe(df[filtered_cols], height=400)
-                # Column statistics
-                selected_column = st.selectbox("Select column for detailed statistics", options=filtered_cols)
-                col1, col2 = st.columns(2)
-                with col1:
-                    st.subheader(f"Statistics for: {selected_column}")
-                    if pd.api.types.is_numeric_dtype(df[selected_column]):
-                        stats = df[selected_column].describe()
-                        for stat, value in stats.items():
-                            st.markdown(f"**{stat}:** {value:.4f}")
-                        # Additional stats
-                        st.markdown(f"**Skewness:** {df[selected_column].skew():.4f}")
-                        st.markdown(f"**Kurtosis:** {df[selected_column].kurtosis():.4f}")
-                    else:
-                        st.markdown(f"**Unique Values:** {df[selected_column].nunique()}")
-                        st.markdown(f"**Most Common:** {df[selected_column].value_counts().index[0]}")
-                        st.markdown(f"**Least Common:** {df[selected_column].value_counts().index[-1]}")
-                        st.markdown(f"**Missing Values:** {df[selected_column].isna().sum()} ({df[selected_column].isna().mean()*100:.2f}%)")
-                with col2:
-                    st.subheader("Distribution")
-                    if pd.api.types.is_numeric_dtype(df[selected_column]):
-                        # Histogram for numeric
-                        fig = px.histogram(df, x=selected_column, histnorm='probability density',
-                                         marginal='box', color_discrete_sequence=['#3B82F6'])
-                        fig.update_layout(height=300, margin=dict(l=0, r=0, t=20, b=0))
-                    else:
-                        # Bar chart for categorical
-                        value_counts = df[selected_column].value_counts().reset_index()
-                        value_counts.columns = [selected_column, 'Count']
-                        value_counts = value_counts.head(15)  # Limit to top 15
-                        fig = px.bar(value_counts, x=selected_column, y='Count',
-                                   color_discrete_sequence=['#3B82F6'])
-                        fig.update_layout(height=300, margin=dict(l=0, r=0, t=20, b=0))
-                    st.plotly_chart(fig, use_container_width=True)
-            else:
-                st.warning("No columns match your filters.")
-        with data_tabs[2]:
-            st.subheader("Data Visualizations")
-            viz_type = st.selectbox(
-                "Select Visualization Type",
-                options=["Scatter Plot", "Correlation Matrix", "Pair Plot", "Box Plot", "Violin Plot", "Line Chart"]
-            )
-            if viz_type == "Scatter Plot":
-                col1, col2, col3 = st.columns(3)
-                with col1:
-                    x_col = st.selectbox("X-axis", options=df.select_dtypes(include=['number']).columns)
-                with col2:
-                    y_col = st.selectbox("Y-axis", options=df.select_dtypes(include=['number']).columns,
-                                        index=min(1, len(df.select_dtypes(include=['number']).columns)-1))
-                with col3:
-                    color_col = st.selectbox("Color by", options=["None"] + list(df.columns), index=0)
-                # Create plot
-                if color_col == "None":
-                    fig = px.scatter(df, x=x_col, y=y_col, title=f"{x_col} vs {y_col}",
-                                    opacity=0.7, color_discrete_sequence=['#3B82F6'])
-                else:
-                    fig = px.scatter(df, x=x_col, y=y_col, color=color_col, title=f"{x_col} vs {y_col} by {color_col}",
-                                    opacity=0.7)
-                fig.update_layout(height=500)
-                st.plotly_chart(fig, use_container_width=True)
-                # Add regression line option
-                if st.checkbox("Add regression line"):
-                    fig = px.scatter(df, x=x_col, y=y_col, trendline="ols",
-                                    title=f"{x_col} vs {y_col} with Regression Line",
-                                    opacity=0.7, color_discrete_sequence=['#3B82F6'])
-                    fig.update_layout(height=500)
-                    st.plotly_chart(fig, use_container_width=True)
-            elif viz_type == "Correlation Matrix":
-                # Select columns for correlation
-                numeric_cols = df.select_dtypes(include=['number']).columns
-                selected_corr_cols = st.multiselect(
-                    "Select columns for correlation matrix",
-                    options=numeric_cols,
-                    default=list(numeric_cols)[:min(8, len(numeric_cols))]
-                )
-                if selected_corr_cols:
-                    # Correlation matrix
-                    corr = df[selected_corr_cols].corr()
-                    mask = np.triu(np.ones_like(corr, dtype=bool))
-                    # Create plot
-                    fig = px.imshow(corr, text_auto=True, color_continuous_scale='RdBu_r',
-                                  zmin=-1, zmax=1, aspect="auto")
-                    fig.update_layout(height=600)
-                    st.plotly_chart(fig, use_container_width=True)
-                else:
-                    st.warning("Please select at least one numeric column.")
-            elif viz_type == "Pair Plot":
-                # Select columns for pair plot
-                numeric_cols = df.select_dtypes(include=['number']).columns
-                selected_pair_cols = st.multiselect(
-                    "Select columns for pair plot (limit 4-5 for readability)",
-                    options=numeric_cols,
-                    default=list(numeric_cols)[:min(4, len(numeric_cols))]
-                )
-                if selected_pair_cols:
-                    if len(selected_pair_cols) > 6:
-                        st.warning("Too many columns may make the plot hard to read. Consider selecting 4-5 columns.")
-                    # Color option
-                    color_col = st.selectbox("Color by (categorical)",
-                                           options=["None"] + list(df.select_dtypes(exclude=['number']).columns),
-                                           index=0)
-                    # Create pair plot
-                    if color_col == "None":
-                        fig = px.scatter_matrix(df, dimensions=selected_pair_cols, opacity=0.7)
-                    else:
-                        fig = px.scatter_matrix(df, dimensions=selected_pair_cols, color=color_col, opacity=0.7)
-                    fig.update_layout(height=700)
-                    st.plotly_chart(fig, use_container_width=True)
-                else:
-                    st.warning("Please select at least one numeric column.")
-            elif viz_type == "Box Plot":
-                col1, col2 = st.columns(2)
-                with col1:
-                    y_col = st.selectbox("Value column", options=df.select_dtypes(include=['number']).columns)
-                with col2:
-                    x_col = st.selectbox("Category column",
-                                        options=["None"] + list(df.select_dtypes(exclude=['number']).columns),
-                                        index=0)
-                # Create plot
-                if x_col == "None":
-                    fig = px.box(df, y=y_col, title=f"Distribution of {y_col}",
-                               color_discrete_sequence=['#3B82F6'])
-                else:
-                    fig = px.box(df, x=x_col, y=y_col, title=f"Distribution of {y_col} by {x_col}")
-                fig.update_layout(height=500)
-                st.plotly_chart(fig, use_container_width=True)
-            elif viz_type == "Violin Plot":
-                col1, col2 = st.columns(2)
-                with col1:
-                    y_col = st.selectbox("Value column", options=df.select_dtypes(include=['number']).columns, key="violin_y")
-                with col2:
-                    x_col = st.selectbox("Category column",
-                                        options=["None"] + list(df.select_dtypes(exclude=['number']).columns),
-                                        index=0, key="violin_x")
-                # Create plot
-                if x_col == "None":
-                    fig = px.violin(df, y=y_col, box=True, title=f"Distribution of {y_col}",
-                                  color_discrete_sequence=['#3B82F6'])
-                else:
-                    fig = px.violin(df, x=x_col, y=y_col, box=True, title=f"Distribution of {y_col} by {x_col}")
-                fig.update_layout(height=500)
-                st.plotly_chart(fig, use_container_width=True)
-            elif viz_type == "Line Chart":
-                # Identify potential date columns
-                date_cols = []
-                for col in df.columns:
-                    try:
-                        if pd.to_datetime(df[col], errors='coerce').notna().all():
-                            date_cols.append(col)
-                    except:
-                        pass
-                if date_cols:
-                    col1, col2 = st.columns(2)
-                    with col1:
-                        x_col = st.selectbox("Time axis", options=date_cols)
-                        # Convert to datetime if not already
-                        df[x_col] = pd.to_datetime(df[x_col])
-                    with col2:
-                        y_cols = st.multiselect("Value columns", options=df.select_dtypes(include=['number']).columns,
-                                               default=[df.select_dtypes(include=['number']).columns[0]])
-                    if y_cols:
-                        # Create line chart
-                        fig = go.Figure()
-                        for y_col in y_cols:
-                            fig.add_trace(go.Scatter(x=df[x_col], y=df[y_col], mode='lines', name=y_col))
-                        fig.update_layout(
-                            title=f"Time Series Plot",
-                            xaxis_title=x_col,
-                            yaxis_title="Values",
-                            height=500,
-                            legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
-                        )
-                        st.plotly_chart(fig, use_container_width=True)
-                    else:
-                        st.warning("Please select at least one value column.")
-                else:
-                    st.warning("No datetime columns detected. Please ensure you have columns with date/time values.")
-        with data_tabs[3]:
-            st.subheader("Comprehensive Profiling Report")
-            profile_options = st.columns(3)
-            with profile_options[0]:
-                minimal = st.checkbox("Minimal Report (Faster)", value=True)
-            with profile_options[1]:
-                sample_data = st.checkbox("Use Sample (Faster for large datasets)", value=True)
-            with profile_options[2]:
-                report_percent = st.slider("Sample Size %", min_value=10, max_value=100, value=50, step=10)
-            if st.button("Generate Profile Report"):
-                with st.spinner("Generating comprehensive profile report..."):
-                    if sample_data and len(df) > 1000:
-                        profile_df = df.sample(int(len(df) * report_percent/100))
-                    else:
-                        profile_df = df
-                    if minimal:
-                        profile = ProfileReport(profile_df, minimal=True, title="Dataset Profile Report")
-                    else:
-                        profile = ProfileReport(profile_df, explorative=True, title="Dataset Profile Report")
-                    st_profile_report(profile)
-    st.markdown('</div>', unsafe_allow_html=True)
 def model_training_page():
-    """Enhanced model training page with more options and better UI"""
-    st.markdown('<div class="card">', unsafe_allow_html=True)
-    # Header with animation
-    col1, col2 = st.columns([1, 3])
-    with col1:
-        st_lottie(lottie_neural, height=150, key="neural_animation")
-    with col2:
-        st.markdown('<div class="step-header">', unsafe_allow_html=True)
-        st.markdown('<div class="step-counter">2</div>', unsafe_allow_html=True)
-        st.markdown('<div class="step-title">Neural Network Training Studio</div>', unsafe_allow_html=True)
-        st.markdown('</div>', unsafe_allow_html=True)
-        st.markdown("Train advanced models with automated optimization and hyperparameter tuning.")
     if 'df' not in st.session_state:
-        st.warning("Please upload data first!")
-        st.markdown('</div>', unsafe_allow_html=True)
         return
     df = st.session_state.df
-    # Create multiple tabs for the workflow
-    train_tabs = st.tabs(["⚙️ Setup", "🔄 Preprocessing", "🧠 Training", "📊 Results"])
-    with train_tabs[0]:
-        st.subheader("Model Configuration")
-        # Problem type selection
-        problem_type = st.selectbox(
-            "Select Problem Type",
-            ["Classification", "Regression", "Clustering"],
-            help="Classification: predict categories, Regression: predict continuous values, Clustering: group similar data points"
-        )
-        # Domain specialization
-        domain_col1, domain_col2 = st.columns(2)
-        with domain_col1:
-            mode = st.selectbox(
-                "Domain Specialization",
-                ["General", "Legal", "Financial", "Medical", "Technical", "Academic"],
-                help="Optimize the model for your specific domain"
-            )
-        with domain_col2:
-            experiment_name = st.text_input(
-                "Experiment Name",
-                value=f"{problem_type}_{datetime.datetime.now().strftime('%Y%m%d_%H%M')}",
-                help="Name your experiment for reference"
-            )
-        # Target variable selection
-        if problem_type != "Clustering":
-            target_col1, target_col2 = st.columns(2)
-            with target_col1:
-                target = st.selectbox("Select Target Variable", df.columns)
-            with target_col2:
-                if problem_type == "Classification":
-                    st.info(f"Class Distribution: {df[target].value_counts().to_dict()}")
-                else:
-                    st.info(f"Target Range: {df[target].min()} to {df[target].max()}")
-            # Feature selection
-            st.subheader("Feature Selection")
-            select_features = st.checkbox("Select specific features", value=False)
-            if select_features:
-                available_features = [col for col in df.columns if col != target]
-                selected_features = st.multiselect(
-                    "Select features to include",
-                    options=available_features,
-                    default=available_features
-                )
-                st.session_state.selected_columns = selected_features + [target]
-            else:
-                st.session_state.selected_columns = df.columns.tolist()
-        else:
-            # For clustering, all columns are features
-            available_features = df.columns.tolist()
-            selected_features = st.multiselect(
-                "Select features for clustering",
-                options=available_features,
-                default=available_features[:min(5, len(available_features))]
-            )
-            st.session_state.selected_columns = selected_features
-    # Rest of the model_training_page function would go here...
-    st.markdown('</div>', unsafe_allow_html=True)
 def visualization_page():
-    """Visualization and evaluation page for trained models"""
-    st.markdown('<div class="card">', unsafe_allow_html=True)
-    # Header with animation
-    col1, col2 = st.columns([1, 3])
-    with col1:
-        st_lottie(lottie_visualization, height=150, key="viz_animation")
-    with col2:
-        st.markdown('<div class="step-header">', unsafe_allow_html=True)
-        st.markdown('<div class="step-counter">3</div>', unsafe_allow_html=True)
-        st.markdown('<div class="step-title">Neural Network Evaluation Center</div>', unsafe_allow_html=True)
-        st.markdown('</div>', unsafe_allow_html=True)
-        st.markdown("Visualize, interpret, and validate your trained neural networks.")
     if 'best_model' not in st.session_state:
-        st.warning("Please train a model first!")
-        st.markdown('</div>', unsafe_allow_html=True)
         return
-    # Evaluation tabs for different analyses
-    eval_tabs = st.tabs(["📊 Model Performance", "🔍 Model Interpretation", "🧪 Test Predictions"])
-    # Tabs content would go here
-    st.markdown('</div>', unsafe_allow_html=True)
 def ai_assistant():
-    """AI Assistant for neural network development guidance"""
     st.markdown('<div class="chat-container">', unsafe_allow_html=True)
-    st.subheader("📚 Neural Network Development Assistant")
-    user_input = st.text_area("Ask a question about your data or neural network development:", "")
-    use_web_search = st.checkbox("Enable web search for up-to-date information", value=False)
-    if st.button("Get AI Guidance"):
-        if user_input:
-            with st.spinner("Analyzing your question..."):
-                # Add user message to chat history
-                st.session_state.chat_history.append({"role": "user", "content": user_input})
-                for msg in st.session_state.chat_history:
-                    if msg["role"] == "user":
-                        st.markdown(f'<div class="user-message">{msg["content"]}</div>', unsafe_allow_html=True)
-                    else:
-                        st.markdown(f'<div class="bot-message">{msg["content"]}</div>', unsafe_allow_html=True)
-                # Generate response
-                try:
-                    ai_response = get_groq_response(user_input, st.session_state.get('mode', 'General'), use_web_search)
-                    st.session_state.chat_history.append({"role": "assistant", "content": ai_response})
-                    st.markdown(f'<div class="bot-message">{ai_response}</div>', unsafe_allow_html=True)
-                except Exception as e:
-                    st.error(f"Error getting AI response: {str(e)}")
-                    st.info("Falling back to alternative model...")
-                    try:
-                        # Fallback to OpenAI
-                        ai_response = "I'm sorry, I couldn't generate a proper response. Please try rephrasing your question."
-                        st.session_state.chat_history.append({"role": "assistant", "content": ai_response})
-                        st.markdown(f'<div class="bot-message">{ai_response}</div>', unsafe_allow_html=True)
-                    except:
-                        st.error("Both primary and fallback AI services failed. Please try again later.")
-    st.markdown('</div>', unsafe_allow_html=True)
-# Initialize additional session state variables
-if 'notification' not in st.session_state:
-    st.session_state.notification = None
-# Import and initialize Lottie animations
-def load_lottie_url(url):
-    """Load Lottie animation from URL"""
-    try:
-        import json
-        import requests
-        r = requests.get(url)
-        if r.status_code != 200:
-            return None
-        return r.json()
-    except:
-        return None
-# Lottie animations
-lottie_upload = load_lottie_url("https://assets9.lottiefiles.com/packages/lf20_grdj1jti.json")
-lottie_neural = load_lottie_url("https://assets8.lottiefiles.com/private_files/lf30_8uvz2gcg.json")
-lottie_visualization = load_lottie_url("https://assets5.lottiefiles.com/packages/lf20_usmfx6bp.json")
-# Main function to run the app
-def main():
-    """Main function to run the app"""
-    # Main App Layout
-    st.markdown("""
-        <div class="header">
-            <h1 class="header-title">Neural-Vision Enhanced</h1>
-            <div class="header-subtitle">Neural Network Development for Domain-Specialized Analysis</div>
-        </div>
-    """, unsafe_allow_html=True)
-    with st.sidebar:
-        st.title("🔮 Neural-Vision Enhanced")
-        page = st.selectbox("Navigation", [
-            "Data Upload & Analysis",
-            "Neural Network Training Studio",
-            "Neural Network Evaluation Center"
-        ])
-        st.session_state.active_page = page
-        st.markdown("---")
-        st.markdown("**Environment Setup**")
-        # Tavily API Key Input and Submit Button
-        tavily_api_input = st.text_input("Tavily API Key", type="password", help="Enter your Tavily API key for web search functionality")
-        if st.button("Submit API Key"):
-            if tavily_api_input:
-                st.session_state.tavily_api_key = tavily_api_input
-                st.success("Tavily API Key submitted successfully!")
-            else:
-                st.warning("Please enter a valid API key.")
-        st.markdown("---")
-        st.markdown("v5.0 | © 2025 Neural-Vision")
-    # Page Routing
-    if st.session_state.active_page == "Data Upload & Analysis":
-        data_upload_page()
-    elif st.session_state.active_page == "Neural Network Training Studio":
-        model_training_page()
-    else:
-        visualization_page()
-    ai_assistant()
-# Run the app
-if __name__ == "__main__":
-    main()

 import streamlit as st
 import pandas as pd
 import plotly.express as px
 import numpy as np
 from sklearn.model_selection import train_test_split
 from sklearn.neural_network import MLPClassifier, MLPRegressor
+from sklearn.cluster import KMeans
 from sklearn.metrics import accuracy_score, r2_score, silhouette_score, confusion_matrix, classification_report, mean_squared_error
 from sklearn.preprocessing import StandardScaler
 from ydata_profiling import ProfileReport
 from streamlit_pandas_profiling import st_profile_report
 from groq import Groq
 from langchain_community.vectorstores import FAISS
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import os
 from dotenv import load_dotenv
 import tempfile
 # Load environment variables
 load_dotenv()
         transform: translateY(-2px);
         box-shadow: 0 6px 16px rgba(59, 130, 246, 0.4);
     }
     </style>
 """, unsafe_allow_html=True)
     fig = px.scatter(X, x=X.columns[0], y=X.columns[1], color=labels, title="Cluster Visualization")
     return fig
 # Pages
 def data_upload_page():
+    st.header("📤 Data Upload & Analysis")
+    uploaded_file = st.file_uploader("Upload Dataset", type=["csv"])
     if uploaded_file:
+        df = pd.read_csv(uploaded_file)
+        st.session_state.df = df
+        st.session_state.vector_store = create_vector_store(convert_df_to_text(df))
+        st.session_state.metrics = {}
+        st.subheader("Dataset Health Check")
+        col1, col2, col3 = st.columns(3)
+        col1.metric("Total Samples", df.shape[0])
+        col2.metric("Features", df.shape[1])
+        col3.metric("Missing Values", df.isna().sum().sum())
+        if st.button("Generate Full EDA Report"):
+            with st.spinner("Generating comprehensive analysis..."):
+                profile = ProfileReport(df, explorative=True)
+                st_profile_report(profile)
 def model_training_page():
+    st.header("🧠 Neural Network Training Studio")
     if 'df' not in st.session_state:
+        st.warning("Upload data first!")
         return
     df = st.session_state.df
+    problem_type = st.selectbox("Select Problem Type", ["Classification", "Regression", "Clustering"])
+    mode = st.selectbox("Domain Specialization", ["Legal", "Financial", "Academic", "Technical"])
+    if problem_type != "Clustering":
+        target = st.selectbox("Select Target Variable", df.columns)
+        X = df.drop(columns=[target])
+        y = df[target]
+    else:
+        X = df
+        y = None
+    if st.button("Train Neural Network"):
+        with st.spinner("Training in progress..."):
+            X_scaled = StandardScaler().fit_transform(X)
+            X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42) if y is not None else (X_scaled, None, None, None)
+            if problem_type == "Classification":
+                model = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
+                model.fit(X_train, y_train)
+                y_pred = model.predict(X_test)
+                st.session_state.metrics = {
+                    "Accuracy": accuracy_score(y_test, y_pred),
+                    "Classification Report": classification_report(y_test, y_pred, output_dict=True)
+                }
+            elif problem_type == "Regression":
+                model = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
+                model.fit(X_train, y_train)
+                y_pred = model.predict(X_test)
+                st.session_state.metrics = {
+                    "R2 Score": r2_score(y_test, y_pred),
+                    "Mean Squared Error": mean_squared_error(y_test, y_pred)
+                }
+            else:  # Clustering
+                model = KMeans(n_clusters=3, random_state=42)
+                labels = model.fit_predict(X_scaled)
+                st.session_state.metrics = {
+                    "Silhouette Score": silhouette_score(X_scaled, labels)
+                }
+            st.session_state.best_model = model
+            st.session_state.X_test = X_test
+            st.session_state.y_test = y_test
+            st.session_state.y_pred = y_pred if y is not None else labels
+            st.session_state.problem_type = problem_type
+            st.success(f"Model trained successfully in {mode} mode!")
 def visualization_page():
+    st.header("🔍 Neural Network Evaluation Center")
     if 'best_model' not in st.session_state:
+        st.warning("Train a model first!")
         return
+    st.subheader("Performance Analysis")
+    if st.session_state.problem_type == "Classification":
+        st.plotly_chart(plot_confusion_matrix(st.session_state.y_test, st.session_state.y_pred))
+        st.plotly_chart(plot_feature_importance(st.session_state.best_model, pd.DataFrame(st.session_state.X_test, columns=st.session_state.df.columns[:-1])))
+    elif st.session_state.problem_type == "Regression":
+        st.plotly_chart(plot_residuals(st.session_state.y_test, st.session_state.y_pred))
+        st.plotly_chart(plot_feature_importance(st.session_state.best_model, pd.DataFrame(st.session_state.X_test, columns=st.session_state.df.columns[:-1])))
+    else:  # Clustering
+        st.plotly_chart(plot_clusters(pd.DataFrame(st.session_state.X_test, columns=st.session_state.df.columns), st.session_state.y_pred))
+    st.subheader("Metrics")
+    st.write(st.session_state.metrics)
+# Chatbot Interface
 def ai_assistant():
     st.markdown('<div class="chat-container">', unsafe_allow_html=True)
+    st.subheader("🧠 Neural Insight Assistant (RAG + Web Search)")
+    use_web_search = st.checkbox("Enable Tavily Web Search", value=False)
+    mode = st.selectbox("Domain Mode", ["Legal", "Financial", "Academic", "Technical"], key="chat_mode")
+    for msg in st.session_state.chat_history:
+        with st.chat_message(msg["role"]):
+            st.markdown(f'<div class="{msg["role"]}-message">{msg["content"]}</div>', unsafe_allow_html=True)
+    if prompt := st.chat_input("Ask about data, models, or web insights..."):
+        st.session_state.chat_history.append({"role": "user", "content": prompt})
+        with st.chat_message("user"):
+            st.markdown(f'<div class="user-message">{prompt}</div>', unsafe_allow_html=True)
+        with st.spinner("Processing..."):
+            response = get_groq_response(prompt, mode, use_web_search)
+            st.session_state.chat_history.append({"role": "assistant", "content": response})
+        with st.chat_message("assistant"):
+            st.markdown(f'<div class="bot-message">{response}</div>', unsafe_allow_html=True)
+    st.markdown('</div>', unsafe_allow_html=True)
+# Main App Layout
+st.markdown("""
+    <div class="header">
+        <h1 class="header-title">Neural-Vision Enhanced</h1>
+        <div class="header-subtitle">Neural Network Development for Domain-Specialized Analysis</div>
+    </div>
+""", unsafe_allow_html=True)
+with st.sidebar:
+    st.title("🔮 Neural-Vision Enhanced")
+    page = st.selectbox("Navigation", [
+        "Data Upload & Analysis",
+        "Neural Network Training Studio",
+        "Neural Network Evaluation Center"
+    ])
+    st.session_state.active_page = page
+    st.markdown("---")
+    st.markdown("**Environment Setup**")
+    # Tavily API Key Input and Submit Button
+    tavily_api_input = st.text_input("Tavily API Key", type="password", help="Enter your Tavily API key for web search functionality")
+    if st.button("Submit API Key"):
+        if tavily_api_input:
+            st.session_state.tavily_api_key = tavily_api_input
+            st.success("Tavily API Key submitted successfully!")
+        else:
+            st.warning("Please enter a valid API key.")
+    st.markdown("---")
+    st.markdown("v5.0 | © 2025 Neural-Vision")
+# Page Routing
+if "Data Upload & Analysis" in page:
+    data_upload_page()
+elif "Neural Network Training Studio" in page:
+    model_training_page()
+else:
+    visualization_page()
+ai_assistant()