Spaces:

CosmickVisions
/

Data-Vision

Running

App Files Files Community

CosmickVisions commited on Mar 14

Commit

bc938fb

verified ·

1 Parent(s): 66b482d

Update app.py

Browse files

Files changed (1) hide show

app.py +211 -397

app.py CHANGED Viewed

@@ -1,59 +1,203 @@
 import streamlit as st
 import pandas as pd
 import numpy as np
-from pycaret.classification import setup as classification_setup, compare_models as compare_classification_models, evaluate_model as evaluate_classification_model, save_model as save_classification_model, plot_model as plot_classification_model
-from pycaret.regression import setup as regression_setup, compare_models as compare_regression_models, evaluate_model as evaluate_regression_model, save_model as save_regression_model, plot_model as plot_regression_model
-from pycaret.clustering import setup as clustering_setup, evaluate_model as evaluate_clustering_model, save_model as save_clustering_model, plot_model as plot_clustering_model
-from ydata_profiling import ProfileReport
-from streamlit_pandas_profiling import st_profile_report
-from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
-from sklearn.decomposition import PCA
-from scipy import stats
 import plotly.express as px
 import plotly.graph_objects as go
 import os
-# Set page config
-st.set_page_config(page_title="Neural-Vision Enhanced", layout="wide")
 # Sidebar Navigation
 with st.sidebar:
-    st.title("🔮 Neural-Vision Enhanced")
-    st.markdown("Your AI-powered model toolbox.")
-    st.markdown("---")
-    app_mode = st.selectbox("Navigation", ["Data Upload", "Data Cleaning", "EDA", "Model Training", "Validation & Exploration"])
-    data_type = st.selectbox("Data Type", ["Tabular"])
     st.markdown("---")
-    st.markdown("**Dependencies**: `pycaret`, `pandas`, `streamlit`, `ydata-profiling`, `sklearn`, `plotly`")
-    st.markdown("Created by Calvin Allen-Crawford | v2.0 | © 2025")
-# Helper functions
-def update_cleaned_data(new_df):
-    if 'data_versions' not in st.session_state:
-        st.session_state.data_versions = []
-    st.session_state.data_versions.append(new_df)
-    st.session_state['cleaned_data'] = new_df
-def enhance_section_title(title):
-    st.markdown(f"<h2 style='text-align: center; color: #1e3a8a;'>{title}</h2>", unsafe_allow_html=True)
-# Main App Sections
 if app_mode == "Data Upload":
-    st.title("📤 Data Upload")
-    uploaded_file = st.file_uploader("Upload CSV Dataset", type=["csv"])
     if uploaded_file:
-        df = pd.read_csv(uploaded_file)
-        st.session_state['raw_data'] = df
-        st.session_state['cleaned_data'] = df.copy()
-        st.write("---")
-        st.subheader("Dataset Preview")
-        st.dataframe(df.head(10))
-        st.write("---")
-        st.subheader("Statistics")
-        col1, col2, col3 = st.columns(3)
-        with col1: st.metric("Rows", df.shape[0])
-        with col2: st.metric("Columns", df.shape[1])
-        with col3: st.metric("Missing Values", df.isna().sum().sum())
 elif app_mode == "Data Cleaning":
     st.title("🧹 Smart Data Cleaning")
@@ -79,165 +223,9 @@ elif app_mode == "Data Cleaning":
             if st.button("Undo Last Action"):
                 st.session_state.data_versions.pop()
                 st.session_state.cleaned_data = st.session_state.data_versions[-1].copy()
                 st.rerun()
-    with st.expander("🛠️ Data Cleaning Operations", expanded=True):
-        enhance_section_title("🔍 Missing Values Treatment")
-        missing_cols = df.columns[df.isna().any()].tolist()
-        if missing_cols:
-            cols = st.multiselect("Select columns with missing values", missing_cols)
-            method = st.selectbox("Choose imputation method", [
-                "Drop Missing Values", "Fill with Mean/Median", "Fill with Custom Value", "Forward Fill", "Backward Fill"
-            ])
-            if method == "Fill with Custom Value":
-                custom_val = st.text_input("Enter custom value:")
-            if st.button("Apply Missing Value Treatment"):
-                new_df = df.copy()
-                if method == "Drop Missing Values":
-                    new_df = new_df.dropna(subset=cols)
-                elif method == "Fill with Mean/Median":
-                    for col in cols:
-                        if pd.api.types.is_numeric_dtype(new_df[col]):
-                            new_df[col] = new_df[col].fillna(new_df[col].median())
-                        else:
-                            new_df[col] = new_df[col].fillna(new_df[col].mode()[0])
-                elif method == "Fill with Custom Value" and custom_val:
-                    new_df[cols] = new_df[cols].fillna(custom_val)
-                elif method == "Forward Fill":
-                    new_df[cols] = new_df[cols].ffill()
-                elif method == "Backward Fill":
-                    new_df[cols] = new_df[cols].bfill()
-                update_cleaned_data(new_df)
-        else:
-            st.success("✨ No missing values detected!")
-        enhance_section_title("🔄 Data Type Conversion")
-        col_to_convert = st.selectbox("Select column to convert", df.columns)
-        new_type = st.selectbox("Select new data type", ["String", "Integer", "Float", "Boolean", "Datetime"])
-        if new_type == "Datetime":
-            date_format = st.text_input("Enter date format (e.g., %Y-%m-%d):", "%Y-%m-%d")
-        if st.button("Convert Data Type"):
-            new_df = df.copy()
-            if new_type == "String":
-                new_df[col_to_convert] = new_df[col_to_convert].astype(str)
-            elif new_type == "Integer":
-                new_df[col_to_convert] = pd.to_numeric(new_df[col_to_convert], errors='coerce').astype('Int64')
-            elif new_type == "Float":
-                new_df[col_to_convert] = pd.to_numeric(new_df[col_to_convert], errors='coerce')
-            elif new_type == "Boolean":
-                new_df[col_to_convert] = new_df[col_to_convert].astype(bool)
-            elif new_type == "Datetime":
-                new_df[col_to_convert] = pd.to_datetime(new_df[col_to_convert], format=date_format, errors='coerce')
-            update_cleaned_data(new_df)
-        enhance_section_title("🗑️ Drop Columns")
-        columns_to_drop = st.multiselect("Select columns to remove", df.columns)
-        if columns_to_drop and st.button("Confirm Column Removal"):
-            new_df = df.copy()
-            new_df = new_df.drop(columns=columns_to_drop)
-            update_cleaned_data(new_df)
-        enhance_section_title("🔢 Encoding Options")
-        encoding_method = st.radio("Choose encoding method", ("Label Encoding", "One-Hot Encoding"))
-        data_to_encode = st.multiselect("Select columns to encode", df.select_dtypes(include='object').columns)
-        if data_to_encode and st.button("Apply Encoding"):
-            new_df = df.copy()
-            if encoding_method == "Label Encoding":
-                for col in data_to_encode:
-                    le = LabelEncoder()
-                    new_df[col] = le.fit_transform(new_df[col].astype(str))
-            elif encoding_method == "One-Hot Encoding":
-                new_df = pd.get_dummies(new_df, columns=data_to_encode, drop_first=True, dtype=int)
-            update_cleaned_data(new_df)
-        enhance_section_title("📏 StandardScaler")
-        scale_cols = st.multiselect("Select numerical columns to scale", df.select_dtypes(include=np.number).columns)
-        if scale_cols and st.button("Apply StandardScaler"):
-            new_df = df.copy()
-            scaler = StandardScaler()
-            new_df[scale_cols] = scaler.fit_transform(new_df[scale_cols])
-            update_cleaned_data(new_df)
-        enhance_section_title("🕵️ Pattern-Based Cleaning")
-        selected_col = st.selectbox("Select text column for pattern cleaning", df.select_dtypes(include='object').columns)
-        pattern = st.text_input("Enter regex pattern:")
-        replacement = st.text_input("Enter replacement value:")
-        if st.button("Apply Pattern Replacement"):
-            new_df = df.copy()
-            new_df[selected_col] = new_df[selected_col].str.replace(pattern, replacement, regex=True)
-            update_cleaned_data(new_df)
-        enhance_section_title("🚀 Bulk Actions")
-        bulk_action = st.selectbox("Choose bulk action", [
-            "Auto-Clean Common Issues", "Drop All Missing Values", "Fill All Missing Values",
-            "One-Hot Encode All Categorical Columns", "Apply Min-Max Scaling to All Numeric Columns",
-            "Remove Outliers from All Numeric Columns", "Principal Component Analysis (PCA)"
-        ])
-        if bulk_action == "Auto-Clean Common Issues" and st.button("Run Auto-Clean"):
-            new_df = df.copy()
-            new_df = new_df.dropna(axis=1, how='all')
-            new_df = new_df.convert_dtypes()
-            text_cols = new_df.select_dtypes(include='object').columns
-            new_df[text_cols] = new_df[text_cols].apply(lambda x: x.str.strip())
-            update_cleaned_data(new_df)
-        elif bulk_action == "Drop All Missing Values" and st.button("Drop All Missing"):
-            new_df = df.copy()
-            new_df = new_df.dropna()
-            update_cleaned_data(new_df)
-        elif bulk_action == "Fill All Missing Values":
-            fill_value = st.text_input("Enter fill value:", "0")
-            if st.button("Fill Missing Values"):
-                new_df = df.copy()
-                new_df = new_df.fillna(fill_value)
-                update_cleaned_data(new_df)
-        elif bulk_action == "One-Hot Encode All Categorical Columns" and st.button("One-Hot Encode All"):
-            new_df = df.copy()
-            categorical_cols = new_df.select_dtypes(include='object').columns
-            new_df = pd.get_dummies(new_df, columns=categorical_cols, drop_first=True, dtype=int)
-            update_cleaned_data(new_df)
-        elif bulk_action == "Apply Min-Max Scaling to All Numeric Columns" and st.button("Apply Min-Max Scaling"):
-            new_df = df.copy()
-            scaler = MinMaxScaler()
-            numerical_cols = new_df.select_dtypes(include=np.number).columns
-            new_df[numerical_cols] = scaler.fit_transform(new_df[numerical_cols])
-            update_cleaned_data(new_df)
-        elif bulk_action == "Remove Outliers from All Numeric Columns" and st.button("Remove All Outliers"):
-            new_df = df.copy()
-            z_scores = np.abs(stats.zscore(new_df.select_dtypes(include=np.number)))
-            new_df = new_df[(z_scores < 3).all(axis=1)]
-            update_cleaned_data(new_df)
-        elif bulk_action == "Principal Component Analysis (PCA)":
-            n_components_bulk = st.slider("Number of components", 1, min(df.shape[1], 10), 2)
-            if st.button("Apply PCA (Bulk)"):
-                new_df = df.copy()
-                pca = PCA(n_components=n_components_bulk)
-                numerical_cols = new_df.select_dtypes(include=np.number).columns
-                pca_result = pca.fit_transform(new_df[numerical_cols])
-                new_df = pd.DataFrame(pca_result, columns=[f'PC{i+1}' for i in range(pca_result.shape[1])])
-                update_cleaned_data(new_df.reset_index(drop=True))
-        enhance_section_title("📊 Principal Component Analysis (PCA)")
-        numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
-        if numerical_cols:
-            pca_cols = st.multiselect("Select columns for PCA", numerical_cols, default=numerical_cols)
-            if pca_cols:
-                st.subheader("Covariance Matrix Heatmap")
-                cov_matrix = df[pca_cols].cov()
-                fig_cov = px.imshow(cov_matrix, labels=dict(x="Features", y="Features", color="Covariance"), color_continuous_scale='RdBu_r')
-                st.plotly_chart(fig_cov)
-                n_components = st.slider("Number of components", 1, min(len(pca_cols), 10), 2)
-                if st.button("Apply PCA"):
-                    new_df = df.copy()
-                    scaler = StandardScaler()
-                    scaled_data = scaler.fit_transform(new_df[pca_cols])
-                    pca = PCA(n_components=n_components)
-                    pca_result = pca.fit_transform(scaled_data)
-                    pca_df = pd.DataFrame(pca_result, columns=[f'PC{i+1}' for i in range(n_components)])
-                    update_cleaned_data(pca_df.reset_index(drop=True))
-                    st.write("Explained Variance Ratio:", pca.explained_variance_ratio_)
-        else:
-            st.warning("No numerical columns available for PCA.")
 elif app_mode == "EDA":
     st.title("🔍 Interactive Data Explorer")
     if 'cleaned_data' not in st.session_state:
@@ -245,10 +233,7 @@ elif app_mode == "EDA":
         st.stop()
     df = st.session_state.cleaned_data.copy()
-    # Enhanced Section Title
     enhance_section_title("Dataset Overview")
-    # Dataset Overview with More Visual Appeal
     with st.container():
         col1, col2, col3, col4 = st.columns(4)
         col1.metric("Total Rows", df.shape[0])
@@ -257,197 +242,26 @@ elif app_mode == "EDA":
         col3.metric("Missing Values", f"{df.isna().sum().sum()} ({missing_percentage:.1f}%)")
         col4.metric("Duplicates", df.duplicated().sum())
-    # Tabs for Quick Preview, Column Types, and Missing Matrix
-    tab1, tab2, tab3 = st.tabs(["Quick Preview", "Column Types", "Missing Matrix"])
-    with tab1:
-        st.write("First few rows of the dataset:")
-        st.dataframe(df.head(), use_container_width=True)
-    with tab2:
-        st.write("Column Data Types:")
-        type_counts = df.dtypes.value_counts().reset_index()
-        type_counts.columns = ['Type', 'Count']
-        st.dataframe(type_counts, use_container_width=True)
-    with tab3:
-        st.write("Missing Values Matrix:")
-        fig_missing = px.imshow(df.isna(), color_continuous_scale=['#e0e0e0', '#FF4B4B'])
-        st.plotly_chart(fig_missing, use_container_width=True)
-    # Enhanced Visualization Builder Section
-    enhance_section_title("Interactive Visualization Builder")
-    with st.container():
-        col1, col2 = st.columns([1, 3])
-        with col1:
-            plot_type = st.selectbox("Choose visualization type", [
-                "Scatter Plot", "Histogram", "Box Plot", "Violin Plot", "Line Chart", "Bar Chart",
-                "Correlation Matrix", "Pair Plot", "Heatmap", "3D Scatter", "Parallel Categories",
-                "Segmented Bar Chart", "Swarm Plot", "Ridge Plot", "Bubble Plot", "Density Plot",
-                "Count Plot", "Lollipop Chart"
-            ])
-            x_axis = st.selectbox("X-axis", df.columns) if plot_type not in ["Correlation Matrix", "Pair Plot"] else None
-            y_axis = st.selectbox("Y-axis", df.columns) if plot_type in ["Scatter Plot", "Box Plot", "Violin Plot", "Line Chart", "Heatmap", "Swarm Plot", "Ridge Plot", "Bubble Plot", "Density Plot", "Lollipop Chart"] else None
-            z_axis = st.selectbox("Z-axis", df.columns) if plot_type == "3D Scatter" else None
-            color_by = st.selectbox("Color encoding", ["None"] + df.columns.tolist(), format_func=lambda x: "No color" if x == "None" else x) if plot_type not in ["Correlation Matrix", "Pair Plot"] else None
-            if plot_type == "Parallel Categories":
-                dimensions = st.multiselect("Dimensions", df.columns.tolist(), default=df.columns[:3].tolist())
-            elif plot_type == "Segmented Bar Chart":
-                segment_col = st.selectbox("Segment Column (Categorical)", df.select_dtypes(exclude=np.number).columns)
-            elif plot_type == "Bubble Plot":
-                size_col = st.selectbox("Size Column", df.columns)
-            elif plot_type == "Pair Plot":
-                pair_cols = st.multiselect("Select columns for Pair Plot", df.columns, default=df.columns[:5].tolist())
-        with col2:
-            try:
-                fig = None
-                if plot_type == "Scatter Plot" and x_axis and y_axis:
-                    fig = px.scatter(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None, trendline="lowess", title=f'Scatter Plot of {x_axis} vs {y_axis}')
-                elif plot_type == "Histogram" and x_axis:
-                    fig = px.histogram(df, x=x_axis, color=color_by if color_by != "None" else None, nbins=30, marginal="box", title=f'Histogram of {x_axis}')
-                elif plot_type == "Box Plot" and x_axis and y_axis:
-                    fig = px.box(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None, title=f'Box Plot of {x_axis} vs {y_axis}')
-                elif plot_type == "Violin Plot" and x_axis and y_axis:
-                    fig = px.violin(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None, box=True, title=f'Violin Plot of {x_axis} vs {y_axis}')
-                elif plot_type == "Line Chart" and x_axis and y_axis:
-                    fig = px.line(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None, title=f'Line Chart of {x_axis} vs {y_axis}')
-                elif plot_type == "Bar Chart" and x_axis:
-                    fig = px.bar(df, x=x_axis, color=color_by if color_by != "None" else None, title=f'Bar Chart of {x_axis}')
-                elif plot_type == "Correlation Matrix":
-                    numeric_df = df.select_dtypes(include=np.number)
-                    if len(numeric_df.columns) > 1:
-                        corr = numeric_df.corr()
-                        fig = px.imshow(corr, text_auto=True, color_continuous_scale='RdBu_r', zmin=-1, zmax=1, title='Correlation Matrix')
-                elif plot_type == "Pair Plot":
-                    if pair_cols:
-                        fig = px.scatter_matrix(df[pair_cols], color=color_by if color_by != "None" else None, title='Pair Plot')
-                elif plot_type == "Heatmap" and x_axis and y_axis:
-                    fig = px.density_heatmap(df, x=x_axis, y=y_axis, facet_col=color_by if color_by != "None" else None, title=f'Heatmap of {x_axis} vs {y_axis}')
-                elif plot_type == "3D Scatter" and x_axis and y_axis and z_axis:
-                    fig = px.scatter_3d(df, x=x_axis, y=y_axis, z=z_axis, color=color_by if color_by != "None" else None, title=f'3D Scatter Plot of {x_axis} vs {y_axis} vs {z_axis}')
-                elif plot_type == "Parallel Categories" and dimensions:
-                    fig = px.parallel_categories(df, dimensions=dimensions, color=color_by if color_by != "None" else None, title='Parallel Categories Plot')
-                elif plot_type == "Segmented Bar Chart" and x_axis and segment_col:
-                    segment_counts = df.groupby([x_axis, segment_col]).size().reset_index(name='counts')
-                    fig = px.bar(segment_counts, x=x_axis, y='counts', color=segment_col, title=f'Segmented Bar Chart of {x_axis} by {segment_col}')
-                    fig.update_layout(yaxis_title="Count")
-                elif plot_type == "Swarm Plot" and x_axis and y_axis:
-                    fig = px.strip(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None, title=f'Swarm Plot of {x_axis} vs {y_axis}')
-                elif plot_type == "Ridge Plot" and x_axis and y_axis:
-                    fig = px.histogram(df, x=x_axis, color=y_axis, marginal="rug", title=f'Ridge Plot of {x_axis} by {y_axis}')
-                elif plot_type == "Bubble Plot" and x_axis and y_axis and size_col:
-                    fig = px.scatter(df, x=x_axis, y=y_axis, size=size_col, color=color_by if color_by != "None" else None, title=f'Bubble Plot of {x_axis} vs {y_axis}')
-                elif plot_type == "Density Plot" and x_axis and y_axis:
-                    fig = px.density_heatmap(df, x=x_axis, y=y_axis, color_continuous_scale="Viridis", title=f'Density Plot of {x_axis} vs {y_axis}')
-                elif plot_type == "Count Plot" and x_axis:
-                    fig = px.bar(df, x=x_axis, color=color_by if color_by != "None" else None, title=f'Count Plot of {x_axis}')
-                    fig.update_layout(yaxis_title="Count")
-                elif plot_type == "Lollipop Chart" and x_axis and y_axis:
-                    fig = go.Figure()
-                    fig.add_trace(go.Scatter(x=df[x_axis], y=df[y_axis], mode='markers', marker=dict(size=10)))
-                    for i in range(len(df)):
-                        fig.add_trace(go.Scatter(x=[df[x_axis].iloc[i], df[x_axis].iloc[i]], y=[0, df[y_axis].iloc[i]], mode='lines', line=dict(color='gray')))
-                    fig.update_layout(showlegend=False, title=f'Lollipop Chart of {x_axis} vs {y_axis}')
-                if fig:
-                    fig.update_layout(template="plotly_white")
-                    st.plotly_chart(fig, use_container_width=True)
-                else:
-                    st.error("Please provide required inputs for the selected plot type.")
-            except Exception as e:
-                st.error(f"Couldn't create visualization: {str(e)}")
-elif app_mode == "Model Training":
-    st.title("🧠 Model Training")
-    if 'cleaned_data' not in st.session_state:
-        st.warning("Please upload and clean data first.")
-        st.stop()
-    df = st.session_state.cleaned_data.copy()
-    problem_type = st.selectbox("Problem Type", ["Classification", "Regression", "Clustering"])
-    target = st.selectbox("Select Target Column", df.columns) if problem_type != "Clustering" else None
-    if st.button("Setup PyCaret"):
-        with st.spinner("Setting up PyCaret..."):
-            if problem_type == "Classification":
-                classification_setup(data=df, target=target, session_id=123, verbose=False)
-                st.session_state['problem_type'] = "Classification"
-                st.session_state['setup_complete'] = True
-            elif problem_type == "Regression":
-                regression_setup(data=df, target=target, session_id=123, verbose=False)
-                st.session_state['problem_type'] = "Regression"
-                st.session_state['setup_complete'] = True
-            elif problem_type == "Clustering":
-                clustering_setup(data=df, session_id=123, verbose=False)
-                st.session_state['problem_type'] = "Clustering"
-                st.session_state['setup_complete'] = True
-            st.success("PyCaret setup complete! You can now train models.")
-    if st.session_state.get('setup_complete', False):
-        st.subheader("Train Models")
-        if st.button("Compare Models"):
-            with st.spinner("Comparing models..."):
-                if st.session_state['problem_type'] == "Classification":
-                    best_model = compare_classification_models()
-                elif st.session_state['problem_type'] == "Regression":
-                    best_model = compare_regression_models()
-                elif st.session_state['problem_type'] == "Clustering":
-                    st.info("Model comparison is not available for clustering. Please proceed with evaluation or create a model manually.")
-                    best_model = None
-                else:
-                    best_model = None
-                if best_model is not None:
-                    st.session_state['best_model'] = best_model
-                    st.success(f"Best Model: {best_model}")
-        if 'best_model' in st.session_state and st.session_state['best_model'] is not None:
-            st.subheader("Model Evaluation")
-            if st.button("Evaluate Model"):
-                with st.spinner("Evaluating model..."):
-                    if st.session_state['problem_type'] == "Classification":
-                        evaluate_classification_model(st.session_state['best_model'])
-                    elif st.session_state['problem_type'] == "Regression":
-                        evaluate_regression_model(st.session_state['best_model'])
-                    elif st.session_state['problem_type'] == "Clustering":
-                        evaluate_clustering_model(st.session_state['best_model'])
-                    st.success("Model evaluation complete!")
-            if st.button("Save Model"):
-                if st.session_state['problem_type'] == "Classification":
-                    save_classification_model(st.session_state['best_model'], "best_model")
-                elif st.session_state['problem_type'] == "Regression":
-                    save_regression_model(st.session_state['best_model'], "best_model")
-                elif st.session_state['problem_type'] == "Clustering":
-                    save_clustering_model(st.session_state['best_model'], "best_model")
-                st.success("Model saved as `best_model.pkl`!")
-                with open("best_model.pkl", "rb") as f:
-                    st.download_button("Download Model", f, file_name="best_model.pkl")
-elif app_mode == "Validation & Exploration":
-    st.title("🔍 Validation & Exploration")
-    if 'best_model' not in st.session_state or st.session_state['best_model'] is None:
-        st.warning("Please train a model first. Note: Clustering does not support automatic model comparison.")
-        st.stop()
-    st.subheader("Model Performance")
-    if st.session_state['problem_type'] == "Classification":
-        st.write("Classification Report:")
-        plot_classification_model(st.session_state['best_model'], plot="confusion_matrix", display_format="streamlit")
-        plot_classification_model(st.session_state['best_model'], plot="auc", display_format="streamlit")
-    elif st.session_state['problem_type'] == "Regression":
-        st.write("Regression Metrics:")
-        plot_regression_model(st.session_state['best_model'], plot="residuals", display_format="streamlit")
-        plot_regression_model(st.session_state['best_model'], plot="error", display_format="streamlit")
-    elif st.session_state['problem_type'] == "Clustering":
-        st.write("Clustering Results:")
-        plot_clustering_model(st.session_state['best_model'], plot="cluster", display_format="streamlit")
-# Custom CSS
-st.markdown("""
-    <style>
-    .stButton>button {background-color: #4CAF50; color: white;}
-    h1, h2 {color: #1e3a8a;}
-    </style>
-""", unsafe_allow_html=True)

 import streamlit as st
 import pandas as pd
 import numpy as np
 import plotly.express as px
 import plotly.graph_objects as go
+from ydata_profiling import ProfileReport
+from streamlit_pandas_profiling import st_profile_report
 import os
+import requests
+import json
+from datetime import datetime
+import re
+import tempfile
+from scipy import stats
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
+from sklearn.decomposition import PCA
+import streamlit.components.v1 as components
+from io import StringIO
+from dotenv import load_dotenv
+from flask import Flask, request, jsonify
+from openai import OpenAI
+import threading
+from sentence_transformers import SentenceTransformer
+# Load environment variables
+load_dotenv()
+# Initialize Flask app
+flask_app = Flask(__name__)
+FLASK_PORT = 5000  # Internal port for Flask, not exposed externally
+# Initialize OpenAI client
+api_key = os.getenv("OPENAI_API_KEY")
+if not api_key:
+    st.error("OPENAI_API_KEY not set. Please configure it in the Hugging Face Space secrets.")
+    st.stop()
+client = OpenAI(api_key=api_key)
+# Flask RAG Endpoint
+@flask_app.route('/rag_chat', methods=['POST'])
+def rag_chat():
+    data = request.get_json()
+    user_input = data.get('user_input', '')
+    app_mode = data.get('app_mode', 'Data Upload')
+    dataset_text = data.get('dataset_text', '')
+    # RAG Logic: Use dataset_text as retrieval context
+    system_prompt = (
+        "You are an AI assistant in Data-Vision Pro, a data analysis app with RAG capabilities. "
+        "The app has three pages:\n"
+        "- **Data Upload**: Upload CSV/XLSX files, view stats, or generate reports.\n"
+        "- **Data Cleaning**: Clean data (e.g., handle missing values, encode variables).\n"
+        "- **EDA**: Visualize data (e.g., scatter plots, histograms).\n"
+        f"The user is on the '{app_mode}' page.\n"
+    )
+    if dataset_text:
+        system_prompt += (
+            "Using the following dataset context, augment your response:\n"
+            f"{dataset_text}\n"
+            "Answer based on this data where relevant, otherwise provide general assistance."
+        )
+    else:
+        system_prompt += "No dataset is loaded. Assist based on app functionality."
+    try:
+        response = client.chat.completions.create(
+            model="gpt-3.5-turbo",
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_input}
+            ],
+            max_tokens=100,  # Increased for RAG context
+            temperature=0.7
+        )
+        return jsonify({"response": response.choices[0].message.content})
+    except Exception as e:
+        return jsonify({"error": str(e)}), 500
+# Run Flask in a background thread
+def run_flask():
+    flask_app.run(host='0.0.0.0', port=FLASK_PORT, debug=False, use_reloader=False)
+# Start Flask thread
+flask_thread = threading.Thread(target=run_flask, daemon=True)
+flask_thread.start()
+# Helper Functions
+def enhance_section_title(title):
+    st.markdown(f"<h2 style='border-bottom: 2px solid #ccc; padding-bottom: 5px;'>{title}</h2>", unsafe_allow_html=True)
+def update_cleaned_data(df):
+    st.session_state.cleaned_data = df
+    if 'data_versions' not in st.session_state:
+        st.session_state.data_versions = [st.session_state.raw_data.copy()]
+    st.session_state.data_versions.append(df.copy())
+    st.success("✅ Action completed successfully!")
+    st.rerun()
+def convert_csv_to_json_and_text(df):
+    """Convert DataFrame to JSON and then to plain text."""
+    json_data = df.to_json(orient="records")
+    data_dict = json.loads(json_data)
+    text_summary = f"Dataset Summary: {df.shape[0]} rows, {df.shape[1]} columns\n"
+    text_summary += f"Missing Values: {df.isna().sum().sum()}\n"
+    text_summary += "Columns:\n"
+    for col in df.columns:
+        text_summary += f"- {col} ({df[col].dtype}): "
+        if pd.api.types.is_numeric_dtype(df[col]):
+            text_summary += f"Mean={df[col].mean():.2f}, Min={df[col].min()}, Max={df[col].max()}"
+        else:
+            text_summary += f"Unique={df[col].nunique()}, Top={df[col].mode()[0] if not df[col].mode().empty else 'N/A'}"
+        text_summary += f", Missing={df[col].isna().sum()}\n"
+    return text_summary
+def get_chatbot_response(user_input, app_mode, dataset_text=""):
+    """Send request to internal Flask RAG endpoint."""
+    payload = {
+        "user_input": user_input,
+        "app_mode": app_mode,
+        "dataset_text": dataset_text
+    }
+    try:
+        response = requests.post(f"http://localhost:{FLASK_PORT}/rag_chat", json=payload, timeout=5)
+        response.raise_for_status()
+        return response.json().get("response", "Error: No response from server")
+    except requests.exceptions.RequestException as e:
+        return f"Error: Could not connect to RAG server. {str(e)}"
+# Streamlit App
 # Sidebar Navigation
 with st.sidebar:
+    st.title("🔮 Data-Vision Pro")
+    st.markdown("Your AI-powered data analysis suite with RAG.")
     st.markdown("---")
+    app_mode = st.selectbox(
+        "Navigation",
+        ["Data Upload", "Data Cleaning", "EDA"],
+        format_func=lambda x: f"📌 {x}"
+    )
+    if app_mode == "Data Upload":
+        st.info("⬆️ Upload your CSV or XLSX dataset to begin.")
+    elif app_mode == "Data Cleaning":
+        st.info("🧹 Clean and preprocess your data using various tools.")
+    elif app_mode == "EDA":
+        st.info("🔍 Explore your data visually and statistically.")
+    st.markdown("---")
+    st.markdown("**Note**: Requires dependencies in `requirements.txt`.")
+    if 'cleaned_data' in st.session_state:
+        csv = st.session_state.cleaned_data.to_csv(index=False)
+        st.download_button(
+            label="Download Cleaned Data as CSV",
+            data=csv,
+            file_name='cleaned_data.csv',
+            mime='text/csv',
+        )
+    st.markdown("Created by Calvin Allen-Crawford")
+    st.markdown("v1.0 | © 2025")
+# Main App Pages
 if app_mode == "Data Upload":
+    st.title("📤 Data Upload & Profiling")
+    st.header("Upload Your Dataset")
+    st.write("Supported formats: CSV, XLSX")
+    if 'raw_data' not in st.session_state:
+        st.info("It looks like no dataset has been uploaded yet. Would you like to upload a CSV or XLSX file?")
+    uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"], key="file_uploader")
     if uploaded_file:
+        st.session_state.pop('raw_data', None)
+        st.session_state.pop('cleaned_data', None)
+        st.session_state.pop('data_versions', None)
+        try:
+            if uploaded_file.name.endswith('.csv'):
+                df = pd.read_csv(uploaded_file)
+            else:
+                df = pd.read_excel(uploaded_file)
+            if df.empty:
+                st.error("Uploaded file is empty.")
+                st.stop()
+            st.session_state.raw_data = df
+            st.session_state.dataset_text = convert_csv_to_json_and_text(df)
+            if 'data_versions' not in st.session_state:
+                st.session_state.data_versions = [df.copy()]
+            col1, col2, col3 = st.columns(3)
+            with col1: st.metric("Rows", df.shape[0])
+            with col2: st.metric("Columns", df.shape[1])
+            with col3: st.metric("Missing Values", df.isna().sum().sum())
+            if st.checkbox("Show Data Preview"):
+                st.dataframe(df.head(10), use_container_width=True)
+            if st.button("Generate Full Profile Report"):
+                with st.spinner("Generating report..."):
+                    pr = ProfileReport(df, explorative=True)
+                    st_profile_report(pr)
+            st.success("✅ Data loaded successfully!")
+        except Exception as e:
+            st.error(f"An error occurred: {str(e)}")
 elif app_mode == "Data Cleaning":
     st.title("🧹 Smart Data Cleaning")
             if st.button("Undo Last Action"):
                 st.session_state.data_versions.pop()
                 st.session_state.cleaned_data = st.session_state.data_versions[-1].copy()
+                st.session_state.dataset_text = convert_csv_to_json_and_text(st.session_state.cleaned_data)
                 st.rerun()
 elif app_mode == "EDA":
     st.title("🔍 Interactive Data Explorer")
     if 'cleaned_data' not in st.session_state:
         st.stop()
     df = st.session_state.cleaned_data.copy()
     enhance_section_title("Dataset Overview")
     with st.container():
         col1, col2, col3, col4 = st.columns(4)
         col1.metric("Total Rows", df.shape[0])
         col3.metric("Missing Values", f"{df.isna().sum().sum()} ({missing_percentage:.1f}%)")
         col4.metric("Duplicates", df.duplicated().sum())
+# Chatbot Section
+st.markdown("---")
+st.subheader("💬 AI Chatbot Assistant (RAG Enabled)")
+st.info("Ask me about the app or your data! Try: 'What can I do here?' or 'What’s in the dataset?'")
+if "chat_history" not in st.session_state:
+    st.session_state.chat_history = []
+for message in st.session_state.chat_history:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"])
+user_input = st.chat_input("Ask me anything about the app or your data...")
+if user_input:
+    st.session_state.chat_history.append({"role": "user", "content": user_input})
+    with st.chat_message("user"):
+        st.markdown(user_input)
+    with st.spinner("Thinking with RAG..."):
+        dataset_text = st.session_state.get("dataset_text", "")
+        response = get_chatbot_response(user_input, app_mode, dataset_text)
+        st.session_state.chat_history.append({"role": "assistant", "content": response})
+    with st.chat_message("assistant"):
+        st.markdown(response)