Spaces:

CosmickVisions
/

Data-Vision

Running

App Files Files Community

CosmickVisions commited on Mar 19

Commit

42bab1b

verified ·

1 Parent(s): 07199b6

Update app.py

Browse files

Files changed (1) hide show

app.py +300 -576

app.py CHANGED Viewed

@@ -1,10 +1,8 @@
-import streamlit as st
 import pandas as pd
 import numpy as np
 import plotly.express as px
-import plotly.graph_objects as go
 from ydata_profiling import ProfileReport
-from streamlit_pandas_profiling import st_profile_report
 import os
 from dotenv import load_dotenv
 from groq import Groq
@@ -14,11 +12,9 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.embeddings import HuggingFaceEmbeddings
 import re
 from scipy import stats
-from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
 import tempfile
-# Set page config as the first Streamlit command
-st.set_page_config(page_title="Data-Vision Pro", layout="wide")
 # Load environment variables
 load_dotenv()
@@ -26,172 +22,16 @@ load_dotenv()
 # Initialize Groq client
 client = Groq(api_key=os.getenv("GROQ_API_KEY"))
-# Initialize HuggingFace embeddings for FAISS
 embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
-# Custom CSS with Modernized Silver, Blue, and Gold Theme + Responsiveness
-st.markdown("""
-    <style>
-    :root {
-        --silver-light: #D8D8D8;
-        --silver-dark: #B8B8B8;
-        --blue: #5C89BC;
-        --blue-dark: #4E73A0;
-        --blue-light: #6EA8E0;
-        --gold: #A87E01;
-        --text-color: #333333;
-        --shadow-color: rgba(0,0,0,0.1);
-        --shadow-color-stronger: rgba(0,0,0,0.2);
-    }
-    .stApp {
-        background: linear-gradient(135deg, var(--silver-light) 0%, var(--silver-dark) 100%);
-        font-family: 'Inter', sans-serif;
-        max-width: 900px;
-        margin: 0 auto;
-        padding: 10px;
-        transition: all 0.3s ease;
-    }
-    .header {
-        background: linear-gradient(90deg, var(--blue) 80%, var(--blue-dark) 100%);
-        color: white;
-        padding: 20px;
-        border-radius: 16px 16px 0 0;
-        box-shadow: 0 4px 12px var(--shadow-color);
-        text-align: center;
-        transition: transform 0.2s ease;
-    }
-    .header:hover {
-        transform: translateY(-2px);
-        box-shadow: 0 4px 12px var(--shadow-color-stronger);
-    }
-    .header-title {
-        font-size: 1.5rem;
-        font-weight: 700;
-        margin: 0;
-    }
-    .header-subtitle {
-        font-size: 0.9rem;
-        margin-top: 8px;
-        opacity: 0.9;
-    }
-    .sidebar .sidebar-content {
-        background-color: white;
-        border-radius: 16px;
-        box-shadow: 0 6px 16px var(--shadow-color);
-        padding: 20px;
-        transition: box-shadow 0.3s ease;
-    }
-    .sidebar .sidebar-content:hover {
-        box-shadow: 0 8px 20px var(--shadow-color-stronger);
-    }
-    .chat-container {
-        background-color: white;
-        border-radius: 16px;
-        box-shadow: 0 6px 16px var(--shadow-color);
-        padding: 20px;
-        margin-top: 25px;
-        transition: box-shadow 0.3s ease;
-    }
-    .chat-container:hover {
-        box-shadow: 0 8px 20px var(--shadow-color-stronger);
-    }
-    .user-message {
-        background: linear-gradient(45deg, var(--blue), var(--blue-light));
-        color: white;
-        border-radius: 20px 20px 6px 20px;
-        padding: 14px 18px;
-        margin-left: auto;
-        max-width: 80%;
-        margin-bottom: 12px;
-        box-shadow: 0 2px 8px var(--blue-dark);
-        transition: transform 0.2s ease;
-    }
-    .user-message:hover {
-        transform: scale(1.02);
-    }
-    .bot-message {
-        background-color: #F0F0F0;
-        color: var(--text-color);
-        border-radius: 20px 20px 20px 6px;
-        padding: 14px 18px;
-        margin-right: auto;
-        max-width: 80%;
-        margin-bottom: 12px;
-        box-shadow: 0 2px 8px var(--shadow-color);
-        transition: transform 0.2s ease;
-    }
-    .bot-message:hover {
-        transform: scale(1.02);
-    }
-    .footer {
-        text-align: center;
-        margin-top: 20px;
-        color: var(--text-color);
-        font-size: 0.8rem;
-    }
-    .tech-badge {
-        display: inline-block;
-        background-color: #E6ECEF;
-        color: var(--blue);
-        padding: 4px 8px;
-        border-radius: 12px;
-        font-size: 0.7rem;
-        margin: 0 4px;
-    }
-    h2 {
-        color: var(--blue);
-        border-bottom: 2px solid var(--gold);
-        padding-bottom: 5px;
-        font-size: 1.5rem;
-        font-weight: 700;
-    }
-    .stButton > button {
-        background-color: var(--gold);
-        color: white;
-        border-radius: 12px;
-        padding: 10px 20px;
-        border: none;
-        box-shadow: 0 4px 12px var(--shadow-color);
-        font-weight: 600;
-        transition: all 0.3s ease;
-    }
-    .stButton > button:hover {
-        background-color: #8C6B01;
-        transform: translateY(-2px);
-        box-shadow: 0 6px 16px var(--shadow-color-stronger);
-    }
-    @media (max-width: 768px) {
-        .header-title {
-            font-size: 1.2rem;
-        }
-        .header-subtitle {
-            font-size: 0.8rem;
-        }
-        .chat-container, .sidebar .sidebar-content {
-            padding: 10px;
-        }
-        .stApp {
-            padding: 5px;
-        }
-        h2 {
-            font-size: 1.2rem;
-        }
-    }
-    </style>
-""", unsafe_allow_html=True)
-# Helper Functions
-def enhance_section_title(title):
-    st.markdown(f"<h2 style='border-bottom: 2px solid var(--gold); padding-bottom: 5px; color: var(--blue);'>{title}</h2>", unsafe_allow_html=True)
 def update_cleaned_data(df):
-    st.session_state.cleaned_data = df
-    if 'data_versions' not in st.session_state:
-        st.session_state.data_versions = [st.session_state.raw_data.copy()]
-    st.session_state.data_versions.append(df.copy())
-    st.session_state.dataset_text = convert_df_to_text(df)
-    st.success("✅ Action completed successfully!")
-    st.rerun()
 def convert_df_to_text(df):
     text = f"Dataset Summary: {df.shape[0]} rows, {df.shape[1]} columns\n"
@@ -238,61 +78,33 @@ def extract_plot_data(plot_info, df):
     x_col = plot_info["x"]
     y_col = plot_info["y"] if "y" in plot_info else None
     data = pd.read_json(plot_info["data"])
-    plot_text = f"Plot Type: {plot_type}\n"
-    plot_text += f"X-Axis: {x_col}\n"
     if y_col:
         plot_text += f"Y-Axis: {y_col}\n"
     if plot_type == "Scatter Plot" and y_col:
         correlation = data[x_col].corr(data[y_col])
         slope, intercept, r_value, p_value, std_err = stats.linregress(data[x_col].dropna(), data[y_col].dropna())
-        plot_text += f"Correlation: {correlation:.2f}\n"
-        plot_text += f"Linear Regression: Slope={slope:.2f}, Intercept={intercept:.2f}, R²={r_value**2:.2f}, p-value={p_value:.4f}\n"
-        plot_text += f"X Stats: Mean={data[x_col].mean():.2f}, Std={data[x_col].std():.2f}, Min={data[x_col].min():.2f}, Max={data[x_col].max():.2f}\n"
-        plot_text += f"Y Stats: Mean={data[y_col].mean():.2f}, Std={data[y_col].std():.2f}, Min={data[y_col].min():.2f}, Max={data[y_col].max():.2f}\n"
-    elif plot_type == "Histogram":
-        plot_text += f"Stats: Mean={data[x_col].mean():.2f}, Median={data[x_col].median():.2f}, Std={data[x_col].std():.2f}\n"
-        plot_text += f"Skewness: {data[x_col].skew():.2f}\n"
-        plot_text += f"Range: [{data[x_col].min():.2f}, {data[x_col].max():.2f}]\n"
-    elif plot_type == "Box Plot" and y_col:
-        q1, q3 = data[y_col].quantile(0.25), data[y_col].quantile(0.75)
-        iqr = q3 - q1
-        plot_text += f"Y Stats: Median={data[y_col].median():.2f}, Q1={q1:.2f}, Q3={q3:.2f}, IQR={iqr:.2f}\n"
-        plot_text += f"Outliers: {len(data[y_col][(data[y_col] < q1 - 1.5 * iqr) | (data[y_col] > q3 + 1.5 * iqr)])} potential outliers\n"
-    elif plot_type == "Line Chart" and y_col:
-        plot_text += f"Y Stats: Mean={data[y_col].mean():.2f}, Std={data[y_col].std():.2f}, Trend={'increasing' if data[y_col].iloc[-1] > data[y_col].iloc[0] else 'decreasing'}\n"
-    elif plot_type == "Bar Chart":
-        plot_text += f"Counts: {data[x_col].value_counts().to_dict()}\n"
-    elif plot_type == "Correlation Matrix":
-        corr = data.corr()
-        plot_text += "Correlation Matrix:\n"
-        for col1 in corr.columns:
-            for col2 in corr.index:
-                if col1 < col2:
-                    plot_text += f"{col1} vs {col2}: {corr.loc[col2, col1]:.2f}\n"
     return plot_text
 def get_chatbot_response(user_input, app_mode, vector_store=None, model="llama3-70b-8192"):
     system_prompt = (
-        "You are an AI assistant in Data-Vision Pro, a data analysis app with RAG capabilities. "
-        f"The user is on the '{app_mode}' page:\n"
-        "- **Data Upload**: Upload CSV/XLSX files, view stats, or generate reports.\n"
-        "- **Data Cleaning**: Clean data (e.g., handle missing values, encode variables).\n"
-        "- **EDA**: Visualize data (e.g., scatter plots, histograms) and analyze plots.\n"
-        "When analyzing plots, provide detailed insights based on numerical data extracted from them."
     )
     context = ""
     if vector_store:
         docs = vector_store.similarity_search(user_input, k=3)
         if docs:
-            context = "\n\nDataset and Plot Context:\n" + "\n".join([f"- {doc.page_content}" for doc in docs])
-            system_prompt += f"Use this dataset and plot context to augment your response:\n{context}"
-    else:
-        system_prompt += "No dataset or plot data is loaded. Assist based on app functionality."
     try:
         response = client.chat.completions.create(
             model=model,
             messages=[
-                {"role": "system", "content": system_prompt},
                 {"role": "user", "content": user_input}
             ],
             temperature=0.7,
@@ -302,379 +114,291 @@ def get_chatbot_response(user_input, app_mode, vector_store=None, model="llama3-
     except Exception as e:
         return f"Error: {str(e)}"
-# Command Functions
-def drop_columns(columns):
-    if 'cleaned_data' in st.session_state:
-        df = st.session_state.cleaned_data.copy()
-        columns_to_drop = [col.strip() for col in columns.split(',')]
-        valid_columns = [col for col in columns_to_drop if col in df.columns]
-        if valid_columns:
-            df.drop(valid_columns, axis=1, inplace=True)
-            update_cleaned_data(df)
-            return f"Dropped columns: {', '.join(valid_columns)}"
-        else:
-            return "No valid columns found to drop."
-    return "No dataset loaded."
-def generate_scatter_plot(params):
-    df = st.session_state.cleaned_data
-    match = re.search(r"([\w\s]+)\s+vs\s+([\w\s]+)", params)
-    if match and len(match.groups()) >= 2:
-        x_axis, y_axis = match.group(1).strip(), match.group(2).strip()
-        if x_axis in df.columns and y_axis in df.columns:
-            fig = px.scatter(df, x=x_axis, y=y_axis, title=f'Scatter Plot of {x_axis} vs {y_axis}')
-            st.plotly_chart(fig)
-            st.session_state.last_plot = {"type": "Scatter Plot", "x": x_axis, "y": y_axis, "data": df[[x_axis, y_axis]].to_json()}
-            return f"Generated scatter plot of {x_axis} vs {y_axis}"
-    return "Invalid columns for scatter plot."
-def generate_histogram(params):
-    df = st.session_state.cleaned_data
-    x_axis = params.strip()
-    if x_axis in df.columns:
-        fig = px.histogram(df, x=x_axis, title=f'Histogram of {x_axis}')
-        st.plotly_chart(fig)
-        st.session_state.last_plot = {"type": "Histogram", "x": x_axis, "data": df[[x_axis]].to_json()}
-        return f"Generated histogram of {x_axis}"
-    return "Invalid column for histogram."
-def analyze_plot():
-    if "last_plot" not in st.session_state:
-        return "No plot available to analyze."
-    plot_info = st.session_state.last_plot
-    df = pd.read_json(plot_info["data"])
-    plot_text = extract_plot_data(plot_info, df)
-    return f"Analysis of the last plot:\n{plot_text}"
-def parse_command(command):
     command = command.lower().strip()
-    if "drop columns" in command or "drop column" in command:
-        columns = command.replace("drop columns", "").replace("drop column", "").strip()
-        return drop_columns, columns
-    elif "show a scatter plot" in command or "scatter plot of" in command:
-        params = command.replace("show a scatter plot of", "").replace("scatter plot of", "").strip()
-        return generate_scatter_plot, params
-    elif "show a histogram" in command or "histogram of" in command:
-        params = command.replace("show a histogram of", "").replace("histogram of", "").strip()
-        return generate_histogram, params
-    elif "analyze plot" in command:
-        return lambda x: analyze_plot(), None
-    return None, command
-# Dataset Preview Function
-def display_dataset_preview():
-    if 'cleaned_data' in st.session_state:
-        st.subheader("Current Dataset Preview")
-        st.dataframe(st.session_state.cleaned_data.head(10), use_container_width=True)
-        st.markdown("---")
-# Main App
-def main():
-    # Header
-    st.markdown("""
-        <div class="header">
-            <h1 class="header-title">Data-Vision Pro</h1>
-            <div class="header-subtitle">Advanced Data Analysis with Groq Inference</div>
-        </div>
-    """, unsafe_allow_html=True)
-    # Sidebar Navigation
-    with st.sidebar:
-        st.markdown("### 🔮 Data-Vision Pro")
-        st.markdown("Your AI-powered data analysis suite with RAG.")
-        st.markdown("---")
-        app_mode = st.selectbox(
-            "Navigation",
-            ["Data Upload", "Data Cleaning", "EDA"],
-            format_func=lambda x: f"📌 {x}"
-        )
-        model = st.selectbox(
-            "Select Groq Model",
-            ["llama3-70b-8192", "llama3-8b-8192", "mixtral-8x7b-32768", "gemma-7b-it"],
-            index=0
-        )
-        if app_mode == "Data Upload":
-            st.info("⬆️ Upload your CSV or XLSX dataset to begin.")
-        elif app_mode == "Data Cleaning":
-            st.info("🧹 Clean and preprocess your data.")
-        elif app_mode == "EDA":
-            st.info("🔍 Explore your data visually.")
-        if 'cleaned_data' in st.session_state:
-            csv = st.session_state.cleaned_data.to_csv(index=False)
-            st.download_button(
-                label="Download Cleaned Data",
-                data=csv,
-                file_name='cleaned_data.csv',
-                mime='text/csv',
-            )
-        st.markdown("---")
-        st.markdown("Built with <span class='tech-badge'>Streamlit</span> + <span class='tech-badge'>Groq</span>", unsafe_allow_html=True)
-    # Initialize Session State
-    if 'vector_store' not in st.session_state:
-        st.session_state.vector_store = None
-    if 'chat_history' not in st.session_state:
-        st.session_state.chat_history = []
-    # Display Dataset Preview
-    display_dataset_preview()
-    # App Pages
-    if app_mode == "Data Upload":
-        st.header("📤 Data Upload & Profiling")
-        uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"], key="file_uploader")
-        if uploaded_file:
-            st.session_state.pop('raw_data', None)
-            st.session_state.pop('cleaned_data', None)
-            st.session_state.pop('data_versions', None)
-            try:
-                if uploaded_file.name.endswith('.csv'):
-                    df = pd.read_csv(uploaded_file)
-                else:
-                    df = pd.read_excel(uploaded_file)
-                if df.empty:
-                    st.error("Uploaded file is empty.")
-                    st.stop()
-                st.session_state.raw_data = df
-                st.session_state.cleaned_data = df.copy()
-                st.session_state.dataset_text = convert_df_to_text(df)
-                st.session_state.vector_store = create_vector_store(st.session_state.dataset_text)
-                if 'data_versions' not in st.session_state:
-                    st.session_state.data_versions = [df.copy()]
-                col1, col2, col3 = st.columns(3)
-                with col1: st.metric("Rows", df.shape[0])
-                with col2: st.metric("Columns", df.shape[1])
-                with col3: st.metric("Missing Values", df.isna().sum().sum())
-                if st.checkbox("Show Data Preview"):
-                    st.dataframe(df.head(10), use_container_width=True)
-                if st.button("Generate Full Profile Report"):
-                    with st.spinner("Generating report..."):
-                        pr = ProfileReport(df, explorative=True)
-                        st_profile_report(pr)
-                st.success("✅ Data loaded successfully!")
-            except Exception as e:
-                st.error(f"An error occurred: {str(e)}")
-    elif app_mode == "Data Cleaning":
-        st.header("🧹 Smart Data Cleaning")
-        if 'raw_data' not in st.session_state:
-            st.warning("Please upload data first in the Data Upload section.")
-            st.stop()
-        if 'cleaned_data' in st.session_state:
-            df = st.session_state.cleaned_data.copy()
         else:
-            st.session_state.cleaned_data = st.session_state.raw_data.copy()
-            df = st.session_state.cleaned_data.copy()
-        enhance_section_title("📊 Data Health Dashboard")
-        with st.expander("Explore Data Health Metrics", expanded=True):
-            col1, col2, col3 = st.columns(3)
-            with col1: st.metric("Columns", len(df.columns))
-            with col2: st.metric("Rows", len(df))
-            with col3: st.metric("Missing Values", df.isna().sum().sum())
-            if st.button("Generate Detailed Health Report"):
-                with st.spinner("Generating report..."):
-                    profile = ProfileReport(df, minimal=True)
-                    st_profile_report(profile)
-            if 'data_versions' in st.session_state and len(st.session_state.data_versions) > 1:
-                if st.button("Undo Last Action"):
-                    st.session_state.data_versions.pop()
-                    st.session_state.cleaned_data = st.session_state.data_versions[-1].copy()
-                    st.session_state.dataset_text = convert_df_to_text(st.session_state.cleaned_data)
-                    st.session_state.vector_store = create_vector_store(st.session_state.dataset_text)
-                    st.rerun()
-        with st.expander("🛠️ Data Cleaning Operations", expanded=True):
-            enhance_section_title("🔍 Missing Values Treatment")
-            missing_cols = df.columns[df.isna().any()].tolist()
-            if missing_cols:
-                cols = st.multiselect("Select columns with missing values", missing_cols)
-                method = st.selectbox("Choose imputation method", [
-                    "Drop Missing Values", "Fill with Mean/Median", "Fill with Custom Value", "Forward Fill", "Backward Fill"
-                ])
-                if method == "Fill with Custom Value":
-                    custom_val = st.text_input("Enter custom value:")
-                if st.button("Apply Missing Value Treatment"):
-                    new_df = df.copy()
-                    if method == "Drop Missing Values":
-                        new_df = new_df.dropna(subset=cols)
-                    elif method == "Fill with Mean/Median":
-                        for col in cols:
-                            if pd.api.types.is_numeric_dtype(new_df[col]):
-                                new_df[col] = new_df[col].fillna(new_df[col].median())
-                            else:
-                                new_df[col] = new_df[col].fillna(new_df[col].mode()[0])
-                    elif method == "Fill with Custom Value" and custom_val:
-                        new_df[cols] = new_df[cols].fillna(custom_val)
-                    elif method == "Forward Fill":
-                        new_df[cols] = new_df[cols].ffill()
-                    elif method == "Backward Fill":
-                        new_df[cols] = new_df[cols].bfill()
-                    update_cleaned_data(new_df)
-            else:
-                st.success("✨ No missing values detected!")
-            enhance_section_title("🔄 Data Type Conversion")
-            col_to_convert = st.selectbox("Select column to convert", df.columns)
-            new_type = st.selectbox("Select new data type", ["String", "Integer", "Float", "Boolean", "Datetime"])
-            if new_type == "Datetime":
-                date_format = st.text_input("Enter date format (e.g., %Y-%m-%d):", "%Y-%m-%d")
-            if st.button("Convert Data Type"):
-                new_df = df.copy()
-                if new_type == "String":
-                    new_df[col_to_convert] = new_df[col_to_convert].astype(str)
-                elif new_type == "Integer":
-                    new_df[col_to_convert] = pd.to_numeric(new_df[col_to_convert], errors='coerce').astype('Int64')
-                elif new_type == "Float":
-                    new_df[col_to_convert] = pd.to_numeric(new_df[col_to_convert], errors='coerce')
-                elif new_type == "Boolean":
-                    new_df[col_to_convert] = new_df[col_to_convert].astype(bool)
-                elif new_type == "Datetime":
-                    new_df[col_to_convert] = pd.to_datetime(new_df[col_to_convert], format=date_format, errors='coerce')
-                update_cleaned_data(new_df)
-            enhance_section_title("🗑️ Drop Columns")
-            columns_to_drop = st.multiselect("Select columns to remove", df.columns)
-            if columns_to_drop and st.button("Confirm Column Removal"):
-                new_df = df.copy()
-                new_df = new_df.drop(columns=columns_to_drop)
-                update_cleaned_data(new_df)
-            enhance_section_title("🔢 Encoding Options")
-            encoding_method = st.radio("Choose encoding method", ("Label Encoding", "One-Hot Encoding"))
-            data_to_encode = st.multiselect("Select columns to encode", df.select_dtypes(include='object').columns)
-            if data_to_encode and st.button("Apply Encoding"):
-                new_df = df.copy()
-                if encoding_method == "Label Encoding":
-                    for col in data_to_encode:
-                        le = LabelEncoder()
-                        new_df[col] = le.fit_transform(new_df[col].astype(str))
-                elif encoding_method == "One-Hot Encoding":
-                    new_df = pd.get_dummies(new_df, columns=data_to_encode, drop_first=True, dtype=int)
-                update_cleaned_data(new_df)
-            enhance_section_title("📏 StandardScaler")
-            scale_cols = st.multiselect("Select numerical columns to scale", df.select_dtypes(include=np.number).columns)
-            if scale_cols and st.button("Apply StandardScaler"):
-                new_df = df.copy()
-                scaler = StandardScaler()
-                new_df[scale_cols] = scaler.fit_transform(new_df[scale_cols])
-                update_cleaned_data(new_df)
-    elif app_mode == "EDA":
-        st.header("🔍 Interactive Data Explorer")
-        if 'cleaned_data' not in st.session_state:
-            st.warning("Please upload and clean data first.")
-            st.stop()
-        df = st.session_state.cleaned_data.copy()
-        enhance_section_title("Dataset Overview")
-        with st.container():
-            col1, col2, col3, col4 = st.columns(4)
-            col1.metric("Total Rows", df.shape[0])
-            col2.metric("Total Columns", df.shape[1])
-            missing_percentage = df.isna().sum().sum() / df.size * 100
-            col3.metric("Missing Values", f"{df.isna().sum().sum()} ({missing_percentage:.1f}%)")
-            col4.metric("Duplicates", df.duplicated().sum())
-        tab1, tab2, tab3 = st.tabs(["Quick Preview", "Column Types", "Missing Matrix"])
-        with tab1:
-            st.write("First few rows of the dataset:")
-            st.dataframe(df.head(), use_container_width=True)
-        with tab2:
-            st.write("Column Data Types:")
-            type_counts = df.dtypes.value_counts().reset_index()
-            type_counts.columns = ['Type', 'Count']
-            st.dataframe(type_counts, use_container_width=True)
-        with tab3:
-            st.write("Missing Values Matrix:")
-            fig_missing = px.imshow(df.isna(), color_continuous_scale=['#e0e0e0', '#66c2a5'])
-            fig_missing.update_layout(coloraxis_colorscale=[[0, 'lightgrey'], [1, '#FF4B4B']])
-            st.plotly_chart(fig_missing, use_container_width=True)
-        enhance_section_title("Interactive Visualization Builder")
-        with st.container():
-            col1, col2 = st.columns([1, 3])
-            with col1:
-                plot_type = st.selectbox("Choose visualization type", [
-                    "Scatter Plot", "Histogram", "Box Plot", "Line Chart", "Bar Chart", "Correlation Matrix"
-                ])
-                x_axis = st.selectbox("X-axis", df.columns) if plot_type != "Correlation Matrix" else None
-                y_axis = st.selectbox("Y-axis", df.columns) if plot_type in ["Scatter Plot", "Box Plot", "Line Chart"] else None
-                color_by = st.selectbox("Color encoding", ["None"] + df.columns.tolist(), format_func=lambda x: "No color" if x == "None" else x) if plot_type != "Correlation Matrix" else None
-            with col2:
-                try:
-                    fig = None
-                    if plot_type == "Scatter Plot" and x_axis and y_axis:
-                        fig = px.scatter(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None, title=f'Scatter Plot of {x_axis} vs {y_axis}')
-                    elif plot_type == "Histogram" and x_axis:
-                        fig = px.histogram(df, x=x_axis, color=color_by if color_by != "None" else None, nbins=30, title=f'Histogram of {x_axis}')
-                    elif plot_type == "Box Plot" and x_axis and y_axis:
-                        fig = px.box(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None, title=f'Box Plot of {x_axis} vs {y_axis}')
-                    elif plot_type == "Line Chart" and x_axis and y_axis:
-                        fig = px.line(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None, title=f'Line Chart of {x_axis} vs {y_axis}')
-                    elif plot_type == "Bar Chart" and x_axis:
-                        fig = px.bar(df, x=x_axis, color=color_by if color_by != "None" else None, title=f'Bar Chart of {x_axis}')
-                    elif plot_type == "Correlation Matrix":
-                        numeric_df = df.select_dtypes(include=np.number)
-                        if len(numeric_df.columns) > 1:
-                            corr = numeric_df.corr()
-                            fig = px.imshow(corr, text_auto=True, color_continuous_scale='RdBu_r', zmin=-1, zmax=1, title='Correlation Matrix')
-                    if fig:
-                        fig.update_layout(template="plotly_white")
-                        st.plotly_chart(fig, use_container_width=True)
-                        st.session_state.last_plot = {
-                            "type": plot_type,
-                            "x": x_axis,
-                            "y": y_axis,
-                            "data": df[[x_axis, y_axis]].to_json() if y_axis else df[[x_axis]].to_json()
-                        }
-                        plot_text = extract_plot_data(st.session_state.last_plot, df)
-                        st.session_state.vector_store = update_vector_store_with_plot(plot_text, st.session_state.vector_store)
-                        with st.expander("Extracted Plot Data"):
-                            st.text(plot_text)
-                    else:
-                        st.error("Please provide required inputs for the selected plot type.")
-                except Exception as e:
-                    st.error(f"Couldn't create visualization: {str(e)}")
-    # Chatbot Section
-    st.markdown("---")
-    st.markdown('<div class="chat-container">', unsafe_allow_html=True)
-    st.subheader("💬 AI Chatbot Assistant (RAG Enabled)")
-    st.info("Ask about your data or app features! Try: 'drop columns X, Y', 'scatter plot of X vs Y', 'analyze plot'")
-    for message in st.session_state.chat_history:
-        with st.chat_message(message["role"]):
-            st.markdown(f'<div class="{message["role"]}-message">{message["content"]}</div>', unsafe_allow_html=True)
-    user_input = st.chat_input("Ask me anything...")
-    if user_input:
-        st.session_state.chat_history.append({"role": "user", "content": user_input})
-        with st.chat_message("user"):
-            st.markdown(f'<div class="user-message">{user_input}</div>', unsafe_allow_html=True)
-        with st.spinner("Processing..."):
-            func, param = parse_command(user_input)
-            if func:
-                response = func(param) if param else func(None)
-            else:
-                response = get_chatbot_response(user_input, app_mode, st.session_state.vector_store, model)
-            st.session_state.chat_history.append({"role": "assistant", "content": response})
-        with st.chat_message("assistant"):
-            st.markdown(f'<div class="bot-message">{response}</div>', unsafe_allow_html=True)
-    st.markdown('</div>', unsafe_allow_html=True)
-    # Footer
-    st.markdown("""
-        <div class="footer">
-            <div>Built with <span class="tech-badge">Streamlit</span> + <span class="tech-badge">Groq</span> + <span class="tech-badge">LangChain</span> + <span class="tech-badge">FAISS</span></div>
-            <div style="margin-top: 8px;">Fast inference for data insights</div>
-        </div>
-    """, unsafe_allow_html=True)
-if __name__ == "__main__":
-    main()

+import gradio as gr
 import pandas as pd
 import numpy as np
 import plotly.express as px
 from ydata_profiling import ProfileReport
 import os
 from dotenv import load_dotenv
 from groq import Groq
 from langchain.embeddings import HuggingFaceEmbeddings
 import re
 from scipy import stats
+from sklearn.preprocessing import StandardScaler, LabelEncoder
 import tempfile
+import json
 # Load environment variables
 load_dotenv()
 # Initialize Groq client
 client = Groq(api_key=os.getenv("GROQ_API_KEY"))
+# Initialize HuggingFace embeddings
 embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+# Helper Functions (unchanged from your original)
 def update_cleaned_data(df):
+    gr.State(value=df)
+    if 'data_versions' not in gr.State():
+        gr.State(value=[gr.State(value=df.copy())])
+    gr.State(value=gr.State(value=gr.State(value=df.copy())))
+    return df, "✅ Action completed successfully!"
 def convert_df_to_text(df):
     text = f"Dataset Summary: {df.shape[0]} rows, {df.shape[1]} columns\n"
     x_col = plot_info["x"]
     y_col = plot_info["y"] if "y" in plot_info else None
     data = pd.read_json(plot_info["data"])
+    plot_text = f"Plot Type: {plot_type}\nX-Axis: {x_col}\n"
     if y_col:
         plot_text += f"Y-Axis: {y_col}\n"
     if plot_type == "Scatter Plot" and y_col:
         correlation = data[x_col].corr(data[y_col])
         slope, intercept, r_value, p_value, std_err = stats.linregress(data[x_col].dropna(), data[y_col].dropna())
+        plot_text += f"Correlation: {correlation:.2f}\nLinear Regression: Slope={slope:.2f}, Intercept={intercept:.2f}, R²={r_value**2:.2f}, p-value={p_value:.4f}\n"
     return plot_text
 def get_chatbot_response(user_input, app_mode, vector_store=None, model="llama3-70b-8192"):
     system_prompt = (
+        f"You are an AI assistant in Data-Vision Pro, on the '{app_mode}' page:\n"
+        "- Data Upload: Upload CSV/XLSX files, view stats, or generate reports.\n"
+        "- Data Cleaning: Clean data (e.g., handle missing values, encode variables).\n"
+        "- EDA: Visualize data (e.g., scatter plots, histograms) and analyze plots.\n"
+        "Use context if provided."
     )
     context = ""
     if vector_store:
         docs = vector_store.similarity_search(user_input, k=3)
         if docs:
+            context = "\n\nContext:\n" + "\n".join([f"- {doc.page_content}" for doc in docs])
     try:
         response = client.chat.completions.create(
             model=model,
             messages=[
+                {"role": "system", "content": system_prompt + context},
                 {"role": "user", "content": user_input}
             ],
             temperature=0.7,
     except Exception as e:
         return f"Error: {str(e)}"
+def parse_command(command, df, vector_store):
     command = command.lower().strip()
+    if "drop columns" in command:
+        columns = command.replace("drop columns", "").strip().split(',')
+        valid_cols = [col.strip() for col in columns if col.strip() in df.columns]
+        if valid_cols:
+            df = df.drop(columns=valid_cols)
+            return update_cleaned_data(df)[0], f"Dropped columns: {', '.join(valid_cols)}"
+        return df, "No valid columns to drop."
+    elif "scatter plot of" in command:
+        match = re.search(r"([\w\s]+)\s+vs\s+([\w\s]+)", command)
+        if match:
+            x, y = match.group(1).strip(), match.group(2).strip()
+            if x in df.columns and y in df.columns:
+                fig = px.scatter(df, x=x, y=y)
+                plot_info = {"type": "Scatter Plot", "x": x, "y": y, "data": df[[x, y]].to_json()}
+                return df, fig, plot_info
+        return df, None, "Invalid scatter plot command."
+    elif "histogram of" in command:
+        col = command.replace("histogram of", "").strip()
+        if col in df.columns:
+            fig = px.histogram(df, x=col)
+            plot_info = {"type": "Histogram", "x": col, "data": df[[col]].to_json()}
+            return df, fig, plot_info
+        return df, None, "Invalid histogram command."
+    elif "analyze plot" in command and "last_plot" in gr.State():
+        plot_info = gr.State(value="last_plot")
+        plot_text = extract_plot_data(plot_info, df)
+        if vector_store:
+            vector_store = update_vector_store_with_plot(plot_text, vector_store)
+        return df, plot_text
+    return df, None, None
+# Custom HTML/JS for Enhanced UI
+custom_html = """
+<style>
+  :root {
+    --silver-light: #D8D8D8;
+    --silver-dark: #B8B8B8;
+    --blue: #5C89BC;
+    --blue-dark: #4E73A0;
+    --blue-light: #6EA8E0;
+    --gold: #A87E01;
+    --shadow-color: rgba(0,0,0,0.1);
+  }
+  .header {
+    background: linear-gradient(90deg, var(--blue) 80%, var(--blue-dark) 100%);
+    color: white;
+    padding: 20px;
+    border-radius: 16px 16px 0 0;
+    text-align: center;
+    box-shadow: 0 4px 12px var(--shadow-color);
+  }
+  .nav-tabs {
+    display: flex;
+    justify-content: space-around;
+    padding: 10px 0;
+    background: var(--silver-light);
+    border-bottom: 2px solid var(--gold);
+  }
+  .nav-tab {
+    padding: 10px 20px;
+    cursor: pointer;
+    color: var(--blue);
+    font-weight: 600;
+    transition: all 0.3s ease;
+  }
+  .nav-tab.active {
+    color: var(--gold);
+    border-bottom: 2px solid var(--gold);
+    background: white;
+    border-radius: 8px 8px 0 0;
+  }
+  .tab-content { display: none; padding: 20px; }
+  .tab-content.active { display: block; }
+  .chat-container {
+    background: white;
+    border-radius: 16px;
+    padding: 20px;
+    box-shadow: 0 6px 16px var(--shadow-color);
+    margin-top: 20px;
+  }
+  .message {
+    padding: 10px 15px;
+    margin: 5px 0;
+    border-radius: 12px;
+    max-width: 80%;
+  }
+  .user-message {
+    background: linear-gradient(45deg, var(--blue), var(--blue-light));
+    color: white;
+    margin-left: auto;
+  }
+  .bot-message {
+    background: #F0F0F0;
+    margin-right: auto;
+  }
+  .metrics {
+    display: flex;
+    gap: 20px;
+    margin: 10px 0;
+  }
+  .metric {
+    background: #F0F0F0;
+    padding: 10px;
+    border-radius: 8px;
+  }
+</style>
+<div class="header">
+  <h1>Data-Vision Pro</h1>
+  <div>Advanced Data Analysis with Groq</div>
+</div>
+<div class="nav-tabs">
+  <div class="nav-tab active" data-tab="upload">Data Upload</div>
+  <div class="nav-tab" data-tab="cleaning">Data Cleaning</div>
+  <div class="nav-tab" data-tab="eda">EDA</div>
+</div>
+<div id="upload" class="tab-content active">
+  <h2>📤 Data Upload & Profiling</h2>
+  <!-- Gradio components will be injected here -->
+</div>
+<div id="cleaning" class="tab-content">
+  <h2>🧹 Data Cleaning</h2>
+  <!-- Gradio components will be injected here -->
+</div>
+<div id="eda" class="tab-content">
+  <h2>🔍 Interactive Data Explorer</h2>
+  <!-- Gradio components will be injected here -->
+</div>
+<div class="chat-container">
+  <h2>💬 AI Chatbot Assistant</h2>
+  <div id="chat" style="max-height:300px; overflow-y:auto;"></div>
+  <input id="chat-input" placeholder="Ask me anything..." style="width:80%;">
+  <button onclick="sendChat()">Send</button>
+</div>
+<script>
+  // Tab Navigation
+  document.querySelectorAll('.nav-tab').forEach(tab => {
+    tab.addEventListener('click', () => {
+      document.querySelectorAll('.nav-tab').forEach(t => t.classList.remove('active'));
+      document.querySelectorAll('.tab-content').forEach(c => c.classList.remove('active'));
+      tab.classList.add('active');
+      document.getElementById(tab.dataset.tab).classList.add('active');
+      document.getElementById('app-mode').value = tab.dataset.tab.charAt(0).toUpperCase() + tab.dataset.tab.slice(1);
+    });
+  });
+  // Chat Functionality
+  function sendChat() {
+    const input = document.getElementById('chat-input');
+    const message = input.value.trim();
+    if (!message) return;
+    input.value = '';
+    const chat = document.getElementById('chat');
+    chat.innerHTML += `<div class="message user-message">${message}</div>`;
+    chat.scrollTop = chat.scrollHeight;
+    // Trigger Gradio event
+    const event = new CustomEvent('chat_submit', { detail: message });
+    document.dispatchEvent(event);
+  }
+  // Listen for bot responses from Gradio
+  document.addEventListener('bot_response', (e) => {
+    const chat = document.getElementById('chat');
+    chat.innerHTML += `<div class="message bot-message">${e.detail}</div>`;
+    chat.scrollTop = chat.scrollHeight;
+  });
+</script>
+"""
+# Gradio Interface
+def main_interface(file, chat_input, cleaned_data, vector_store, last_plot, app_mode, model):
+    outputs = {}
+    # Data Upload
+    if file and app_mode == "Data Upload":
+        if file.name.endswith('.csv'):
+            df = pd.read_csv(file)
         else:
+            df = pd.read_excel(file)
+        cleaned_data, msg = update_cleaned_data(df)
+        vector_store = create_vector_store(convert_df_to_text(df))
+        metrics_html = f"""
+        <div class="metrics">
+          <div class="metric">Rows: {df.shape[0]}</div>
+          <div class="metric">Columns: {df.shape[1]}</div>
+          <div class="metric">Missing: {df.isna().sum().sum()}</div>
+        </div>
+        """
+        outputs["upload_output"] = gr.HTML(value=metrics_html + f"<pre>{df.head().to_string()}</pre>")
+        outputs["status"] = msg
+        outputs["cleaned_data"] = cleaned_data
+        outputs["vector_store"] = vector_store
+    # Data Cleaning
+    elif app_mode == "Data Cleaning" and cleaned_data is not None:
+        df = cleaned_data
+        metrics_html = f"""
+        <div class="metrics">
+          <div class="metric">Rows: {df.shape[0]}</div>
+          <div class="metric">Columns: {df.shape[1]}</div>
+          <div class="metric">Missing: {df.isna().sum().sum()}</div>
+        </div>
+        """
+        outputs["cleaning_output"] = gr.HTML(value=metrics_html)
+    # EDA
+    elif app_mode == "EDA" and cleaned_data is not None:
+        df = cleaned_data
+        metrics_html = f"""
+        <div class="metrics">
+          <div class="metric">Rows: {df.shape[0]}</div>
+          <div class="metric">Columns: {df.shape[1]}</div>
+          <div class="metric">Missing: {df.isna().sum().sum()}</div>
+        </div>
+        """
+        outputs["eda_output"] = gr.HTML(value=metrics_html)
+    # Chatbot
+    if chat_input:
+        df = cleaned_data if cleaned_data is not None else pd.DataFrame()
+        new_df, plot_fig, plot_info_or_msg = parse_command(chat_input, df, vector_store)
+        if plot_fig:
+            outputs["plot"] = plot_fig
+            outputs["last_plot"] = plot_info_or_msg
+            vector_store = update_vector_store_with_plot(extract_plot_data(plot_info_or_msg, df), vector_store)
+            outputs["vector_store"] = vector_store
+            response = f"Generated {plot_info_or_msg['type'].lower()}."
+        elif isinstance(plot_info_or_msg, str):
+            response = plot_info_or_msg
+            if "Dropped columns" in response:
+                outputs["cleaned_data"] = new_df
+                outputs["vector_store"] = create_vector_store(convert_df_to_text(new_df))
+        else:
+            response = get_chatbot_response(chat_input, app_mode, vector_store, model)
+        outputs["status"] = response
+        # Trigger JS event for chatbot
+        outputs["chat_output"] = gr.HTML(value=f"""
+        <script>
+          document.dispatchEvent(new CustomEvent('bot_response', {{ detail: {json.dumps(response)} }}));
+        </script>
+        """)
+    return outputs
+# Gradio App
+with gr.Blocks(title="Data-Vision Pro") as demo:
+    # State Variables
+    cleaned_data = gr.State()
+    vector_store = gr.State()
+    last_plot = gr.State()
+    # Custom HTML
+    gr.HTML(custom_html)
+    # Hidden App Mode Input
+    app_mode = gr.Textbox(value="Data Upload", elem_id="app-mode", visible=False)
+    # Inputs
+    with gr.Row():
+        file_input = gr.File(label="Upload CSV/XLSX")
+        model = gr.Dropdown(choices=["llama3-70b-8192", "llama3-8b-8192", "mixtral-8x7b-32768", "gemma-7b-it"], value="llama3-70b-8192", label="Groq Model")
+    # Outputs
+    upload_output = gr.HTML(label="Upload Results", elem_id="upload-output")
+    cleaning_output = gr.HTML(label="Cleaning Results", elem_id="cleaning-output")
+    eda_output = gr.HTML(label="EDA Results", elem_id="eda-output")
+    plot = gr.Plot(label="Visualization")
+    status = gr.Textbox(label="Status")
+    chat_output = gr.HTML(visible=False)  # Hidden output to trigger JS
+    # Chat Input
+    chat_input = gr.Textbox(label="Chat with AI", interactive=True, placeholder="Ask me anything...")
+    # Event Handling
+    file_input.change(
+        main_interface,
+        inputs=[file_input, chat_input, cleaned_data, vector_store, last_plot, app_mode, model],
+        outputs=[upload_output, cleaning_output, eda_output, plot, status, chat_output, cleaned_data, vector_store, last_plot]
+    )
+    chat_input.submit(
+        main_interface,
+        inputs=[file_input, chat_input, cleaned_data, vector_store, last_plot, app_mode, model],
+        outputs=[upload_output, cleaning_output, eda_output, plot, status, chat_output, cleaned_data, vector_store, last_plot]
+    )
+demo.launch()