Spaces:

CosmickVisions
/

Data-Vision

Running

App Files Files Community

CosmickVisions commited on Mar 16

Commit

4046ae3

verified ·

1 Parent(s): eb07713

Update app.py

Browse files

Files changed (1) hide show

app.py +532 -350

app.py CHANGED Viewed

@@ -17,191 +17,136 @@ from scipy import stats
 from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
 import tempfile
-# Set page config for fullscreen
-st.set_page_config(page_title="Data-Vision Pro", layout="wide", initial_sidebar_state="collapsed")
 # Load environment variables
 load_dotenv()
-# Initialize Groq client and embeddings
 client = Groq(api_key=os.getenv("GROQ_API_KEY"))
 embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
-# Custom CSS with improved spacing, sizing, and UI/UX
 st.markdown("""
-   <style>
-/* Define CSS variables for reusability */
-:root {
-    --primary-blue: #003366;    /* Deep blue for headers and accents */
-    --light-gray: #F5F5F5;      /* Light gray for backgrounds */
-    --dark-gray: #333333;       /* Dark gray for text */
-    --accent-gold: #B8860B;     /* Muted gold for highlights */
-    --spacing-unit: 1.5rem;     /* Standard spacing unit */
-}
-/* Main app container */
-.stApp {
-    background-color: var(--light-gray);
-    font-family: 'Inter', sans-serif;
-    padding: var(--spacing-unit);
-    height: 100vh;
-    overflow-y: auto;
-    display: flex;
-    flex-direction: column;
-}
-/* Header styling */
-.header {
-    background-color: var(--primary-blue);
-    color: white;
-    padding: var(--spacing-unit);
-    border-radius: 8px;
-    text-align: center;
-    margin-bottom: var(--spacing-unit);
-    box-shadow: 0 2px 10px rgba(0,0,0,0.1);
-}
-.header-title {
-    font-size: 2rem;
-    font-weight: 700;
-    margin: 0;
-}
-.header-subtitle {
-    font-size: 1rem;
-    margin-top: 0.5rem;
-    opacity: 0.9;
-}
-/* Navigation bar */
-.nav-bar {
-    background-color: white;
-    border-radius: 8px;
-    padding: 1rem;
-    display: flex;
-    gap: 1rem;
-    align-items: center;
-    flex-wrap: wrap;
-    margin-bottom: var(--spacing-unit);
-    box-shadow: 0 2px 5px rgba(0,0,0,0.05);
-}
-.nav-item {
-    color: var(--primary-blue);
-    font-weight: 500;
-    padding: 0.75rem 1.5rem;
-    border-radius: 5px;
-    text-align: center;
-    transition: all 0.2s ease;
-    flex: 1;
-}
-.nav-item:hover {
-    background-color: var(--accent-gold);
-    color: white;
-}
-/* Main content area */
-.main-container {
-    background-color: white;
-    border-radius: 8px;
-    padding: 2rem;  /* Increased padding for spacing */
-    flex-grow: 1;
-    margin-bottom: var(--spacing-unit);
-    box-shadow: 0 2px 5px rgba(0,0,0,0.05);
-}
-/* Chatbot container */
-.chat-container {
-    background-color: white;
-    border-radius: 8px 8px 0 0;
-    padding: 1rem;
-    position: fixed;
-    bottom: 0;
-    left: var(--spacing-unit);
-    right: var(--spacing-unit);
-    max-height: 40vh;
-    overflow-y: auto;
-    box-shadow: 0 -2px 10px rgba(0,0,0,0.1);
-    z-index: 1000;
-}
-.chat-message-container {
-    margin-bottom: 0.5rem;
-}
-.user-message, .bot-message {
-    padding: 0.75rem 1rem;
-    border-radius: 12px;
-    margin-bottom: 0.5rem;
-    max-width: 70%;
-    word-wrap: break-word;
-}
-.user-message {
-    background-color: var(--primary-blue);
-    color: white;
-    margin-left: auto;
-    box-shadow: 0 1px 3px rgba(0,0,0,0.1);
-}
-.bot-message {
-    background-color: #F0F0F0;
-    color: var(--dark-gray);
-    margin-right: auto;
-    box-shadow: 0 1px 3px rgba(0,0,0,0.05);
-}
-/* Footer */
-.footer {
-    text-align: center;
-    color: var(--dark-gray);
-    font-size: 0.9rem;
-    padding: 1rem 0;
-    margin-top: auto;
-}
-/* Headings */
-h2 {
-    color: var(--primary-blue);
-    border-bottom: 2px solid var(--accent-gold);
-    padding-bottom: 0.5rem;
-    font-size: 1.5rem;
-    margin-bottom: 1rem;
-}
-/* Streamlit button styling */
-.stButton > button {
-    background-color: var(--accent-gold);
-    color: white;
-    border-radius: 5px;
-    padding: 0.75rem 1.5rem;
-    font-weight: 500;
-    transition: background-color 0.2s ease;
-}
-.stButton > button:hover {
-    background-color: #8C6B01;  /* Darker gold on hover */
-}
-/* Streamlit text input styling */
-.stTextInput > div > div > input {
-    border-radius: 5px;
-    padding: 0.75rem;
-}
-/* Responsive design for smaller screens */
-@media (max-width: 768px) {
-    .header-title { font-size: 1.5rem; }
-    .header-subtitle { font-size: 0.9rem; }
-    .nav-bar { flex-direction: column; padding: 0.75rem; gap: 0.5rem; }
-    .nav-item { padding: 0.5rem; }
-    .main-container { padding: 1.5rem; }
-    .chat-container { padding: 0.75rem; max-height: 50vh; }
-    h2 { font-size: 1.2rem; }
-}
-@media (max-width: 480px) {
-    .header-title { font-size: 1.2rem; }
-    .stApp { padding: 0.75rem; }
-    .chat-container { left: 0.75rem; right: 0.75rem; }
-}
-</style>
 """, unsafe_allow_html=True)
-# Helper Functions (fully implemented from original intent)
 def enhance_section_title(title):
-    st.markdown(f"<h2>{title}</h2>", unsafe_allow_html=True)
 def update_cleaned_data(df):
     st.session_state.cleaned_data = df
@@ -209,13 +154,13 @@ def update_cleaned_data(df):
         st.session_state.data_versions = [st.session_state.raw_data.copy()]
     st.session_state.data_versions.append(df.copy())
     st.session_state.dataset_text = convert_df_to_text(df)
-    st.session_state.vector_store = create_vector_store(st.session_state.dataset_text)
-    st.success("✅ Action completed!")
     st.rerun()
 def convert_df_to_text(df):
     text = f"Dataset Summary: {df.shape[0]} rows, {df.shape[1]} columns\n"
-    text += f"Missing Values: {df.isna().sum().sum()}\nColumns:\n"
     for col in df.columns:
         text += f"- {col} ({df[col].dtype}): "
         if pd.api.types.is_numeric_dtype(df[col]):
@@ -231,104 +176,161 @@ def create_vector_store(df_text):
         temp_path = temp_file.name
     loader = TextLoader(temp_path)
     documents = loader.load()
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
     texts = text_splitter.split_documents(documents)
     vector_store = FAISS.from_documents(texts, embeddings)
     os.unlink(temp_path)
     return vector_store
 def update_vector_store_with_plot(plot_text, existing_vector_store):
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
-    plot_docs = text_splitter.create_documents([plot_text])
     if existing_vector_store:
-        existing_vector_store.add_documents(plot_docs)
     else:
-        existing_vector_store = FAISS.from_documents(plot_docs, embeddings)
     return existing_vector_store
 def extract_plot_data(plot_info, df):
-    plot_type = plot_info.get("type", "").lower()
-    x_col = plot_info.get("x", "")
-    y_col = plot_info.get("y", "")
-    if x_col not in df.columns or (plot_type == "scatter" and y_col not in df.columns):
-        return None, "Invalid column names."
-    if plot_type == "scatter":
-        fig = px.scatter(df, x=x_col, y=y_col)
-        return fig, f"Scatter plot: {x_col} vs {y_col}"
-    elif plot_type == "histogram":
-        fig = px.histogram(df, x=x_col)
-        return fig, f"Histogram of {x_col}"
-    return None, "Unsupported plot type."
 def get_chatbot_response(user_input, app_mode, vector_store=None, model="llama3-70b-8192"):
     context = ""
     if vector_store:
         docs = vector_store.similarity_search(user_input, k=3)
-        context = "\n\nData Context:\n" + "\n".join([f"- {doc.page_content}" for doc in docs])
-    system_prompt = f"You are an expert in {app_mode.lower()} analysis.{context}"
-    response = client.chat.completions.create(
-        model=model,
-        messages=[
-            {"role": "system", "content": system_prompt},
-            {"role": "user", "content": user_input}
-        ],
-        temperature=0.7,
-        max_tokens=1024
-    ).choices[0].message.content
-    return response
-# Command Functions (basic implementations from original intent)
 def drop_columns(columns):
     if 'cleaned_data' in st.session_state:
-        df = st.session_state.cleaned_data.drop(columns=columns)
-        update_cleaned_data(df)
 def generate_scatter_plot(params):
-    if 'cleaned_data' in st.session_state:
-        df = st.session_state.cleaned_data
-        x, y = params.get("x"), params.get("y")
-        if x in df.columns and y in df.columns:
-            fig = px.scatter(df, x=x, y=y)
-            st.plotly_chart(fig, use_container_width=True)
-            plot_text = f"Scatter plot of {x} vs {y}"
-            st.session_state.vector_store = update_vector_store_with_plot(plot_text, st.session_state.vector_store)
 def generate_histogram(params):
-    if 'cleaned_data' in st.session_state:
-        df = st.session_state.cleaned_data
-        x = params.get("x")
-        if x in df.columns:
-            fig = px.histogram(df, x=x)
-            st.plotly_chart(fig, use_container_width=True)
-            plot_text = f"Histogram of {x}"
-            st.session_state.vector_store = update_vector_store_with_plot(plot_text, st.session_state.vector_store)
 def analyze_plot():
-    if 'cleaned_data' in st.session_state:
-        st.write("Plot analysis not fully implemented yet.")
 def parse_command(command):
     command = command.lower().strip()
-    if "drop" in command:
-        columns = re.findall(r"drop\s+columns?\s+(.+)", command)
-        if columns:
-            cols = [col.strip() for col in columns[0].split(",")]
-            return {"action": "drop_columns", "columns": cols}
-    elif "scatter" in command:
-        match = re.search(r"scatter\s+plot\s+x=(\w+)\s+y=(\w+)", command)
-        if match:
-            return {"action": "scatter_plot", "x": match.group(1), "y": match.group(2)}
-    elif "histogram" in command:
-        match = re.search(r"histogram\s+of\s+(\w+)", command)
-        if match:
-            return {"action": "histogram", "x": match.group(1)}
-    return None
-# Dataset Preview
 def display_dataset_preview():
     if 'cleaned_data' in st.session_state:
-        st.subheader("Dataset Preview")
-        st.dataframe(st.session_state.cleaned_data.head(), use_container_width=True, height=200)
 # Main App
 def main():
@@ -336,127 +338,307 @@ def main():
     st.markdown("""
         <div class="header">
             <h1 class="header-title">Data-Vision Pro</h1>
-            <div class="header-subtitle">Advanced Data Analysis with Groq</div>
         </div>
     """, unsafe_allow_html=True)
-    # Navigation Bar
-    with st.container():
-        st.markdown('<div class="nav-bar">', unsafe_allow_html=True)
-        col1, col2, col3, col4 = st.columns([2, 2, 2, 1])
-        with col1:
-            uploaded_file = st.file_uploader("Upload File", type=["csv", "xlsx"], key="file_uploader")
-        with col2:
-            app_mode = st.selectbox("Mode", ["Data Upload", "Data Cleaning", "EDA"], label_visibility="collapsed")
-        with col3:
-            model = st.selectbox("Model", ["llama3-70b-8192", "llama3-8b-8192", "mixtral-8x7b-32768", "gemma-7b-it"], label_visibility="collapsed")
-        with col4:
-            if 'cleaned_data' in st.session_state:
-                csv = st.session_state.cleaned_data.to_csv(index=False)
-                st.download_button(label="Download", data=csv, file_name='cleaned_data.csv', mime='text/csv')
-        st.markdown('</div>', unsafe_allow_html=True)
     # Initialize Session State
     if 'vector_store' not in st.session_state:
         st.session_state.vector_store = None
     if 'chat_history' not in st.session_state:
         st.session_state.chat_history = []
-    if 'raw_data' not in st.session_state:
-        st.session_state.raw_data = None
-    if 'cleaned_data' not in st.session_state:
-        st.session_state.cleaned_data = None
-    if 'data_versions' not in st.session_state:
-        st.session_state.data_versions = []
-    if 'dataset_text' not in st.session_state:
-        st.session_state.dataset_text = ""
-    # Main Content
-    with st.container():
-        st.markdown('<div class="main-container">', unsafe_allow_html=True)
-        display_dataset_preview()
-        if app_mode == "Data Upload":
-            enhance_section_title("📤 Data Upload")
-            if uploaded_file:
-                try:
-                    df = pd.read_csv(uploaded_file) if uploaded_file.name.endswith('.csv') else pd.read_excel(uploaded_file)
-                    st.session_state.raw_data = df
-                    st.session_state.cleaned_data = df.copy()
-                    st.session_state.dataset_text = convert_df_to_text(df)
-                    st.session_state.vector_store = create_vector_store(st.session_state.dataset_text)
                     st.session_state.data_versions = [df.copy()]
-                    col1, col2, col3 = st.columns(3)
-                    with col1: st.metric("Rows", df.shape[0])
-                    with col2: st.metric("Columns", df.shape[1])
-                    with col3: st.metric("Missing", df.isna().sum().sum())
-                    if st.button("Generate Report"):
                         pr = ProfileReport(df, explorative=True)
                         st_profile_report(pr)
-                except Exception as e:
-                    st.error(f"Error: {e}")
-        elif app_mode == "Data Cleaning":
-            enhance_section_title("🧹 Data Cleaning")
-            if 'cleaned_data' not in st.session_state:
-                st.warning("Upload data first.")
-            else:
-                df = st.session_state.cleaned_data.copy()
-                columns_to_drop = st.multiselect("Drop Columns", df.columns)
-                if st.button("Drop Selected"):
-                    new_df = df.drop(columns=columns_to_drop)
                     update_cleaned_data(new_df)
-        elif app_mode == "EDA":
-            enhance_section_title("🔍 EDA")
-            if 'cleaned_data' not in st.session_state:
-                st.warning("Upload data first.")
             else:
-                df = st.session_state.cleaned_data.copy()
-                plot_type = st.selectbox("Plot Type", ["Scatter Plot", "Histogram"])
-                x_axis = st.selectbox("X-axis", df.columns)
-                if plot_type == "Scatter Plot":
-                    y_axis = st.selectbox("Y-axis", df.columns)
-                    if st.button("Generate"):
-                        fig = px.scatter(df, x=x_axis, y=y_axis)
-                        st.plotly_chart(fig, use_container_width=True)
-                        plot_text = f"Scatter plot of {x_axis} vs {y_axis}"
-                        st.session_state.vector_store = update_vector_store_with_plot(plot_text, st.session_state.vector_store)
-                else:
-                    if st.button("Generate"):
-                        fig = px.histogram(df, x=x_axis)
                         st.plotly_chart(fig, use_container_width=True)
-                        plot_text = f"Histogram of {x_axis}"
                         st.session_state.vector_store = update_vector_store_with_plot(plot_text, st.session_state.vector_store)
-        st.markdown('</div>', unsafe_allow_html=True)
-    # Chatbot
-    with st.container():
-        st.markdown('<div class="chat-container">', unsafe_allow_html=True)
-        st.subheader("💬 Chatbot")
-        for message in st.session_state.chat_history:
-            with st.container():
-                st.markdown(f'<div class="chat-message-container"><div class="{message["role"]}-message">{message["content"]}</div></div>', unsafe_allow_html=True)
-        if user_input := st.chat_input("Ask anything..."):
-            command = parse_command(user_input)
-            if command:
-                if command["action"] == "drop_columns":
-                    drop_columns(command["columns"])
-                elif command["action"] == "scatter_plot":
-                    generate_scatter_plot({"x": command["x"], "y": command["y"]})
-                elif command["action"] == "histogram":
-                    generate_histogram({"x": command["x"]})
-                st.session_state.chat_history.append({"role": "user", "content": user_input})
-                st.session_state.chat_history.append({"role": "assistant", "content": "Command executed."})
             else:
-                st.session_state.chat_history.append({"role": "user", "content": user_input})
                 response = get_chatbot_response(user_input, app_mode, st.session_state.vector_store, model)
-                st.session_state.chat_history.append({"role": "assistant", "content": response})
-            st.rerun()
-        st.markdown('</div>', unsafe_allow_html=True)
     # Footer
-    st.markdown('<div class="footer">Built with Streamlit & Groq</div>', unsafe_allow_html=True)
 if __name__ == "__main__":
     main()

 from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
 import tempfile
+# Set page config as the first Streamlit command
+st.set_page_config(page_title="Data-Vision Pro", layout="wide")
 # Load environment variables
 load_dotenv()
+# Initialize Groq client
 client = Groq(api_key=os.getenv("GROQ_API_KEY"))
+# Initialize HuggingFace embeddings for FAISS
 embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+# Custom CSS with Silver, Blue, and Gold Theme + Responsiveness
 st.markdown("""
+    <style>
+    :root {
+        --silver: #D8D8D8;
+        --blue: #5C89BC;
+        --gold: #A87E01;
+        --text-color: #333333;
+    }
+    .stApp {
+        background-color: var(--silver);
+        font-family: 'Inter', sans-serif;
+        max-width: 900px;
+        margin: 0 auto;
+        padding: 10px;
+    }
+    .header {
+        background-color: var(--blue);
+        color: white;
+        padding: 15px;
+        border-radius: 5px;
+        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+        text-align: center;
+    }
+    .header-title {
+        font-size: 1.5rem;
+        font-weight: 700;
+        margin: 0;
+    }
+    .header-subtitle {
+        font-size: 0.9rem;
+        margin-top: 5px;
+    }
+    .sidebar .sidebar-content {
+        background-color: white;
+        border-radius: 5px;
+        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+        padding: 15px;
+    }
+    .chat-container {
+        background-color: white;
+        border-radius: 5px;
+        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+        padding: 15px;
+        margin-top: 20px;
+    }
+    .user-message {
+        background-color: var(--blue);
+        color: white;
+        border-radius: 18px 18px 4px 18px;
+        padding: 12px 16px;
+        margin-left: auto;
+        max-width: 80%;
+        margin-bottom: 10px;
+    }
+    .bot-message {
+        background-color: #F0F0F0;
+        color: var(--text-color);
+        border-radius: 18px 18px 18px 4px;
+        padding: 12px 16px;
+        margin-right: auto;
+        max-width: 80%;
+        margin-bottom: 10px;
+    }
+    .footer {
+        text-align: center;
+        margin-top: 20px;
+        color: var(--text-color);
+        font-size: 0.8rem;
+    }
+    .tech-badge {
+        display: inline-block;
+        background-color: #E6ECEF;
+        color: var(--blue);
+        padding: 4px 8px;
+        border-radius: 12px;
+        font-size: 0.7rem;
+        margin: 0 4px;
+    }
+    h2 {
+        color: var(--blue);
+        border-bottom: 2px solid var(--gold);
+        padding-bottom: 5px;
+    }
+    .stButton > button {
+        background-color: var(--gold);
+        color: white;
+        border-radius: 5px;
+        padding: 8px 16px;
+        border: none;
+        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+    }
+    .stButton > button:hover {
+        background-color: #8C6B01;
+    }
+    @media (max-width: 768px) {
+        .header-title {
+            font-size: 1.2rem;
+        }
+        .header-subtitle {
+            font-size: 0.8rem;
+        }
+        .chat-container, .sidebar .sidebar-content {
+            padding: 10px;
+        }
+        .stApp {
+            padding: 5px;
+        }
+        h2 {
+            font-size: 1.2rem;
+        }
+    }
+    </style>
 """, unsafe_allow_html=True)
+# Helper Functions (unchanged)
 def enhance_section_title(title):
+    st.markdown(f"<h2 style='border-bottom: 2px solid var(--gold); padding-bottom: 5px; color: var(--blue);'>{title}</h2>", unsafe_allow_html=True)
 def update_cleaned_data(df):
     st.session_state.cleaned_data = df
         st.session_state.data_versions = [st.session_state.raw_data.copy()]
     st.session_state.data_versions.append(df.copy())
     st.session_state.dataset_text = convert_df_to_text(df)
+    st.success("✅ Action completed successfully!")
     st.rerun()
 def convert_df_to_text(df):
     text = f"Dataset Summary: {df.shape[0]} rows, {df.shape[1]} columns\n"
+    text += f"Missing Values: {df.isna().sum().sum()}\n"
+    text += "Columns:\n"
     for col in df.columns:
         text += f"- {col} ({df[col].dtype}): "
         if pd.api.types.is_numeric_dtype(df[col]):
         temp_path = temp_file.name
     loader = TextLoader(temp_path)
     documents = loader.load()
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
     texts = text_splitter.split_documents(documents)
     vector_store = FAISS.from_documents(texts, embeddings)
     os.unlink(temp_path)
     return vector_store
 def update_vector_store_with_plot(plot_text, existing_vector_store):
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as temp_file:
+        temp_file.write(plot_text)
+        temp_path = temp_file.name
+    loader = TextLoader(temp_path)
+    documents = loader.load()
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
+    texts = text_splitter.split_documents(documents)
     if existing_vector_store:
+        existing_vector_store.add_documents(texts)
     else:
+        existing_vector_store = FAISS.from_documents(texts, embeddings)
+    os.unlink(temp_path)
     return existing_vector_store
 def extract_plot_data(plot_info, df):
+    plot_type = plot_info["type"]
+    x_col = plot_info["x"]
+    y_col = plot_info["y"] if "y" in plot_info else None
+    data = pd.read_json(plot_info["data"])
+    plot_text = f"Plot Type: {plot_type}\n"
+    plot_text += f"X-Axis: {x_col}\n"
+    if y_col:
+        plot_text += f"Y-Axis: {y_col}\n"
+    if plot_type == "Scatter Plot" and y_col:
+        correlation = data[x_col].corr(data[y_col])
+        slope, intercept, r_value, p_value, std_err = stats.linregress(data[x_col].dropna(), data[y_col].dropna())
+        plot_text += f"Correlation: {correlation:.2f}\n"
+        plot_text += f"Linear Regression: Slope={slope:.2f}, Intercept={intercept:.2f}, R²={r_value**2:.2f}, p-value={p_value:.4f}\n"
+        plot_text += f"X Stats: Mean={data[x_col].mean():.2f}, Std={data[x_col].std():.2f}, Min={data[x_col].min():.2f}, Max={data[x_col].max():.2f}\n"
+        plot_text += f"Y Stats: Mean={data[y_col].mean():.2f}, Std={data[y_col].std():.2f}, Min={data[y_col].min():.2f}, Max={data[y_col].max():.2f}\n"
+    elif plot_type == "Histogram":
+        plot_text += f"Stats: Mean={data[x_col].mean():.2f}, Median={data[x_col].median():.2f}, Std={data[x_col].std():.2f}\n"
+        plot_text += f"Skewness: {data[x_col].skew():.2f}\n"
+        plot_text += f"Range: [{data[x_col].min():.2f}, {data[x_col].max():.2f}]\n"
+    elif plot_type == "Box Plot" and y_col:
+        q1, q3 = data[y_col].quantile(0.25), data[y_col].quantile(0.75)
+        iqr = q3 - q1
+        plot_text += f"Y Stats: Median={data[y_col].median():.2f}, Q1={q1:.2f}, Q3={q3:.2f}, IQR={iqr:.2f}\n"
+        plot_text += f"Outliers: {len(data[y_col][(data[y_col] < q1 - 1.5 * iqr) | (data[y_col] > q3 + 1.5 * iqr)])} potential outliers\n"
+    elif plot_type == "Line Chart" and y_col:
+        plot_text += f"Y Stats: Mean={data[y_col].mean():.2f}, Std={data[y_col].std():.2f}, Trend={'increasing' if data[y_col].iloc[-1] > data[y_col].iloc[0] else 'decreasing'}\n"
+    elif plot_type == "Bar Chart":
+        plot_text += f"Counts: {data[x_col].value_counts().to_dict()}\n"
+    elif plot_type == "Correlation Matrix":
+        corr = data.corr()
+        plot_text += "Correlation Matrix:\n"
+        for col1 in corr.columns:
+            for col2 in corr.index:
+                if col1 < col2:
+                    plot_text += f"{col1} vs {col2}: {corr.loc[col2, col1]:.2f}\n"
+    return plot_text
 def get_chatbot_response(user_input, app_mode, vector_store=None, model="llama3-70b-8192"):
+    system_prompt = (
+        "You are an AI assistant in Data-Vision Pro, a data analysis app with RAG capabilities. "
+        f"The user is on the '{app_mode}' page:\n"
+        "- **Data Upload**: Upload CSV/XLSX files, view stats, or generate reports.\n"
+        "- **Data Cleaning**: Clean data (e.g., handle missing values, encode variables).\n"
+        "- **EDA**: Visualize data (e.g., scatter plots, histograms) and analyze plots.\n"
+        "When analyzing plots, provide detailed insights based on numerical data extracted from them."
+    )
     context = ""
     if vector_store:
         docs = vector_store.similarity_search(user_input, k=3)
+        if docs:
+            context = "\n\nDataset and Plot Context:\n" + "\n".join([f"- {doc.page_content}" for doc in docs])
+            system_prompt += f"Use this dataset and plot context to augment your response:\n{context}"
+    else:
+        system_prompt += "No dataset or plot data is loaded. Assist based on app functionality."
+    try:
+        response = client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_input}
+            ],
+            temperature=0.7,
+            max_tokens=1024
+        )
+        return response.choices[0].message.content
+    except Exception as e:
+        return f"Error: {str(e)}"
+# Command Functions
 def drop_columns(columns):
     if 'cleaned_data' in st.session_state:
+        df = st.session_state.cleaned_data.copy()
+        columns_to_drop = [col.strip() for col in columns.split(',')]
+        valid_columns = [col for col in columns_to_drop if col in df.columns]
+        if valid_columns:
+            df.drop(valid_columns, axis=1, inplace=True)
+            update_cleaned_data(df)
+            return f"Dropped columns: {', '.join(valid_columns)}"
+        else:
+            return "No valid columns found to drop."
+    return "No dataset loaded."
 def generate_scatter_plot(params):
+    df = st.session_state.cleaned_data
+    match = re.search(r"([\w\s]+)\s+vs\s+([\w\s]+)", params)
+    if match and len(match.groups()) >= 2:
+        x_axis, y_axis = match.group(1).strip(), match.group(2).strip()
+        if x_axis in df.columns and y_axis in df.columns:
+            fig = px.scatter(df, x=x_axis, y=y_axis, title=f'Scatter Plot of {x_axis} vs {y_axis}')
+            st.plotly_chart(fig)
+            st.session_state.last_plot = {"type": "Scatter Plot", "x": x_axis, "y": y_axis, "data": df[[x_axis, y_axis]].to_json()}
+            return f"Generated scatter plot of {x_axis} vs {y_axis}"
+    return "Invalid columns for scatter plot."
 def generate_histogram(params):
+    df = st.session_state.cleaned_data
+    x_axis = params.strip()
+    if x_axis in df.columns:
+        fig = px.histogram(df, x=x_axis, title=f'Histogram of {x_axis}')
+        st.plotly_chart(fig)
+        st.session_state.last_plot = {"type": "Histogram", "x": x_axis, "data": df[[x_axis]].to_json()}
+        return f"Generated histogram of {x_axis}"
+    return "Invalid column for histogram."
 def analyze_plot():
+    if "last_plot" not in st.session_state:
+        return "No plot available to analyze."
+    plot_info = st.session_state.last_plot
+    df = pd.read_json(plot_info["data"])
+    plot_text = extract_plot_data(plot_info, df)
+    return f"Analysis of the last plot:\n{plot_text}"
 def parse_command(command):
     command = command.lower().strip()
+    if "drop columns" in command or "drop column" in command:
+        columns = command.replace("drop columns", "").replace("drop column", "").strip()
+        return drop_columns, columns
+    elif "show a scatter plot" in command or "scatter plot of" in command:
+        params = command.replace("show a scatter plot of", "").replace("scatter plot of", "").strip()
+        return generate_scatter_plot, params
+    elif "show a histogram" in command or "histogram of" in command:
+        params = command.replace("show a histogram of", "").replace("histogram of", "").strip()
+        return generate_histogram, params
+    elif "analyze plot" in command:
+        return lambda x: analyze_plot(), None
+    return None, command
+# Dataset Preview Function
 def display_dataset_preview():
     if 'cleaned_data' in st.session_state:
+        st.subheader("Current Dataset Preview")
+        st.dataframe(st.session_state.cleaned_data.head(10), use_container_width=True)
+        st.markdown("---")
 # Main App
 def main():
     st.markdown("""
         <div class="header">
             <h1 class="header-title">Data-Vision Pro</h1>
+            <div class="header-subtitle">Advanced Data Analysis with Groq Inference</div>
         </div>
     """, unsafe_allow_html=True)
+    # Sidebar Navigation
+    with st.sidebar:
+        st.markdown("### 🔮 Data-Vision Pro")
+        st.markdown("Your AI-powered data analysis suite with RAG.")
+        st.markdown("---")
+        app_mode = st.selectbox(
+            "Navigation",
+            ["Data Upload", "Data Cleaning", "EDA"],
+            format_func=lambda x: f"📌 {x}"
+        )
+        model = st.selectbox(
+            "Select Groq Model",
+            ["llama3-70b-8192", "llama3-8b-8192", "mixtral-8x7b-32768", "gemma-7b-it"],
+            index=0
+        )
+        if app_mode == "Data Upload":
+            st.info("⬆️ Upload your CSV or XLSX dataset to begin.")
+        elif app_mode == "Data Cleaning":
+            st.info("🧹 Clean and preprocess your data.")
+        elif app_mode == "EDA":
+            st.info("🔍 Explore your data visually.")
+        if 'cleaned_data' in st.session_state:
+            csv = st.session_state.cleaned_data.to_csv(index=False)
+            st.download_button(
+                label="Download Cleaned Data",
+                data=csv,
+                file_name='cleaned_data.csv',
+                mime='text/csv',
+            )
+        st.markdown("---")
+        st.markdown("Built with <span class='tech-badge'>Streamlit</span> + <span class='tech-badge'>Groq</span>", unsafe_allow_html=True)
     # Initialize Session State
     if 'vector_store' not in st.session_state:
         st.session_state.vector_store = None
     if 'chat_history' not in st.session_state:
         st.session_state.chat_history = []
+    # Display Dataset Preview
+    display_dataset_preview()
+    # App Pages
+    if app_mode == "Data Upload":
+        st.header("📤 Data Upload & Profiling")
+        uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"], key="file_uploader")
+        if uploaded_file:
+            st.session_state.pop('raw_data', None)
+            st.session_state.pop('cleaned_data', None)
+            st.session_state.pop('data_versions', None)
+            try:
+                if uploaded_file.name.endswith('.csv'):
+                    df = pd.read_csv(uploaded_file)
+                else:
+                    df = pd.read_excel(uploaded_file)
+                if df.empty:
+                    st.error("Uploaded file is empty.")
+                    st.stop()
+                st.session_state.raw_data = df
+                st.session_state.cleaned_data = df.copy()
+                st.session_state.dataset_text = convert_df_to_text(df)
+                st.session_state.vector_store = create_vector_store(st.session_state.dataset_text)
+                if 'data_versions' not in st.session_state:
                     st.session_state.data_versions = [df.copy()]
+                col1, col2, col3 = st.columns(3)
+                with col1: st.metric("Rows", df.shape[0])
+                with col2: st.metric("Columns", df.shape[1])
+                with col3: st.metric("Missing Values", df.isna().sum().sum())
+                if st.checkbox("Show Data Preview"):
+                    st.dataframe(df.head(10), use_container_width=True)
+                if st.button("Generate Full Profile Report"):
+                    with st.spinner("Generating report..."):
                         pr = ProfileReport(df, explorative=True)
                         st_profile_report(pr)
+                st.success("✅ Data loaded successfully!")
+            except Exception as e:
+                st.error(f"An error occurred: {str(e)}")
+    elif app_mode == "Data Cleaning":
+        st.header("🧹 Smart Data Cleaning")
+        if 'raw_data' not in st.session_state:
+            st.warning("Please upload data first in the Data Upload section.")
+            st.stop()
+        if 'cleaned_data' in st.session_state:
+            df = st.session_state.cleaned_data.copy()
+        else:
+            st.session_state.cleaned_data = st.session_state.raw_data.copy()
+            df = st.session_state.cleaned_data.copy()
+        enhance_section_title("📊 Data Health Dashboard")
+        with st.expander("Explore Data Health Metrics", expanded=True):
+            col1, col2, col3 = st.columns(3)
+            with col1: st.metric("Columns", len(df.columns))
+            with col2: st.metric("Rows", len(df))
+            with col3: st.metric("Missing Values", df.isna().sum().sum())
+            if st.button("Generate Detailed Health Report"):
+                with st.spinner("Generating report..."):
+                    profile = ProfileReport(df, minimal=True)
+                    st_profile_report(profile)
+            if 'data_versions' in st.session_state and len(st.session_state.data_versions) > 1:
+                if st.button("Undo Last Action"):
+                    st.session_state.data_versions.pop()
+                    st.session_state.cleaned_data = st.session_state.data_versions[-1].copy()
+                    st.session_state.dataset_text = convert_df_to_text(st.session_state.cleaned_data)
+                    st.session_state.vector_store = create_vector_store(st.session_state.dataset_text)
+                    st.rerun()
+        with st.expander("🛠️ Data Cleaning Operations", expanded=True):
+            enhance_section_title("🔍 Missing Values Treatment")
+            missing_cols = df.columns[df.isna().any()].tolist()
+            if missing_cols:
+                cols = st.multiselect("Select columns with missing values", missing_cols)
+                method = st.selectbox("Choose imputation method", [
+                    "Drop Missing Values", "Fill with Mean/Median", "Fill with Custom Value", "Forward Fill", "Backward Fill"
+                ])
+                if method == "Fill with Custom Value":
+                    custom_val = st.text_input("Enter custom value:")
+                if st.button("Apply Missing Value Treatment"):
+                    new_df = df.copy()
+                    if method == "Drop Missing Values":
+                        new_df = new_df.dropna(subset=cols)
+                    elif method == "Fill with Mean/Median":
+                        for col in cols:
+                            if pd.api.types.is_numeric_dtype(new_df[col]):
+                                new_df[col] = new_df[col].fillna(new_df[col].median())
+                            else:
+                                new_df[col] = new_df[col].fillna(new_df[col].mode()[0])
+                    elif method == "Fill with Custom Value" and custom_val:
+                        new_df[cols] = new_df[cols].fillna(custom_val)
+                    elif method == "Forward Fill":
+                        new_df[cols] = new_df[cols].ffill()
+                    elif method == "Backward Fill":
+                        new_df[cols] = new_df[cols].bfill()
                     update_cleaned_data(new_df)
             else:
+                st.success("✨ No missing values detected!")
+            enhance_section_title("🔄 Data Type Conversion")
+            col_to_convert = st.selectbox("Select column to convert", df.columns)
+            new_type = st.selectbox("Select new data type", ["String", "Integer", "Float", "Boolean", "Datetime"])
+            if new_type == "Datetime":
+                date_format = st.text_input("Enter date format (e.g., %Y-%m-%d):", "%Y-%m-%d")
+            if st.button("Convert Data Type"):
+                new_df = df.copy()
+                if new_type == "String":
+                    new_df[col_to_convert] = new_df[col_to_convert].astype(str)
+                elif new_type == "Integer":
+                    new_df[col_to_convert] = pd.to_numeric(new_df[col_to_convert], errors='coerce').astype('Int64')
+                elif new_type == "Float":
+                    new_df[col_to_convert] = pd.to_numeric(new_df[col_to_convert], errors='coerce')
+                elif new_type == "Boolean":
+                    new_df[col_to_convert] = new_df[col_to_convert].astype(bool)
+                elif new_type == "Datetime":
+                    new_df[col_to_convert] = pd.to_datetime(new_df[col_to_convert], format=date_format, errors='coerce')
+                update_cleaned_data(new_df)
+            enhance_section_title("🗑️ Drop Columns")
+            columns_to_drop = st.multiselect("Select columns to remove", df.columns)
+            if columns_to_drop and st.button("Confirm Column Removal"):
+                new_df = df.copy()
+                new_df = new_df.drop(columns=columns_to_drop)
+                update_cleaned_data(new_df)
+            enhance_section_title("🔢 Encoding Options")
+            encoding_method = st.radio("Choose encoding method", ("Label Encoding", "One-Hot Encoding"))
+            data_to_encode = st.multiselect("Select columns to encode", df.select_dtypes(include='object').columns)
+            if data_to_encode and st.button("Apply Encoding"):
+                new_df = df.copy()
+                if encoding_method == "Label Encoding":
+                    for col in data_to_encode:
+                        le = LabelEncoder()
+                        new_df[col] = le.fit_transform(new_df[col].astype(str))
+                elif encoding_method == "One-Hot Encoding":
+                    new_df = pd.get_dummies(new_df, columns=data_to_encode, drop_first=True, dtype=int)
+                update_cleaned_data(new_df)
+            enhance_section_title("📏 StandardScaler")
+            scale_cols = st.multiselect("Select numerical columns to scale", df.select_dtypes(include=np.number).columns)
+            if scale_cols and st.button("Apply StandardScaler"):
+                new_df = df.copy()
+                scaler = StandardScaler()
+                new_df[scale_cols] = scaler.fit_transform(new_df[scale_cols])
+                update_cleaned_data(new_df)
+    elif app_mode == "EDA":
+        st.header("🔍 Interactive Data Explorer")
+        if 'cleaned_data' not in st.session_state:
+            st.warning("Please upload and clean data first.")
+            st.stop()
+        df = st.session_state.cleaned_data.copy()
+        enhance_section_title("Dataset Overview")
+        with st.container():
+            col1, col2, col3, col4 = st.columns(4)
+            col1.metric("Total Rows", df.shape[0])
+            col2.metric("Total Columns", df.shape[1])
+            missing_percentage = df.isna().sum().sum() / df.size * 100
+            col3.metric("Missing Values", f"{df.isna().sum().sum()} ({missing_percentage:.1f}%)")
+            col4.metric("Duplicates", df.duplicated().sum())
+        tab1, tab2, tab3 = st.tabs(["Quick Preview", "Column Types", "Missing Matrix"])
+        with tab1:
+            st.write("First few rows of the dataset:")
+            st.dataframe(df.head(), use_container_width=True)
+        with tab2:
+            st.write("Column Data Types:")
+            type_counts = df.dtypes.value_counts().reset_index()
+            type_counts.columns = ['Type', 'Count']
+            st.dataframe(type_counts, use_container_width=True)
+        with tab3:
+            st.write("Missing Values Matrix:")
+            fig_missing = px.imshow(df.isna(), color_continuous_scale=['#e0e0e0', '#66c2a5'])
+            fig_missing.update_layout(coloraxis_colorscale=[[0, 'lightgrey'], [1, '#FF4B4B']])
+            st.plotly_chart(fig_missing, use_container_width=True)
+        enhance_section_title("Interactive Visualization Builder")
+        with st.container():
+            col1, col2 = st.columns([1, 3])
+            with col1:
+                plot_type = st.selectbox("Choose visualization type", [
+                    "Scatter Plot", "Histogram", "Box Plot", "Line Chart", "Bar Chart", "Correlation Matrix"
+                ])
+                x_axis = st.selectbox("X-axis", df.columns) if plot_type != "Correlation Matrix" else None
+                y_axis = st.selectbox("Y-axis", df.columns) if plot_type in ["Scatter Plot", "Box Plot", "Line Chart"] else None
+                color_by = st.selectbox("Color encoding", ["None"] + df.columns.tolist(), format_func=lambda x: "No color" if x == "None" else x) if plot_type != "Correlation Matrix" else None
+            with col2:
+                try:
+                    fig = None
+                    if plot_type == "Scatter Plot" and x_axis and y_axis:
+                        fig = px.scatter(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None, title=f'Scatter Plot of {x_axis} vs {y_axis}')
+                    elif plot_type == "Histogram" and x_axis:
+                        fig = px.histogram(df, x=x_axis, color=color_by if color_by != "None" else None, nbins=30, title=f'Histogram of {x_axis}')
+                    elif plot_type == "Box Plot" and x_axis and y_axis:
+                        fig = px.box(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None, title=f'Box Plot of {x_axis} vs {y_axis}')
+                    elif plot_type == "Line Chart" and x_axis and y_axis:
+                        fig = px.line(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None, title=f'Line Chart of {x_axis} vs {y_axis}')
+                    elif plot_type == "Bar Chart" and x_axis:
+                        fig = px.bar(df, x=x_axis, color=color_by if color_by != "None" else None, title=f'Bar Chart of {x_axis}')
+                    elif plot_type == "Correlation Matrix":
+                        numeric_df = df.select_dtypes(include=np.number)
+                        if len(numeric_df.columns) > 1:
+                            corr = numeric_df.corr()
+                            fig = px.imshow(corr, text_auto=True, color_continuous_scale='RdBu_r', zmin=-1, zmax=1, title='Correlation Matrix')
+                    if fig:
+                        fig.update_layout(template="plotly_white")
                         st.plotly_chart(fig, use_container_width=True)
+                        st.session_state.last_plot = {
+                            "type": plot_type,
+                            "x": x_axis,
+                            "y": y_axis,
+                            "data": df[[x_axis, y_axis]].to_json() if y_axis else df[[x_axis]].to_json()
+                        }
+                        plot_text = extract_plot_data(st.session_state.last_plot, df)
                         st.session_state.vector_store = update_vector_store_with_plot(plot_text, st.session_state.vector_store)
+                        with st.expander("Extracted Plot Data"):
+                            st.text(plot_text)
+                    else:
+                        st.error("Please provide required inputs for the selected plot type.")
+                except Exception as e:
+                    st.error(f"Couldn't create visualization: {str(e)}")
+    # Chatbot Section
+    st.markdown("---")
+    st.markdown('<div class="chat-container">', unsafe_allow_html=True)
+    st.subheader("💬 AI Chatbot Assistant (RAG Enabled)")
+    st.info("Ask about your data or app features! Try: 'drop columns X, Y', 'scatter plot of X vs Y', 'analyze plot'")
+    for message in st.session_state.chat_history:
+        with st.chat_message(message["role"]):
+            st.markdown(f'<div class="{message["role"]}-message">{message["content"]}</div>', unsafe_allow_html=True)
+    user_input = st.chat_input("Ask me anything...")
+    if user_input:
+        st.session_state.chat_history.append({"role": "user", "content": user_input})
+        with st.chat_message("user"):
+            st.markdown(f'<div class="user-message">{user_input}</div>', unsafe_allow_html=True)
+        with st.spinner("Processing..."):
+            func, param = parse_command(user_input)
+            if func:
+                response = func(param) if param else func(None)
             else:
                 response = get_chatbot_response(user_input, app_mode, st.session_state.vector_store, model)
+            st.session_state.chat_history.append({"role": "assistant", "content": response})
+        with st.chat_message("assistant"):
+            st.markdown(f'<div class="bot-message">{response}</div>', unsafe_allow_html=True)
+    st.markdown('</div>', unsafe_allow_html=True)
     # Footer
+    st.markdown("""
+        <div class="footer">
+            <div>Built with <span class="tech-badge">Streamlit</span> + <span class="tech-badge">Groq</span> + <span class="tech-badge">LangChain</span> + <span class="tech-badge">FAISS</span></div>
+            <div style="margin-top: 8px;">Fast inference for data insights</div>
+        </div>
+    """, unsafe_allow_html=True)
 if __name__ == "__main__":
     main()