Spaces:

mgbam
/

PhoenixUI

Sleeping

App Files Files Community

mgbam commited on 8 days ago

Commit

f7b84f1

verified ·

1 Parent(s): c08faed

Update app.py

Browse files

Files changed (1) hide show

app.py +358 -404

app.py CHANGED Viewed

@@ -5,20 +5,33 @@ import plotly.express as px
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
 import io
-from scipy import stats
 import warnings
 import google.generativeai as genai
 import os
-from dotenv import load_dotenv
 import logging
-import json
 from contextlib import redirect_stdout
 # --- Configuration ---
 warnings.filterwarnings('ignore')
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-# --- Helper Functions ---
 def safe_exec(code_string: str, local_vars: dict) -> tuple:
     """Safely execute a string of Python code and capture its output."""
@@ -26,291 +39,282 @@ def safe_exec(code_string: str, local_vars: dict) -> tuple:
     try:
         with redirect_stdout(output_buffer):
             exec(code_string, globals(), local_vars)
-        stdout_output = output_buffer.getvalue()
-        fig = local_vars.get('fig', None)
-        return stdout_output, fig, None
     except Exception as e:
-        error_message = f"Execution Error: {str(e)}"
-        logging.error(f"Error executing AI-generated code: {error_message}")
-        return None, None, error_message
-# --- Core Data Processing ---
-def load_and_process_file(file_obj, state_dict):
-    """Loads a CSV file and performs initial processing, updating the global state."""
-    if file_obj is None:
-        return None, "Please upload a file.", gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
     try:
         df = pd.read_csv(file_obj.name)
-        # Attempt to convert object columns to datetime
         for col in df.select_dtypes(include=['object']).columns:
             try:
                 df[col] = pd.to_datetime(df[col], errors='raise')
-                logging.info(f"Successfully converted column '{col}' to datetime.")
             except (ValueError, TypeError):
-                continue
         metadata = extract_dataset_metadata(df)
-        state_dict = {
-            'df': df,
             'metadata': metadata,
-            'filename': os.path.basename(file_obj.name)
         }
-        # Update UI elements dynamically
-        update_args = {
-            'choices': metadata['columns'],
-            'value': None,
-            'interactive': True
         }
-        # Check for time series tab visibility
-        time_series_visible = len(metadata['datetime_cols']) > 0
-        return (
-            state_dict,
-            f"✅ Loaded `{state_dict['filename']}` ({metadata['shape'][0]} rows, {metadata['shape'][1]} cols)",
-            gr.update(**update_args), gr.update(**update_args), gr.update(**update_args),
-            gr.update(choices=metadata['numeric_cols'], value=None, interactive=True),
-            gr.update(choices=metadata['datetime_cols'], value=None, interactive=True),
-            gr.update(visible=time_series_visible), # Show/hide Time Series tab
-            gr.update(visible=True) # Show Chatbot tab
-        )
     except Exception as e:
-        logging.error(f"Error loading file: {e}")
-        return state_dict, f"❌ Error: {e}", gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(visible=False), gr.update(visible=False)
-def extract_dataset_metadata(df: pd.DataFrame) -> dict:
-    """Extracts comprehensive metadata from a DataFrame."""
     rows, cols = df.shape
-    columns = df.columns.tolist()
-    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
-    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
-    datetime_cols = df.select_dtypes(include=['datetime64', 'datetime64[ns]']).columns.tolist()
-    missing_data = df.isnull().sum()
-    data_quality = round((df.notna().sum().sum() / (rows * cols)) * 100, 1) if rows * cols > 0 else 0
     return {
         'shape': (rows, cols),
-        'columns': columns,
-        'numeric_cols': numeric_cols,
-        'categorical_cols': categorical_cols,
-        'datetime_cols': datetime_cols,
-        'dtypes': df.dtypes.to_string(),
-        'missing_data': missing_data.to_dict(),
-        'data_quality': data_quality,
-        'head': df.head().to_string()
     }
-# --- Tab 1: AI Overview ---
-def analyze_dataset_overview(state_dict, api_key: str):
-    """Generates an AI-powered narrative overview of the dataset."""
-    if not state_dict:
-        return "❌ Please upload a dataset first.", "", 0
-    if not api_key:
-        return "❌ Please enter your Gemini API key.", "", 0
-    metadata = state_dict['metadata']
-    # Create prompt for Gemini
-    prompt = f"""
-    You are an expert data analyst and storyteller. Your task is to provide a high-level, engaging overview of a dataset based on its metadata.
-    **Dataset Metadata:**
-    - **Shape:** {metadata['shape'][0]} rows, {metadata['shape'][1]} columns
-    - **Column Names:** {', '.join(metadata['columns'])}
-    - **Numeric Columns:** {', '.join(metadata['numeric_cols'])}
-    - **Categorical Columns:** {', '.join(metadata['categorical_cols'])}
-    - **Datetime Columns:** {', '.join(metadata['datetime_cols'])}
-    - **Data Quality (Non-missing values):** {metadata['data_quality']}%
-    - **First 5 rows:**
-    {metadata['head']}
-    **Your Task:**
-    Based on the metadata, generate a report in Markdown format. Use emojis to make it visually appealing. The report should have the following sections:
-    # 🚀 AI-Powered Dataset Overview
-    ## 🤔 What is this dataset likely about?
-    (Predict the domain and purpose of the dataset, e.g., "This appears to be customer transaction data for an e-commerce platform.")
-    ## 💡 Potential Key Questions to Explore
-    - (Suggest 3-4 interesting business or research questions the data could answer.)
-    - (Example: "Which products are most frequently purchased together?")
-    ## 📊 Potential Analyses & Visualizations
-    - (List 3-4 types of analyses that would be valuable.)
-    - (Example: "Time series analysis of sales to identify seasonality.")
-    ## ⚠️ Data Quality & Potential Issues
-    - (Briefly comment on the data quality score and mention if the presence of datetime columns is a good sign for certain analyses.)
-    """
-    try:
-        genai.configure(api_key=api_key)
-        model = genai.GenerativeModel('gemini-1.5-flash')
-        response = model.generate_content(prompt)
-        story = response.text
-    except Exception as e:
-        story = f"## ⚠️ AI Generation Failed\n**Error:** {str(e)}\n\nPlease check your API key and network connection. A fallback analysis is provided below.\n\n" \
-                f"### Fallback Analysis\nThis dataset contains **{metadata['shape'][0]}** records and **{metadata['shape'][1]}** features. " \
-                f"It includes **{len(metadata['numeric_cols'])}** numeric, **{len(metadata['categorical_cols'])}** categorical, " \
-                f"and **{len(metadata['datetime_cols'])}** time-based columns. The overall data quality is **{metadata['data_quality']}%**, " \
-                f"which is a good starting point for analysis."
-    # Basic Info Summary
-    basic_info = f"""
-    📋 **File:** `{state_dict.get('filename', 'N/A')}`
-    📊 **Size:** {metadata['shape'][0]:,} rows × {metadata['shape'][1]} columns
-    🔢 **Features:**
-      • **Numeric:** {len(metadata['numeric_cols'])}
-      • **Categorical:** {len(metadata['categorical_cols'])}
-      • **DateTime:** {len(metadata['datetime_cols'])}
-    🎯 **Data Quality:** {metadata['data_quality']}%
-    """
-    return story, basic_info, metadata['data_quality']
-# --- Tab 2: Univariate Analysis ---
-def generate_univariate_plot(column_name, state_dict):
-    """Generates plots for a single selected variable."""
-    if not column_name or not state_dict:
-        return None, "Select a column to analyze."
-    df = state_dict['df']
-    metadata = state_dict['metadata']
-    fig = None
-    summary = ""
-    if column_name in metadata['numeric_cols']:
-        fig = make_subplots(rows=1, cols=2, subplot_titles=("Histogram", "Box Plot"))
-        fig.add_trace(go.Histogram(x=df[column_name], name="Histogram"), row=1, col=1)
-        fig.add_trace(go.Box(y=df[column_name], name="Box Plot"), row=1, col=2)
-        fig.update_layout(title_text=f"Distribution of '{column_name}'", showlegend=False)
-        summary = df[column_name].describe().to_frame().to_markdown()
-    elif column_name in metadata['categorical_cols']:
-        top_n = 20
-        counts = df[column_name].value_counts()
-        title = f"Top {min(top_n, len(counts))} Categories for '{column_name}'"
-        fig = px.bar(counts.nlargest(top_n), title=title, labels={'index': column_name, 'value': 'Count'})
-        fig.update_layout(showlegend=False)
-        summary = counts.to_frame().to_markdown()
-    elif column_name in metadata['datetime_cols']:
-        counts = df[column_name].dt.to_period("M").value_counts().sort_index()
-        fig = px.line(x=counts.index.to_timestamp(), y=counts.values, title=f"Records over Time for '{column_name}'")
-        fig.update_layout(xaxis_title="Time", yaxis_title="Record Count")
-        summary = df[column_name].describe(datetime_is_numeric=True).to_frame().to_markdown()
-    return fig, summary
-# --- Tab 3: Bivariate Analysis ---
-def generate_bivariate_plot(x_col, y_col, state_dict):
-    """Generates plots to explore the relationship between two variables."""
-    if not x_col or not y_col or not state_dict:
-        return None, "Select two columns to analyze."
-    if x_col == y_col:
-        return None, "Please select two different columns."
-    df = state_dict['df']
-    metadata = state_dict['metadata']
-    x_type = 'numeric' if x_col in metadata['numeric_cols'] else 'categorical'
-    y_type = 'numeric' if y_col in metadata['numeric_cols'] else 'categorical'
-    fig = None
-    title = f"{x_col} vs. {y_col}"
-    if x_type == 'numeric' and y_type == 'numeric':
-        fig = px.scatter(df, x=x_col, y=y_col, title=f"Scatter Plot: {title}", trendline="ols", trendline_color_override="red")
-        summary = df[[x_col, y_col]].corr().to_markdown()
-    elif x_type == 'numeric' and y_type == 'categorical':
-        fig = px.box(df, x=x_col, y=y_col, title=f"Box Plot: {title}")
-        summary = df.groupby(y_col)[x_col].describe().to_markdown()
-    elif x_type == 'categorical' and y_type == 'numeric':
-        fig = px.box(df, x=y_col, y=x_col, title=f"Box Plot: {title}")
-        summary = df.groupby(x_col)[y_col].describe().to_markdown()
-    else: # Both categorical
-        crosstab = pd.crosstab(df[x_col], df[y_col])
-        fig = px.imshow(crosstab, title=f"Heatmap of Counts: {title}", text_auto=True)
-        summary = crosstab.to_markdown()
-    return fig, f"### Analysis Summary\n{summary}"
-# --- Tab 4: Time Series Analysis ---
-def generate_time_series_plot(time_col, value_col, resample_freq, state_dict):
-    """Generates a time series plot with resampling."""
-    if not time_col or not value_col or not state_dict:
-        return None, "Select Time and Value columns."
-    df = state_dict['df'].copy()
-    try:
-        df[time_col] = pd.to_datetime(df[time_col])
-        df_resampled = df.set_index(time_col)[value_col].resample(resample_freq).mean().reset_index()
-        fig = px.line(df_resampled, x=time_col, y=value_col,
-                      title=f"Time Series of {value_col} (Resampled to '{resample_freq}')")
-        fig.update_layout(xaxis_title="Date", yaxis_title=f"Mean of {value_col}")
-        return fig, f"Showing mean of '{value_col}' aggregated by '{resample_freq}'."
-    except Exception as e:
-        return None, f"Error: {e}"
-# --- Tab 5: AI Analyst Chat ---
-def respond_to_chat(user_message, history, state_dict, api_key):
-    """Handles the chat interaction with the AI Analyst."""
     if not api_key:
-        history.append((user_message, "I can't answer without a Gemini API key. Please enter it in the 'AI Overview' tab."))
-        return history, None, ""
-    if not state_dict:
-        history.append((user_message, "Please upload a dataset before asking questions."))
-        return history, None, ""
-    history.append((user_message, None))
-    df_metadata = state_dict['metadata']
-    # Construct a robust prompt for the AI
     prompt = f"""
-    You are an AI Data Analyst assistant. Your name is 'Gemini Analyst'.
-    You are given a pandas DataFrame named `df`.
-    Your goal is to answer the user's question about this DataFrame by writing and executing Python code.
     **Instructions:**
-    1.  Analyze the user's question.
-    2.  Write Python code to answer it.
-    3.  You can use pandas, numpy, and plotly.express.
-    4.  If you create a plot, you **MUST** assign it to a variable named `fig`. The plot will be displayed to the user.
-    5.  If you are just calculating something or printing text, the `print()` output will be shown.
-    6.  **DO NOT** write any code that modifies the DataFrame (e.g., `df.dropna(inplace=True)`). Use `df.copy()` if you need to modify data.
-    7.  Respond **ONLY** with a JSON object containing two keys: "thought" and "code".
-        - "thought": A short, one-sentence explanation of your plan.
-        - "code": A string containing the Python code to execute.
-    **DataFrame Metadata:**
-    - **Filename:** {state_dict['filename']}
-    - **Shape:** {df_metadata['shape'][0]} rows, {df_metadata['shape'][1]} columns
-    - **Columns and Data Types:**
-    {df_metadata['dtypes']}
-    ---
-    **User Question:** "{user_message}"
-    ---
     **Your JSON Response:**
     """
@@ -319,168 +323,118 @@ def respond_to_chat(user_message, history, state_dict, api_key):
         model = genai.GenerativeModel('gemini-1.5-flash')
         response = model.generate_content(prompt)
-        # Clean and parse the JSON response
-        response_text = response.text.strip().replace("```json", "").replace("```", "")
-        response_json = json.loads(response_text)
         thought = response_json.get("thought", "Thinking...")
-        code_to_run = response_json.get("code", "")
-        bot_message = f"🧠 **Thought:** {thought}\n\n"
-        # Execute the code
-        local_vars = {'df': state_dict['df'], 'px': px, 'pd': pd, 'np': np}
-        stdout, fig_result, error = safe_exec(code_to_run, local_vars)
         if error:
-            bot_message += f"💥 **Error:**\n```\n{error}\n```"
-            history[-1] = (user_message, bot_message)
-            return history, None, ""
         if stdout:
-            bot_message += f"📋 **Output:**\n```\n{stdout}\n```"
-        if not fig_result and not stdout:
-            bot_message += "✅ Code executed successfully, but it produced no visible output."
-        history[-1] = (user_message, bot_message)
-        return history, fig_result, ""
     except Exception as e:
-        error_msg = f"An unexpected error occurred: {e}. The AI might have returned an invalid response. Please try rephrasing your question."
-        logging.error(f"Chatbot error: {error_msg}")
         history[-1] = (user_message, error_msg)
-        return history, None, ""
-# --- Gradio Interface ---
-def create_gradio_interface():
-    """Builds and returns the full Gradio application interface."""
-    with gr.Blocks(title="🚀 AI Data Explorer", theme=gr.themes.Soft()) as demo:
-        # Global state to hold data
-        global_state = gr.State({})
-        # Header
-        gr.Markdown("# 🚀 AI Data Explorer: Your Advanced Analytic Tool")
-        gr.Markdown("Upload a CSV, then explore your data with interactive tabs and a powerful AI Analyst.")
-        # --- Top Row: File Upload and API Key ---
-        with gr.Row():
-            with gr.Column(scale=2):
-                file_input = gr.File(label="📁 Upload CSV File", file_types=[".csv"])
-                status_output = gr.Markdown("Status: Waiting for file...")
-            with gr.Column(scale=1):
-                api_key_input = gr.Textbox(
-                    label="🔑 Gemini API Key",
-                    placeholder="Enter your key here...",
-                    type="password",
-                    info="Get your free key from Google AI Studio"
-                )
-        # --- Main Tabs ---
-        with gr.Tabs() as tabs:
-            # Tab 1: AI Overview
-            with gr.Tab("🤖 AI Overview", id=0):
-                overview_btn = gr.Button("🧠 Generate AI Overview", variant="primary")
-                with gr.Row():
-                    story_output = gr.Markdown(label="📖 AI-Generated Story")
-                    with gr.Column():
-                        basic_info_output = gr.Markdown(label="📋 Basic Information")
-                        quality_score = gr.Number(label="🎯 Data Quality Score (%)", interactive=False)
-            # Tab 2: Univariate Analysis
-            with gr.Tab("📊 Univariate Analysis", id=1):
-                uni_col_select = gr.Dropdown(label="Select a Column to Analyze", interactive=False)
-                with gr.Row():
-                    uni_plot_output = gr.Plot(label="Distribution Plot")
-                    uni_summary_output = gr.Markdown(label="Summary Statistics")
-            # Tab 3: Bivariate Analysis
-            with gr.Tab("📈 Bivariate Analysis", id=2):
-                with gr.Row():
-                    bi_x_select = gr.Dropdown(label="Select X-Axis Column", interactive=False)
-                    bi_y_select = gr.Dropdown(label="Select Y-Axis Column", interactive=False)
-                bi_btn = gr.Button("🎨 Generate Bivariate Plot", variant="secondary")
-                with gr.Row():
-                    bi_plot_output = gr.Plot(label="Relationship Plot")
-                    bi_summary_output = gr.Markdown(label="Analysis Summary")
-            # Tab 4: Time Series (conditionally visible)
-            with gr.Tab("⏳ Time Series Analysis", id=3, visible=False) as ts_tab:
-                with gr.Row():
-                    ts_time_col = gr.Dropdown(label="Select Time Column", interactive=False)
-                    ts_value_col = gr.Dropdown(label="Select Value Column", interactive=False)
-                    ts_resample = gr.Radio(['D', 'W', 'M', 'Q', 'Y'], label="Resample Frequency", value='M')
-                ts_btn = gr.Button("📈 Plot Time Series", variant="secondary")
-                ts_plot_output = gr.Plot(label="Time Series Plot")
-                ts_status_output = gr.Markdown()
-            # Tab 5: AI Analyst Chat (conditionally visible)
-            with gr.Tab("💬 AI Analyst Chat", id=4, visible=False) as chat_tab:
-                chatbot = gr.Chatbot(label="Chat with Gemini Analyst", height=500)
-                chat_plot_output = gr.Plot(label="AI Generated Plot")
-                with gr.Row():
-                    chat_input = gr.Textbox(label="Your Question", placeholder="e.g., 'Show me the distribution of age'", scale=4)
-                    chat_submit_btn = gr.Button("Submit", variant="primary", scale=1)
-                chat_clear_btn = gr.Button("Clear Chat")
-        # --- Event Handlers ---
-        # File upload triggers data loading and UI updates
-        file_input.upload(
-            fn=load_and_process_file,
-            inputs=[file_input, global_state],
-            outputs=[global_state, status_output, uni_col_select, bi_x_select, bi_y_select, ts_value_col, ts_time_col, ts_tab, chat_tab]
-        )
-        # Tab 1: Overview
-        overview_btn.click(
-            fn=analyze_dataset_overview,
-            inputs=[global_state, api_key_input],
-            outputs=[story_output, basic_info_output, quality_score]
-        )
-        # Tab 2: Univariate
-        uni_col_select.change(
-            fn=generate_univariate_plot,
-            inputs=[uni_col_select, global_state],
-            outputs=[uni_plot_output, uni_summary_output]
-        )
-        # Tab 3: Bivariate
-        bi_btn.click(
-            fn=generate_bivariate_plot,
-            inputs=[bi_x_select, bi_y_select, global_state],
-            outputs=[bi_plot_output, bi_summary_output]
-        )
-        # Tab 4: Time Series
-        ts_btn.click(
-            fn=generate_time_series_plot,
-            inputs=[ts_time_col, ts_value_col, ts_resample, global_state],
-            outputs=[ts_plot_output, ts_status_output]
-        )
-        # Tab 5: AI Chat
-        chat_submit_btn.click(
-            fn=respond_to_chat,
-            inputs=[chat_input, chatbot, global_state, api_key_input],
-            outputs=[chatbot, chat_plot_output, chat_input]
-        )
-        chat_input.submit(
-            fn=respond_to_chat,
-            inputs=[chat_input, chatbot, global_state, api_key_input],
-            outputs=[chatbot, chat_plot_output, chat_input]
-        )
-        chat_clear_btn.click(lambda: ([], None, ""), None, [chatbot, chat_plot_output, chat_input])
-    return demo
-# --- Main Application Execution ---
 if __name__ == "__main__":
-    # For local development, you might use load_dotenv()
-    # load_dotenv()
-    app = create_gradio_interface()
-    app.launch(debug=True)

 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
 import io
+import json
 import warnings
 import google.generativeai as genai
 import os
 import logging
 from contextlib import redirect_stdout
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.linear_model import LogisticRegression, LinearRegression
+from sklearn.metrics import accuracy_score, confusion_matrix, r2_score, mean_squared_error
+from sklearn.preprocessing import LabelEncoder
 # --- Configuration ---
 warnings.filterwarnings('ignore')
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+THEME = gr.themes.Glass(primary_hue="blue", secondary_hue="cyan").set(
+    body_background_fill="rgba(0,0,0,0.8)",
+    block_background_fill="rgba(0,0,0,0.6)",
+    block_border_width="1px",
+    border_color_primary="rgba(255,255,255,0.1)"
+)
+MODEL_REGISTRY = {
+    "Classification": {"Random Forest": RandomForestClassifier, "Logistic Regression": LogisticRegression},
+    "Regression": {"Random Forest": RandomForestRegressor, "Linear Regression": LinearRegression}
+}
+# --- Core Logic ---
 def safe_exec(code_string: str, local_vars: dict) -> tuple:
     """Safely execute a string of Python code and capture its output."""
     try:
         with redirect_stdout(output_buffer):
             exec(code_string, globals(), local_vars)
+        stdout = output_buffer.getvalue()
+        fig = local_vars.get('fig')
+        df_out = local_vars.get('df_result')
+        return stdout, fig, df_out, None
     except Exception as e:
+        return None, None, None, f"Execution Error: {str(e)}"
+def prime_data(file_obj):
+    """Loads, analyzes, and primes the entire application state upon file upload."""
+    if not file_obj:
+        return {gr.update(visible=False): None}
     try:
         df = pd.read_csv(file_obj.name)
+        # Smart type conversion
         for col in df.select_dtypes(include=['object']).columns:
             try:
                 df[col] = pd.to_datetime(df[col], errors='raise')
             except (ValueError, TypeError):
+                if df[col].nunique() / len(df) < 0.5: # If not too many unique values
+                    df[col] = df[col].astype('category')
+        # --- Phoenix Eye: Proactive Insights Engine ---
+        insights = {}
         metadata = extract_dataset_metadata(df)
+        # 1. Missing Data
+        missing = df.isnull().sum()
+        insights['missing'] = missing[missing > 0].sort_values(ascending=False)
+        # 2. High Cardinality
+        insights['high_cardinality'] = {c: df[c].nunique() for c in metadata['categorical_cols'] if df[c].nunique() > 50}
+        # 3. High Correlations
+        if len(metadata['numeric_cols']) > 1:
+            corr = df[metadata['numeric_cols']].corr().abs()
+            sol = corr.unstack()
+            so = sol.sort_values(kind="quicksort", ascending=False)
+            so = so[so < 1] # Remove self-correlation
+            insights['high_correlations'] = so.head(5)
+        # 4. Outlier Detection (IQR method)
+        outliers = {}
+        for col in metadata['numeric_cols']:
+            Q1, Q3 = df[col].quantile(0.25), df[col].quantile(0.75)
+            IQR = Q3 - Q1
+            outlier_count = ((df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))).sum()
+            if outlier_count > 0:
+                outliers[col] = outlier_count
+        insights['outliers'] = outliers
+        # 5. ML Target Suggestion
+        suggestions = []
+        for col in metadata['categorical_cols']:
+            if df[col].nunique() == 2:
+                suggestions.append(f"{col} (Binary Classification)")
+        for col in metadata['numeric_cols']:
+            if df[col].nunique() > 20: # Heuristic for continuous target
+                 suggestions.append(f"{col} (Regression)")
+        insights['ml_suggestions'] = suggestions
+        state = {
+            'df_original': df,
+            'df_modified': df.copy(),
+            'filename': os.path.basename(file_obj.name),
             'metadata': metadata,
+            'proactive_insights': insights
         }
+        # Generate UI updates
+        overview_md = generate_phoenix_eye_markdown(state)
+        all_cols = metadata['columns']
+        num_cols = metadata['numeric_cols']
+        cat_cols = metadata['categorical_cols']
+        return {
+            global_state: state,
+            phoenix_tabs: gr.update(visible=True),
+            phoenix_eye_output: overview_md,
+            # Data Medic updates
+            medic_col_select: gr.update(choices=insights['missing'].index.tolist() or [], interactive=True),
+            # Oracle updates
+            oracle_target_select: gr.update(choices=all_cols, interactive=True),
+            oracle_feature_select: gr.update(choices=all_cols, interactive=True),
         }
     except Exception as e:
+        logging.error(f"Priming Error: {e}")
+        return {phoenix_eye_output: gr.update(value=f"❌ **Error:** {e}")}
+def extract_dataset_metadata(df):
+    """Extracts typed metadata from a DataFrame."""
     rows, cols = df.shape
     return {
         'shape': (rows, cols),
+        'columns': df.columns.tolist(),
+        'numeric_cols': df.select_dtypes(include=np.number).columns.tolist(),
+        'categorical_cols': df.select_dtypes(include=['object', 'category']).columns.tolist(),
+        'datetime_cols': df.select_dtypes(include=['datetime64', 'datetime64[ns]']).columns.tolist(),
+        'dtypes': df.dtypes.apply(lambda x: x.name).to_dict()
     }
+def generate_phoenix_eye_markdown(state):
+    """Creates the markdown for the proactive insights dashboard."""
+    insights = state['proactive_insights']
+    md = f"## 🦅 Phoenix Eye: Proactive Insights for `{state['filename']}`\n"
+    md += f"Dataset has **{state['metadata']['shape'][0]} rows** and **{state['metadata']['shape'][1]} columns**.\n\n"
+    # ML Suggestions
+    md += "### 🔮 Potential ML Targets\n"
+    if insights['ml_suggestions']:
+        for s in insights['ml_suggestions']: md += f"- `{s}`\n"
+    else: md += "No obvious ML target columns found.\n"
+    md += "\n"
+    # Missing Data
+    md += "### 💧 Missing Data\n"
+    if not insights['missing'].empty:
+        md += "Found missing values in these columns. Use the **Data Medic** tab to fix.\n"
+        md += insights['missing'].to_frame('Missing Count').to_markdown() + "\n"
+    else: md += "✅ No missing data found!\n"
+    md += "\n"
+    # High Correlation
+    md += "### 🔗 Top Correlations\n"
+    if 'high_correlations' in insights and not insights['high_correlations'].empty:
+        md += insights['high_correlations'].to_frame('Correlation').to_markdown() + "\n"
+    else: md += "No strong correlations found between numeric features.\n"
+    md += "\n"
+    # Outliers
+    md += "### 📈 Outlier Alert\n"
+    if insights['outliers']:
+        for col, count in insights['outliers'].items(): md += f"- `{col}` has **{count}** potential outliers.\n"
+    else: md += "✅ No significant outliers detected.\n"
+    md += "\n"
+    # High Cardinality
+    md += "### 🇇 High Cardinality Warning\n"
+    if insights['high_cardinality']:
+        for col, count in insights['high_cardinality'].items(): md += f"- `{col}` has **{count}** unique values, which may be problematic for some models.\n"
+    else: md += "✅ No high-cardinality categorical columns found.\n"
+    md += "\n"
+    return md
+# --- Tab Handlers ---
+def medic_preview_imputation(state, col, method):
+    """Shows a before-and-after plot for data imputation."""
+    if not col: return None
+    df_orig = state['df_original']
+    df_mod = df_orig.copy()
+    if method == 'mean': value = df_mod[col].mean()
+    elif method == 'median': value = df_mod[col].median()
+    else: value = df_mod[col].mode()[0]
+    df_mod[col] = df_mod[col].fillna(value)
+    fig = go.Figure()
+    fig.add_trace(go.Histogram(x=df_orig[col], name='Before', opacity=0.7))
+    fig.add_trace(go.Histogram(x=df_mod[col], name='After', opacity=0.7))
+    fig.update_layout(barmode='overlay', title=f"'{col}' Distribution: Before vs. After Imputation", legend_title_text='Dataset')
+    return fig
+def medic_apply_imputation(state, col, method):
+    """Applies imputation and updates the main state."""
+    if not col: return state, "No column selected."
+    df_mod = state['df_modified'].copy()
+    if method == 'mean': value = df_mod[col].mean()
+    elif method == 'median': value = df_mod[col].median()
+    else: value = df_mod[col].mode()[0]
+    df_mod[col] = df_mod[col].fillna(value)
+    state['df_modified'] = df_mod
+    # Re-run proactive insights on the modified df
+    state['proactive_insights']['missing'] = df_mod.isnull().sum()
+    state['proactive_insights']['missing'] = state['proactive_insights']['missing'][state['proactive_insights']['missing'] > 0]
+    return state, f"✅ Applied '{method}' imputation to '{col}'.", gr.update(choices=state['proactive_insights']['missing'].index.tolist())
+def download_cleaned_data(state):
+    """Saves the modified dataframe to a csv and returns the path."""
+    if state:
+        df = state['df_modified']
+        # Gradio handles the tempfile creation
+        return gr.File.update(value=df.to_csv(index=False), visible=True)
+    return gr.File.update(visible=False)
+def oracle_run_model(state, target, features, model_name):
+    """Trains a simple ML model and returns metrics and plots."""
+    if not target or not features: return None, None, "Please select a target and at least one feature."
+    df = state['df_modified'].copy()
+    # Preprocessing
+    df.dropna(subset=features + [target], inplace=True)
+    if df.empty: return None, None, "Not enough data after dropping NA values."
+    le = LabelEncoder()
+    for col in features + [target]:
+        if df[col].dtype == 'object' or df[col].dtype.name == 'category':
+            df[col] = le.fit_transform(df[col])
+    X = df[features]
+    y = df[target]
+    problem_type = "Classification" if y.nunique() <= 10 else "Regression"
+    if model_name not in MODEL_REGISTRY[problem_type]:
+        return None, None, f"Model {model_name} not suitable for {problem_type}."
+    model = MODEL_REGISTRY[problem_type][model_name](random_state=42)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
+    model.fit(X_train, y_train)
+    preds = model.predict(X_test)
+    # Results
+    if problem_type == "Classification":
+        acc = accuracy_score(y_test, preds)
+        cm = confusion_matrix(y_test, preds)
+        cm_fig = px.imshow(cm, text_auto=True, title=f"Confusion Matrix (Accuracy: {acc:.2f})")
+        if hasattr(model, 'feature_importances_'):
+            fi = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)
+            fi_fig = px.bar(fi, title="Feature Importance")
+            return fi_fig, cm_fig, f"**Classification Report:**\n- Accuracy: {acc:.2f}"
+        else:
+            return None, cm_fig, f"**Classification Report:**\n- Accuracy: {acc:.2f}"
+    else: # Regression
+        r2 = r2_score(y_test, preds)
+        rmse = np.sqrt(mean_squared_error(y_test, preds))
+        preds_fig = px.scatter(x=y_test, y=preds, labels={'x': 'Actual Values', 'y': 'Predicted Values'},
+                               title=f"Predictions vs. Actuals (R²: {r2:.2f})", trendline='ols')
+        if hasattr(model, 'feature_importances_'):
+            fi = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)
+            fi_fig = px.bar(fi, title="Feature Importance")
+            return fi_fig, preds_fig, f"**Regression Report:**\n- R² Score: {r2:.2f}\n- RMSE: {rmse:.2f}"
+        else:
+            return None, preds_fig, f"**Regression Report:**\n- R² Score: {r2:.2f}\n- RMSE: {rmse:.2f}"
+def copilot_respond(user_message, history, state, api_key):
+    """Handles the AI Co-pilot chat interaction."""
     if not api_key:
+        return history + [(user_message, "I need a Gemini API key to function.")], None, None, ""
+    history += [(user_message, None)]
     prompt = f"""
+    You are 'Phoenix Co-pilot', a world-class AI data analyst. Your goal is to help the user by writing and executing Python code.
+    You have access to a pandas DataFrame named `df`. This is the user's LATEST data, including any cleaning they've performed.
+    **DataFrame Info:**
+    - Columns and dtypes: {json.dumps(state['metadata']['dtypes'])}
     **Instructions:**
+    1.  Analyze the user's request: '{user_message}'.
+    2.  Formulate a plan (thought).
+    3.  Write Python code to execute the plan.
+    4.  Use `pandas`, `numpy`, and `plotly.express as px`.
+    5.  To show a plot, assign it to a variable `fig`. Ex: `fig = px.histogram(df, x='age')`.
+    6.  To show a dataframe, assign it to a variable `df_result`. Ex: `df_result = df.describe()`.
+    7.  Use `print()` for text output.
+    8.  **NEVER** modify `df` in place. Use `df.copy()` if needed.
+    9.  Respond **ONLY** with a single, valid JSON object with keys "thought" and "code".
+    **User Request:** "{user_message}"
     **Your JSON Response:**
     """
         model = genai.GenerativeModel('gemini-1.5-flash')
         response = model.generate_content(prompt)
+        # Clean and parse JSON
+        response_json = json.loads(response.text.strip().replace("```json", "").replace("```", ""))
         thought = response_json.get("thought", "Thinking...")
+        code_to_run = response_json.get("code", "print('No code generated.')")
+        bot_thinking = f"🧠 **Thinking:** *{thought}*"
+        history[-1] = (user_message, bot_thinking)
+        yield history, None, None, gr.update(value=code_to_run)
+        # Execute Code
+        local_vars = {'df': state['df_modified'], 'px': px, 'pd': pd, 'np': np}
+        stdout, fig_result, df_result, error = safe_exec(code_to_run, local_vars)
+        bot_response = bot_thinking + "\n\n---\n\n"
         if error:
+            bot_response += f"💥 **Execution Error:**\n```\n{error}\n```"
         if stdout:
+            bot_response += f"📋 **Output:**\n```\n{stdout}\n```"
+        if not error and not stdout and not fig_result and not isinstance(df_result, pd.DataFrame):
+            bot_response += "✅ Code executed, but produced no direct output."
+        history[-1] = (user_message, bot_response)
+        yield history, fig_result, df_result, gr.update(value=code_to_run)
     except Exception as e:
+        error_msg = f"A critical error occurred: {e}. The AI may have returned invalid JSON. Check the generated code."
         history[-1] = (user_message, error_msg)
+        yield history, None, None, ""
+# --- Gradio UI Construction ---
+with gr.Blocks(theme=THEME, title="Phoenix AI Data Explorer") as demo:
+    global_state = gr.State({})
+    gr.Markdown("# 🔥 Phoenix AI Data Explorer")
+    gr.Markdown("The next-generation analytic tool. Upload your data to awaken the Phoenix.")
+    with gr.Row():
+        file_input = gr.File(label="📁 Upload CSV", file_types=[".csv"])
+        api_key_input = gr.Textbox(label="🔑 Gemini API Key", type="password", placeholder="Enter Google AI Studio key...")
+    with gr.Tabs(visible=False) as phoenix_tabs:
+        with gr.Tab("🦅 Phoenix Eye"):
+            phoenix_eye_output = gr.Markdown()
+        with gr.Tab("🩺 Data Medic"):
+            gr.Markdown("### Cleanse Your Data\nSelect a column with missing values and choose a method to fill them.")
+            with gr.Row():
+                medic_col_select = gr.Dropdown(label="Select Column to Clean")
+                medic_method_select = gr.Radio(['mean', 'median', 'mode'], label="Imputation Method", value='mean')
+            medic_preview_btn = gr.Button("📊 Preview Changes")
+            medic_plot = gr.Plot()
+            with gr.Row():
+                medic_apply_btn = gr.Button("✅ Apply & Save Changes", variant="primary")
+                medic_status = gr.Textbox(label="Status", interactive=False)
+            with gr.Accordion("Download Cleaned Data", open=False):
+                download_btn = gr.Button("⬇️ Download Cleaned CSV")
+                download_file_output = gr.File(label="Download Link", visible=False)
+        with gr.Tab("🔮 The Oracle (Predictive Modeling)"):
+            gr.Markdown("### Glimpse the Future\nTrain a simple model to see the predictive power of your data.")
+            with gr.Row():
+                oracle_target_select = gr.Dropdown(label="🎯 Select Target Variable")
+                oracle_feature_select = gr.Multiselect(label="✨ Select Features")
+                oracle_model_select = gr.Dropdown(choices=["Random Forest", "Logistic Regression", "Linear Regression"], label="🧠 Select Model")
+            oracle_run_btn = gr.Button("🚀 Train Model!", variant="primary")
+            oracle_status = gr.Markdown()
+            with gr.Row():
+                oracle_fig1 = gr.Plot()
+                oracle_fig2 = gr.Plot()
+        with gr.Tab("🤖 AI Co-pilot"):
+            gr.Markdown("### Your Conversational Analyst\nAsk any question about your data in plain English.")
+            copilot_chatbot = gr.Chatbot(label="Chat History", height=400)
+            with gr.Accordion("AI Generated Results", open=True):
+                copilot_fig_output = gr.Plot()
+                copilot_df_output = gr.Dataframe(interactive=False)
+            with gr.Accordion("Generated Code", open=False):
+                copilot_code_output = gr.Code(language="python", interactive=False)
+            with gr.Row():
+                copilot_input = gr.Textbox(label="Your Question", placeholder="e.g., 'What's the correlation between age and salary?'", scale=4)
+                copilot_submit_btn = gr.Button("Submit", variant="primary", scale=1)
+    # --- Event Wiring ---
+    file_input.upload(
+        fn=prime_data,
+        inputs=file_input,
+        outputs=[global_state, phoenix_tabs, phoenix_eye_output, medic_col_select, oracle_target_select, oracle_feature_select],
+        show_progress="full"
+    )
+    # Data Medic
+    medic_preview_btn.click(medic_preview_imputation, [global_state, medic_col_select, medic_method_select], medic_plot)
+    medic_apply_btn.click(medic_apply_imputation, [global_state, medic_col_select, medic_method_select], [global_state, medic_status, medic_col_select])
+    download_btn.click(download_cleaned_data, [global_state], download_file_output)
+    # Oracle
+    oracle_run_btn.click(
+        oracle_run_model,
+        [global_state, oracle_target_select, oracle_feature_select, oracle_model_select],
+        [oracle_fig1, oracle_fig2, oracle_status],
+        show_progress="full"
+    )
+    # AI Co-pilot
+    copilot_submit_btn.click(
+        copilot_respond,
+        [copilot_input, copilot_chatbot, global_state, api_key_input],
+        [copilot_chatbot, copilot_fig_output, copilot_df_output, copilot_code_output]
+    ).then(lambda: "", copilot_input, copilot_input) # Clear input after submit
 if __name__ == "__main__":
+    demo.launch(debug=True)