Spaces:

mgbam
/

PhoenixUI

Running

App Files Files Community

mgbam commited on 8 days ago

Commit

c08faed

verified ·

1 Parent(s): 6a754de

Create app.py

Browse files

Files changed (1) hide show

app.py +486 -0

app.py ADDED Viewed

	@@ -0,0 +1,486 @@

+import gradio as gr
+import pandas as pd
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+import io
+from scipy import stats
+import warnings
+import google.generativeai as genai
+import os
+from dotenv import load_dotenv
+import logging
+import json
+from contextlib import redirect_stdout
+# --- Configuration ---
+warnings.filterwarnings('ignore')
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# --- Helper Functions ---
+def safe_exec(code_string: str, local_vars: dict) -> tuple:
+    """Safely execute a string of Python code and capture its output."""
+    output_buffer = io.StringIO()
+    try:
+        with redirect_stdout(output_buffer):
+            exec(code_string, globals(), local_vars)
+        stdout_output = output_buffer.getvalue()
+        fig = local_vars.get('fig', None)
+        return stdout_output, fig, None
+    except Exception as e:
+        error_message = f"Execution Error: {str(e)}"
+        logging.error(f"Error executing AI-generated code: {error_message}")
+        return None, None, error_message
+# --- Core Data Processing ---
+def load_and_process_file(file_obj, state_dict):
+    """Loads a CSV file and performs initial processing, updating the global state."""
+    if file_obj is None:
+        return None, "Please upload a file.", gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
+    try:
+        df = pd.read_csv(file_obj.name)
+        # Attempt to convert object columns to datetime
+        for col in df.select_dtypes(include=['object']).columns:
+            try:
+                df[col] = pd.to_datetime(df[col], errors='raise')
+                logging.info(f"Successfully converted column '{col}' to datetime.")
+            except (ValueError, TypeError):
+                continue
+        metadata = extract_dataset_metadata(df)
+        state_dict = {
+            'df': df,
+            'metadata': metadata,
+            'filename': os.path.basename(file_obj.name)
+        }
+        # Update UI elements dynamically
+        update_args = {
+            'choices': metadata['columns'],
+            'value': None,
+            'interactive': True
+        }
+        # Check for time series tab visibility
+        time_series_visible = len(metadata['datetime_cols']) > 0
+        return (
+            state_dict,
+            f"✅ Loaded `{state_dict['filename']}` ({metadata['shape'][0]} rows, {metadata['shape'][1]} cols)",
+            gr.update(**update_args), gr.update(**update_args), gr.update(**update_args),
+            gr.update(choices=metadata['numeric_cols'], value=None, interactive=True),
+            gr.update(choices=metadata['datetime_cols'], value=None, interactive=True),
+            gr.update(visible=time_series_visible), # Show/hide Time Series tab
+            gr.update(visible=True) # Show Chatbot tab
+        )
+    except Exception as e:
+        logging.error(f"Error loading file: {e}")
+        return state_dict, f"❌ Error: {e}", gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(visible=False), gr.update(visible=False)
+def extract_dataset_metadata(df: pd.DataFrame) -> dict:
+    """Extracts comprehensive metadata from a DataFrame."""
+    rows, cols = df.shape
+    columns = df.columns.tolist()
+    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
+    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
+    datetime_cols = df.select_dtypes(include=['datetime64', 'datetime64[ns]']).columns.tolist()
+    missing_data = df.isnull().sum()
+    data_quality = round((df.notna().sum().sum() / (rows * cols)) * 100, 1) if rows * cols > 0 else 0
+    return {
+        'shape': (rows, cols),
+        'columns': columns,
+        'numeric_cols': numeric_cols,
+        'categorical_cols': categorical_cols,
+        'datetime_cols': datetime_cols,
+        'dtypes': df.dtypes.to_string(),
+        'missing_data': missing_data.to_dict(),
+        'data_quality': data_quality,
+        'head': df.head().to_string()
+    }
+# --- Tab 1: AI Overview ---
+def analyze_dataset_overview(state_dict, api_key: str):
+    """Generates an AI-powered narrative overview of the dataset."""
+    if not state_dict:
+        return "❌ Please upload a dataset first.", "", 0
+    if not api_key:
+        return "❌ Please enter your Gemini API key.", "", 0
+    metadata = state_dict['metadata']
+    # Create prompt for Gemini
+    prompt = f"""
+    You are an expert data analyst and storyteller. Your task is to provide a high-level, engaging overview of a dataset based on its metadata.
+    **Dataset Metadata:**
+    - **Shape:** {metadata['shape'][0]} rows, {metadata['shape'][1]} columns
+    - **Column Names:** {', '.join(metadata['columns'])}
+    - **Numeric Columns:** {', '.join(metadata['numeric_cols'])}
+    - **Categorical Columns:** {', '.join(metadata['categorical_cols'])}
+    - **Datetime Columns:** {', '.join(metadata['datetime_cols'])}
+    - **Data Quality (Non-missing values):** {metadata['data_quality']}%
+    - **First 5 rows:**
+    {metadata['head']}
+    **Your Task:**
+    Based on the metadata, generate a report in Markdown format. Use emojis to make it visually appealing. The report should have the following sections:
+    # 🚀 AI-Powered Dataset Overview
+    ## 🤔 What is this dataset likely about?
+    (Predict the domain and purpose of the dataset, e.g., "This appears to be customer transaction data for an e-commerce platform.")
+    ## 💡 Potential Key Questions to Explore
+    - (Suggest 3-4 interesting business or research questions the data could answer.)
+    - (Example: "Which products are most frequently purchased together?")
+    ## 📊 Potential Analyses & Visualizations
+    - (List 3-4 types of analyses that would be valuable.)
+    - (Example: "Time series analysis of sales to identify seasonality.")
+    ## ⚠️ Data Quality & Potential Issues
+    - (Briefly comment on the data quality score and mention if the presence of datetime columns is a good sign for certain analyses.)
+    """
+    try:
+        genai.configure(api_key=api_key)
+        model = genai.GenerativeModel('gemini-1.5-flash')
+        response = model.generate_content(prompt)
+        story = response.text
+    except Exception as e:
+        story = f"## ⚠️ AI Generation Failed\n**Error:** {str(e)}\n\nPlease check your API key and network connection. A fallback analysis is provided below.\n\n" \
+                f"### Fallback Analysis\nThis dataset contains **{metadata['shape'][0]}** records and **{metadata['shape'][1]}** features. " \
+                f"It includes **{len(metadata['numeric_cols'])}** numeric, **{len(metadata['categorical_cols'])}** categorical, " \
+                f"and **{len(metadata['datetime_cols'])}** time-based columns. The overall data quality is **{metadata['data_quality']}%**, " \
+                f"which is a good starting point for analysis."
+    # Basic Info Summary
+    basic_info = f"""
+    📋 **File:** `{state_dict.get('filename', 'N/A')}`
+    📊 **Size:** {metadata['shape'][0]:,} rows × {metadata['shape'][1]} columns
+    🔢 **Features:**
+      • **Numeric:** {len(metadata['numeric_cols'])}
+      • **Categorical:** {len(metadata['categorical_cols'])}
+      • **DateTime:** {len(metadata['datetime_cols'])}
+    🎯 **Data Quality:** {metadata['data_quality']}%
+    """
+    return story, basic_info, metadata['data_quality']
+# --- Tab 2: Univariate Analysis ---
+def generate_univariate_plot(column_name, state_dict):
+    """Generates plots for a single selected variable."""
+    if not column_name or not state_dict:
+        return None, "Select a column to analyze."
+    df = state_dict['df']
+    metadata = state_dict['metadata']
+    fig = None
+    summary = ""
+    if column_name in metadata['numeric_cols']:
+        fig = make_subplots(rows=1, cols=2, subplot_titles=("Histogram", "Box Plot"))
+        fig.add_trace(go.Histogram(x=df[column_name], name="Histogram"), row=1, col=1)
+        fig.add_trace(go.Box(y=df[column_name], name="Box Plot"), row=1, col=2)
+        fig.update_layout(title_text=f"Distribution of '{column_name}'", showlegend=False)
+        summary = df[column_name].describe().to_frame().to_markdown()
+    elif column_name in metadata['categorical_cols']:
+        top_n = 20
+        counts = df[column_name].value_counts()
+        title = f"Top {min(top_n, len(counts))} Categories for '{column_name}'"
+        fig = px.bar(counts.nlargest(top_n), title=title, labels={'index': column_name, 'value': 'Count'})
+        fig.update_layout(showlegend=False)
+        summary = counts.to_frame().to_markdown()
+    elif column_name in metadata['datetime_cols']:
+        counts = df[column_name].dt.to_period("M").value_counts().sort_index()
+        fig = px.line(x=counts.index.to_timestamp(), y=counts.values, title=f"Records over Time for '{column_name}'")
+        fig.update_layout(xaxis_title="Time", yaxis_title="Record Count")
+        summary = df[column_name].describe(datetime_is_numeric=True).to_frame().to_markdown()
+    return fig, summary
+# --- Tab 3: Bivariate Analysis ---
+def generate_bivariate_plot(x_col, y_col, state_dict):
+    """Generates plots to explore the relationship between two variables."""
+    if not x_col or not y_col or not state_dict:
+        return None, "Select two columns to analyze."
+    if x_col == y_col:
+        return None, "Please select two different columns."
+    df = state_dict['df']
+    metadata = state_dict['metadata']
+    x_type = 'numeric' if x_col in metadata['numeric_cols'] else 'categorical'
+    y_type = 'numeric' if y_col in metadata['numeric_cols'] else 'categorical'
+    fig = None
+    title = f"{x_col} vs. {y_col}"
+    if x_type == 'numeric' and y_type == 'numeric':
+        fig = px.scatter(df, x=x_col, y=y_col, title=f"Scatter Plot: {title}", trendline="ols", trendline_color_override="red")
+        summary = df[[x_col, y_col]].corr().to_markdown()
+    elif x_type == 'numeric' and y_type == 'categorical':
+        fig = px.box(df, x=x_col, y=y_col, title=f"Box Plot: {title}")
+        summary = df.groupby(y_col)[x_col].describe().to_markdown()
+    elif x_type == 'categorical' and y_type == 'numeric':
+        fig = px.box(df, x=y_col, y=x_col, title=f"Box Plot: {title}")
+        summary = df.groupby(x_col)[y_col].describe().to_markdown()
+    else: # Both categorical
+        crosstab = pd.crosstab(df[x_col], df[y_col])
+        fig = px.imshow(crosstab, title=f"Heatmap of Counts: {title}", text_auto=True)
+        summary = crosstab.to_markdown()
+    return fig, f"### Analysis Summary\n{summary}"
+# --- Tab 4: Time Series Analysis ---
+def generate_time_series_plot(time_col, value_col, resample_freq, state_dict):
+    """Generates a time series plot with resampling."""
+    if not time_col or not value_col or not state_dict:
+        return None, "Select Time and Value columns."
+    df = state_dict['df'].copy()
+    try:
+        df[time_col] = pd.to_datetime(df[time_col])
+        df_resampled = df.set_index(time_col)[value_col].resample(resample_freq).mean().reset_index()
+        fig = px.line(df_resampled, x=time_col, y=value_col,
+                      title=f"Time Series of {value_col} (Resampled to '{resample_freq}')")
+        fig.update_layout(xaxis_title="Date", yaxis_title=f"Mean of {value_col}")
+        return fig, f"Showing mean of '{value_col}' aggregated by '{resample_freq}'."
+    except Exception as e:
+        return None, f"Error: {e}"
+# --- Tab 5: AI Analyst Chat ---
+def respond_to_chat(user_message, history, state_dict, api_key):
+    """Handles the chat interaction with the AI Analyst."""
+    if not api_key:
+        history.append((user_message, "I can't answer without a Gemini API key. Please enter it in the 'AI Overview' tab."))
+        return history, None, ""
+    if not state_dict:
+        history.append((user_message, "Please upload a dataset before asking questions."))
+        return history, None, ""
+    history.append((user_message, None))
+    df_metadata = state_dict['metadata']
+    # Construct a robust prompt for the AI
+    prompt = f"""
+    You are an AI Data Analyst assistant. Your name is 'Gemini Analyst'.
+    You are given a pandas DataFrame named `df`.
+    Your goal is to answer the user's question about this DataFrame by writing and executing Python code.
+    **Instructions:**
+    1.  Analyze the user's question.
+    2.  Write Python code to answer it.
+    3.  You can use pandas, numpy, and plotly.express.
+    4.  If you create a plot, you **MUST** assign it to a variable named `fig`. The plot will be displayed to the user.
+    5.  If you are just calculating something or printing text, the `print()` output will be shown.
+    6.  **DO NOT** write any code that modifies the DataFrame (e.g., `df.dropna(inplace=True)`). Use `df.copy()` if you need to modify data.
+    7.  Respond **ONLY** with a JSON object containing two keys: "thought" and "code".
+        - "thought": A short, one-sentence explanation of your plan.
+        - "code": A string containing the Python code to execute.
+    **DataFrame Metadata:**
+    - **Filename:** {state_dict['filename']}
+    - **Shape:** {df_metadata['shape'][0]} rows, {df_metadata['shape'][1]} columns
+    - **Columns and Data Types:**
+    {df_metadata['dtypes']}
+    ---
+    **User Question:** "{user_message}"
+    ---
+    **Your JSON Response:**
+    """
+    try:
+        genai.configure(api_key=api_key)
+        model = genai.GenerativeModel('gemini-1.5-flash')
+        response = model.generate_content(prompt)
+        # Clean and parse the JSON response
+        response_text = response.text.strip().replace("```json", "").replace("```", "")
+        response_json = json.loads(response_text)
+        thought = response_json.get("thought", "Thinking...")
+        code_to_run = response_json.get("code", "")
+        bot_message = f"🧠 **Thought:** {thought}\n\n"
+        # Execute the code
+        local_vars = {'df': state_dict['df'], 'px': px, 'pd': pd, 'np': np}
+        stdout, fig_result, error = safe_exec(code_to_run, local_vars)
+        if error:
+            bot_message += f"💥 **Error:**\n```\n{error}\n```"
+            history[-1] = (user_message, bot_message)
+            return history, None, ""
+        if stdout:
+            bot_message += f"📋 **Output:**\n```\n{stdout}\n```"
+        if not fig_result and not stdout:
+            bot_message += "✅ Code executed successfully, but it produced no visible output."
+        history[-1] = (user_message, bot_message)
+        return history, fig_result, ""
+    except Exception as e:
+        error_msg = f"An unexpected error occurred: {e}. The AI might have returned an invalid response. Please try rephrasing your question."
+        logging.error(f"Chatbot error: {error_msg}")
+        history[-1] = (user_message, error_msg)
+        return history, None, ""
+# --- Gradio Interface ---
+def create_gradio_interface():
+    """Builds and returns the full Gradio application interface."""
+    with gr.Blocks(title="🚀 AI Data Explorer", theme=gr.themes.Soft()) as demo:
+        # Global state to hold data
+        global_state = gr.State({})
+        # Header
+        gr.Markdown("# 🚀 AI Data Explorer: Your Advanced Analytic Tool")
+        gr.Markdown("Upload a CSV, then explore your data with interactive tabs and a powerful AI Analyst.")
+        # --- Top Row: File Upload and API Key ---
+        with gr.Row():
+            with gr.Column(scale=2):
+                file_input = gr.File(label="📁 Upload CSV File", file_types=[".csv"])
+                status_output = gr.Markdown("Status: Waiting for file...")
+            with gr.Column(scale=1):
+                api_key_input = gr.Textbox(
+                    label="🔑 Gemini API Key",
+                    placeholder="Enter your key here...",
+                    type="password",
+                    info="Get your free key from Google AI Studio"
+                )
+        # --- Main Tabs ---
+        with gr.Tabs() as tabs:
+            # Tab 1: AI Overview
+            with gr.Tab("🤖 AI Overview", id=0):
+                overview_btn = gr.Button("🧠 Generate AI Overview", variant="primary")
+                with gr.Row():
+                    story_output = gr.Markdown(label="📖 AI-Generated Story")
+                    with gr.Column():
+                        basic_info_output = gr.Markdown(label="📋 Basic Information")
+                        quality_score = gr.Number(label="🎯 Data Quality Score (%)", interactive=False)
+            # Tab 2: Univariate Analysis
+            with gr.Tab("📊 Univariate Analysis", id=1):
+                uni_col_select = gr.Dropdown(label="Select a Column to Analyze", interactive=False)
+                with gr.Row():
+                    uni_plot_output = gr.Plot(label="Distribution Plot")
+                    uni_summary_output = gr.Markdown(label="Summary Statistics")
+            # Tab 3: Bivariate Analysis
+            with gr.Tab("📈 Bivariate Analysis", id=2):
+                with gr.Row():
+                    bi_x_select = gr.Dropdown(label="Select X-Axis Column", interactive=False)
+                    bi_y_select = gr.Dropdown(label="Select Y-Axis Column", interactive=False)
+                bi_btn = gr.Button("🎨 Generate Bivariate Plot", variant="secondary")
+                with gr.Row():
+                    bi_plot_output = gr.Plot(label="Relationship Plot")
+                    bi_summary_output = gr.Markdown(label="Analysis Summary")
+            # Tab 4: Time Series (conditionally visible)
+            with gr.Tab("⏳ Time Series Analysis", id=3, visible=False) as ts_tab:
+                with gr.Row():
+                    ts_time_col = gr.Dropdown(label="Select Time Column", interactive=False)
+                    ts_value_col = gr.Dropdown(label="Select Value Column", interactive=False)
+                    ts_resample = gr.Radio(['D', 'W', 'M', 'Q', 'Y'], label="Resample Frequency", value='M')
+                ts_btn = gr.Button("📈 Plot Time Series", variant="secondary")
+                ts_plot_output = gr.Plot(label="Time Series Plot")
+                ts_status_output = gr.Markdown()
+            # Tab 5: AI Analyst Chat (conditionally visible)
+            with gr.Tab("💬 AI Analyst Chat", id=4, visible=False) as chat_tab:
+                chatbot = gr.Chatbot(label="Chat with Gemini Analyst", height=500)
+                chat_plot_output = gr.Plot(label="AI Generated Plot")
+                with gr.Row():
+                    chat_input = gr.Textbox(label="Your Question", placeholder="e.g., 'Show me the distribution of age'", scale=4)
+                    chat_submit_btn = gr.Button("Submit", variant="primary", scale=1)
+                chat_clear_btn = gr.Button("Clear Chat")
+        # --- Event Handlers ---
+        # File upload triggers data loading and UI updates
+        file_input.upload(
+            fn=load_and_process_file,
+            inputs=[file_input, global_state],
+            outputs=[global_state, status_output, uni_col_select, bi_x_select, bi_y_select, ts_value_col, ts_time_col, ts_tab, chat_tab]
+        )
+        # Tab 1: Overview
+        overview_btn.click(
+            fn=analyze_dataset_overview,
+            inputs=[global_state, api_key_input],
+            outputs=[story_output, basic_info_output, quality_score]
+        )
+        # Tab 2: Univariate
+        uni_col_select.change(
+            fn=generate_univariate_plot,
+            inputs=[uni_col_select, global_state],
+            outputs=[uni_plot_output, uni_summary_output]
+        )
+        # Tab 3: Bivariate
+        bi_btn.click(
+            fn=generate_bivariate_plot,
+            inputs=[bi_x_select, bi_y_select, global_state],
+            outputs=[bi_plot_output, bi_summary_output]
+        )
+        # Tab 4: Time Series
+        ts_btn.click(
+            fn=generate_time_series_plot,
+            inputs=[ts_time_col, ts_value_col, ts_resample, global_state],
+            outputs=[ts_plot_output, ts_status_output]
+        )
+        # Tab 5: AI Chat
+        chat_submit_btn.click(
+            fn=respond_to_chat,
+            inputs=[chat_input, chatbot, global_state, api_key_input],
+            outputs=[chatbot, chat_plot_output, chat_input]
+        )
+        chat_input.submit(
+            fn=respond_to_chat,
+            inputs=[chat_input, chatbot, global_state, api_key_input],
+            outputs=[chatbot, chat_plot_output, chat_input]
+        )
+        chat_clear_btn.click(lambda: ([], None, ""), None, [chatbot, chat_plot_output, chat_input])
+    return demo
+# --- Main Application Execution ---
+if __name__ == "__main__":
+    # For local development, you might use load_dotenv()
+    # load_dotenv()
+    app = create_gradio_interface()
+    app.launch(debug=True)