Spaces:

mgbam
/

PhoenixUI

Running

App Files Files Community

mgbam commited on 4 days ago

Commit

c9ba3ae

verified ·

1 Parent(s): 60da408

Update app.py

Browse files

Files changed (1) hide show

app.py +367 -325

app.py CHANGED Viewed

@@ -1,84 +1,108 @@
 # -*- coding: utf-8 -*-
-"""
-🚀 AutoEDA: AI-Powered Exploratory Data Analysis Tool
-An advanced Gradio application for automated exploratory data analysis,
-data profiling, and AI-driven insights using Google's Gemini API.
-Key Features:
-- Unified Analysis Workflow: Upload a CSV and get a full report across all tabs.
-- AI-Powered Storytelling: Generates a narrative overview, use cases, and findings.
-- Actionable AI Suggestions: Provides data cleaning recommendations.
-- Interactive Visualizations: Users can select columns to generate plots dynamically.
-- In-depth Profiling: Detailed statistics for numeric and categorical data.
-- Column-Level Drilldown: Inspect individual features in detail.
-- Report Download: Export the AI-generated analysis as a Markdown file.
-Author: World-Class MCP Expert
-Version: 2.0
-"""
 from __future__ import annotations
 import warnings
 import logging
 import os
-import pandas as pd
 import numpy as np
 import plotly.express as px
 import plotly.graph_objects as go
-from plotly.subplots import make_subplots
-import gradio as gr
 import google.generativeai as genai
-from typing import Optional, Dict, Any, Tuple, List
-from datetime import datetime
-# --- Configuration & Setup ---
-warnings.filterwarnings('ignore')
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-# --- Core Analysis Logic (The "Engine") ---
 class DataAnalyzer:
     """
-    A comprehensive class to encapsulate all data analysis operations.
-    It holds the dataframe and provides methods for profiling, visualization,
-    and AI-powered analysis, ensuring data is processed only once.
     """
     def __init__(self, df: pd.DataFrame):
         if not isinstance(df, pd.DataFrame):
             raise TypeError("Input must be a pandas DataFrame.")
         self.df = df
         self._metadata: Optional[Dict[str, Any]] = None
-        logging.info(f"DataAnalyzer initialized with DataFrame of shape: {self.df.shape}")
     @property
     def metadata(self) -> Dict[str, Any]:
-        """Lazy-loads and caches dataset metadata."""
         if self._metadata is None:
             self._metadata = self._extract_metadata()
         return self._metadata
     def _extract_metadata(self) -> Dict[str, Any]:
-        """Extracts comprehensive metadata from the DataFrame."""
-        logging.info("Extracting dataset metadata...")
         rows, cols = self.df.shape
         numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist()
         categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
-        datetime_cols = self.df.select_dtypes(include=['datetime64']).columns.tolist()
-        # High correlation pairs
         high_corr_pairs = []
         if len(numeric_cols) > 1:
             corr_matrix = self.df[numeric_cols].corr().abs()
             upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
             high_corr_pairs = (
-                upper_tri.stack()
                 .reset_index()
-                .rename(columns={'level_0': 'Var 1', 'level_1': 'Var 2', 0: 'Correlation'})
-                .query('Correlation > 0.7')
-                .sort_values('Correlation', ascending=False)
-                .head(5)
                 .to_dict('records')
             )
@@ -87,341 +111,359 @@ class DataAnalyzer:
             'columns': self.df.columns.tolist(),
             'numeric_cols': numeric_cols,
             'categorical_cols': categorical_cols,
-            'datetime_cols': datetime_cols,
-            'memory_usage': f"{self.df.memory_usage(deep=True).sum() / 1e6:.2f} MB",
             'total_missing': int(self.df.isnull().sum().sum()),
-            'data_quality_score': round((self.df.notna().sum().sum() / self.df.size) * 100, 1),
             'high_corr_pairs': high_corr_pairs,
         }
-    def get_profiling_report(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
-        """Generates detailed data profiling tables."""
-        logging.info("Generating data profiling report.")
-        # Missing data
         missing = self.df.isnull().sum()
         missing_df = pd.DataFrame({
-            'Missing Values': missing,
-            'Percentage (%)': (missing / len(self.df) * 100).round(2)
-        }).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing Values', ascending=False)
-        # Numeric stats
-        numeric_stats_df = self.df[self.metadata['numeric_cols']].describe().round(3).T.reset_index().rename(columns={'index': 'Column'})
-        # Categorical stats
-        cat_stats_list = []
-        for col in self.metadata['categorical_cols']:
-            stats = {
-                'Column': col,
-                'Unique Values': self.df[col].nunique(),
-                'Top Value': self.df[col].mode().iloc[0] if not self.df[col].mode().empty else 'N/A',
-                'Frequency': self.df[col].value_counts().iloc[0] if not self.df[col].value_counts().empty else 0
-            }
-            cat_stats_list.append(stats)
-        categorical_stats_df = pd.DataFrame(cat_stats_list)
-        return missing_df, numeric_stats_df, categorical_stats_df
-    def get_initial_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]:
-        """Creates a set of standard, non-interactive overview plots."""
-        logging.info("Generating initial overview visualizations.")
-        # Data type distribution
         dtype_counts = self.df.dtypes.astype(str).value_counts()
-        dtype_fig = px.pie(
             values=dtype_counts.values, names=dtype_counts.index,
-            title="📊 Data Type Distribution", hole=0.3
         )
-        dtype_fig.update_traces(textposition='inside', textinfo='percent+label')
-        # Missing data overview
-        missing_fig = px.bar(
-            x=self.df.isnull().sum(), y=self.df.columns,
-            orientation='h', title="🕳️ Missing Values Overview",
-            labels={'x': 'Number of Missing Values', 'y': 'Column'},
-        ).update_yaxes(categoryorder="total ascending")
-        # Correlation heatmap
-        corr_fig = go.Figure()
-        if len(self.metadata['numeric_cols']) > 1:
-            corr_matrix = self.df[self.metadata['numeric_cols']].corr()
-            corr_fig = px.imshow(
                 corr_matrix, text_auto=".2f", aspect="auto",
-                title="🔗 Correlation Matrix (Numeric Features)",
-                color_continuous_scale='RdBu_r'
             )
         else:
-            corr_fig.update_layout(title="🔗 Correlation Matrix (Not enough numeric columns)")
-        return dtype_fig, missing_fig, corr_fig
-    def generate_ai_report(self, api_key: str) -> str:
-        """Generates a full data story and analysis using the Gemini API."""
-        logging.info("Generating AI report with Gemini.")
         prompt = f"""
-        As an expert data analyst and storyteller, your task is to analyze the provided dataset summary and generate a comprehensive, insightful, and accessible report.
-        **Dataset Metadata:**
-        - **Shape:** {self.metadata['shape'][0]} rows, {self.metadata['shape'][1]} columns.
-        - **Column Names:** {', '.join(self.metadata['columns'])}
-        - **Numeric Columns:** {', '.join(self.metadata['numeric_cols'])}
-        - **Categorical Columns:** {', '.join(self.metadata['categorical_cols'])}
-        - **Overall Data Quality:** {self.metadata['data_quality_score']}%
-        - **Total Missing Values:** {self.metadata['total_missing']:,}
-        - **Highly Correlated Pairs (>0.7):** {self.metadata['high_corr_pairs'] if self.metadata['high_corr_pairs'] else 'None detected.'}
-        - **Sample Data (First 3 Rows):**
-        {self.df.head(3).to_markdown()}
-        **Your Report Structure (Use Markdown):**
-        # 🚀 AI-Powered Data Analysis Report
-        ## 📖 1. The Story of the Data
-        * **What is this dataset about?** (Deduce the purpose and subject matter of the data.)
-        * **What domain or industry does it belong to?** (e.g., E-commerce, Finance, Healthcare.)
-        * **Who might use this data?** (e.g., Marketers, Scientists, Financial Analysts.)
-        ## 🎯 2. Key Insights & Interesting Findings
-        - **Finding 1:** (Describe a significant pattern, trend, or anomaly. Use emojis to highlight.)
-        - **Finding 2:** (Mention another interesting discovery, perhaps from correlations or categorical data.)
-        - **Finding 3:** (Highlight a potential business or research opportunity revealed by the data.)
-        ## 🧹 3. Data Quality & Cleaning Recommendations
-        * **Overall Quality Assessment:** (Comment on the {self.metadata['data_quality_score']}% score and {self.metadata['total_missing']} missing values.)
-        * **Actionable Steps:**
-            - **Recommendation 1:** (e.g., "For column 'X' with Y% missing values, consider imputation using the mean/median/mode.")
-            - **Recommendation 2:** (e.g., "Columns 'A' and 'B' are highly correlated ({'e.g., ' + str(self.metadata['high_corr_pairs'][0]) if self.metadata['high_corr_pairs'] else ''}). Consider dropping one for modeling to avoid multicollinearity.")
-            - **Recommendation 3:** (e.g., "Column 'Z' is categorical but stored as a number. Recommend converting it to a category type.")
-        ## 🔮 4. Potential Next Steps & Use Cases
-        - **Analysis Idea 1:** (e.g., "Build a predictive model for customer churn.")
-        - **Dashboard Idea 2:** (e.g., "Create a sales performance dashboard tracking KPIs over time.")
-        - **Research Question 3:** (e.g., "Investigate the factors influencing employee attrition.")
         """
         try:
             genai.configure(api_key=api_key)
-            model = genai.GenerativeModel('gemini-1.5-flash-latest')
             response = model.generate_content(prompt)
             return response.text
         except Exception as e:
-            logging.error(f"Gemini API call failed: {e}")
-            return f"❌ **Error generating AI report.**\n**Reason:** {str(e)}\n\nPlease check your API key and network connection. A fallback analysis could not be generated."
 # --- Gradio UI & Event Handlers ---
-def process_uploaded_file(file_obj: gr.File, api_key: str) -> tuple:
     """
-    Main function to process the uploaded file. It runs all analyses
-    and returns updates for all UI components in one go.
     """
     if file_obj is None:
-        raise gr.Error("📁 Please upload a CSV file first!")
     if not api_key:
-        raise gr.Error("🔑 Please enter your Gemini API key!")
     try:
         df = pd.read_csv(file_obj.name)
         analyzer = DataAnalyzer(df)
-        # Perform all analyses
-        ai_report = analyzer.generate_ai_report(api_key)
-        missing_df, num_stats, cat_stats = analyzer.get_profiling_report()
-        dtype_fig, missing_fig, corr_fig = analyzer.get_initial_visuals()
-        # Prepare UI updates
-        all_cols = analyzer.metadata['columns']
-        num_cols = analyzer.metadata['numeric_cols']
-        cat_cols = analyzer.metadata['categorical_cols']
-        # The return dictionary maps UI components to their new values/configurations
         return {
             state_analyzer: analyzer,
-            # Overview Tab
-            md_ai_report: ai_report,
-            btn_download_report: gr.Button(visible=True),
             # Profiling Tab
-            df_missing_data: missing_df,
-            df_numeric_stats: num_stats,
-            df_categorical_stats: cat_stats,
-            # Visuals Tab
-            plot_dtype: dtype_fig,
-            plot_missing: missing_fig,
-            plot_corr: corr_fig,
-            # Interactive Visuals Tab
-            dd_hist_col: gr.Dropdown(choices=num_cols, label="Select Numeric Column for Histogram", visible=True),
-            dd_scatter_x: gr.Dropdown(choices=num_cols, label="Select X-axis (Numeric)", visible=True),
-            dd_scatter_y: gr.Dropdown(choices=num_cols, label="Select Y-axis (Numeric)", visible=True),
-            dd_scatter_color: gr.Dropdown(choices=all_cols, label="Select Color (Categorical/Numeric)", visible=True),
-            dd_box_cat: gr.Dropdown(choices=cat_cols, label="Select Categorical Column for Box Plot", visible=True),
-            dd_box_num: gr.Dropdown(choices=num_cols, label="Select Numeric Column for Box Plot", visible=True),
-            # Column Drilldown Tab
-            dd_drilldown_col: gr.Dropdown(choices=all_cols, label="Select Column to Analyze", visible=True),
         }
     except Exception as e:
-        logging.error(f"An error occurred during file processing: {e}", exc_info=True)
-        raise gr.Error(f"Processing failed! Error: {str(e)}")
-# --- Interactive Plotting Functions ---
-def create_histogram(analyzer: DataAnalyzer, col: str) -> go.Figure:
-    if not col: return go.Figure()
-    return px.histogram(analyzer.df, x=col, title=f"Distribution of {col}", marginal="box")
-def create_scatterplot(analyzer: DataAnalyzer, x_col: str, y_col: str, color_col: str) -> go.Figure:
-    if not x_col or not y_col: return go.Figure()
-    return px.scatter(analyzer.df, x=x_col, y=y_col, color=color_col,
-                      title=f"Scatter Plot: {x_col} vs. {y_col}")
-def create_boxplot(analyzer: DataAnalyzer, cat_col: str, num_col: str) -> go.Figure:
-    if not cat_col or not num_col: return go.Figure()
-    return px.box(analyzer.df, x=cat_col, y=num_col, title=f"Box Plot: {num_col} by {cat_col}")
-def analyze_single_column(analyzer: DataAnalyzer, col: str) -> Tuple[str, go.Figure]:
-    if not col: return "", go.Figure()
-    col_series = analyzer.df[col]
-    # Generate stats markdown
-    stats_md = f"### 🔎 Analysis of Column: `{col}`\n"
-    stats_md += f"- **Data Type:** `{col_series.dtype}`\n"
-    stats_md += f"- **Missing Values:** {col_series.isnull().sum()} ({col_series.isnull().mean():.2%})\n"
-    stats_md += f"- **Unique Values:** {col_series.nunique()}\n"
-    # Generate plot based on type
-    fig = go.Figure()
-    if pd.api.types.is_numeric_dtype(col_series):
-        stats_md += f"- **Mean:** {col_series.mean():.2f}\n"
-        stats_md += f"- **Median:** {col_series.median():.2f}\n"
-        stats_md += f"- **Std Dev:** {col_series.std():.2f}\n"
-        fig = create_histogram(analyzer, col)
-    elif pd.api.types.is_categorical_dtype(col_series) or pd.api.types.is_object_dtype(col_series):
-        top5 = col_series.value_counts().head(5)
-        stats_md += f"- **Top 5 Values:**\n"
-        for val, count in top5.items():
-            stats_md += f"  - `{val}`: {count} times\n"
-        fig = px.bar(top5, x=top5.index, y=top5.values, title=f"Top 5 Value Counts for {col}")
-        fig.update_xaxes(title=col)
-        fig.update_yaxes(title="Count")
-    return stats_md, fig
-def download_report(analyzer: DataAnalyzer, ai_report_text: str) -> str:
-    """Saves the AI report and basic stats to a markdown file for download."""
-    if not analyzer: return None
-    filename = f"AI_Report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
-    # Create the full report content
-    full_report = f"# AutoEDA Analysis Report\n\n"
-    full_report += f"**Date Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
-    full_report += f"**Dataset Shape:** {analyzer.metadata['shape'][0]} rows x {analyzer.metadata['shape'][1]} columns\n\n"
     full_report += "---\n\n"
     full_report += ai_report_text
     with open(filename, "w", encoding="utf-8") as f:
         f.write(full_report)
-    logging.info(f"Generated download report: {filename}")
     return filename
-# --- Gradio Interface Definition ---
-with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), title="🚀 AutoEDA Pro") as demo:
-    # State object to hold the DataAnalyzer instance
-    state_analyzer = gr.State()
-    gr.Markdown("# 🚀 AutoEDA Pro: Your AI Data Science Assistant")
-    gr.Markdown("Upload a CSV, enter your Gemini API key, and click 'Analyze!' to unlock a comprehensive, AI-powered report on your data.")
-    with gr.Row():
-        with gr.Column(scale=2):
-            file_input = gr.File(label="📁 Upload your CSV File", file_types=[".csv"])
-        with gr.Column(scale=2):
-            api_key_input = gr.Textbox(label="🔑 Google Gemini API Key", type="password", placeholder="Enter your key here...")
-        with gr.Column(scale=1, min_width=150):
-            analyze_btn = gr.Button("✨ Analyze!", variant="primary", scale=1)
-    with gr.Tabs():
-        with gr.Tab("🤖 AI Report & Overview"):
-            md_ai_report = gr.Markdown("Your AI-generated report will appear here...")
-            btn_download_report = gr.Button("⬇️ Download Full Report", visible=False)
-        with gr.Tab("📊 Data Profiling"):
-            gr.Markdown("### Detailed Data Profile")
-            gr.Markdown("**Missing Data Analysis**")
-            df_missing_data = gr.DataFrame(interactive=False)
-            gr.Markdown("**Numeric Feature Statistics**")
-            df_numeric_stats = gr.DataFrame(interactive=False)
-            gr.Markdown("**Categorical Feature Statistics**")
-            df_categorical_stats = gr.DataFrame(interactive=False)
-        with gr.Tab("📈 Overview Visuals"):
-            gr.Markdown("### At-a-Glance Visualizations")
-            with gr.Row():
-                plot_dtype = gr.Plot()
-                plot_missing = gr.Plot()
-            with gr.Row():
-                plot_corr = gr.Plot()
-        with gr.Tab("🎨 Interactive Visuals"):
-            gr.Markdown("### Explore Your Data Visually")
-            with gr.Row():
-                with gr.Column():
-                    dd_hist_col = gr.Dropdown(label="Select Column", visible=False)
-                    plot_hist = gr.Plot()
-                with gr.Column():
-                    dd_box_cat = gr.Dropdown(label="Select Category", visible=False)
-                    dd_box_num = gr.Dropdown(label="Select Value", visible=False)
-                    plot_box = gr.Plot()
-            with gr.Row():
-                gr.Markdown("#### Scatter Plot Explorer")
-                with gr.Row():
-                    dd_scatter_x = gr.Dropdown(label="X-axis", visible=False)
-                    dd_scatter_y = gr.Dropdown(label="Y-axis", visible=False)
-                    dd_scatter_color = gr.Dropdown(label="Color", visible=False)
-                plot_scatter = gr.Plot()
-        with gr.Tab("🔍 Column Drilldown"):
-            gr.Markdown("### Deep Dive into a Single Column")
-            dd_drilldown_col = gr.Dropdown(label="Select Column", visible=False)
-            with gr.Row():
-                md_drilldown_stats = gr.Markdown()
-                plot_drilldown = gr.Plot()
-    # --- Event Listeners ---
-    # Main analysis trigger
-    analyze_btn.click(
-        fn=process_uploaded_file,
-        inputs=[file_input, api_key_input],
-        outputs=[
-            state_analyzer, md_ai_report, btn_download_report,
-            df_missing_data, df_numeric_stats, df_categorical_stats,
-            plot_dtype, plot_missing, plot_corr,
-            dd_hist_col, dd_scatter_x, dd_scatter_y, dd_scatter_color,
-            dd_box_cat, dd_box_num, dd_drilldown_col
-        ]
-    )
-    # Interactive plot triggers
-    dd_hist_col.change(fn=create_histogram, inputs=[state_analyzer, dd_hist_col], outputs=plot_hist)
-    dd_scatter_x.change(fn=create_scatterplot, inputs=[state_analyzer, dd_scatter_x, dd_scatter_y, dd_scatter_color], outputs=plot_scatter)
-    dd_scatter_y.change(fn=create_scatterplot, inputs=[state_analyzer, dd_scatter_x, dd_scatter_y, dd_scatter_color], outputs=plot_scatter)
-    dd_scatter_color.change(fn=create_scatterplot, inputs=[state_analyzer, dd_scatter_x, dd_scatter_y, dd_scatter_color], outputs=plot_scatter)
-    dd_box_cat.change(fn=create_boxplot, inputs=[state_analyzer, dd_box_cat, dd_box_num], outputs=plot_box)
-    dd_box_num.change(fn=create_boxplot, inputs=[state_analyzer, dd_box_cat, dd_box_num], outputs=plot_box)
-    # Drilldown trigger
-    dd_drilldown_col.change(fn=analyze_single_column, inputs=[state_analyzer, dd_drilldown_col], outputs=[md_drilldown_stats, plot_drilldown])
-    # Download trigger
-    btn_download_report.click(fn=download_report, inputs=[state_analyzer, md_ai_report], outputs=gr.File(label="Download Report"))
-    gr.Markdown("---")
-    gr.Markdown("💡 **Tip**: Get your free Google Gemini API key from [Google AI Studio](https://aistudio.google.com/app/apikey).")
-    gr.Markdown("MCP Expert System v2.0 - Analysis Complete.")
 if __name__ == "__main__":
-    demo.launch(debug=True)

 # -*- coding: utf-8 -*-
+#
+# PROJECT:      CognitiveEDA - The AI-Augmented Data Discovery Platform
+#
+# DESCRIPTION:  An enterprise-grade Gradio application that revolutionizes Exploratory
+#               Data Analysis (EDA). By integrating Google's Gemini Pro LLM, this
+#               tool transcends traditional data profiling. It automates the generation
+#               of statistical summaries, interactive visualizations, and, most
+#               importantly, a rich, narrative-driven analysis. It delivers
+#               executive summaries, data quality assessments, actionable insights,
+#               and strategic recommendations in a single, streamlined workflow.
+#
+# ARCHITECTURE: The application is built upon a robust, object-oriented foundation.
+#               - DataAnalyzer (Core Engine): An encapsulated class that holds the
+#                 DataFrame state and performs all statistical calculations and
+#                 metadata extraction efficiently, ensuring data is processed once.
+#               - AI Integration: A dedicated module communicates with the Gemini API,
+#                 using a sophisticated, structured prompt to ensure consistent,
+#                 high-quality analytical narratives.
+#               - Gradio Interface (UI Layer): A multi-tabbed, interactive dashboard
+#                 that logically separates the AI narrative, data profiling, static
+#                 visuals, and interactive exploration tools. State is managed
+#                 efficiently to provide a responsive user experience.
+#
+# FEATURES:
+#   - AI-Powered Executive Summary: Generates a high-level overview for stakeholders.
+#   - Automated Data Quality Audit: Provides a quality score and actionable cleaning steps.
+#   - Insight Discovery Engine: Uncovers hidden patterns, correlations, and anomalies.
+#   - Strategic Recommendations: Suggests next steps, modeling approaches, and business use cases.
+#   - Comprehensive Profiling: Detailed statistical tables for all data types.
+#   - Interactive Visualization Suite: Dynamic plots for deep-dive analysis.
+#   - One-Click Report Export: Downloads the complete AI-generated analysis as a Markdown file.
+#
+# AUTHOR:       An MCP Expert in Data & AI Solutions
+# VERSION:      3.0 (Enterprise Edition)
+# LAST-UPDATE:  2023-10-27
 from __future__ import annotations
 import warnings
 import logging
 import os
+from datetime import datetime
+from typing import Any, Dict, List, Optional, Tuple
+import gradio as gr
 import numpy as np
+import pandas as pd
 import plotly.express as px
 import plotly.graph_objects as go
 import google.generativeai as genai
+# --- Configuration & Constants ---
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - [%(levelname)s] - (%(filename)s:%(lineno)d) - %(message)s'
+)
+warnings.filterwarnings('ignore', category=FutureWarning)
+class Config:
+    """Application-wide configuration settings."""
+    APP_TITLE = "🚀 CognitiveEDA: AI-Augmented Data Discovery Platform"
+    GEMINI_MODEL = 'gemini-1.5-flash-latest'
+    CORR_THRESHOLD = 0.75  # Threshold for highlighting high correlation
+    TOP_N_CATEGORIES = 10  # For bar charts of categorical features
+# --- Core Analysis Engine ---
 class DataAnalyzer:
     """
+    Encapsulates all data analysis logic, acting as the single source of truth
+    for the uploaded dataset and its derived metadata.
     """
     def __init__(self, df: pd.DataFrame):
         if not isinstance(df, pd.DataFrame):
             raise TypeError("Input must be a pandas DataFrame.")
         self.df = df
         self._metadata: Optional[Dict[str, Any]] = None
+        logging.info(f"DataAnalyzer instantiated with DataFrame of shape: {self.df.shape}")
     @property
     def metadata(self) -> Dict[str, Any]:
+        """Lazy-loads and caches comprehensive dataset metadata for efficient reuse."""
         if self._metadata is None:
+            logging.info("First access to metadata, performing extraction...")
             self._metadata = self._extract_metadata()
         return self._metadata
     def _extract_metadata(self) -> Dict[str, Any]:
+        """Performs a deep scan of the DataFrame to extract key characteristics."""
         rows, cols = self.df.shape
         numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist()
         categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
+        # Advanced: High correlation pair detection
         high_corr_pairs = []
         if len(numeric_cols) > 1:
             corr_matrix = self.df[numeric_cols].corr().abs()
             upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
+            high_corr_series = upper_tri.stack()
             high_corr_pairs = (
+                high_corr_series[high_corr_series > Config.CORR_THRESHOLD]
                 .reset_index()
+                .rename(columns={'level_0': 'Feature 1', 'level_1': 'Feature 2', 0: 'Correlation'})
                 .to_dict('records')
             )
             'columns': self.df.columns.tolist(),
             'numeric_cols': numeric_cols,
             'categorical_cols': categorical_cols,
+            'memory_usage_mb': f"{self.df.memory_usage(deep=True).sum() / 1e6:.2f}",
             'total_missing': int(self.df.isnull().sum().sum()),
+            'data_quality_score': round((self.df.notna().sum().sum() / self.df.size) * 100, 2),
             'high_corr_pairs': high_corr_pairs,
         }
+    def get_profiling_tables(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+        """Generates structured DataFrames for data profiling."""
+        logging.info("Generating profiling tables for missing, numeric, and categorical data.")
+        # Missing data profile
         missing = self.df.isnull().sum()
         missing_df = pd.DataFrame({
+            'Missing Count': missing,
+            'Missing Percentage (%)': (missing / len(self.df) * 100).round(2)
+        }).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing Count', ascending=False)
+        # Numeric features profile
+        numeric_stats = self.df[self.metadata['numeric_cols']].describe(percentiles=[.01, .25, .5, .75, .99]).T
+        numeric_stats_df = numeric_stats.round(3).reset_index().rename(columns={'index': 'Column'})
+        # Categorical features profile
+        cat_stats = self.df[self.metadata['categorical_cols']].describe(include=['object', 'category']).T
+        cat_stats_df = cat_stats.reset_index().rename(columns={'index': 'Column'})
+        return missing_df, numeric_stats_df, cat_stats_df
+    def get_overview_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]:
+        """Creates a set of key visualizations for a high-level overview."""
+        logging.info("Generating overview visualizations (types, missing data, correlation).")
+        meta = self.metadata
         dtype_counts = self.df.dtypes.astype(str).value_counts()
+        fig_types = px.pie(
             values=dtype_counts.values, names=dtype_counts.index,
+            title="<b>📊 Data Type Composition</b>", hole=0.4,
+            color_discrete_sequence=px.colors.qualitative.Pastel
         )
+        fig_types.update_traces(textposition='outside', textinfo='percent+label')
+        missing_df = self.df.isnull().sum().reset_index(name='count').query('count > 0')
+        fig_missing = px.bar(
+            missing_df, x='index', y='count', title="<b>🕳️ Missing Values Distribution</b>",
+            labels={'index': 'Column Name', 'count': 'Number of Missing Values'},
+        ).update_xaxes(categoryorder="total descending")
+        fig_corr = go.Figure()
+        if len(meta['numeric_cols']) > 1:
+            corr_matrix = self.df[meta['numeric_cols']].corr()
+            fig_corr = px.imshow(
                 corr_matrix, text_auto=".2f", aspect="auto",
+                title=f"<b>🔗 Correlation Matrix (Threshold > {Config.CORR_THRESHOLD})</b>",
+                color_continuous_scale='RdBu_r', zmin=-1, zmax=1
             )
         else:
+            fig_corr.update_layout(title="<b>🔗 Correlation Matrix (Insufficient Numeric Data)</b>")
+        return fig_types, fig_missing, fig_corr
+    def generate_ai_narrative(self, api_key: str) -> str:
+        """Orchestrates the generation of the full AI-driven report using Gemini."""
+        logging.info("Generating AI narrative with the Gemini API.")
+        meta = self.metadata
+        # A more sophisticated, structured prompt for a better report
         prompt = f"""
+        As "Cognitive Analyst," an elite AI data scientist, your task is to generate a comprehensive, multi-part data discovery report.
+        Analyze the following dataset context and produce a professional, insightful, and clear analysis in Markdown format.
+        **DATASET CONTEXT:**
+        - **Shape:** {meta['shape'][0]} rows, {meta['shape'][1]} columns.
+        - **Column Schema:**
+          - Numeric: {', '.join(meta['numeric_cols']) if meta['numeric_cols'] else 'None'}
+          - Categorical: {', '.join(meta['categorical_cols']) if meta['categorical_cols'] else 'None'}
+        - **Data Quality Score:** {meta['data_quality_score']}% (Percentage of non-missing cells)
+        - **Total Missing Values:** {meta['total_missing']:,}
+        - **High-Correlation Pairs (>{Config.CORR_THRESHOLD}):** {meta['high_corr_pairs'] if meta['high_corr_pairs'] else 'None detected.'}
+        - **Data Snippet (First 5 Rows):**
+        {self.df.head(5).to_markdown(index=False)}
+        **REQUIRED REPORT STRUCTURE (Strictly use this Markdown format):**
+        # 🚀 AI Data Discovery Report
+        ## 📄 1. Executive Summary
+        *   **Primary Objective:** (Deduce the most likely purpose of this dataset. What problem is it trying to solve?)
+        *   **Key Finding:** (State the single most interesting or impactful insight you've discovered.)
+        *   **Overall State:** (Briefly comment on the data's quality and readiness for analysis.)
+        ## 🧐 2. Data Profile & Quality Assessment
+        *   **First Impression:** (Describe the dataset's structure, size, and composition.)
+        *   **Data Quality Audit:** (Elaborate on the **{meta['data_quality_score']}%** quality score. Are the **{meta['total_missing']}** missing values concentrated in specific columns? Is this a major concern?)
+        *   **Redundancy Check:** (Comment on the detected high-correlation pairs. Is there a risk of multicollinearity in modeling?)
+        ## 💡 3. Key Insights & Potential Stories
+        *   **Insight 1 (e.g., Anomaly Detected 🕵️):** (Describe a surprising pattern, outlier, or distribution in a key numeric column.)
+        *   **Insight 2 (e.g., Categorical Trend 📊):** (Analyze a key categorical column. What does its distribution reveal? Is there a dominant category?)
+        *   **Insight 3 (e.g., Relationship Hint 🔗):** (Speculate on a potential relationship between two or more columns, even if not highly correlated.)
+        ## 🛠️ 4. Actionable Recommendations
+        *   **Data Cleaning:**
+            - **Step 1:** (Provide a specific recommendation for handling missing data, e.g., "For `column_name`, with X% missing, consider imputation using the median due to its skewed distribution.")
+            - **Step 2:** (Suggest actions for correlated features, e.g., "Consider dropping `Feature A` or using dimensionality reduction (PCA) due to its high correlation with `Feature B`.")
+        *   **Feature Engineering:**
+            - **Idea 1:** (Suggest creating a new feature, e.g., "Combine `year` and `month` into a `date` feature for time-series analysis.")
+        *   **Next Analytical Steps:**
+            - **Hypothesis to Test:** (Propose a business or research question to investigate further, e.g., "Does `customer_segment` significantly impact `total_spend`?")
+            - **Modeling Potential:** (Suggest a suitable machine learning model, e.g., "This dataset is well-suited for a classification model to predict `is_churn`.")
         """
         try:
             genai.configure(api_key=api_key)
+            model = genai.GenerativeModel(Config.GEMINI_MODEL)
             response = model.generate_content(prompt)
             return response.text
         except Exception as e:
+            logging.error(f"Gemini API call failed: {e}", exc_info=True)
+            error_message = (
+                "❌ **AI Report Generation Failed**\n\n"
+                f"**Error Details:** `{str(e)}`\n\n"
+                "**Troubleshooting Steps:**\n"
+                "1.  Verify that your Google Gemini API key is correct and active.\n"
+                "2.  Check your network connection and firewall settings.\n"
+                "3.  Ensure the Gemini API is not experiencing an outage."
+            )
+            return error_message
 # --- Gradio UI & Event Handlers ---
+def create_ui():
+    """Defines and builds the Gradio user interface."""
+    # --- Interactive Plotting Functions (scoped inside UI creation for clarity) ---
+    def create_histogram(analyzer: DataAnalyzer, col: str) -> go.Figure:
+        if not col or not analyzer: return go.Figure()
+        return px.histogram(analyzer.df, x=col, title=f"<b>Distribution of {col}</b>", marginal="box", template="plotly_white")
+    def create_scatterplot(analyzer: DataAnalyzer, x_col: str, y_col:str, color_col:str) -> go.Figure:
+        if not all([analyzer, x_col, y_col]): return go.Figure()
+        return px.scatter(
+            analyzer.df, x=x_col, y=y_col, color=color_col,
+            title=f"<b>Scatter Plot: {x_col} vs. {y_col}</b>", template="plotly_white",
+            color_continuous_scale=px.colors.sequential.Viridis
+        )
+    def analyze_single_column(analyzer: DataAnalyzer, col: str) -> Tuple[str, go.Figure]:
+        if not col or not analyzer: return "", go.Figure()
+        series = analyzer.df[col]
+        stats_md = f"### 🔎 **Deep Dive: `{col}`**\n"
+        stats_md += f"- **Data Type:** `{series.dtype}`\n"
+        stats_md += f"- **Unique Values:** `{series.nunique()}`\n"
+        stats_md += f"- **Missing:** `{series.isnull().sum()}` ({series.isnull().mean():.2%})\n"
+        fig = go.Figure()
+        if pd.api.types.is_numeric_dtype(series):
+            stats_md += f"- **Mean:** `{series.mean():.3f}` | **Std Dev:** `{series.std():.3f}`\n"
+            stats_md += f"- **Median:** `{series.median():.3f}` | **Min:** `{series.min():.3f}` | **Max:** `{series.max():.3f}`\n"
+            fig = create_histogram(analyzer, col)
+        else:
+            top_n = series.value_counts().nlargest(Config.TOP_N_CATEGORIES)
+            stats_md += f"- **Top Value:** `{top_n.index[0]}` ({top_n.iloc[0]} occurrences)\n"
+            fig = px.bar(
+                top_n, y=top_n.index, x=top_n.values, orientation='h',
+                title=f"<b>Top {Config.TOP_N_CATEGORIES} Categories in `{col}`</b>",
+                labels={'y': col, 'x': 'Count'}, template="plotly_white"
+            ).update_yaxes(categoryorder="total ascending")
+        return stats_md, fig
+    # --- Main UI Blocks ---
+    with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="cyan"), title=Config.APP_TITLE) as demo:
+        # Store for the main DataAnalyzer object
+        state_analyzer = gr.State()
+        gr.Markdown(f"<h1>{Config.APP_TITLE}</h1>")
+        gr.Markdown("Upload a CSV file, provide your Gemini API key, and receive an instant, AI-driven analysis of your data.")
+        with gr.Row():
+            with gr.Column(scale=3):
+                upload_button = gr.File(label="1. Upload CSV File", file_types=[".csv"])
+            with gr.Column(scale=2):
+                api_key_input = gr.Textbox(label="2. Enter Google Gemini API Key", type="password")
+            with gr.Column(scale=1, min_width=150):
+                analyze_button = gr.Button("✨ Generate Analysis", variant="primary")
+        with gr.Tabs() as tabs:
+            with gr.Tab("🤖 AI Narrative", id=0):
+                ai_report_output = gr.Markdown("Your AI-generated report will appear here once analysis is complete...")
+                download_report_button = gr.Button("⬇️ Download Full Report", visible=False)
+            with gr.Tab(" Profile", id=1):
+                gr.Markdown("### **Detailed Data Profile**")
+                gr.Markdown("#### Missing Data Summary")
+                profile_missing_df = gr.DataFrame(interactive=False, label="Missing Values")
+                gr.Markdown("#### Numeric Features Summary")
+                profile_numeric_df = gr.DataFrame(interactive=False, label="Numeric Stats")
+                gr.Markdown("#### Categorical Features Summary")
+                profile_categorical_df = gr.DataFrame(interactive=False, label="Categorical Stats")
+            with gr.Tab("📈 Overview Visuals", id=2):
+                gr.Markdown("### **At-a-Glance Visualizations**")
+                with gr.Row():
+                    plot_types = gr.Plot()
+                    plot_missing = gr.Plot()
+                plot_correlation = gr.Plot()
+            with gr.Tab("🎨 Interactive Explorer", id=3):
+                gr.Markdown("### **Visually Explore Feature Relationships**")
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        gr.Markdown("#### Univariate Analysis")
+                        dd_hist_col = gr.Dropdown(label="Select Column for Histogram", visible=False)
+                    with gr.Column(scale=2):
+                        plot_histogram = gr.Plot()
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        gr.Markdown("#### Bivariate Analysis (Scatter Plot)")
+                        dd_scatter_x = gr.Dropdown(label="X-Axis (Numeric)", visible=False)
+                        dd_scatter_y = gr.Dropdown(label="Y-Axis (Numeric)", visible=False)
+                        dd_scatter_color = gr.Dropdown(label="Color By (Optional)", visible=False)
+                    with gr.Column(scale=2):
+                        plot_scatter = gr.Plot()
+            with gr.Tab("🔍 Column Deep-Dive", id=4):
+                gr.Markdown("### **Inspect a Single Column in Detail**")
+                dd_drilldown_col = gr.Dropdown(label="Select Column to Analyze", visible=False)
+                with gr.Row():
+                    md_drilldown_stats = gr.Markdown()
+                    plot_drilldown = gr.Plot()
+        gr.HTML("""
+        <div style="text-align: center; margin-top: 20px; font-family: sans-serif; color: #777;">
+            <p>💡 Need an API key? Get one from <a href="https://aistudio.google.com/app/apikey" target="_blank">Google AI Studio</a>.</p>
+            <p>CognitiveEDA v3.0 | An MCP Expert System</p>
+        </div>
+        """)
+        # --- Event Listeners & Control Flow ---
+        outputs_for_main_analysis = [
+            state_analyzer, ai_report_output, download_report_button,
+            profile_missing_df, profile_numeric_df, profile_categorical_df,
+            plot_types, plot_missing, plot_correlation,
+            dd_hist_col, dd_scatter_x, dd_scatter_y, dd_scatter_color, dd_drilldown_col
+        ]
+        analyze_button.click(
+            fn=run_full_analysis,
+            inputs=[upload_button, api_key_input],
+            outputs=outputs_for_main_analysis
+        )
+        # Interactive plot triggers
+        dd_hist_col.change(fn=create_histogram, inputs=[state_analyzer, dd_hist_col], outputs=plot_histogram)
+        scatter_inputs = [state_analyzer, dd_scatter_x, dd_scatter_y, dd_scatter_color]
+        dd_scatter_x.change(fn=create_scatterplot, inputs=scatter_inputs, outputs=plot_scatter)
+        dd_scatter_y.change(fn=create_scatterplot, inputs=scatter_inputs, outputs=plot_scatter)
+        dd_scatter_color.change(fn=create_scatterplot, inputs=scatter_inputs, outputs=plot_scatter)
+        dd_drilldown_col.change(
+            fn=analyze_single_column,
+            inputs=[state_analyzer, dd_drilldown_col],
+            outputs=[md_drilldown_stats, plot_drilldown]
+        )
+        download_report_button.click(
+            fn=download_report_file,
+            inputs=[state_analyzer, ai_report_output],
+            outputs=gr.File(label="Download Report")
+        )
+    return demo
+# --- Main Application Logic ---
+def run_full_analysis(file_obj: gr.File, api_key: str) -> Dict[gr.component, Any]:
     """
+    Orchestrates the entire analysis pipeline upon button click.
+    Returns a dictionary to update all relevant UI components at once.
     """
     if file_obj is None:
+        raise gr.Error("CRITICAL: No file uploaded. Please select a CSV file.")
     if not api_key:
+        raise gr.Error("CRITICAL: Gemini API key is missing. Please provide your key.")
     try:
+        logging.info(f"Processing uploaded file: {file_obj.name}")
         df = pd.read_csv(file_obj.name)
         analyzer = DataAnalyzer(df)
+        # --- Execute all analysis tasks concurrently (conceptually) ---
+        ai_report = analyzer.generate_ai_narrative(api_key)
+        missing_df, num_df, cat_df = analyzer.get_profiling_tables()
+        fig_types, fig_missing, fig_corr = analyzer.get_overview_visuals()
+        # --- Prepare UI component updates ---
+        meta = analyzer.metadata
+        all_cols, num_cols, cat_cols = meta['columns'], meta['numeric_cols'], meta['categorical_cols']
+        # Return a dictionary mapping components to their new state/value
         return {
+            # State & AI Report
             state_analyzer: analyzer,
+            ai_report_output: ai_report,
+            download_report_button: gr.Button(visible=True),
             # Profiling Tab
+            profile_missing_df: missing_df,
+            profile_numeric_df: num_df,
+            profile_categorical_df: cat_df,
+            # Overview Visuals Tab
+            plot_types: fig_types,
+            plot_missing: fig_missing,
+            plot_correlation: fig_corr,
+            # Interactive Explorer & Drilldown Dropdown Updates
+            dd_hist_col: gr.Dropdown(choices=num_cols, label="Select Numeric Column", visible=True),
+            dd_scatter_x: gr.Dropdown(choices=num_cols, label="X-Axis (Numeric)", visible=True),
+            dd_scatter_y: gr.Dropdown(choices=num_cols, label="Y-Axis (Numeric)", visible=True),
+            dd_scatter_color: gr.Dropdown(choices=all_cols, label="Color By (Optional)", visible=True),
+            dd_drilldown_col: gr.Dropdown(choices=all_cols, label="Select Column to Analyze", visible=True)
         }
     except Exception as e:
+        logging.error(f"A critical error occurred during file processing: {e}", exc_info=True)
+        raise gr.Error(f"Analysis Failed! The process stopped due to: {str(e)}")
+def download_report_file(analyzer: DataAnalyzer, ai_report_text: str) -> str:
+    """Generates a comprehensive Markdown file for download."""
+    if not analyzer:
+        logging.warning("Download attempted without a valid analyzer object.")
+        return None
+    filename = f"CognitiveEDA_Report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
+    meta = analyzer.metadata
+    # Assemble the full report
+    full_report = f"# CognitiveEDA - Data Discovery Report\n"
+    full_report += f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
+    full_report += f"## Dataset Overview\n"
+    full_report += f"- **Shape:** {meta['shape'][0]} rows x {meta['shape'][1]} columns\n"
+    full_report += f"- **Memory Footprint:** {meta['memory_usage_mb']} MB\n"
+    full_report += f"- **Data Quality Score:** {meta['data_quality_score']}%\n\n"
     full_report += "---\n\n"
     full_report += ai_report_text
     with open(filename, "w", encoding="utf-8") as f:
         f.write(full_report)
+    logging.info(f"Report file generated successfully: {filename}")
     return filename
 if __name__ == "__main__":
+    app_instance = create_ui()
+    app_instance.launch(debug=True, server_name="0.0.0.0")