|
|
|
""" |
|
🚀 AutoEDA: AI-Powered Exploratory Data Analysis Tool |
|
|
|
An advanced Gradio application for automated exploratory data analysis, |
|
data profiling, and AI-driven insights using Google's Gemini API. |
|
|
|
Key Features: |
|
- Unified Analysis Workflow: Upload a CSV and get a full report across all tabs. |
|
- AI-Powered Storytelling: Generates a narrative overview, use cases, and findings. |
|
- Actionable AI Suggestions: Provides data cleaning recommendations. |
|
- Interactive Visualizations: Users can select columns to generate plots dynamically. |
|
- In-depth Profiling: Detailed statistics for numeric and categorical data. |
|
- Column-Level Drilldown: Inspect individual features in detail. |
|
- Report Download: Export the AI-generated analysis as a Markdown file. |
|
|
|
Author: World-Class MCP Expert |
|
Version: 2.0 |
|
""" |
|
from __future__ import annotations |
|
|
|
import warnings |
|
import logging |
|
import os |
|
import pandas as pd |
|
import numpy as np |
|
import plotly.express as px |
|
import plotly.graph_objects as go |
|
from plotly.subplots import make_subplots |
|
import gradio as gr |
|
import google.generativeai as genai |
|
from typing import Optional, Dict, Any, Tuple, List |
|
from datetime import datetime |
|
|
|
|
|
|
|
warnings.filterwarnings('ignore') |
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
|
|
|
|
|
class DataAnalyzer: |
|
""" |
|
A comprehensive class to encapsulate all data analysis operations. |
|
It holds the dataframe and provides methods for profiling, visualization, |
|
and AI-powered analysis, ensuring data is processed only once. |
|
""" |
|
def __init__(self, df: pd.DataFrame): |
|
if not isinstance(df, pd.DataFrame): |
|
raise TypeError("Input must be a pandas DataFrame.") |
|
self.df = df |
|
self._metadata: Optional[Dict[str, Any]] = None |
|
logging.info(f"DataAnalyzer initialized with DataFrame of shape: {self.df.shape}") |
|
|
|
@property |
|
def metadata(self) -> Dict[str, Any]: |
|
"""Lazy-loads and caches dataset metadata.""" |
|
if self._metadata is None: |
|
self._metadata = self._extract_metadata() |
|
return self._metadata |
|
|
|
def _extract_metadata(self) -> Dict[str, Any]: |
|
"""Extracts comprehensive metadata from the DataFrame.""" |
|
logging.info("Extracting dataset metadata...") |
|
rows, cols = self.df.shape |
|
numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist() |
|
categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist() |
|
datetime_cols = self.df.select_dtypes(include=['datetime64']).columns.tolist() |
|
|
|
|
|
high_corr_pairs = [] |
|
if len(numeric_cols) > 1: |
|
corr_matrix = self.df[numeric_cols].corr().abs() |
|
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) |
|
high_corr_pairs = ( |
|
upper_tri.stack() |
|
.reset_index() |
|
.rename(columns={'level_0': 'Var 1', 'level_1': 'Var 2', 0: 'Correlation'}) |
|
.query('Correlation > 0.7') |
|
.sort_values('Correlation', ascending=False) |
|
.head(5) |
|
.to_dict('records') |
|
) |
|
|
|
return { |
|
'shape': (rows, cols), |
|
'columns': self.df.columns.tolist(), |
|
'numeric_cols': numeric_cols, |
|
'categorical_cols': categorical_cols, |
|
'datetime_cols': datetime_cols, |
|
'memory_usage': f"{self.df.memory_usage(deep=True).sum() / 1e6:.2f} MB", |
|
'total_missing': int(self.df.isnull().sum().sum()), |
|
'data_quality_score': round((self.df.notna().sum().sum() / self.df.size) * 100, 1), |
|
'high_corr_pairs': high_corr_pairs, |
|
} |
|
|
|
def get_profiling_report(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: |
|
"""Generates detailed data profiling tables.""" |
|
logging.info("Generating data profiling report.") |
|
|
|
|
|
missing = self.df.isnull().sum() |
|
missing_df = pd.DataFrame({ |
|
'Missing Values': missing, |
|
'Percentage (%)': (missing / len(self.df) * 100).round(2) |
|
}).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing Values', ascending=False) |
|
|
|
|
|
numeric_stats_df = self.df[self.metadata['numeric_cols']].describe().round(3).T.reset_index().rename(columns={'index': 'Column'}) |
|
|
|
|
|
cat_stats_list = [] |
|
for col in self.metadata['categorical_cols']: |
|
stats = { |
|
'Column': col, |
|
'Unique Values': self.df[col].nunique(), |
|
'Top Value': self.df[col].mode().iloc[0] if not self.df[col].mode().empty else 'N/A', |
|
'Frequency': self.df[col].value_counts().iloc[0] if not self.df[col].value_counts().empty else 0 |
|
} |
|
cat_stats_list.append(stats) |
|
categorical_stats_df = pd.DataFrame(cat_stats_list) |
|
|
|
return missing_df, numeric_stats_df, categorical_stats_df |
|
|
|
def get_initial_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]: |
|
"""Creates a set of standard, non-interactive overview plots.""" |
|
logging.info("Generating initial overview visualizations.") |
|
|
|
|
|
dtype_counts = self.df.dtypes.astype(str).value_counts() |
|
dtype_fig = px.pie( |
|
values=dtype_counts.values, names=dtype_counts.index, |
|
title="📊 Data Type Distribution", hole=0.3 |
|
) |
|
dtype_fig.update_traces(textposition='inside', textinfo='percent+label') |
|
|
|
|
|
missing_fig = px.bar( |
|
x=self.df.isnull().sum(), y=self.df.columns, |
|
orientation='h', title="🕳️ Missing Values Overview", |
|
labels={'x': 'Number of Missing Values', 'y': 'Column'}, |
|
).update_yaxes(categoryorder="total ascending") |
|
|
|
|
|
corr_fig = go.Figure() |
|
if len(self.metadata['numeric_cols']) > 1: |
|
corr_matrix = self.df[self.metadata['numeric_cols']].corr() |
|
corr_fig = px.imshow( |
|
corr_matrix, text_auto=".2f", aspect="auto", |
|
title="🔗 Correlation Matrix (Numeric Features)", |
|
color_continuous_scale='RdBu_r' |
|
) |
|
else: |
|
corr_fig.update_layout(title="🔗 Correlation Matrix (Not enough numeric columns)") |
|
|
|
return dtype_fig, missing_fig, corr_fig |
|
|
|
def generate_ai_report(self, api_key: str) -> str: |
|
"""Generates a full data story and analysis using the Gemini API.""" |
|
logging.info("Generating AI report with Gemini.") |
|
|
|
prompt = f""" |
|
As an expert data analyst and storyteller, your task is to analyze the provided dataset summary and generate a comprehensive, insightful, and accessible report. |
|
|
|
**Dataset Metadata:** |
|
- **Shape:** {self.metadata['shape'][0]} rows, {self.metadata['shape'][1]} columns. |
|
- **Column Names:** {', '.join(self.metadata['columns'])} |
|
- **Numeric Columns:** {', '.join(self.metadata['numeric_cols'])} |
|
- **Categorical Columns:** {', '.join(self.metadata['categorical_cols'])} |
|
- **Overall Data Quality:** {self.metadata['data_quality_score']}% |
|
- **Total Missing Values:** {self.metadata['total_missing']:,} |
|
- **Highly Correlated Pairs (>0.7):** {self.metadata['high_corr_pairs'] if self.metadata['high_corr_pairs'] else 'None detected.'} |
|
- **Sample Data (First 3 Rows):** |
|
{self.df.head(3).to_markdown()} |
|
|
|
**Your Report Structure (Use Markdown):** |
|
|
|
# 🚀 AI-Powered Data Analysis Report |
|
|
|
## 📖 1. The Story of the Data |
|
* **What is this dataset about?** (Deduce the purpose and subject matter of the data.) |
|
* **What domain or industry does it belong to?** (e.g., E-commerce, Finance, Healthcare.) |
|
* **Who might use this data?** (e.g., Marketers, Scientists, Financial Analysts.) |
|
|
|
## 🎯 2. Key Insights & Interesting Findings |
|
- **Finding 1:** (Describe a significant pattern, trend, or anomaly. Use emojis to highlight.) |
|
- **Finding 2:** (Mention another interesting discovery, perhaps from correlations or categorical data.) |
|
- **Finding 3:** (Highlight a potential business or research opportunity revealed by the data.) |
|
|
|
## 🧹 3. Data Quality & Cleaning Recommendations |
|
* **Overall Quality Assessment:** (Comment on the {self.metadata['data_quality_score']}% score and {self.metadata['total_missing']} missing values.) |
|
* **Actionable Steps:** |
|
- **Recommendation 1:** (e.g., "For column 'X' with Y% missing values, consider imputation using the mean/median/mode.") |
|
- **Recommendation 2:** (e.g., "Columns 'A' and 'B' are highly correlated ({'e.g., ' + str(self.metadata['high_corr_pairs'][0]) if self.metadata['high_corr_pairs'] else ''}). Consider dropping one for modeling to avoid multicollinearity.") |
|
- **Recommendation 3:** (e.g., "Column 'Z' is categorical but stored as a number. Recommend converting it to a category type.") |
|
|
|
## 🔮 4. Potential Next Steps & Use Cases |
|
- **Analysis Idea 1:** (e.g., "Build a predictive model for customer churn.") |
|
- **Dashboard Idea 2:** (e.g., "Create a sales performance dashboard tracking KPIs over time.") |
|
- **Research Question 3:** (e.g., "Investigate the factors influencing employee attrition.") |
|
""" |
|
try: |
|
genai.configure(api_key=api_key) |
|
model = genai.GenerativeModel('gemini-1.5-flash-latest') |
|
response = model.generate_content(prompt) |
|
return response.text |
|
except Exception as e: |
|
logging.error(f"Gemini API call failed: {e}") |
|
return f"❌ **Error generating AI report.**\n**Reason:** {str(e)}\n\nPlease check your API key and network connection. A fallback analysis could not be generated." |
|
|
|
|
|
|
|
def process_uploaded_file(file_obj: gr.File, api_key: str) -> tuple: |
|
""" |
|
Main function to process the uploaded file. It runs all analyses |
|
and returns updates for all UI components in one go. |
|
""" |
|
if file_obj is None: |
|
raise gr.Error("📁 Please upload a CSV file first!") |
|
if not api_key: |
|
raise gr.Error("🔑 Please enter your Gemini API key!") |
|
|
|
try: |
|
df = pd.read_csv(file_obj.name) |
|
analyzer = DataAnalyzer(df) |
|
|
|
|
|
ai_report = analyzer.generate_ai_report(api_key) |
|
missing_df, num_stats, cat_stats = analyzer.get_profiling_report() |
|
dtype_fig, missing_fig, corr_fig = analyzer.get_initial_visuals() |
|
|
|
|
|
all_cols = analyzer.metadata['columns'] |
|
num_cols = analyzer.metadata['numeric_cols'] |
|
cat_cols = analyzer.metadata['categorical_cols'] |
|
|
|
|
|
return { |
|
state_analyzer: analyzer, |
|
|
|
md_ai_report: ai_report, |
|
btn_download_report: gr.Button(visible=True), |
|
|
|
df_missing_data: missing_df, |
|
df_numeric_stats: num_stats, |
|
df_categorical_stats: cat_stats, |
|
|
|
plot_dtype: dtype_fig, |
|
plot_missing: missing_fig, |
|
plot_corr: corr_fig, |
|
|
|
dd_hist_col: gr.Dropdown(choices=num_cols, label="Select Numeric Column for Histogram", visible=True), |
|
dd_scatter_x: gr.Dropdown(choices=num_cols, label="Select X-axis (Numeric)", visible=True), |
|
dd_scatter_y: gr.Dropdown(choices=num_cols, label="Select Y-axis (Numeric)", visible=True), |
|
dd_scatter_color: gr.Dropdown(choices=all_cols, label="Select Color (Categorical/Numeric)", visible=True), |
|
dd_box_cat: gr.Dropdown(choices=cat_cols, label="Select Categorical Column for Box Plot", visible=True), |
|
dd_box_num: gr.Dropdown(choices=num_cols, label="Select Numeric Column for Box Plot", visible=True), |
|
|
|
dd_drilldown_col: gr.Dropdown(choices=all_cols, label="Select Column to Analyze", visible=True), |
|
} |
|
|
|
except Exception as e: |
|
logging.error(f"An error occurred during file processing: {e}", exc_info=True) |
|
raise gr.Error(f"Processing failed! Error: {str(e)}") |
|
|
|
|
|
|
|
def create_histogram(analyzer: DataAnalyzer, col: str) -> go.Figure: |
|
if not col: return go.Figure() |
|
return px.histogram(analyzer.df, x=col, title=f"Distribution of {col}", marginal="box") |
|
|
|
def create_scatterplot(analyzer: DataAnalyzer, x_col: str, y_col: str, color_col: str) -> go.Figure: |
|
if not x_col or not y_col: return go.Figure() |
|
return px.scatter(analyzer.df, x=x_col, y=y_col, color=color_col, |
|
title=f"Scatter Plot: {x_col} vs. {y_col}") |
|
|
|
def create_boxplot(analyzer: DataAnalyzer, cat_col: str, num_col: str) -> go.Figure: |
|
if not cat_col or not num_col: return go.Figure() |
|
return px.box(analyzer.df, x=cat_col, y=num_col, title=f"Box Plot: {num_col} by {cat_col}") |
|
|
|
def analyze_single_column(analyzer: DataAnalyzer, col: str) -> Tuple[str, go.Figure]: |
|
if not col: return "", go.Figure() |
|
|
|
col_series = analyzer.df[col] |
|
|
|
|
|
stats_md = f"### 🔎 Analysis of Column: `{col}`\n" |
|
stats_md += f"- **Data Type:** `{col_series.dtype}`\n" |
|
stats_md += f"- **Missing Values:** {col_series.isnull().sum()} ({col_series.isnull().mean():.2%})\n" |
|
stats_md += f"- **Unique Values:** {col_series.nunique()}\n" |
|
|
|
|
|
fig = go.Figure() |
|
if pd.api.types.is_numeric_dtype(col_series): |
|
stats_md += f"- **Mean:** {col_series.mean():.2f}\n" |
|
stats_md += f"- **Median:** {col_series.median():.2f}\n" |
|
stats_md += f"- **Std Dev:** {col_series.std():.2f}\n" |
|
fig = create_histogram(analyzer, col) |
|
elif pd.api.types.is_categorical_dtype(col_series) or pd.api.types.is_object_dtype(col_series): |
|
top5 = col_series.value_counts().head(5) |
|
stats_md += f"- **Top 5 Values:**\n" |
|
for val, count in top5.items(): |
|
stats_md += f" - `{val}`: {count} times\n" |
|
fig = px.bar(top5, x=top5.index, y=top5.values, title=f"Top 5 Value Counts for {col}") |
|
fig.update_xaxes(title=col) |
|
fig.update_yaxes(title="Count") |
|
|
|
return stats_md, fig |
|
|
|
def download_report(analyzer: DataAnalyzer, ai_report_text: str) -> str: |
|
"""Saves the AI report and basic stats to a markdown file for download.""" |
|
if not analyzer: return None |
|
|
|
filename = f"AI_Report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md" |
|
|
|
|
|
full_report = f"# AutoEDA Analysis Report\n\n" |
|
full_report += f"**Date Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n" |
|
full_report += f"**Dataset Shape:** {analyzer.metadata['shape'][0]} rows x {analyzer.metadata['shape'][1]} columns\n\n" |
|
full_report += "---\n\n" |
|
full_report += ai_report_text |
|
|
|
with open(filename, "w", encoding="utf-8") as f: |
|
f.write(full_report) |
|
|
|
logging.info(f"Generated download report: {filename}") |
|
return filename |
|
|
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), title="🚀 AutoEDA Pro") as demo: |
|
|
|
state_analyzer = gr.State() |
|
|
|
gr.Markdown("# 🚀 AutoEDA Pro: Your AI Data Science Assistant") |
|
gr.Markdown("Upload a CSV, enter your Gemini API key, and click 'Analyze!' to unlock a comprehensive, AI-powered report on your data.") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
file_input = gr.File(label="📁 Upload your CSV File", file_types=[".csv"]) |
|
with gr.Column(scale=2): |
|
api_key_input = gr.Textbox(label="🔑 Google Gemini API Key", type="password", placeholder="Enter your key here...") |
|
with gr.Column(scale=1, min_width=150): |
|
analyze_btn = gr.Button("✨ Analyze!", variant="primary", scale=1) |
|
|
|
with gr.Tabs(): |
|
with gr.Tab("🤖 AI Report & Overview"): |
|
md_ai_report = gr.Markdown("Your AI-generated report will appear here...") |
|
btn_download_report = gr.Button("⬇️ Download Full Report", visible=False) |
|
|
|
with gr.Tab("📊 Data Profiling"): |
|
gr.Markdown("### Detailed Data Profile") |
|
gr.Markdown("**Missing Data Analysis**") |
|
df_missing_data = gr.DataFrame(interactive=False) |
|
gr.Markdown("**Numeric Feature Statistics**") |
|
df_numeric_stats = gr.DataFrame(interactive=False) |
|
gr.Markdown("**Categorical Feature Statistics**") |
|
df_categorical_stats = gr.DataFrame(interactive=False) |
|
|
|
with gr.Tab("📈 Overview Visuals"): |
|
gr.Markdown("### At-a-Glance Visualizations") |
|
with gr.Row(): |
|
plot_dtype = gr.Plot() |
|
plot_missing = gr.Plot() |
|
with gr.Row(): |
|
plot_corr = gr.Plot() |
|
|
|
with gr.Tab("🎨 Interactive Visuals"): |
|
gr.Markdown("### Explore Your Data Visually") |
|
with gr.Row(): |
|
with gr.Column(): |
|
dd_hist_col = gr.Dropdown(label="Select Column", visible=False) |
|
plot_hist = gr.Plot() |
|
with gr.Column(): |
|
dd_box_cat = gr.Dropdown(label="Select Category", visible=False) |
|
dd_box_num = gr.Dropdown(label="Select Value", visible=False) |
|
plot_box = gr.Plot() |
|
with gr.Row(): |
|
gr.Markdown("#### Scatter Plot Explorer") |
|
with gr.Row(): |
|
dd_scatter_x = gr.Dropdown(label="X-axis", visible=False) |
|
dd_scatter_y = gr.Dropdown(label="Y-axis", visible=False) |
|
dd_scatter_color = gr.Dropdown(label="Color", visible=False) |
|
plot_scatter = gr.Plot() |
|
|
|
with gr.Tab("🔍 Column Drilldown"): |
|
gr.Markdown("### Deep Dive into a Single Column") |
|
dd_drilldown_col = gr.Dropdown(label="Select Column", visible=False) |
|
with gr.Row(): |
|
md_drilldown_stats = gr.Markdown() |
|
plot_drilldown = gr.Plot() |
|
|
|
|
|
|
|
|
|
analyze_btn.click( |
|
fn=process_uploaded_file, |
|
inputs=[file_input, api_key_input], |
|
outputs=[ |
|
state_analyzer, md_ai_report, btn_download_report, |
|
df_missing_data, df_numeric_stats, df_categorical_stats, |
|
plot_dtype, plot_missing, plot_corr, |
|
dd_hist_col, dd_scatter_x, dd_scatter_y, dd_scatter_color, |
|
dd_box_cat, dd_box_num, dd_drilldown_col |
|
] |
|
) |
|
|
|
|
|
dd_hist_col.change(fn=create_histogram, inputs=[state_analyzer, dd_hist_col], outputs=plot_hist) |
|
dd_scatter_x.change(fn=create_scatterplot, inputs=[state_analyzer, dd_scatter_x, dd_scatter_y, dd_scatter_color], outputs=plot_scatter) |
|
dd_scatter_y.change(fn=create_scatterplot, inputs=[state_analyzer, dd_scatter_x, dd_scatter_y, dd_scatter_color], outputs=plot_scatter) |
|
dd_scatter_color.change(fn=create_scatterplot, inputs=[state_analyzer, dd_scatter_x, dd_scatter_y, dd_scatter_color], outputs=plot_scatter) |
|
dd_box_cat.change(fn=create_boxplot, inputs=[state_analyzer, dd_box_cat, dd_box_num], outputs=plot_box) |
|
dd_box_num.change(fn=create_boxplot, inputs=[state_analyzer, dd_box_cat, dd_box_num], outputs=plot_box) |
|
|
|
|
|
dd_drilldown_col.change(fn=analyze_single_column, inputs=[state_analyzer, dd_drilldown_col], outputs=[md_drilldown_stats, plot_drilldown]) |
|
|
|
|
|
btn_download_report.click(fn=download_report, inputs=[state_analyzer, md_ai_report], outputs=gr.File(label="Download Report")) |
|
|
|
gr.Markdown("---") |
|
gr.Markdown("💡 **Tip**: Get your free Google Gemini API key from [Google AI Studio](https://aistudio.google.com/app/apikey).") |
|
gr.Markdown("MCP Expert System v2.0 - Analysis Complete.") |
|
|
|
if __name__ == "__main__": |
|
demo.launch(debug=True) |