|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations |
|
|
|
import warnings |
|
import logging |
|
import os |
|
import sys |
|
import importlib.util |
|
from datetime import datetime |
|
from typing import Any, Dict, List, Optional, Tuple |
|
|
|
import gradio as gr |
|
import numpy as np |
|
import pandas as pd |
|
import plotly.express as px |
|
import plotly.graph_objects as go |
|
import google.generativeai as genai |
|
|
|
|
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - [%(levelname)s] - (%(filename)s:%(lineno)d) - %(message)s' |
|
) |
|
warnings.filterwarnings('ignore', category=FutureWarning) |
|
|
|
class Config: |
|
APP_TITLE = "π CognitiveEDA: AI-Augmented Data Discovery Platform" |
|
GEMINI_MODEL = 'gemini-1.5-flash-latest' |
|
CORR_THRESHOLD = 0.75 |
|
TOP_N_CATEGORIES = 10 |
|
|
|
|
|
|
|
class DataAnalyzer: |
|
def __init__(self, df: pd.DataFrame): |
|
if not isinstance(df, pd.DataFrame): |
|
raise TypeError("Input must be a pandas DataFrame.") |
|
self.df = df |
|
self._metadata: Optional[Dict[str, Any]] = None |
|
logging.info(f"DataAnalyzer instantiated with DataFrame of shape: {self.df.shape}") |
|
|
|
@property |
|
def metadata(self) -> Dict[str, Any]: |
|
if self._metadata is None: |
|
logging.info("First access to metadata, performing extraction...") |
|
self._metadata = self._extract_metadata() |
|
return self._metadata |
|
|
|
def _extract_metadata(self) -> Dict[str, Any]: |
|
rows, cols = self.df.shape |
|
numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist() |
|
categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist() |
|
high_corr_pairs = [] |
|
if len(numeric_cols) > 1: |
|
corr_matrix = self.df[numeric_cols].corr().abs() |
|
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) |
|
high_corr_series = upper_tri.stack() |
|
high_corr_pairs = ( |
|
high_corr_series[high_corr_series > Config.CORR_THRESHOLD] |
|
.reset_index() |
|
.rename(columns={'level_0': 'Feature 1', 'level_1': 'Feature 2', 0: 'Correlation'}) |
|
.to_dict('records') |
|
) |
|
return { |
|
'shape': (rows, cols), 'columns': self.df.columns.tolist(), |
|
'numeric_cols': numeric_cols, 'categorical_cols': categorical_cols, |
|
'memory_usage_mb': f"{self.df.memory_usage(deep=True).sum() / 1e6:.2f}", |
|
'total_missing': int(self.df.isnull().sum().sum()), |
|
'data_quality_score': round((self.df.notna().sum().sum() / self.df.size) * 100, 2), |
|
'high_corr_pairs': high_corr_pairs, |
|
} |
|
|
|
def get_profiling_tables(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: |
|
logging.info("Generating profiling tables for missing, numeric, and categorical data.") |
|
missing = self.df.isnull().sum() |
|
missing_df = pd.DataFrame({ |
|
'Missing Count': missing, 'Missing Percentage (%)': (missing / len(self.df) * 100).round(2) |
|
}).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing Count', ascending=False) |
|
numeric_stats = self.df[self.metadata['numeric_cols']].describe(percentiles=[.01, .25, .5, .75, .99]).T |
|
numeric_stats_df = numeric_stats.round(3).reset_index().rename(columns={'index': 'Column'}) |
|
cat_stats = self.df[self.metadata['categorical_cols']].describe(include=['object', 'category']).T |
|
cat_stats_df = cat_stats.reset_index().rename(columns={'index': 'Column'}) |
|
return missing_df, numeric_stats_df, cat_stats_df |
|
|
|
def get_overview_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]: |
|
logging.info("Generating overview visualizations (types, missing data, correlation).") |
|
meta = self.metadata |
|
dtype_counts = self.df.dtypes.astype(str).value_counts() |
|
fig_types = px.pie(values=dtype_counts.values, names=dtype_counts.index, title="<b>π Data Type Composition</b>", hole=0.4, color_discrete_sequence=px.colors.qualitative.Pastel) |
|
fig_types.update_traces(textposition='outside', textinfo='percent+label') |
|
missing_df = self.df.isnull().sum().reset_index(name='count').query('count > 0') |
|
fig_missing = px.bar(missing_df, x='index', y='count', title="<b>π³οΈ Missing Values Distribution</b>", labels={'index': 'Column Name', 'count': 'Number of Missing Values'}).update_xaxes(categoryorder="total descending") |
|
fig_corr = go.Figure() |
|
if len(meta['numeric_cols']) > 1: |
|
corr_matrix = self.df[meta['numeric_cols']].corr() |
|
fig_corr = px.imshow(corr_matrix, text_auto=".2f", aspect="auto", title=f"<b>π Correlation Matrix (Threshold > {Config.CORR_THRESHOLD})</b>", color_continuous_scale='RdBu_r', zmin=-1, zmax=1) |
|
else: |
|
fig_corr.update_layout(title="<b>π Correlation Matrix (Insufficient Numeric Data)</b>") |
|
return fig_types, fig_missing, fig_corr |
|
|
|
def generate_ai_narrative(self, api_key: str) -> str: |
|
logging.info("Generating AI narrative with the Gemini API.") |
|
meta = self.metadata |
|
data_snippet_md = self.df.head(5).to_markdown(index=False) |
|
prompt = f""" |
|
As "Cognitive Analyst," an elite AI data scientist, your task is to generate a comprehensive, multi-part data discovery report. |
|
Analyze the following dataset context and produce a professional, insightful, and clear analysis in Markdown format. |
|
|
|
**DATASET CONTEXT:** |
|
- **Shape:** {meta['shape'][0]} rows, {meta['shape'][1]} columns. |
|
- **Column Schema:** |
|
- Numeric: {', '.join(meta['numeric_cols']) if meta['numeric_cols'] else 'None'} |
|
- Categorical: {', '.join(meta['categorical_cols']) if meta['categorical_cols'] else 'None'} |
|
- **Data Quality Score:** {meta['data_quality_score']}% (Percentage of non-missing cells) |
|
- **Total Missing Values:** {meta['total_missing']:,} |
|
- **High-Correlation Pairs (>{Config.CORR_THRESHOLD}):** {meta['high_corr_pairs'] if meta['high_corr_pairs'] else 'None detected.'} |
|
- **Data Snippet (First 5 Rows):** |
|
{data_snippet_md} |
|
|
|
**REQUIRED REPORT STRUCTURE (Strictly use this Markdown format):** |
|
... |
|
""" |
|
try: |
|
genai.configure(api_key=api_key) |
|
model = genai.GenerativeModel(Config.GEMINI_MODEL) |
|
response = model.generate_content(prompt) |
|
return response.text |
|
except Exception as e: |
|
logging.error(f"Gemini API call failed: {e}", exc_info=True) |
|
error_message = ("β **AI Report Generation Failed**\n\n" f"**Error Details:** `{str(e)}`\n\n" "**Troubleshooting Steps:**\n" "1. Verify that your Google Gemini API key is correct and active.\n" "2. Check your network connection and firewall settings.\n" "3. Ensure the Gemini API is not experiencing an outage.") |
|
return error_message |
|
|
|
|
|
|
|
def create_ui(): |
|
def create_histogram(analyzer: DataAnalyzer, col: str) -> go.Figure: |
|
if not col or not analyzer: return go.Figure() |
|
return px.histogram(analyzer.df, x=col, title=f"<b>Distribution of {col}</b>", marginal="box", template="plotly_white") |
|
def create_scatterplot(analyzer: DataAnalyzer, x_col: str, y_col:str, color_col:str) -> go.Figure: |
|
if not all([analyzer, x_col, y_col]): return go.Figure() |
|
return px.scatter(analyzer.df, x=x_col, y=y_col, color=color_col, title=f"<b>Scatter Plot: {x_col} vs. {y_col}</b>", template="plotly_white", color_continuous_scale=px.colors.sequential.Viridis) |
|
def analyze_single_column(analyzer: DataAnalyzer, col: str) -> Tuple[str, go.Figure]: |
|
if not col or not analyzer: return "", go.Figure() |
|
series = analyzer.df[col] |
|
stats_md = f"### π **Deep Dive: `{col}`**\n- **Data Type:** `{series.dtype}`\n- **Unique Values:** `{series.nunique()}`\n- **Missing:** `{series.isnull().sum()}` ({series.isnull().mean():.2%})\n" |
|
fig = go.Figure() |
|
if pd.api.types.is_numeric_dtype(series): |
|
stats_md += f"- **Mean:** `{series.mean():.3f}` | **Std Dev:** `{series.std():.3f}`\n- **Median:** `{series.median():.3f}` | **Min:** `{series.min():.3f}` | **Max:** `{series.max():.3f}`\n" |
|
fig = create_histogram(analyzer, col) |
|
else: |
|
top_n = series.value_counts().nlargest(Config.TOP_N_CATEGORIES) |
|
stats_md += f"- **Top Value:** `{top_n.index[0]}` ({top_n.iloc[0]} occurrences)\n" |
|
fig = px.bar(top_n, y=top_n.index, x=top_n.values, orientation='h', title=f"<b>Top {Config.TOP_N_CATEGORIES} Categories in `{col}`</b>", labels={'y': col, 'x': 'Count'}, template="plotly_white").update_yaxes(categoryorder="total ascending") |
|
return stats_md, fig |
|
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="cyan"), title=Config.APP_TITLE) as demo: |
|
state_analyzer = gr.State() |
|
gr.Markdown(f"<h1>{Config.APP_TITLE}</h1>") |
|
gr.Markdown("Upload a CSV file, provide your Gemini API key, and receive an instant, AI-driven analysis of your data.") |
|
with gr.Row(): |
|
upload_button = gr.File(label="1. Upload CSV File", file_types=[".csv"], scale=3) |
|
api_key_input = gr.Textbox(label="2. Enter Google Gemini API Key", type="password", scale=2) |
|
analyze_button = gr.Button("β¨ Generate Analysis", variant="primary", scale=1, min_width=150) |
|
with gr.Tabs(): |
|
with gr.Tab("π€ AI Narrative"): |
|
ai_report_output = gr.Markdown("Your AI-generated report will appear here...") |
|
download_report_button = gr.Button("β¬οΈ Download Full Report", visible=False) |
|
with gr.Tab("Profile"): |
|
profile_missing_df = gr.DataFrame(interactive=False, label="Missing Values") |
|
profile_numeric_df = gr.DataFrame(interactive=False, label="Numeric Stats") |
|
profile_categorical_df = gr.DataFrame(interactive=False, label="Categorical Stats") |
|
with gr.Tab("π Overview Visuals"): |
|
with gr.Row(): |
|
plot_types, plot_missing = gr.Plot(), gr.Plot() |
|
plot_correlation = gr.Plot() |
|
with gr.Tab("π¨ Interactive Explorer"): |
|
with gr.Row(equal_height=False): |
|
with gr.Column(scale=1): |
|
dd_hist_col = gr.Dropdown(label="Select Column for Histogram", visible=False) |
|
with gr.Column(scale=2): |
|
plot_histogram = gr.Plot() |
|
with gr.Row(equal_height=False): |
|
with gr.Column(scale=1): |
|
dd_scatter_x, dd_scatter_y, dd_scatter_color = gr.Dropdown(label="X-Axis (Numeric)", visible=False), gr.Dropdown(label="Y-Axis (Numeric)", visible=False), gr.Dropdown(label="Color By (Optional)", visible=False) |
|
with gr.Column(scale=2): |
|
plot_scatter = gr.Plot() |
|
with gr.Tab("π Column Deep-Dive"): |
|
dd_drilldown_col = gr.Dropdown(label="Select Column to Analyze", visible=False) |
|
with gr.Row(): |
|
md_drilldown_stats, plot_drilldown = gr.Markdown(), gr.Plot() |
|
gr.HTML("""<div style="text-align: center; margin-top: 20px; font-family: sans-serif; color: #777;"><p>π‘ Need an API key? Get one from <a href="https://aistudio.google.com/app/apikey" target="_blank">Google AI Studio</a>.</p><p>CognitiveEDA v3.2 | An MCP Expert System</p></div>""") |
|
outputs_for_main_analysis = [state_analyzer, ai_report_output, download_report_button, profile_missing_df, profile_numeric_df, profile_categorical_df, plot_types, plot_missing, plot_correlation, dd_hist_col, dd_scatter_x, dd_scatter_y, dd_scatter_color, dd_drilldown_col] |
|
analyze_button.click(fn=run_full_analysis, inputs=[upload_button, api_key_input], outputs=outputs_for_main_analysis) |
|
dd_hist_col.change(fn=create_histogram, inputs=[state_analyzer, dd_hist_col], outputs=plot_histogram) |
|
scatter_inputs = [state_analyzer, dd_scatter_x, dd_scatter_y, dd_scatter_color] |
|
for dd in [dd_scatter_x, dd_scatter_y, dd_scatter_color]: |
|
dd.change(fn=create_scatterplot, inputs=scatter_inputs, outputs=plot_scatter) |
|
dd_drilldown_col.change(fn=analyze_single_column, inputs=[state_analyzer, dd_drilldown_col], outputs=[md_drilldown_stats, plot_drilldown]) |
|
download_report_button.click(fn=download_report_file, inputs=[state_analyzer, ai_report_output], outputs=gr.File(label="Download Report")) |
|
return demo |
|
|
|
|
|
|
|
|
|
def run_full_analysis(file_obj: gr.File, api_key: str) -> list: |
|
""" |
|
Orchestrates the entire analysis pipeline upon button click. |
|
Returns a list of values to update all relevant UI components. |
|
""" |
|
if file_obj is None: |
|
raise gr.Error("CRITICAL: No file uploaded. Please select a CSV file.") |
|
if not api_key: |
|
raise gr.Error("CRITICAL: Gemini API key is missing. Please provide your key.") |
|
|
|
try: |
|
logging.info(f"Processing uploaded file: {file_obj.name}") |
|
df = pd.read_csv(file_obj.name) |
|
analyzer = DataAnalyzer(df) |
|
|
|
ai_report = analyzer.generate_ai_narrative(api_key) |
|
missing_df, num_df, cat_df = analyzer.get_profiling_tables() |
|
fig_types, fig_missing, fig_corr = analyzer.get_overview_visuals() |
|
|
|
meta = analyzer.metadata |
|
all_cols, num_cols = meta['columns'], meta['numeric_cols'] |
|
|
|
|
|
return [ |
|
analyzer, |
|
ai_report, |
|
gr.Button(visible=True), |
|
missing_df, |
|
num_df, |
|
cat_df, |
|
fig_types, |
|
fig_missing, |
|
fig_corr, |
|
gr.Dropdown(choices=num_cols, label="Select Numeric Column", visible=True), |
|
gr.Dropdown(choices=num_cols, label="X-Axis (Numeric)", visible=True), |
|
gr.Dropdown(choices=num_cols, label="Y-Axis (Numeric)", visible=True), |
|
gr.Dropdown(choices=all_cols, label="Color By (Optional)", visible=True), |
|
gr.Dropdown(choices=all_cols, label="Select Column to Analyze", visible=True) |
|
] |
|
except Exception as e: |
|
logging.error(f"A critical error occurred during file processing: {e}", exc_info=True) |
|
raise gr.Error(f"Analysis Failed! The process stopped due to: {str(e)}") |
|
|
|
|
|
def download_report_file(analyzer: DataAnalyzer, ai_report_text: str) -> Optional[str]: |
|
if not analyzer: |
|
logging.warning("Download attempted without a valid analyzer object.") |
|
return None |
|
filename = f"CognitiveEDA_Report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md" |
|
meta = analyzer.metadata |
|
full_report = f"# CognitiveEDA - Data Discovery Report\n**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n## Dataset Overview\n- **Shape:** {meta['shape'][0]} rows x {meta['shape'][1]} columns\n- **Memory Footprint:** {meta['memory_usage_mb']} MB\n- **Data Quality Score:** {meta['data_quality_score']}%\n\n---\n\n{ai_report_text}" |
|
with open(filename, "w", encoding="utf-8") as f: |
|
f.write(full_report) |
|
logging.info(f"Report file generated successfully: {filename}") |
|
return filename |
|
|
|
def perform_pre_flight_checks(): |
|
logging.info("Performing pre-flight dependency checks...") |
|
required_packages = ["pandas", "gradio", "plotly", "google.generativeai", "tabulate"] |
|
missing_packages = [pkg for pkg in required_packages if importlib.util.find_spec(pkg) is None] |
|
if missing_packages: |
|
logging.critical(f"Missing critical packages: {', '.join(missing_packages)}") |
|
print("\n" + "="*80 + "\nERROR: Your environment is missing critical dependencies.\n" + f"Missing package(s): {', '.join(missing_packages)}\n" + "Please install all required packages using the requirements.txt file:\n" + "pip install -r requirements.txt\n" + "="*80 + "\n") |
|
sys.exit(1) |
|
logging.info("All dependencies are satisfied. Proceeding with launch.") |
|
|
|
if __name__ == "__main__": |
|
perform_pre_flight_checks() |
|
app_instance = create_ui() |
|
app_instance.launch(debug=True, server_name="0.0.0.0") |