|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations |
|
|
|
import warnings |
|
import logging |
|
import os |
|
import sys |
|
import importlib.util |
|
from datetime import datetime |
|
from typing import Any, Dict, List, Optional, Tuple |
|
|
|
import gradio as gr |
|
import numpy as np |
|
import pandas as pd |
|
import plotly.express as px |
|
import plotly.graph_objects as go |
|
import google.generativeai as genai |
|
|
|
|
|
from analysis_modules import analyze_time_series, generate_word_cloud, perform_clustering |
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - [%(levelname)s] - (%(filename)s:%(lineno)d) - %(message)s') |
|
warnings.filterwarnings('ignore', category=FutureWarning) |
|
|
|
class Config: |
|
APP_TITLE = "π CognitiveEDA: The Adaptive Intelligence Engine" |
|
GEMINI_MODEL = 'gemini-1.5-flash-latest' |
|
CORR_THRESHOLD = 0.75 |
|
TOP_N_CATEGORIES = 10 |
|
MAX_UI_ROWS = 50000 |
|
|
|
|
|
class DataAnalyzer: |
|
def __init__(self, df: pd.DataFrame): |
|
if not isinstance(df, pd.DataFrame): raise TypeError("Input must be a pandas DataFrame.") |
|
self.df = df |
|
self._metadata: Optional[Dict[str, Any]] = None |
|
logging.info(f"DataAnalyzer instantiated with DataFrame of shape: {self.df.shape}") |
|
|
|
@property |
|
def metadata(self) -> Dict[str, Any]: |
|
if self._metadata is None: self._metadata = self._extract_metadata() |
|
return self._metadata |
|
|
|
def _extract_metadata(self) -> Dict[str, Any]: |
|
|
|
rows, cols = self.df.shape |
|
numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist() |
|
categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist() |
|
datetime_cols = self.df.select_dtypes(include=['datetime64', 'datetimetz']).columns.tolist() |
|
text_cols = [col for col in categorical_cols if self.df[col].str.len().mean() > 50] |
|
|
|
high_corr_pairs = [] |
|
if len(numeric_cols) > 1: |
|
corr_matrix = self.df[numeric_cols].corr().abs() |
|
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) |
|
high_corr_series = upper_tri.stack() |
|
high_corr_pairs = (high_corr_series[high_corr_series > Config.CORR_THRESHOLD].reset_index().rename(columns={'level_0': 'Feature 1', 'level_1': 'Feature 2', 0: 'Correlation'}).to_dict('records')) |
|
|
|
return { |
|
'shape': (rows, cols), 'columns': self.df.columns.tolist(), |
|
'numeric_cols': numeric_cols, 'categorical_cols': categorical_cols, |
|
'datetime_cols': datetime_cols, 'text_cols': text_cols, |
|
'memory_usage_mb': f"{self.df.memory_usage(deep=True).sum() / 1e6:.2f}", |
|
'total_missing': int(self.df.isnull().sum().sum()), |
|
'data_quality_score': round((self.df.notna().sum().sum() / self.df.size) * 100, 2), |
|
'high_corr_pairs': high_corr_pairs, |
|
} |
|
|
|
def get_profiling_tables(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: |
|
|
|
... |
|
|
|
def get_overview_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]: |
|
|
|
... |
|
|
|
def generate_ai_narrative(self, api_key: str, context: Dict[str, Any]) -> str: |
|
"""Generates a context-aware AI narrative.""" |
|
logging.info(f"Generating AI narrative with context: {context.keys()}") |
|
meta = self.metadata |
|
data_snippet_md = self.df.head(5).to_markdown(index=False) |
|
|
|
|
|
context_prompt = "**DATASET CONTEXT:**\n" |
|
if context.get('is_timeseries'): |
|
context_prompt += "- **Analysis Mode:** Time-Series. Focus on trends, seasonality, and stationarity.\n" |
|
if context.get('has_text'): |
|
context_prompt += "- **Analysis Mode:** Text Analysis. Note potential for NLP tasks like sentiment analysis or topic modeling.\n" |
|
|
|
prompt = f""" |
|
As "Cognitive Analyst," an elite AI data scientist, your task is to generate a comprehensive data discovery report. |
|
{context_prompt} |
|
- **Shape:** {meta['shape'][0]} rows, {meta['shape'][1]} columns. |
|
... (rest of the prompt from v3.2) |
|
""" |
|
|
|
... |
|
return "AI Narrative Placeholder" |
|
|
|
|
|
|
|
def create_ui(): |
|
"""Defines and builds the new adaptive Gradio user interface.""" |
|
|
|
with gr.Blocks(theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="blue"), title=Config.APP_TITLE) as demo: |
|
|
|
state_analyzer = gr.State() |
|
|
|
|
|
gr.Markdown(f"<h1>{Config.APP_TITLE}</h1>") |
|
gr.Markdown("Upload your data (CSV, Excel) and let the AI build a custom analysis dashboard for you.") |
|
with gr.Row(): |
|
upload_button = gr.File(label="1. Upload Data File", file_types=[".csv", ".xlsx", ".xls"], scale=3) |
|
api_key_input = gr.Textbox(label="2. Enter Google Gemini API Key", type="password", scale=2) |
|
analyze_button = gr.Button("β¨ Build My Dashboard", variant="primary", scale=1) |
|
|
|
|
|
with gr.Tabs(): |
|
|
|
with gr.Tab("π€ AI Narrative"): |
|
ai_report_output = gr.Markdown("### Your AI-generated report will appear here...") |
|
download_report_button = gr.Button("β¬οΈ Download Full Report", visible=False) |
|
with gr.Tab("π Profile"): |
|
gr.Markdown("### **Detailed Data Profile**") |
|
profile_missing_df = gr.DataFrame(interactive=False, label="Missing Values") |
|
profile_numeric_df = gr.DataFrame(interactive=False, label="Numeric Stats") |
|
profile_categorical_df = gr.DataFrame(interactive=False, label="Categorical Stats") |
|
with gr.Tab("π Overview Visuals"): |
|
with gr.Row(): plot_types, plot_missing = gr.Plot(), gr.Plot() |
|
plot_correlation = gr.Plot() |
|
|
|
|
|
with gr.Tab("β Time-Series Analysis", visible=False) as tab_timeseries: |
|
gr.Markdown("### **Decompose and Analyze Time-Series Data**") |
|
with gr.Row(): |
|
dd_ts_date = gr.Dropdown(label="Select Date/Time Column", interactive=True) |
|
dd_ts_value = gr.Dropdown(label="Select Value Column", interactive=True) |
|
plot_ts_decomp = gr.Plot() |
|
md_ts_stats = gr.Markdown() |
|
|
|
with gr.Tab("π Text Analysis", visible=False) as tab_text: |
|
gr.Markdown("### **Visualize High-Frequency Words**") |
|
dd_text_col = gr.Dropdown(label="Select Text Column", interactive=True) |
|
html_word_cloud = gr.HTML() |
|
|
|
with gr.Tab("π§© Clustering (K-Means)", visible=False) as tab_cluster: |
|
gr.Markdown("### **Discover Latent Groups with K-Means Clustering**") |
|
with gr.Row(): |
|
num_clusters = gr.Slider(minimum=2, maximum=10, value=4, step=1, label="Number of Clusters (K)", interactive=True) |
|
plot_cluster = gr.Plot() |
|
md_cluster_summary = gr.Markdown() |
|
|
|
|
|
main_outputs = [ |
|
state_analyzer, ai_report_output, download_report_button, |
|
profile_missing_df, profile_numeric_df, profile_categorical_df, |
|
plot_types, plot_missing, plot_correlation, |
|
tab_timeseries, dd_ts_date, dd_ts_value, |
|
tab_text, dd_text_col, |
|
tab_cluster, num_clusters |
|
] |
|
analyze_button.click(fn=run_full_analysis, inputs=[upload_button, api_key_input], outputs=main_outputs) |
|
|
|
|
|
ts_inputs = [state_analyzer, dd_ts_date, dd_ts_value] |
|
for dd in [dd_ts_date, dd_ts_value]: |
|
dd.change(fn=lambda a, d, v: analyze_time_series(a.df, d, v), inputs=ts_inputs, outputs=[plot_ts_decomp, md_ts_stats]) |
|
|
|
dd_text_col.change(fn=lambda a, t: generate_word_cloud(a.df, t), inputs=[state_analyzer, dd_text_col], outputs=html_word_cloud) |
|
|
|
cluster_inputs = [state_analyzer, num_clusters] |
|
num_clusters.change(fn=lambda a, k: perform_clustering(a.df, a.metadata['numeric_cols'], k), inputs=cluster_inputs, outputs=[plot_cluster, md_cluster_summary]) |
|
|
|
return demo |
|
|
|
|
|
def run_full_analysis(file_obj: gr.File, api_key: str) -> list: |
|
"""The new adaptive analysis orchestrator.""" |
|
if file_obj is None: raise gr.Error("CRITICAL: No file uploaded.") |
|
if not api_key: raise gr.Error("CRITICAL: Gemini API key is missing.") |
|
|
|
try: |
|
logging.info(f"Processing uploaded file: {file_obj.name}") |
|
df = pd.read_csv(file_obj.name) if file_obj.name.endswith('.csv') else pd.read_excel(file_obj.name) |
|
|
|
if len(df) > Config.MAX_UI_ROWS: |
|
logging.info(f"Large dataset detected ({len(df)} rows). Sampling to {Config.MAX_UI_ROWS} for UI.") |
|
df_display = df.sample(n=Config.MAX_UI_ROWS, random_state=42) |
|
else: |
|
df_display = df |
|
|
|
analyzer = DataAnalyzer(df_display) |
|
meta = analyzer.metadata |
|
|
|
|
|
ai_context = {'is_timeseries': bool(meta['datetime_cols']), 'has_text': bool(meta['text_cols'])} |
|
|
|
ai_report = "AI Narrative generation is ready. Trigger on demand." |
|
missing_df, num_df, cat_df = analyzer.get_profiling_tables() |
|
fig_types, fig_missing, fig_corr = analyzer.get_overview_visuals() |
|
|
|
|
|
show_ts_tab = gr.Tab(visible=bool(meta['datetime_cols'])) |
|
show_text_tab = gr.Tab(visible=bool(meta['text_cols'])) |
|
show_cluster_tab = gr.Tab(visible=len(meta['numeric_cols']) > 1) |
|
|
|
return [ |
|
analyzer, ai_report, gr.Button(visible=True), |
|
missing_df, num_df, cat_df, fig_types, fig_missing, fig_corr, |
|
show_ts_tab, gr.Dropdown(choices=meta['datetime_cols']), gr.Dropdown(choices=meta['numeric_cols']), |
|
show_text_tab, gr.Dropdown(choices=meta['text_cols']), |
|
show_cluster_tab, gr.Slider(visible=True) |
|
] |
|
except Exception as e: |
|
logging.error(f"A critical error occurred: {e}", exc_info=True) |
|
raise gr.Error(f"Analysis Failed! Error: {str(e)}") |
|
|
|
def perform_pre_flight_checks(): |
|
|
|
... |
|
|
|
if __name__ == "__main__": |
|
|
|
app_instance = create_ui() |
|
app_instance.launch(debug=True, server_name="0.0.0.0") |