Update app.py
Browse files
app.py
CHANGED
@@ -1,250 +1,427 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
import plotly.express as px
|
5 |
-
import
|
6 |
-
import
|
7 |
-
import
|
8 |
import google.generativeai as genai
|
9 |
-
import
|
10 |
-
from
|
11 |
-
|
|
|
12 |
|
13 |
-
# --- Configuration & Constants ---
|
14 |
warnings.filterwarnings('ignore')
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
|
|
|
|
49 |
)
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
return
|
123 |
|
124 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
-
|
127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
try:
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
df = pd.read_csv(filename)
|
134 |
-
elif extension == '.txt':
|
135 |
-
# Use sep=None to auto-detect the delimiter (tabs, spaces, etc.)
|
136 |
-
df = pd.read_csv(filename, sep=None, engine='python')
|
137 |
-
elif extension in ['.xls', '.xlsx']:
|
138 |
-
df = pd.read_excel(filename)
|
139 |
-
else:
|
140 |
-
raise ValueError(f"Unsupported file type: {extension}")
|
141 |
-
|
142 |
-
# Continue with processing once the DataFrame is loaded
|
143 |
-
for col in df.select_dtypes(include=['object']).columns:
|
144 |
-
try: df[col] = pd.to_datetime(df[col], errors='raise')
|
145 |
-
except (ValueError, TypeError): continue
|
146 |
-
|
147 |
-
metadata = self._extract_dataset_metadata(df)
|
148 |
-
state = {'df': df, 'metadata': metadata, 'dashboard_plots': []}
|
149 |
-
rows, cols, quality = metadata['shape'][0], metadata['shape'][1], metadata['data_quality']
|
150 |
-
page_updates = self._switch_page("cockpit", [0,1,2,3])
|
151 |
-
return (state, f"✅ **{os.path.basename(filename)}** loaded.", *page_updates, f"{rows:,}", f"{cols}", f"{quality}%", f"{len(metadata['datetime_cols'])}",
|
152 |
-
gr.update(choices=metadata['columns'], interactive=True), gr.update(choices=metadata['columns'], interactive=True), gr.update(interactive=True))
|
153 |
except Exception as e:
|
154 |
-
|
155 |
-
return
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
210 |
|
211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
try:
|
220 |
-
genai.configure(api_key=api_key); response_json = self._sanitize_and_parse_json(genai.GenerativeModel('gemini-1.5-flash').generate_content(prompt).text)
|
221 |
-
plan, code, insight = response_json.get("plan"), response_json.get("code"), response_json.get("insight")
|
222 |
-
stdout, fig, df_result, error = self._safe_exec(code, {'df': state['df'], 'px': px, 'pd': pd})
|
223 |
-
|
224 |
-
history[-1] = (user_message, f"**Plan:** {plan}")
|
225 |
-
explanation = f"**Insight:** {insight}"
|
226 |
-
if stdout: explanation += f"\n\n**Console Output:**\n```\n{stdout}\n```"
|
227 |
-
if error: gr.Error(f"AI Code Execution Failed: {error}")
|
228 |
-
|
229 |
-
yield (history, gr.update(visible=bool(explanation), value=explanation), gr.update(visible=bool(code), value=code),
|
230 |
-
gr.update(visible=bool(fig), value=fig), gr.update(visible=bool(df_result is not None), value=df_result))
|
231 |
-
except Exception as e:
|
232 |
-
history[-1] = (user_message, f"I encountered an error processing the AI response. Please rephrase your question.\n\n**Details:** `{str(e)}`")
|
233 |
-
yield history, *[gr.update(visible=False)]*4
|
234 |
-
|
235 |
-
def _safe_exec(self, code_string: str, local_vars: Dict) -> Tuple[Any, ...]:
|
236 |
-
try:
|
237 |
-
output_buffer = io.StringIO()
|
238 |
-
with redirect_stdout(output_buffer): exec(code_string, globals(), local_vars)
|
239 |
-
return output_buffer.getvalue(), local_vars.get('fig'), local_vars.get('result_df'), None
|
240 |
-
except Exception as e: return None, None, None, str(e)
|
241 |
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
248 |
|
249 |
-
|
250 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
🚀 AutoEDA: AI-Powered Exploratory Data Analysis Tool
|
4 |
+
|
5 |
+
An advanced Gradio application for automated exploratory data analysis,
|
6 |
+
data profiling, and AI-driven insights using Google's Gemini API.
|
7 |
+
|
8 |
+
Key Features:
|
9 |
+
- Unified Analysis Workflow: Upload a CSV and get a full report across all tabs.
|
10 |
+
- AI-Powered Storytelling: Generates a narrative overview, use cases, and findings.
|
11 |
+
- Actionable AI Suggestions: Provides data cleaning recommendations.
|
12 |
+
- Interactive Visualizations: Users can select columns to generate plots dynamically.
|
13 |
+
- In-depth Profiling: Detailed statistics for numeric and categorical data.
|
14 |
+
- Column-Level Drilldown: Inspect individual features in detail.
|
15 |
+
- Report Download: Export the AI-generated analysis as a Markdown file.
|
16 |
+
|
17 |
+
Author: World-Class MCP Expert
|
18 |
+
Version: 2.0
|
19 |
+
"""
|
20 |
+
from __future__ import annotations
|
21 |
+
|
22 |
+
import warnings
|
23 |
+
import logging
|
24 |
+
import os
|
25 |
import pandas as pd
|
26 |
import numpy as np
|
27 |
import plotly.express as px
|
28 |
+
import plotly.graph_objects as go
|
29 |
+
from plotly.subplots import make_subplots
|
30 |
+
import gradio as gr
|
31 |
import google.generativeai as genai
|
32 |
+
from typing import Optional, Dict, Any, Tuple, List
|
33 |
+
from datetime import datetime
|
34 |
+
|
35 |
+
# --- Configuration & Setup ---
|
36 |
|
|
|
37 |
warnings.filterwarnings('ignore')
|
38 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
39 |
+
|
40 |
+
# --- Core Analysis Logic (The "Engine") ---
|
41 |
+
|
42 |
+
class DataAnalyzer:
|
43 |
+
"""
|
44 |
+
A comprehensive class to encapsulate all data analysis operations.
|
45 |
+
It holds the dataframe and provides methods for profiling, visualization,
|
46 |
+
and AI-powered analysis, ensuring data is processed only once.
|
47 |
+
"""
|
48 |
+
def __init__(self, df: pd.DataFrame):
|
49 |
+
if not isinstance(df, pd.DataFrame):
|
50 |
+
raise TypeError("Input must be a pandas DataFrame.")
|
51 |
+
self.df = df
|
52 |
+
self._metadata: Optional[Dict[str, Any]] = None
|
53 |
+
logging.info(f"DataAnalyzer initialized with DataFrame of shape: {self.df.shape}")
|
54 |
+
|
55 |
+
@property
|
56 |
+
def metadata(self) -> Dict[str, Any]:
|
57 |
+
"""Lazy-loads and caches dataset metadata."""
|
58 |
+
if self._metadata is None:
|
59 |
+
self._metadata = self._extract_metadata()
|
60 |
+
return self._metadata
|
61 |
|
62 |
+
def _extract_metadata(self) -> Dict[str, Any]:
|
63 |
+
"""Extracts comprehensive metadata from the DataFrame."""
|
64 |
+
logging.info("Extracting dataset metadata...")
|
65 |
+
rows, cols = self.df.shape
|
66 |
+
numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist()
|
67 |
+
categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
|
68 |
+
datetime_cols = self.df.select_dtypes(include=['datetime64']).columns.tolist()
|
69 |
+
|
70 |
+
# High correlation pairs
|
71 |
+
high_corr_pairs = []
|
72 |
+
if len(numeric_cols) > 1:
|
73 |
+
corr_matrix = self.df[numeric_cols].corr().abs()
|
74 |
+
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
75 |
+
high_corr_pairs = (
|
76 |
+
upper_tri.stack()
|
77 |
+
.reset_index()
|
78 |
+
.rename(columns={'level_0': 'Var 1', 'level_1': 'Var 2', 0: 'Correlation'})
|
79 |
+
.query('Correlation > 0.7')
|
80 |
+
.sort_values('Correlation', ascending=False)
|
81 |
+
.head(5)
|
82 |
+
.to_dict('records')
|
83 |
)
|
84 |
+
|
85 |
+
return {
|
86 |
+
'shape': (rows, cols),
|
87 |
+
'columns': self.df.columns.tolist(),
|
88 |
+
'numeric_cols': numeric_cols,
|
89 |
+
'categorical_cols': categorical_cols,
|
90 |
+
'datetime_cols': datetime_cols,
|
91 |
+
'memory_usage': f"{self.df.memory_usage(deep=True).sum() / 1e6:.2f} MB",
|
92 |
+
'total_missing': int(self.df.isnull().sum().sum()),
|
93 |
+
'data_quality_score': round((self.df.notna().sum().sum() / self.df.size) * 100, 1),
|
94 |
+
'high_corr_pairs': high_corr_pairs,
|
95 |
+
}
|
96 |
+
|
97 |
+
def get_profiling_report(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
98 |
+
"""Generates detailed data profiling tables."""
|
99 |
+
logging.info("Generating data profiling report.")
|
100 |
+
|
101 |
+
# Missing data
|
102 |
+
missing = self.df.isnull().sum()
|
103 |
+
missing_df = pd.DataFrame({
|
104 |
+
'Missing Values': missing,
|
105 |
+
'Percentage (%)': (missing / len(self.df) * 100).round(2)
|
106 |
+
}).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing Values', ascending=False)
|
107 |
+
|
108 |
+
# Numeric stats
|
109 |
+
numeric_stats_df = self.df[self.metadata['numeric_cols']].describe().round(3).T.reset_index().rename(columns={'index': 'Column'})
|
110 |
+
|
111 |
+
# Categorical stats
|
112 |
+
cat_stats_list = []
|
113 |
+
for col in self.metadata['categorical_cols']:
|
114 |
+
stats = {
|
115 |
+
'Column': col,
|
116 |
+
'Unique Values': self.df[col].nunique(),
|
117 |
+
'Top Value': self.df[col].mode().iloc[0] if not self.df[col].mode().empty else 'N/A',
|
118 |
+
'Frequency': self.df[col].value_counts().iloc[0] if not self.df[col].value_counts().empty else 0
|
119 |
+
}
|
120 |
+
cat_stats_list.append(stats)
|
121 |
+
categorical_stats_df = pd.DataFrame(cat_stats_list)
|
122 |
+
|
123 |
+
return missing_df, numeric_stats_df, categorical_stats_df
|
124 |
+
|
125 |
+
def get_initial_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]:
|
126 |
+
"""Creates a set of standard, non-interactive overview plots."""
|
127 |
+
logging.info("Generating initial overview visualizations.")
|
128 |
+
|
129 |
+
# Data type distribution
|
130 |
+
dtype_counts = self.df.dtypes.astype(str).value_counts()
|
131 |
+
dtype_fig = px.pie(
|
132 |
+
values=dtype_counts.values, names=dtype_counts.index,
|
133 |
+
title="📊 Data Type Distribution", hole=0.3
|
134 |
+
)
|
135 |
+
dtype_fig.update_traces(textposition='inside', textinfo='percent+label')
|
136 |
+
|
137 |
+
# Missing data overview
|
138 |
+
missing_fig = px.bar(
|
139 |
+
x=self.df.isnull().sum(), y=self.df.columns,
|
140 |
+
orientation='h', title="🕳️ Missing Values Overview",
|
141 |
+
labels={'x': 'Number of Missing Values', 'y': 'Column'},
|
142 |
+
).update_yaxes(categoryorder="total ascending")
|
143 |
+
|
144 |
+
# Correlation heatmap
|
145 |
+
corr_fig = go.Figure()
|
146 |
+
if len(self.metadata['numeric_cols']) > 1:
|
147 |
+
corr_matrix = self.df[self.metadata['numeric_cols']].corr()
|
148 |
+
corr_fig = px.imshow(
|
149 |
+
corr_matrix, text_auto=".2f", aspect="auto",
|
150 |
+
title="🔗 Correlation Matrix (Numeric Features)",
|
151 |
+
color_continuous_scale='RdBu_r'
|
152 |
+
)
|
153 |
+
else:
|
154 |
+
corr_fig.update_layout(title="🔗 Correlation Matrix (Not enough numeric columns)")
|
155 |
+
|
156 |
+
return dtype_fig, missing_fig, corr_fig
|
157 |
|
158 |
+
def generate_ai_report(self, api_key: str) -> str:
|
159 |
+
"""Generates a full data story and analysis using the Gemini API."""
|
160 |
+
logging.info("Generating AI report with Gemini.")
|
161 |
+
|
162 |
+
prompt = f"""
|
163 |
+
As an expert data analyst and storyteller, your task is to analyze the provided dataset summary and generate a comprehensive, insightful, and accessible report.
|
164 |
+
|
165 |
+
**Dataset Metadata:**
|
166 |
+
- **Shape:** {self.metadata['shape'][0]} rows, {self.metadata['shape'][1]} columns.
|
167 |
+
- **Column Names:** {', '.join(self.metadata['columns'])}
|
168 |
+
- **Numeric Columns:** {', '.join(self.metadata['numeric_cols'])}
|
169 |
+
- **Categorical Columns:** {', '.join(self.metadata['categorical_cols'])}
|
170 |
+
- **Overall Data Quality:** {self.metadata['data_quality_score']}%
|
171 |
+
- **Total Missing Values:** {self.metadata['total_missing']:,}
|
172 |
+
- **Highly Correlated Pairs (>0.7):** {self.metadata['high_corr_pairs'] if self.metadata['high_corr_pairs'] else 'None detected.'}
|
173 |
+
- **Sample Data (First 3 Rows):**
|
174 |
+
{self.df.head(3).to_markdown()}
|
175 |
|
176 |
+
**Your Report Structure (Use Markdown):**
|
177 |
+
|
178 |
+
# 🚀 AI-Powered Data Analysis Report
|
179 |
+
|
180 |
+
## 📖 1. The Story of the Data
|
181 |
+
* **What is this dataset about?** (Deduce the purpose and subject matter of the data.)
|
182 |
+
* **What domain or industry does it belong to?** (e.g., E-commerce, Finance, Healthcare.)
|
183 |
+
* **Who might use this data?** (e.g., Marketers, Scientists, Financial Analysts.)
|
184 |
+
|
185 |
+
## 🎯 2. Key Insights & Interesting Findings
|
186 |
+
- **Finding 1:** (Describe a significant pattern, trend, or anomaly. Use emojis to highlight.)
|
187 |
+
- **Finding 2:** (Mention another interesting discovery, perhaps from correlations or categorical data.)
|
188 |
+
- **Finding 3:** (Highlight a potential business or research opportunity revealed by the data.)
|
189 |
+
|
190 |
+
## 🧹 3. Data Quality & Cleaning Recommendations
|
191 |
+
* **Overall Quality Assessment:** (Comment on the {self.metadata['data_quality_score']}% score and {self.metadata['total_missing']} missing values.)
|
192 |
+
* **Actionable Steps:**
|
193 |
+
- **Recommendation 1:** (e.g., "For column 'X' with Y% missing values, consider imputation using the mean/median/mode.")
|
194 |
+
- **Recommendation 2:** (e.g., "Columns 'A' and 'B' are highly correlated ({'e.g., ' + str(self.metadata['high_corr_pairs'][0]) if self.metadata['high_corr_pairs'] else ''}). Consider dropping one for modeling to avoid multicollinearity.")
|
195 |
+
- **Recommendation 3:** (e.g., "Column 'Z' is categorical but stored as a number. Recommend converting it to a category type.")
|
196 |
+
|
197 |
+
## 🔮 4. Potential Next Steps & Use Cases
|
198 |
+
- **Analysis Idea 1:** (e.g., "Build a predictive model for customer churn.")
|
199 |
+
- **Dashboard Idea 2:** (e.g., "Create a sales performance dashboard tracking KPIs over time.")
|
200 |
+
- **Research Question 3:** (e.g., "Investigate the factors influencing employee attrition.")
|
201 |
+
"""
|
202 |
try:
|
203 |
+
genai.configure(api_key=api_key)
|
204 |
+
model = genai.GenerativeModel('gemini-1.5-flash-latest')
|
205 |
+
response = model.generate_content(prompt)
|
206 |
+
return response.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
except Exception as e:
|
208 |
+
logging.error(f"Gemini API call failed: {e}")
|
209 |
+
return f"❌ **Error generating AI report.**\n**Reason:** {str(e)}\n\nPlease check your API key and network connection. A fallback analysis could not be generated."
|
210 |
+
|
211 |
+
# --- Gradio UI & Event Handlers ---
|
212 |
+
|
213 |
+
def process_uploaded_file(file_obj: gr.File, api_key: str) -> tuple:
|
214 |
+
"""
|
215 |
+
Main function to process the uploaded file. It runs all analyses
|
216 |
+
and returns updates for all UI components in one go.
|
217 |
+
"""
|
218 |
+
if file_obj is None:
|
219 |
+
raise gr.Error("📁 Please upload a CSV file first!")
|
220 |
+
if not api_key:
|
221 |
+
raise gr.Error("🔑 Please enter your Gemini API key!")
|
222 |
+
|
223 |
+
try:
|
224 |
+
df = pd.read_csv(file_obj.name)
|
225 |
+
analyzer = DataAnalyzer(df)
|
226 |
+
|
227 |
+
# Perform all analyses
|
228 |
+
ai_report = analyzer.generate_ai_report(api_key)
|
229 |
+
missing_df, num_stats, cat_stats = analyzer.get_profiling_report()
|
230 |
+
dtype_fig, missing_fig, corr_fig = analyzer.get_initial_visuals()
|
231 |
+
|
232 |
+
# Prepare UI updates
|
233 |
+
all_cols = analyzer.metadata['columns']
|
234 |
+
num_cols = analyzer.metadata['numeric_cols']
|
235 |
+
cat_cols = analyzer.metadata['categorical_cols']
|
236 |
+
|
237 |
+
# The return dictionary maps UI components to their new values/configurations
|
238 |
+
return {
|
239 |
+
state_analyzer: analyzer,
|
240 |
+
# Overview Tab
|
241 |
+
md_ai_report: ai_report,
|
242 |
+
btn_download_report: gr.Button(visible=True),
|
243 |
+
# Profiling Tab
|
244 |
+
df_missing_data: missing_df,
|
245 |
+
df_numeric_stats: num_stats,
|
246 |
+
df_categorical_stats: cat_stats,
|
247 |
+
# Visuals Tab
|
248 |
+
plot_dtype: dtype_fig,
|
249 |
+
plot_missing: missing_fig,
|
250 |
+
plot_corr: corr_fig,
|
251 |
+
# Interactive Visuals Tab
|
252 |
+
dd_hist_col: gr.Dropdown(choices=num_cols, label="Select Numeric Column for Histogram", visible=True),
|
253 |
+
dd_scatter_x: gr.Dropdown(choices=num_cols, label="Select X-axis (Numeric)", visible=True),
|
254 |
+
dd_scatter_y: gr.Dropdown(choices=num_cols, label="Select Y-axis (Numeric)", visible=True),
|
255 |
+
dd_scatter_color: gr.Dropdown(choices=all_cols, label="Select Color (Categorical/Numeric)", visible=True),
|
256 |
+
dd_box_cat: gr.Dropdown(choices=cat_cols, label="Select Categorical Column for Box Plot", visible=True),
|
257 |
+
dd_box_num: gr.Dropdown(choices=num_cols, label="Select Numeric Column for Box Plot", visible=True),
|
258 |
+
# Column Drilldown Tab
|
259 |
+
dd_drilldown_col: gr.Dropdown(choices=all_cols, label="Select Column to Analyze", visible=True),
|
260 |
+
}
|
261 |
+
|
262 |
+
except Exception as e:
|
263 |
+
logging.error(f"An error occurred during file processing: {e}", exc_info=True)
|
264 |
+
raise gr.Error(f"Processing failed! Error: {str(e)}")
|
265 |
+
|
266 |
+
# --- Interactive Plotting Functions ---
|
267 |
+
|
268 |
+
def create_histogram(analyzer: DataAnalyzer, col: str) -> go.Figure:
|
269 |
+
if not col: return go.Figure()
|
270 |
+
return px.histogram(analyzer.df, x=col, title=f"Distribution of {col}", marginal="box")
|
271 |
+
|
272 |
+
def create_scatterplot(analyzer: DataAnalyzer, x_col: str, y_col: str, color_col: str) -> go.Figure:
|
273 |
+
if not x_col or not y_col: return go.Figure()
|
274 |
+
return px.scatter(analyzer.df, x=x_col, y=y_col, color=color_col,
|
275 |
+
title=f"Scatter Plot: {x_col} vs. {y_col}")
|
276 |
+
|
277 |
+
def create_boxplot(analyzer: DataAnalyzer, cat_col: str, num_col: str) -> go.Figure:
|
278 |
+
if not cat_col or not num_col: return go.Figure()
|
279 |
+
return px.box(analyzer.df, x=cat_col, y=num_col, title=f"Box Plot: {num_col} by {cat_col}")
|
280 |
|
281 |
+
def analyze_single_column(analyzer: DataAnalyzer, col: str) -> Tuple[str, go.Figure]:
|
282 |
+
if not col: return "", go.Figure()
|
283 |
+
|
284 |
+
col_series = analyzer.df[col]
|
285 |
+
|
286 |
+
# Generate stats markdown
|
287 |
+
stats_md = f"### 🔎 Analysis of Column: `{col}`\n"
|
288 |
+
stats_md += f"- **Data Type:** `{col_series.dtype}`\n"
|
289 |
+
stats_md += f"- **Missing Values:** {col_series.isnull().sum()} ({col_series.isnull().mean():.2%})\n"
|
290 |
+
stats_md += f"- **Unique Values:** {col_series.nunique()}\n"
|
291 |
+
|
292 |
+
# Generate plot based on type
|
293 |
+
fig = go.Figure()
|
294 |
+
if pd.api.types.is_numeric_dtype(col_series):
|
295 |
+
stats_md += f"- **Mean:** {col_series.mean():.2f}\n"
|
296 |
+
stats_md += f"- **Median:** {col_series.median():.2f}\n"
|
297 |
+
stats_md += f"- **Std Dev:** {col_series.std():.2f}\n"
|
298 |
+
fig = create_histogram(analyzer, col)
|
299 |
+
elif pd.api.types.is_categorical_dtype(col_series) or pd.api.types.is_object_dtype(col_series):
|
300 |
+
top5 = col_series.value_counts().head(5)
|
301 |
+
stats_md += f"- **Top 5 Values:**\n"
|
302 |
+
for val, count in top5.items():
|
303 |
+
stats_md += f" - `{val}`: {count} times\n"
|
304 |
+
fig = px.bar(top5, x=top5.index, y=top5.values, title=f"Top 5 Value Counts for {col}")
|
305 |
+
fig.update_xaxes(title=col)
|
306 |
+
fig.update_yaxes(title="Count")
|
307 |
+
|
308 |
+
return stats_md, fig
|
309 |
+
|
310 |
+
def download_report(analyzer: DataAnalyzer, ai_report_text: str) -> str:
|
311 |
+
"""Saves the AI report and basic stats to a markdown file for download."""
|
312 |
+
if not analyzer: return None
|
313 |
+
|
314 |
+
filename = f"AI_Report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
|
315 |
+
|
316 |
+
# Create the full report content
|
317 |
+
full_report = f"# AutoEDA Analysis Report\n\n"
|
318 |
+
full_report += f"**Date Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
|
319 |
+
full_report += f"**Dataset Shape:** {analyzer.metadata['shape'][0]} rows x {analyzer.metadata['shape'][1]} columns\n\n"
|
320 |
+
full_report += "---\n\n"
|
321 |
+
full_report += ai_report_text
|
322 |
+
|
323 |
+
with open(filename, "w", encoding="utf-8") as f:
|
324 |
+
f.write(full_report)
|
325 |
|
326 |
+
logging.info(f"Generated download report: {filename}")
|
327 |
+
return filename
|
328 |
+
|
329 |
+
# --- Gradio Interface Definition ---
|
330 |
+
|
331 |
+
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), title="🚀 AutoEDA Pro") as demo:
|
332 |
+
# State object to hold the DataAnalyzer instance
|
333 |
+
state_analyzer = gr.State()
|
334 |
+
|
335 |
+
gr.Markdown("# 🚀 AutoEDA Pro: Your AI Data Science Assistant")
|
336 |
+
gr.Markdown("Upload a CSV, enter your Gemini API key, and click 'Analyze!' to unlock a comprehensive, AI-powered report on your data.")
|
337 |
+
|
338 |
+
with gr.Row():
|
339 |
+
with gr.Column(scale=2):
|
340 |
+
file_input = gr.File(label="📁 Upload your CSV File", file_types=[".csv"])
|
341 |
+
with gr.Column(scale=2):
|
342 |
+
api_key_input = gr.Textbox(label="🔑 Google Gemini API Key", type="password", placeholder="Enter your key here...")
|
343 |
+
with gr.Column(scale=1, min_width=150):
|
344 |
+
analyze_btn = gr.Button("✨ Analyze!", variant="primary", scale=1)
|
345 |
+
|
346 |
+
with gr.Tabs():
|
347 |
+
with gr.Tab("🤖 AI Report & Overview"):
|
348 |
+
md_ai_report = gr.Markdown("Your AI-generated report will appear here...")
|
349 |
+
btn_download_report = gr.Button("⬇️ Download Full Report", visible=False)
|
350 |
+
|
351 |
+
with gr.Tab("📊 Data Profiling"):
|
352 |
+
gr.Markdown("### Detailed Data Profile")
|
353 |
+
gr.Markdown("**Missing Data Analysis**")
|
354 |
+
df_missing_data = gr.DataFrame(interactive=False)
|
355 |
+
gr.Markdown("**Numeric Feature Statistics**")
|
356 |
+
df_numeric_stats = gr.DataFrame(interactive=False)
|
357 |
+
gr.Markdown("**Categorical Feature Statistics**")
|
358 |
+
df_categorical_stats = gr.DataFrame(interactive=False)
|
359 |
+
|
360 |
+
with gr.Tab("📈 Overview Visuals"):
|
361 |
+
gr.Markdown("### At-a-Glance Visualizations")
|
362 |
+
with gr.Row():
|
363 |
+
plot_dtype = gr.Plot()
|
364 |
+
plot_missing = gr.Plot()
|
365 |
+
with gr.Row():
|
366 |
+
plot_corr = gr.Plot()
|
367 |
+
|
368 |
+
with gr.Tab("🎨 Interactive Visuals"):
|
369 |
+
gr.Markdown("### Explore Your Data Visually")
|
370 |
+
with gr.Row():
|
371 |
+
with gr.Column():
|
372 |
+
dd_hist_col = gr.Dropdown(label="Select Column", visible=False)
|
373 |
+
plot_hist = gr.Plot()
|
374 |
+
with gr.Column():
|
375 |
+
dd_box_cat = gr.Dropdown(label="Select Category", visible=False)
|
376 |
+
dd_box_num = gr.Dropdown(label="Select Value", visible=False)
|
377 |
+
plot_box = gr.Plot()
|
378 |
+
with gr.Row():
|
379 |
+
gr.Markdown("#### Scatter Plot Explorer")
|
380 |
+
with gr.Row():
|
381 |
+
dd_scatter_x = gr.Dropdown(label="X-axis", visible=False)
|
382 |
+
dd_scatter_y = gr.Dropdown(label="Y-axis", visible=False)
|
383 |
+
dd_scatter_color = gr.Dropdown(label="Color", visible=False)
|
384 |
+
plot_scatter = gr.Plot()
|
385 |
|
386 |
+
with gr.Tab("🔍 Column Drilldown"):
|
387 |
+
gr.Markdown("### Deep Dive into a Single Column")
|
388 |
+
dd_drilldown_col = gr.Dropdown(label="Select Column", visible=False)
|
389 |
+
with gr.Row():
|
390 |
+
md_drilldown_stats = gr.Markdown()
|
391 |
+
plot_drilldown = gr.Plot()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
392 |
|
393 |
+
# --- Event Listeners ---
|
394 |
+
|
395 |
+
# Main analysis trigger
|
396 |
+
analyze_btn.click(
|
397 |
+
fn=process_uploaded_file,
|
398 |
+
inputs=[file_input, api_key_input],
|
399 |
+
outputs=[
|
400 |
+
state_analyzer, md_ai_report, btn_download_report,
|
401 |
+
df_missing_data, df_numeric_stats, df_categorical_stats,
|
402 |
+
plot_dtype, plot_missing, plot_corr,
|
403 |
+
dd_hist_col, dd_scatter_x, dd_scatter_y, dd_scatter_color,
|
404 |
+
dd_box_cat, dd_box_num, dd_drilldown_col
|
405 |
+
]
|
406 |
+
)
|
407 |
|
408 |
+
# Interactive plot triggers
|
409 |
+
dd_hist_col.change(fn=create_histogram, inputs=[state_analyzer, dd_hist_col], outputs=plot_hist)
|
410 |
+
dd_scatter_x.change(fn=create_scatterplot, inputs=[state_analyzer, dd_scatter_x, dd_scatter_y, dd_scatter_color], outputs=plot_scatter)
|
411 |
+
dd_scatter_y.change(fn=create_scatterplot, inputs=[state_analyzer, dd_scatter_x, dd_scatter_y, dd_scatter_color], outputs=plot_scatter)
|
412 |
+
dd_scatter_color.change(fn=create_scatterplot, inputs=[state_analyzer, dd_scatter_x, dd_scatter_y, dd_scatter_color], outputs=plot_scatter)
|
413 |
+
dd_box_cat.change(fn=create_boxplot, inputs=[state_analyzer, dd_box_cat, dd_box_num], outputs=plot_box)
|
414 |
+
dd_box_num.change(fn=create_boxplot, inputs=[state_analyzer, dd_box_cat, dd_box_num], outputs=plot_box)
|
415 |
+
|
416 |
+
# Drilldown trigger
|
417 |
+
dd_drilldown_col.change(fn=analyze_single_column, inputs=[state_analyzer, dd_drilldown_col], outputs=[md_drilldown_stats, plot_drilldown])
|
418 |
+
|
419 |
+
# Download trigger
|
420 |
+
btn_download_report.click(fn=download_report, inputs=[state_analyzer, md_ai_report], outputs=gr.File(label="Download Report"))
|
421 |
+
|
422 |
+
gr.Markdown("---")
|
423 |
+
gr.Markdown("💡 **Tip**: Get your free Google Gemini API key from [Google AI Studio](https://aistudio.google.com/app/apikey).")
|
424 |
+
gr.Markdown("MCP Expert System v2.0 - Analysis Complete.")
|
425 |
+
|
426 |
+
if __name__ == "__main__":
|
427 |
+
demo.launch(debug=True)
|