Update app.py
Browse files
app.py
CHANGED
@@ -1,84 +1,108 @@
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
An
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
from __future__ import annotations
|
21 |
|
22 |
import warnings
|
23 |
import logging
|
24 |
import os
|
25 |
-
|
|
|
|
|
|
|
26 |
import numpy as np
|
|
|
27 |
import plotly.express as px
|
28 |
import plotly.graph_objects as go
|
29 |
-
from plotly.subplots import make_subplots
|
30 |
-
import gradio as gr
|
31 |
import google.generativeai as genai
|
32 |
-
from typing import Optional, Dict, Any, Tuple, List
|
33 |
-
from datetime import datetime
|
34 |
|
35 |
-
# --- Configuration &
|
36 |
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
39 |
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
class DataAnalyzer:
|
43 |
"""
|
44 |
-
|
45 |
-
|
46 |
-
and AI-powered analysis, ensuring data is processed only once.
|
47 |
"""
|
48 |
def __init__(self, df: pd.DataFrame):
|
49 |
if not isinstance(df, pd.DataFrame):
|
50 |
raise TypeError("Input must be a pandas DataFrame.")
|
51 |
self.df = df
|
52 |
self._metadata: Optional[Dict[str, Any]] = None
|
53 |
-
logging.info(f"DataAnalyzer
|
54 |
|
55 |
@property
|
56 |
def metadata(self) -> Dict[str, Any]:
|
57 |
-
"""Lazy-loads and caches dataset metadata."""
|
58 |
if self._metadata is None:
|
|
|
59 |
self._metadata = self._extract_metadata()
|
60 |
return self._metadata
|
61 |
|
62 |
def _extract_metadata(self) -> Dict[str, Any]:
|
63 |
-
"""
|
64 |
-
logging.info("Extracting dataset metadata...")
|
65 |
rows, cols = self.df.shape
|
66 |
numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist()
|
67 |
categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
|
68 |
-
datetime_cols = self.df.select_dtypes(include=['datetime64']).columns.tolist()
|
69 |
|
70 |
-
# High correlation
|
71 |
high_corr_pairs = []
|
72 |
if len(numeric_cols) > 1:
|
73 |
corr_matrix = self.df[numeric_cols].corr().abs()
|
74 |
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
|
|
75 |
high_corr_pairs = (
|
76 |
-
|
77 |
.reset_index()
|
78 |
-
.rename(columns={'level_0': '
|
79 |
-
.query('Correlation > 0.7')
|
80 |
-
.sort_values('Correlation', ascending=False)
|
81 |
-
.head(5)
|
82 |
.to_dict('records')
|
83 |
)
|
84 |
|
@@ -87,341 +111,359 @@ class DataAnalyzer:
|
|
87 |
'columns': self.df.columns.tolist(),
|
88 |
'numeric_cols': numeric_cols,
|
89 |
'categorical_cols': categorical_cols,
|
90 |
-
'
|
91 |
-
'memory_usage': f"{self.df.memory_usage(deep=True).sum() / 1e6:.2f} MB",
|
92 |
'total_missing': int(self.df.isnull().sum().sum()),
|
93 |
-
'data_quality_score': round((self.df.notna().sum().sum() / self.df.size) * 100,
|
94 |
'high_corr_pairs': high_corr_pairs,
|
95 |
}
|
96 |
|
97 |
-
def
|
98 |
-
"""Generates
|
99 |
-
logging.info("Generating
|
100 |
-
|
101 |
-
# Missing data
|
102 |
missing = self.df.isnull().sum()
|
103 |
missing_df = pd.DataFrame({
|
104 |
-
'Missing
|
105 |
-
'Percentage (%)': (missing / len(self.df) * 100).round(2)
|
106 |
-
}).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing
|
107 |
-
|
108 |
-
# Numeric
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
'Top Value': self.df[col].mode().iloc[0] if not self.df[col].mode().empty else 'N/A',
|
118 |
-
'Frequency': self.df[col].value_counts().iloc[0] if not self.df[col].value_counts().empty else 0
|
119 |
-
}
|
120 |
-
cat_stats_list.append(stats)
|
121 |
-
categorical_stats_df = pd.DataFrame(cat_stats_list)
|
122 |
-
|
123 |
-
return missing_df, numeric_stats_df, categorical_stats_df
|
124 |
|
125 |
-
def
|
126 |
-
"""Creates a set of
|
127 |
-
logging.info("Generating
|
|
|
128 |
|
129 |
-
# Data type distribution
|
130 |
dtype_counts = self.df.dtypes.astype(str).value_counts()
|
131 |
-
|
132 |
values=dtype_counts.values, names=dtype_counts.index,
|
133 |
-
title="
|
|
|
134 |
)
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
x=
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
corr_matrix = self.df[self.metadata['numeric_cols']].corr()
|
148 |
-
corr_fig = px.imshow(
|
149 |
corr_matrix, text_auto=".2f", aspect="auto",
|
150 |
-
title="
|
151 |
-
color_continuous_scale='RdBu_r'
|
152 |
)
|
153 |
else:
|
154 |
-
|
155 |
|
156 |
-
return
|
157 |
-
|
158 |
-
def
|
159 |
-
"""
|
160 |
-
logging.info("Generating AI
|
|
|
161 |
|
|
|
162 |
prompt = f"""
|
163 |
-
As an
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
- **
|
168 |
-
- **
|
169 |
-
|
170 |
-
|
171 |
-
- **
|
172 |
-
- **
|
173 |
-
- **
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
*
|
183 |
-
*
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
*
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
|
|
|
|
|
|
|
|
201 |
"""
|
202 |
try:
|
203 |
genai.configure(api_key=api_key)
|
204 |
-
model = genai.GenerativeModel(
|
205 |
response = model.generate_content(prompt)
|
206 |
return response.text
|
207 |
except Exception as e:
|
208 |
-
logging.error(f"Gemini API call failed: {e}")
|
209 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
210 |
|
211 |
# --- Gradio UI & Event Handlers ---
|
212 |
|
213 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
"""
|
215 |
-
|
216 |
-
|
217 |
"""
|
218 |
if file_obj is None:
|
219 |
-
raise gr.Error("
|
220 |
if not api_key:
|
221 |
-
raise gr.Error("
|
222 |
|
223 |
try:
|
|
|
224 |
df = pd.read_csv(file_obj.name)
|
225 |
analyzer = DataAnalyzer(df)
|
|
|
|
|
|
|
|
|
|
|
226 |
|
227 |
-
#
|
228 |
-
|
229 |
-
|
230 |
-
dtype_fig, missing_fig, corr_fig = analyzer.get_initial_visuals()
|
231 |
-
|
232 |
-
# Prepare UI updates
|
233 |
-
all_cols = analyzer.metadata['columns']
|
234 |
-
num_cols = analyzer.metadata['numeric_cols']
|
235 |
-
cat_cols = analyzer.metadata['categorical_cols']
|
236 |
|
237 |
-
#
|
238 |
return {
|
|
|
239 |
state_analyzer: analyzer,
|
240 |
-
|
241 |
-
|
242 |
-
btn_download_report: gr.Button(visible=True),
|
243 |
# Profiling Tab
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
# Visuals Tab
|
248 |
-
|
249 |
-
plot_missing:
|
250 |
-
|
251 |
-
# Interactive
|
252 |
-
dd_hist_col: gr.Dropdown(choices=num_cols, label="Select Numeric Column
|
253 |
-
dd_scatter_x: gr.Dropdown(choices=num_cols, label="
|
254 |
-
dd_scatter_y: gr.Dropdown(choices=num_cols, label="
|
255 |
-
dd_scatter_color: gr.Dropdown(choices=all_cols, label="
|
256 |
-
|
257 |
-
dd_box_num: gr.Dropdown(choices=num_cols, label="Select Numeric Column for Box Plot", visible=True),
|
258 |
-
# Column Drilldown Tab
|
259 |
-
dd_drilldown_col: gr.Dropdown(choices=all_cols, label="Select Column to Analyze", visible=True),
|
260 |
}
|
261 |
|
262 |
except Exception as e:
|
263 |
-
logging.error(f"
|
264 |
-
raise gr.Error(f"
|
265 |
-
|
266 |
-
# --- Interactive Plotting Functions ---
|
267 |
|
268 |
-
def create_histogram(analyzer: DataAnalyzer, col: str) -> go.Figure:
|
269 |
-
if not col: return go.Figure()
|
270 |
-
return px.histogram(analyzer.df, x=col, title=f"Distribution of {col}", marginal="box")
|
271 |
|
272 |
-
def
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
stats_md += f"- **Data Type:** `{col_series.dtype}`\n"
|
289 |
-
stats_md += f"- **Missing Values:** {col_series.isnull().sum()} ({col_series.isnull().mean():.2%})\n"
|
290 |
-
stats_md += f"- **Unique Values:** {col_series.nunique()}\n"
|
291 |
-
|
292 |
-
# Generate plot based on type
|
293 |
-
fig = go.Figure()
|
294 |
-
if pd.api.types.is_numeric_dtype(col_series):
|
295 |
-
stats_md += f"- **Mean:** {col_series.mean():.2f}\n"
|
296 |
-
stats_md += f"- **Median:** {col_series.median():.2f}\n"
|
297 |
-
stats_md += f"- **Std Dev:** {col_series.std():.2f}\n"
|
298 |
-
fig = create_histogram(analyzer, col)
|
299 |
-
elif pd.api.types.is_categorical_dtype(col_series) or pd.api.types.is_object_dtype(col_series):
|
300 |
-
top5 = col_series.value_counts().head(5)
|
301 |
-
stats_md += f"- **Top 5 Values:**\n"
|
302 |
-
for val, count in top5.items():
|
303 |
-
stats_md += f" - `{val}`: {count} times\n"
|
304 |
-
fig = px.bar(top5, x=top5.index, y=top5.values, title=f"Top 5 Value Counts for {col}")
|
305 |
-
fig.update_xaxes(title=col)
|
306 |
-
fig.update_yaxes(title="Count")
|
307 |
-
|
308 |
-
return stats_md, fig
|
309 |
-
|
310 |
-
def download_report(analyzer: DataAnalyzer, ai_report_text: str) -> str:
|
311 |
-
"""Saves the AI report and basic stats to a markdown file for download."""
|
312 |
-
if not analyzer: return None
|
313 |
-
|
314 |
-
filename = f"AI_Report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
|
315 |
-
|
316 |
-
# Create the full report content
|
317 |
-
full_report = f"# AutoEDA Analysis Report\n\n"
|
318 |
-
full_report += f"**Date Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
|
319 |
-
full_report += f"**Dataset Shape:** {analyzer.metadata['shape'][0]} rows x {analyzer.metadata['shape'][1]} columns\n\n"
|
320 |
full_report += "---\n\n"
|
321 |
full_report += ai_report_text
|
322 |
|
323 |
with open(filename, "w", encoding="utf-8") as f:
|
324 |
f.write(full_report)
|
325 |
|
326 |
-
logging.info(f"
|
327 |
return filename
|
328 |
|
329 |
-
# --- Gradio Interface Definition ---
|
330 |
-
|
331 |
-
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), title="🚀 AutoEDA Pro") as demo:
|
332 |
-
# State object to hold the DataAnalyzer instance
|
333 |
-
state_analyzer = gr.State()
|
334 |
-
|
335 |
-
gr.Markdown("# 🚀 AutoEDA Pro: Your AI Data Science Assistant")
|
336 |
-
gr.Markdown("Upload a CSV, enter your Gemini API key, and click 'Analyze!' to unlock a comprehensive, AI-powered report on your data.")
|
337 |
-
|
338 |
-
with gr.Row():
|
339 |
-
with gr.Column(scale=2):
|
340 |
-
file_input = gr.File(label="📁 Upload your CSV File", file_types=[".csv"])
|
341 |
-
with gr.Column(scale=2):
|
342 |
-
api_key_input = gr.Textbox(label="🔑 Google Gemini API Key", type="password", placeholder="Enter your key here...")
|
343 |
-
with gr.Column(scale=1, min_width=150):
|
344 |
-
analyze_btn = gr.Button("✨ Analyze!", variant="primary", scale=1)
|
345 |
-
|
346 |
-
with gr.Tabs():
|
347 |
-
with gr.Tab("🤖 AI Report & Overview"):
|
348 |
-
md_ai_report = gr.Markdown("Your AI-generated report will appear here...")
|
349 |
-
btn_download_report = gr.Button("⬇️ Download Full Report", visible=False)
|
350 |
-
|
351 |
-
with gr.Tab("📊 Data Profiling"):
|
352 |
-
gr.Markdown("### Detailed Data Profile")
|
353 |
-
gr.Markdown("**Missing Data Analysis**")
|
354 |
-
df_missing_data = gr.DataFrame(interactive=False)
|
355 |
-
gr.Markdown("**Numeric Feature Statistics**")
|
356 |
-
df_numeric_stats = gr.DataFrame(interactive=False)
|
357 |
-
gr.Markdown("**Categorical Feature Statistics**")
|
358 |
-
df_categorical_stats = gr.DataFrame(interactive=False)
|
359 |
-
|
360 |
-
with gr.Tab("📈 Overview Visuals"):
|
361 |
-
gr.Markdown("### At-a-Glance Visualizations")
|
362 |
-
with gr.Row():
|
363 |
-
plot_dtype = gr.Plot()
|
364 |
-
plot_missing = gr.Plot()
|
365 |
-
with gr.Row():
|
366 |
-
plot_corr = gr.Plot()
|
367 |
-
|
368 |
-
with gr.Tab("🎨 Interactive Visuals"):
|
369 |
-
gr.Markdown("### Explore Your Data Visually")
|
370 |
-
with gr.Row():
|
371 |
-
with gr.Column():
|
372 |
-
dd_hist_col = gr.Dropdown(label="Select Column", visible=False)
|
373 |
-
plot_hist = gr.Plot()
|
374 |
-
with gr.Column():
|
375 |
-
dd_box_cat = gr.Dropdown(label="Select Category", visible=False)
|
376 |
-
dd_box_num = gr.Dropdown(label="Select Value", visible=False)
|
377 |
-
plot_box = gr.Plot()
|
378 |
-
with gr.Row():
|
379 |
-
gr.Markdown("#### Scatter Plot Explorer")
|
380 |
-
with gr.Row():
|
381 |
-
dd_scatter_x = gr.Dropdown(label="X-axis", visible=False)
|
382 |
-
dd_scatter_y = gr.Dropdown(label="Y-axis", visible=False)
|
383 |
-
dd_scatter_color = gr.Dropdown(label="Color", visible=False)
|
384 |
-
plot_scatter = gr.Plot()
|
385 |
-
|
386 |
-
with gr.Tab("🔍 Column Drilldown"):
|
387 |
-
gr.Markdown("### Deep Dive into a Single Column")
|
388 |
-
dd_drilldown_col = gr.Dropdown(label="Select Column", visible=False)
|
389 |
-
with gr.Row():
|
390 |
-
md_drilldown_stats = gr.Markdown()
|
391 |
-
plot_drilldown = gr.Plot()
|
392 |
-
|
393 |
-
# --- Event Listeners ---
|
394 |
-
|
395 |
-
# Main analysis trigger
|
396 |
-
analyze_btn.click(
|
397 |
-
fn=process_uploaded_file,
|
398 |
-
inputs=[file_input, api_key_input],
|
399 |
-
outputs=[
|
400 |
-
state_analyzer, md_ai_report, btn_download_report,
|
401 |
-
df_missing_data, df_numeric_stats, df_categorical_stats,
|
402 |
-
plot_dtype, plot_missing, plot_corr,
|
403 |
-
dd_hist_col, dd_scatter_x, dd_scatter_y, dd_scatter_color,
|
404 |
-
dd_box_cat, dd_box_num, dd_drilldown_col
|
405 |
-
]
|
406 |
-
)
|
407 |
-
|
408 |
-
# Interactive plot triggers
|
409 |
-
dd_hist_col.change(fn=create_histogram, inputs=[state_analyzer, dd_hist_col], outputs=plot_hist)
|
410 |
-
dd_scatter_x.change(fn=create_scatterplot, inputs=[state_analyzer, dd_scatter_x, dd_scatter_y, dd_scatter_color], outputs=plot_scatter)
|
411 |
-
dd_scatter_y.change(fn=create_scatterplot, inputs=[state_analyzer, dd_scatter_x, dd_scatter_y, dd_scatter_color], outputs=plot_scatter)
|
412 |
-
dd_scatter_color.change(fn=create_scatterplot, inputs=[state_analyzer, dd_scatter_x, dd_scatter_y, dd_scatter_color], outputs=plot_scatter)
|
413 |
-
dd_box_cat.change(fn=create_boxplot, inputs=[state_analyzer, dd_box_cat, dd_box_num], outputs=plot_box)
|
414 |
-
dd_box_num.change(fn=create_boxplot, inputs=[state_analyzer, dd_box_cat, dd_box_num], outputs=plot_box)
|
415 |
-
|
416 |
-
# Drilldown trigger
|
417 |
-
dd_drilldown_col.change(fn=analyze_single_column, inputs=[state_analyzer, dd_drilldown_col], outputs=[md_drilldown_stats, plot_drilldown])
|
418 |
-
|
419 |
-
# Download trigger
|
420 |
-
btn_download_report.click(fn=download_report, inputs=[state_analyzer, md_ai_report], outputs=gr.File(label="Download Report"))
|
421 |
-
|
422 |
-
gr.Markdown("---")
|
423 |
-
gr.Markdown("💡 **Tip**: Get your free Google Gemini API key from [Google AI Studio](https://aistudio.google.com/app/apikey).")
|
424 |
-
gr.Markdown("MCP Expert System v2.0 - Analysis Complete.")
|
425 |
|
426 |
if __name__ == "__main__":
|
427 |
-
|
|
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
+
#
|
3 |
+
# PROJECT: CognitiveEDA - The AI-Augmented Data Discovery Platform
|
4 |
+
#
|
5 |
+
# DESCRIPTION: An enterprise-grade Gradio application that revolutionizes Exploratory
|
6 |
+
# Data Analysis (EDA). By integrating Google's Gemini Pro LLM, this
|
7 |
+
# tool transcends traditional data profiling. It automates the generation
|
8 |
+
# of statistical summaries, interactive visualizations, and, most
|
9 |
+
# importantly, a rich, narrative-driven analysis. It delivers
|
10 |
+
# executive summaries, data quality assessments, actionable insights,
|
11 |
+
# and strategic recommendations in a single, streamlined workflow.
|
12 |
+
#
|
13 |
+
# ARCHITECTURE: The application is built upon a robust, object-oriented foundation.
|
14 |
+
# - DataAnalyzer (Core Engine): An encapsulated class that holds the
|
15 |
+
# DataFrame state and performs all statistical calculations and
|
16 |
+
# metadata extraction efficiently, ensuring data is processed once.
|
17 |
+
# - AI Integration: A dedicated module communicates with the Gemini API,
|
18 |
+
# using a sophisticated, structured prompt to ensure consistent,
|
19 |
+
# high-quality analytical narratives.
|
20 |
+
# - Gradio Interface (UI Layer): A multi-tabbed, interactive dashboard
|
21 |
+
# that logically separates the AI narrative, data profiling, static
|
22 |
+
# visuals, and interactive exploration tools. State is managed
|
23 |
+
# efficiently to provide a responsive user experience.
|
24 |
+
#
|
25 |
+
# FEATURES:
|
26 |
+
# - AI-Powered Executive Summary: Generates a high-level overview for stakeholders.
|
27 |
+
# - Automated Data Quality Audit: Provides a quality score and actionable cleaning steps.
|
28 |
+
# - Insight Discovery Engine: Uncovers hidden patterns, correlations, and anomalies.
|
29 |
+
# - Strategic Recommendations: Suggests next steps, modeling approaches, and business use cases.
|
30 |
+
# - Comprehensive Profiling: Detailed statistical tables for all data types.
|
31 |
+
# - Interactive Visualization Suite: Dynamic plots for deep-dive analysis.
|
32 |
+
# - One-Click Report Export: Downloads the complete AI-generated analysis as a Markdown file.
|
33 |
+
#
|
34 |
+
# AUTHOR: An MCP Expert in Data & AI Solutions
|
35 |
+
# VERSION: 3.0 (Enterprise Edition)
|
36 |
+
# LAST-UPDATE: 2023-10-27
|
37 |
+
|
38 |
from __future__ import annotations
|
39 |
|
40 |
import warnings
|
41 |
import logging
|
42 |
import os
|
43 |
+
from datetime import datetime
|
44 |
+
from typing import Any, Dict, List, Optional, Tuple
|
45 |
+
|
46 |
+
import gradio as gr
|
47 |
import numpy as np
|
48 |
+
import pandas as pd
|
49 |
import plotly.express as px
|
50 |
import plotly.graph_objects as go
|
|
|
|
|
51 |
import google.generativeai as genai
|
|
|
|
|
52 |
|
53 |
+
# --- Configuration & Constants ---
|
54 |
|
55 |
+
logging.basicConfig(
|
56 |
+
level=logging.INFO,
|
57 |
+
format='%(asctime)s - [%(levelname)s] - (%(filename)s:%(lineno)d) - %(message)s'
|
58 |
+
)
|
59 |
+
warnings.filterwarnings('ignore', category=FutureWarning)
|
60 |
|
61 |
+
class Config:
|
62 |
+
"""Application-wide configuration settings."""
|
63 |
+
APP_TITLE = "🚀 CognitiveEDA: AI-Augmented Data Discovery Platform"
|
64 |
+
GEMINI_MODEL = 'gemini-1.5-flash-latest'
|
65 |
+
CORR_THRESHOLD = 0.75 # Threshold for highlighting high correlation
|
66 |
+
TOP_N_CATEGORIES = 10 # For bar charts of categorical features
|
67 |
+
|
68 |
+
# --- Core Analysis Engine ---
|
69 |
|
70 |
class DataAnalyzer:
|
71 |
"""
|
72 |
+
Encapsulates all data analysis logic, acting as the single source of truth
|
73 |
+
for the uploaded dataset and its derived metadata.
|
|
|
74 |
"""
|
75 |
def __init__(self, df: pd.DataFrame):
|
76 |
if not isinstance(df, pd.DataFrame):
|
77 |
raise TypeError("Input must be a pandas DataFrame.")
|
78 |
self.df = df
|
79 |
self._metadata: Optional[Dict[str, Any]] = None
|
80 |
+
logging.info(f"DataAnalyzer instantiated with DataFrame of shape: {self.df.shape}")
|
81 |
|
82 |
@property
|
83 |
def metadata(self) -> Dict[str, Any]:
|
84 |
+
"""Lazy-loads and caches comprehensive dataset metadata for efficient reuse."""
|
85 |
if self._metadata is None:
|
86 |
+
logging.info("First access to metadata, performing extraction...")
|
87 |
self._metadata = self._extract_metadata()
|
88 |
return self._metadata
|
89 |
|
90 |
def _extract_metadata(self) -> Dict[str, Any]:
|
91 |
+
"""Performs a deep scan of the DataFrame to extract key characteristics."""
|
|
|
92 |
rows, cols = self.df.shape
|
93 |
numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist()
|
94 |
categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
|
|
|
95 |
|
96 |
+
# Advanced: High correlation pair detection
|
97 |
high_corr_pairs = []
|
98 |
if len(numeric_cols) > 1:
|
99 |
corr_matrix = self.df[numeric_cols].corr().abs()
|
100 |
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
101 |
+
high_corr_series = upper_tri.stack()
|
102 |
high_corr_pairs = (
|
103 |
+
high_corr_series[high_corr_series > Config.CORR_THRESHOLD]
|
104 |
.reset_index()
|
105 |
+
.rename(columns={'level_0': 'Feature 1', 'level_1': 'Feature 2', 0: 'Correlation'})
|
|
|
|
|
|
|
106 |
.to_dict('records')
|
107 |
)
|
108 |
|
|
|
111 |
'columns': self.df.columns.tolist(),
|
112 |
'numeric_cols': numeric_cols,
|
113 |
'categorical_cols': categorical_cols,
|
114 |
+
'memory_usage_mb': f"{self.df.memory_usage(deep=True).sum() / 1e6:.2f}",
|
|
|
115 |
'total_missing': int(self.df.isnull().sum().sum()),
|
116 |
+
'data_quality_score': round((self.df.notna().sum().sum() / self.df.size) * 100, 2),
|
117 |
'high_corr_pairs': high_corr_pairs,
|
118 |
}
|
119 |
|
120 |
+
def get_profiling_tables(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
121 |
+
"""Generates structured DataFrames for data profiling."""
|
122 |
+
logging.info("Generating profiling tables for missing, numeric, and categorical data.")
|
123 |
+
# Missing data profile
|
|
|
124 |
missing = self.df.isnull().sum()
|
125 |
missing_df = pd.DataFrame({
|
126 |
+
'Missing Count': missing,
|
127 |
+
'Missing Percentage (%)': (missing / len(self.df) * 100).round(2)
|
128 |
+
}).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing Count', ascending=False)
|
129 |
+
|
130 |
+
# Numeric features profile
|
131 |
+
numeric_stats = self.df[self.metadata['numeric_cols']].describe(percentiles=[.01, .25, .5, .75, .99]).T
|
132 |
+
numeric_stats_df = numeric_stats.round(3).reset_index().rename(columns={'index': 'Column'})
|
133 |
+
|
134 |
+
# Categorical features profile
|
135 |
+
cat_stats = self.df[self.metadata['categorical_cols']].describe(include=['object', 'category']).T
|
136 |
+
cat_stats_df = cat_stats.reset_index().rename(columns={'index': 'Column'})
|
137 |
+
|
138 |
+
return missing_df, numeric_stats_df, cat_stats_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
|
140 |
+
def get_overview_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]:
|
141 |
+
"""Creates a set of key visualizations for a high-level overview."""
|
142 |
+
logging.info("Generating overview visualizations (types, missing data, correlation).")
|
143 |
+
meta = self.metadata
|
144 |
|
|
|
145 |
dtype_counts = self.df.dtypes.astype(str).value_counts()
|
146 |
+
fig_types = px.pie(
|
147 |
values=dtype_counts.values, names=dtype_counts.index,
|
148 |
+
title="<b>📊 Data Type Composition</b>", hole=0.4,
|
149 |
+
color_discrete_sequence=px.colors.qualitative.Pastel
|
150 |
)
|
151 |
+
fig_types.update_traces(textposition='outside', textinfo='percent+label')
|
152 |
+
|
153 |
+
missing_df = self.df.isnull().sum().reset_index(name='count').query('count > 0')
|
154 |
+
fig_missing = px.bar(
|
155 |
+
missing_df, x='index', y='count', title="<b>🕳️ Missing Values Distribution</b>",
|
156 |
+
labels={'index': 'Column Name', 'count': 'Number of Missing Values'},
|
157 |
+
).update_xaxes(categoryorder="total descending")
|
158 |
+
|
159 |
+
fig_corr = go.Figure()
|
160 |
+
if len(meta['numeric_cols']) > 1:
|
161 |
+
corr_matrix = self.df[meta['numeric_cols']].corr()
|
162 |
+
fig_corr = px.imshow(
|
|
|
|
|
163 |
corr_matrix, text_auto=".2f", aspect="auto",
|
164 |
+
title=f"<b>🔗 Correlation Matrix (Threshold > {Config.CORR_THRESHOLD})</b>",
|
165 |
+
color_continuous_scale='RdBu_r', zmin=-1, zmax=1
|
166 |
)
|
167 |
else:
|
168 |
+
fig_corr.update_layout(title="<b>🔗 Correlation Matrix (Insufficient Numeric Data)</b>")
|
169 |
|
170 |
+
return fig_types, fig_missing, fig_corr
|
171 |
+
|
172 |
+
def generate_ai_narrative(self, api_key: str) -> str:
|
173 |
+
"""Orchestrates the generation of the full AI-driven report using Gemini."""
|
174 |
+
logging.info("Generating AI narrative with the Gemini API.")
|
175 |
+
meta = self.metadata
|
176 |
|
177 |
+
# A more sophisticated, structured prompt for a better report
|
178 |
prompt = f"""
|
179 |
+
As "Cognitive Analyst," an elite AI data scientist, your task is to generate a comprehensive, multi-part data discovery report.
|
180 |
+
Analyze the following dataset context and produce a professional, insightful, and clear analysis in Markdown format.
|
181 |
+
|
182 |
+
**DATASET CONTEXT:**
|
183 |
+
- **Shape:** {meta['shape'][0]} rows, {meta['shape'][1]} columns.
|
184 |
+
- **Column Schema:**
|
185 |
+
- Numeric: {', '.join(meta['numeric_cols']) if meta['numeric_cols'] else 'None'}
|
186 |
+
- Categorical: {', '.join(meta['categorical_cols']) if meta['categorical_cols'] else 'None'}
|
187 |
+
- **Data Quality Score:** {meta['data_quality_score']}% (Percentage of non-missing cells)
|
188 |
+
- **Total Missing Values:** {meta['total_missing']:,}
|
189 |
+
- **High-Correlation Pairs (>{Config.CORR_THRESHOLD}):** {meta['high_corr_pairs'] if meta['high_corr_pairs'] else 'None detected.'}
|
190 |
+
- **Data Snippet (First 5 Rows):**
|
191 |
+
{self.df.head(5).to_markdown(index=False)}
|
192 |
+
|
193 |
+
**REQUIRED REPORT STRUCTURE (Strictly use this Markdown format):**
|
194 |
+
|
195 |
+
# 🚀 AI Data Discovery Report
|
196 |
+
|
197 |
+
## 📄 1. Executive Summary
|
198 |
+
* **Primary Objective:** (Deduce the most likely purpose of this dataset. What problem is it trying to solve?)
|
199 |
+
* **Key Finding:** (State the single most interesting or impactful insight you've discovered.)
|
200 |
+
* **Overall State:** (Briefly comment on the data's quality and readiness for analysis.)
|
201 |
+
|
202 |
+
## 🧐 2. Data Profile & Quality Assessment
|
203 |
+
* **First Impression:** (Describe the dataset's structure, size, and composition.)
|
204 |
+
* **Data Quality Audit:** (Elaborate on the **{meta['data_quality_score']}%** quality score. Are the **{meta['total_missing']}** missing values concentrated in specific columns? Is this a major concern?)
|
205 |
+
* **Redundancy Check:** (Comment on the detected high-correlation pairs. Is there a risk of multicollinearity in modeling?)
|
206 |
+
|
207 |
+
## 💡 3. Key Insights & Potential Stories
|
208 |
+
* **Insight 1 (e.g., Anomaly Detected 🕵️):** (Describe a surprising pattern, outlier, or distribution in a key numeric column.)
|
209 |
+
* **Insight 2 (e.g., Categorical Trend 📊):** (Analyze a key categorical column. What does its distribution reveal? Is there a dominant category?)
|
210 |
+
* **Insight 3 (e.g., Relationship Hint 🔗):** (Speculate on a potential relationship between two or more columns, even if not highly correlated.)
|
211 |
+
|
212 |
+
## 🛠️ 4. Actionable Recommendations
|
213 |
+
* **Data Cleaning:**
|
214 |
+
- **Step 1:** (Provide a specific recommendation for handling missing data, e.g., "For `column_name`, with X% missing, consider imputation using the median due to its skewed distribution.")
|
215 |
+
- **Step 2:** (Suggest actions for correlated features, e.g., "Consider dropping `Feature A` or using dimensionality reduction (PCA) due to its high correlation with `Feature B`.")
|
216 |
+
* **Feature Engineering:**
|
217 |
+
- **Idea 1:** (Suggest creating a new feature, e.g., "Combine `year` and `month` into a `date` feature for time-series analysis.")
|
218 |
+
* **Next Analytical Steps:**
|
219 |
+
- **Hypothesis to Test:** (Propose a business or research question to investigate further, e.g., "Does `customer_segment` significantly impact `total_spend`?")
|
220 |
+
- **Modeling Potential:** (Suggest a suitable machine learning model, e.g., "This dataset is well-suited for a classification model to predict `is_churn`.")
|
221 |
"""
|
222 |
try:
|
223 |
genai.configure(api_key=api_key)
|
224 |
+
model = genai.GenerativeModel(Config.GEMINI_MODEL)
|
225 |
response = model.generate_content(prompt)
|
226 |
return response.text
|
227 |
except Exception as e:
|
228 |
+
logging.error(f"Gemini API call failed: {e}", exc_info=True)
|
229 |
+
error_message = (
|
230 |
+
"❌ **AI Report Generation Failed**\n\n"
|
231 |
+
f"**Error Details:** `{str(e)}`\n\n"
|
232 |
+
"**Troubleshooting Steps:**\n"
|
233 |
+
"1. Verify that your Google Gemini API key is correct and active.\n"
|
234 |
+
"2. Check your network connection and firewall settings.\n"
|
235 |
+
"3. Ensure the Gemini API is not experiencing an outage."
|
236 |
+
)
|
237 |
+
return error_message
|
238 |
|
239 |
# --- Gradio UI & Event Handlers ---
|
240 |
|
241 |
+
def create_ui():
|
242 |
+
"""Defines and builds the Gradio user interface."""
|
243 |
+
|
244 |
+
# --- Interactive Plotting Functions (scoped inside UI creation for clarity) ---
|
245 |
+
def create_histogram(analyzer: DataAnalyzer, col: str) -> go.Figure:
|
246 |
+
if not col or not analyzer: return go.Figure()
|
247 |
+
return px.histogram(analyzer.df, x=col, title=f"<b>Distribution of {col}</b>", marginal="box", template="plotly_white")
|
248 |
+
|
249 |
+
def create_scatterplot(analyzer: DataAnalyzer, x_col: str, y_col:str, color_col:str) -> go.Figure:
|
250 |
+
if not all([analyzer, x_col, y_col]): return go.Figure()
|
251 |
+
return px.scatter(
|
252 |
+
analyzer.df, x=x_col, y=y_col, color=color_col,
|
253 |
+
title=f"<b>Scatter Plot: {x_col} vs. {y_col}</b>", template="plotly_white",
|
254 |
+
color_continuous_scale=px.colors.sequential.Viridis
|
255 |
+
)
|
256 |
+
|
257 |
+
def analyze_single_column(analyzer: DataAnalyzer, col: str) -> Tuple[str, go.Figure]:
|
258 |
+
if not col or not analyzer: return "", go.Figure()
|
259 |
+
|
260 |
+
series = analyzer.df[col]
|
261 |
+
stats_md = f"### 🔎 **Deep Dive: `{col}`**\n"
|
262 |
+
stats_md += f"- **Data Type:** `{series.dtype}`\n"
|
263 |
+
stats_md += f"- **Unique Values:** `{series.nunique()}`\n"
|
264 |
+
stats_md += f"- **Missing:** `{series.isnull().sum()}` ({series.isnull().mean():.2%})\n"
|
265 |
+
|
266 |
+
fig = go.Figure()
|
267 |
+
if pd.api.types.is_numeric_dtype(series):
|
268 |
+
stats_md += f"- **Mean:** `{series.mean():.3f}` | **Std Dev:** `{series.std():.3f}`\n"
|
269 |
+
stats_md += f"- **Median:** `{series.median():.3f}` | **Min:** `{series.min():.3f}` | **Max:** `{series.max():.3f}`\n"
|
270 |
+
fig = create_histogram(analyzer, col)
|
271 |
+
else:
|
272 |
+
top_n = series.value_counts().nlargest(Config.TOP_N_CATEGORIES)
|
273 |
+
stats_md += f"- **Top Value:** `{top_n.index[0]}` ({top_n.iloc[0]} occurrences)\n"
|
274 |
+
fig = px.bar(
|
275 |
+
top_n, y=top_n.index, x=top_n.values, orientation='h',
|
276 |
+
title=f"<b>Top {Config.TOP_N_CATEGORIES} Categories in `{col}`</b>",
|
277 |
+
labels={'y': col, 'x': 'Count'}, template="plotly_white"
|
278 |
+
).update_yaxes(categoryorder="total ascending")
|
279 |
+
|
280 |
+
return stats_md, fig
|
281 |
+
|
282 |
+
# --- Main UI Blocks ---
|
283 |
+
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="cyan"), title=Config.APP_TITLE) as demo:
|
284 |
+
# Store for the main DataAnalyzer object
|
285 |
+
state_analyzer = gr.State()
|
286 |
+
|
287 |
+
gr.Markdown(f"<h1>{Config.APP_TITLE}</h1>")
|
288 |
+
gr.Markdown("Upload a CSV file, provide your Gemini API key, and receive an instant, AI-driven analysis of your data.")
|
289 |
+
|
290 |
+
with gr.Row():
|
291 |
+
with gr.Column(scale=3):
|
292 |
+
upload_button = gr.File(label="1. Upload CSV File", file_types=[".csv"])
|
293 |
+
with gr.Column(scale=2):
|
294 |
+
api_key_input = gr.Textbox(label="2. Enter Google Gemini API Key", type="password")
|
295 |
+
with gr.Column(scale=1, min_width=150):
|
296 |
+
analyze_button = gr.Button("✨ Generate Analysis", variant="primary")
|
297 |
+
|
298 |
+
with gr.Tabs() as tabs:
|
299 |
+
with gr.Tab("🤖 AI Narrative", id=0):
|
300 |
+
ai_report_output = gr.Markdown("Your AI-generated report will appear here once analysis is complete...")
|
301 |
+
download_report_button = gr.Button("⬇️ Download Full Report", visible=False)
|
302 |
+
|
303 |
+
with gr.Tab(" Profile", id=1):
|
304 |
+
gr.Markdown("### **Detailed Data Profile**")
|
305 |
+
gr.Markdown("#### Missing Data Summary")
|
306 |
+
profile_missing_df = gr.DataFrame(interactive=False, label="Missing Values")
|
307 |
+
gr.Markdown("#### Numeric Features Summary")
|
308 |
+
profile_numeric_df = gr.DataFrame(interactive=False, label="Numeric Stats")
|
309 |
+
gr.Markdown("#### Categorical Features Summary")
|
310 |
+
profile_categorical_df = gr.DataFrame(interactive=False, label="Categorical Stats")
|
311 |
+
|
312 |
+
with gr.Tab("📈 Overview Visuals", id=2):
|
313 |
+
gr.Markdown("### **At-a-Glance Visualizations**")
|
314 |
+
with gr.Row():
|
315 |
+
plot_types = gr.Plot()
|
316 |
+
plot_missing = gr.Plot()
|
317 |
+
plot_correlation = gr.Plot()
|
318 |
+
|
319 |
+
with gr.Tab("🎨 Interactive Explorer", id=3):
|
320 |
+
gr.Markdown("### **Visually Explore Feature Relationships**")
|
321 |
+
with gr.Row():
|
322 |
+
with gr.Column(scale=1):
|
323 |
+
gr.Markdown("#### Univariate Analysis")
|
324 |
+
dd_hist_col = gr.Dropdown(label="Select Column for Histogram", visible=False)
|
325 |
+
with gr.Column(scale=2):
|
326 |
+
plot_histogram = gr.Plot()
|
327 |
+
|
328 |
+
with gr.Row():
|
329 |
+
with gr.Column(scale=1):
|
330 |
+
gr.Markdown("#### Bivariate Analysis (Scatter Plot)")
|
331 |
+
dd_scatter_x = gr.Dropdown(label="X-Axis (Numeric)", visible=False)
|
332 |
+
dd_scatter_y = gr.Dropdown(label="Y-Axis (Numeric)", visible=False)
|
333 |
+
dd_scatter_color = gr.Dropdown(label="Color By (Optional)", visible=False)
|
334 |
+
with gr.Column(scale=2):
|
335 |
+
plot_scatter = gr.Plot()
|
336 |
+
|
337 |
+
with gr.Tab("🔍 Column Deep-Dive", id=4):
|
338 |
+
gr.Markdown("### **Inspect a Single Column in Detail**")
|
339 |
+
dd_drilldown_col = gr.Dropdown(label="Select Column to Analyze", visible=False)
|
340 |
+
with gr.Row():
|
341 |
+
md_drilldown_stats = gr.Markdown()
|
342 |
+
plot_drilldown = gr.Plot()
|
343 |
+
|
344 |
+
gr.HTML("""
|
345 |
+
<div style="text-align: center; margin-top: 20px; font-family: sans-serif; color: #777;">
|
346 |
+
<p>💡 Need an API key? Get one from <a href="https://aistudio.google.com/app/apikey" target="_blank">Google AI Studio</a>.</p>
|
347 |
+
<p>CognitiveEDA v3.0 | An MCP Expert System</p>
|
348 |
+
</div>
|
349 |
+
""")
|
350 |
+
|
351 |
+
# --- Event Listeners & Control Flow ---
|
352 |
+
|
353 |
+
outputs_for_main_analysis = [
|
354 |
+
state_analyzer, ai_report_output, download_report_button,
|
355 |
+
profile_missing_df, profile_numeric_df, profile_categorical_df,
|
356 |
+
plot_types, plot_missing, plot_correlation,
|
357 |
+
dd_hist_col, dd_scatter_x, dd_scatter_y, dd_scatter_color, dd_drilldown_col
|
358 |
+
]
|
359 |
+
|
360 |
+
analyze_button.click(
|
361 |
+
fn=run_full_analysis,
|
362 |
+
inputs=[upload_button, api_key_input],
|
363 |
+
outputs=outputs_for_main_analysis
|
364 |
+
)
|
365 |
+
|
366 |
+
# Interactive plot triggers
|
367 |
+
dd_hist_col.change(fn=create_histogram, inputs=[state_analyzer, dd_hist_col], outputs=plot_histogram)
|
368 |
+
|
369 |
+
scatter_inputs = [state_analyzer, dd_scatter_x, dd_scatter_y, dd_scatter_color]
|
370 |
+
dd_scatter_x.change(fn=create_scatterplot, inputs=scatter_inputs, outputs=plot_scatter)
|
371 |
+
dd_scatter_y.change(fn=create_scatterplot, inputs=scatter_inputs, outputs=plot_scatter)
|
372 |
+
dd_scatter_color.change(fn=create_scatterplot, inputs=scatter_inputs, outputs=plot_scatter)
|
373 |
+
|
374 |
+
dd_drilldown_col.change(
|
375 |
+
fn=analyze_single_column,
|
376 |
+
inputs=[state_analyzer, dd_drilldown_col],
|
377 |
+
outputs=[md_drilldown_stats, plot_drilldown]
|
378 |
+
)
|
379 |
+
|
380 |
+
download_report_button.click(
|
381 |
+
fn=download_report_file,
|
382 |
+
inputs=[state_analyzer, ai_report_output],
|
383 |
+
outputs=gr.File(label="Download Report")
|
384 |
+
)
|
385 |
+
|
386 |
+
return demo
|
387 |
+
|
388 |
+
# --- Main Application Logic ---
|
389 |
+
|
390 |
+
def run_full_analysis(file_obj: gr.File, api_key: str) -> Dict[gr.component, Any]:
|
391 |
"""
|
392 |
+
Orchestrates the entire analysis pipeline upon button click.
|
393 |
+
Returns a dictionary to update all relevant UI components at once.
|
394 |
"""
|
395 |
if file_obj is None:
|
396 |
+
raise gr.Error("CRITICAL: No file uploaded. Please select a CSV file.")
|
397 |
if not api_key:
|
398 |
+
raise gr.Error("CRITICAL: Gemini API key is missing. Please provide your key.")
|
399 |
|
400 |
try:
|
401 |
+
logging.info(f"Processing uploaded file: {file_obj.name}")
|
402 |
df = pd.read_csv(file_obj.name)
|
403 |
analyzer = DataAnalyzer(df)
|
404 |
+
|
405 |
+
# --- Execute all analysis tasks concurrently (conceptually) ---
|
406 |
+
ai_report = analyzer.generate_ai_narrative(api_key)
|
407 |
+
missing_df, num_df, cat_df = analyzer.get_profiling_tables()
|
408 |
+
fig_types, fig_missing, fig_corr = analyzer.get_overview_visuals()
|
409 |
|
410 |
+
# --- Prepare UI component updates ---
|
411 |
+
meta = analyzer.metadata
|
412 |
+
all_cols, num_cols, cat_cols = meta['columns'], meta['numeric_cols'], meta['categorical_cols']
|
|
|
|
|
|
|
|
|
|
|
|
|
413 |
|
414 |
+
# Return a dictionary mapping components to their new state/value
|
415 |
return {
|
416 |
+
# State & AI Report
|
417 |
state_analyzer: analyzer,
|
418 |
+
ai_report_output: ai_report,
|
419 |
+
download_report_button: gr.Button(visible=True),
|
|
|
420 |
# Profiling Tab
|
421 |
+
profile_missing_df: missing_df,
|
422 |
+
profile_numeric_df: num_df,
|
423 |
+
profile_categorical_df: cat_df,
|
424 |
+
# Overview Visuals Tab
|
425 |
+
plot_types: fig_types,
|
426 |
+
plot_missing: fig_missing,
|
427 |
+
plot_correlation: fig_corr,
|
428 |
+
# Interactive Explorer & Drilldown Dropdown Updates
|
429 |
+
dd_hist_col: gr.Dropdown(choices=num_cols, label="Select Numeric Column", visible=True),
|
430 |
+
dd_scatter_x: gr.Dropdown(choices=num_cols, label="X-Axis (Numeric)", visible=True),
|
431 |
+
dd_scatter_y: gr.Dropdown(choices=num_cols, label="Y-Axis (Numeric)", visible=True),
|
432 |
+
dd_scatter_color: gr.Dropdown(choices=all_cols, label="Color By (Optional)", visible=True),
|
433 |
+
dd_drilldown_col: gr.Dropdown(choices=all_cols, label="Select Column to Analyze", visible=True)
|
|
|
|
|
|
|
434 |
}
|
435 |
|
436 |
except Exception as e:
|
437 |
+
logging.error(f"A critical error occurred during file processing: {e}", exc_info=True)
|
438 |
+
raise gr.Error(f"Analysis Failed! The process stopped due to: {str(e)}")
|
|
|
|
|
439 |
|
|
|
|
|
|
|
440 |
|
441 |
+
def download_report_file(analyzer: DataAnalyzer, ai_report_text: str) -> str:
|
442 |
+
"""Generates a comprehensive Markdown file for download."""
|
443 |
+
if not analyzer:
|
444 |
+
logging.warning("Download attempted without a valid analyzer object.")
|
445 |
+
return None
|
446 |
+
|
447 |
+
filename = f"CognitiveEDA_Report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
|
448 |
+
meta = analyzer.metadata
|
449 |
+
|
450 |
+
# Assemble the full report
|
451 |
+
full_report = f"# CognitiveEDA - Data Discovery Report\n"
|
452 |
+
full_report += f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
|
453 |
+
full_report += f"## Dataset Overview\n"
|
454 |
+
full_report += f"- **Shape:** {meta['shape'][0]} rows x {meta['shape'][1]} columns\n"
|
455 |
+
full_report += f"- **Memory Footprint:** {meta['memory_usage_mb']} MB\n"
|
456 |
+
full_report += f"- **Data Quality Score:** {meta['data_quality_score']}%\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
457 |
full_report += "---\n\n"
|
458 |
full_report += ai_report_text
|
459 |
|
460 |
with open(filename, "w", encoding="utf-8") as f:
|
461 |
f.write(full_report)
|
462 |
|
463 |
+
logging.info(f"Report file generated successfully: {filename}")
|
464 |
return filename
|
465 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
466 |
|
467 |
if __name__ == "__main__":
|
468 |
+
app_instance = create_ui()
|
469 |
+
app_instance.launch(debug=True, server_name="0.0.0.0")
|