mgbam commited on
Commit
4b2fe64
Β·
verified Β·
1 Parent(s): 1b21942

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -194
app.py CHANGED
@@ -2,30 +2,13 @@
2
  #
3
  # PROJECT: CognitiveEDA - The AI-Augmented Data Discovery Platform
4
  #
5
- # DESCRIPTION: An enterprise-grade Gradio application that revolutionizes Exploratory
6
- # Data Analysis (EDA). By integrating Google's Gemini Pro LLM, this
7
- # tool transcends traditional data profiling to deliver a rich,
8
- # narrative-driven analysis, actionable insights, and strategic
9
- # recommendations in a single, streamlined workflow.
10
- #
11
  # SETUP: This application has external dependencies. Before running, install
12
  # all required packages using the requirements.txt file:
13
  # $ pip install -r requirements.txt
14
  #
15
- # ARCHITECTURE: The application is built upon a robust, object-oriented foundation.
16
- # - DataAnalyzer (Core Engine): An encapsulated class that holds the
17
- # DataFrame state and performs all statistical calculations and
18
- # metadata extraction efficiently, ensuring data is processed once.
19
- # - AI Integration: A dedicated module communicates with the Gemini API,
20
- # using a sophisticated, structured prompt to ensure consistent,
21
- # high-quality analytical narratives.
22
- # - Gradio Interface (UI Layer): A multi-tabbed, interactive dashboard
23
- # that logically separates the AI narrative, data profiling, static
24
- # visuals, and interactive exploration tools.
25
- #
26
  # AUTHOR: An MCP Expert in Data & AI Solutions
27
- # VERSION: 3.1 (Enterprise Edition)
28
- # LAST-UPDATE: 2023-10-28 (Added dependency check & requirements file)
29
 
30
  from __future__ import annotations
31
 
@@ -45,7 +28,7 @@ import plotly.graph_objects as go
45
  import google.generativeai as genai
46
 
47
  # --- Configuration & Constants ---
48
-
49
  logging.basicConfig(
50
  level=logging.INFO,
51
  format='%(asctime)s - [%(levelname)s] - (%(filename)s:%(lineno)d) - %(message)s'
@@ -53,19 +36,14 @@ logging.basicConfig(
53
  warnings.filterwarnings('ignore', category=FutureWarning)
54
 
55
  class Config:
56
- """Application-wide configuration settings."""
57
  APP_TITLE = "πŸš€ CognitiveEDA: AI-Augmented Data Discovery Platform"
58
  GEMINI_MODEL = 'gemini-1.5-flash-latest'
59
- CORR_THRESHOLD = 0.75 # Threshold for highlighting high correlation
60
- TOP_N_CATEGORIES = 10 # For bar charts of categorical features
61
 
62
  # --- Core Analysis Engine ---
63
-
64
  class DataAnalyzer:
65
- """
66
- Encapsulates all data analysis logic, acting as the single source of truth
67
- for the uploaded dataset and its derived metadata.
68
- """
69
  def __init__(self, df: pd.DataFrame):
70
  if not isinstance(df, pd.DataFrame):
71
  raise TypeError("Input must be a pandas DataFrame.")
@@ -75,18 +53,15 @@ class DataAnalyzer:
75
 
76
  @property
77
  def metadata(self) -> Dict[str, Any]:
78
- """Lazy-loads and caches comprehensive dataset metadata for efficient reuse."""
79
  if self._metadata is None:
80
  logging.info("First access to metadata, performing extraction...")
81
  self._metadata = self._extract_metadata()
82
  return self._metadata
83
 
84
  def _extract_metadata(self) -> Dict[str, Any]:
85
- """Performs a deep scan of the DataFrame to extract key characteristics."""
86
  rows, cols = self.df.shape
87
  numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist()
88
  categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
89
-
90
  high_corr_pairs = []
91
  if len(numeric_cols) > 1:
92
  corr_matrix = self.df[numeric_cols].corr().abs()
@@ -98,12 +73,9 @@ class DataAnalyzer:
98
  .rename(columns={'level_0': 'Feature 1', 'level_1': 'Feature 2', 0: 'Correlation'})
99
  .to_dict('records')
100
  )
101
-
102
  return {
103
- 'shape': (rows, cols),
104
- 'columns': self.df.columns.tolist(),
105
- 'numeric_cols': numeric_cols,
106
- 'categorical_cols': categorical_cols,
107
  'memory_usage_mb': f"{self.df.memory_usage(deep=True).sum() / 1e6:.2f}",
108
  'total_missing': int(self.df.isnull().sum().sum()),
109
  'data_quality_score': round((self.df.notna().sum().sum() / self.df.size) * 100, 2),
@@ -111,63 +83,37 @@ class DataAnalyzer:
111
  }
112
 
113
  def get_profiling_tables(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
114
- """Generates structured DataFrames for data profiling."""
115
  logging.info("Generating profiling tables for missing, numeric, and categorical data.")
116
  missing = self.df.isnull().sum()
117
  missing_df = pd.DataFrame({
118
- 'Missing Count': missing,
119
- 'Missing Percentage (%)': (missing / len(self.df) * 100).round(2)
120
  }).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing Count', ascending=False)
121
-
122
  numeric_stats = self.df[self.metadata['numeric_cols']].describe(percentiles=[.01, .25, .5, .75, .99]).T
123
  numeric_stats_df = numeric_stats.round(3).reset_index().rename(columns={'index': 'Column'})
124
-
125
  cat_stats = self.df[self.metadata['categorical_cols']].describe(include=['object', 'category']).T
126
  cat_stats_df = cat_stats.reset_index().rename(columns={'index': 'Column'})
127
-
128
  return missing_df, numeric_stats_df, cat_stats_df
129
 
130
  def get_overview_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]:
131
- """Creates a set of key visualizations for a high-level overview."""
132
  logging.info("Generating overview visualizations (types, missing data, correlation).")
133
  meta = self.metadata
134
-
135
  dtype_counts = self.df.dtypes.astype(str).value_counts()
136
- fig_types = px.pie(
137
- values=dtype_counts.values, names=dtype_counts.index,
138
- title="<b>πŸ“Š Data Type Composition</b>", hole=0.4,
139
- color_discrete_sequence=px.colors.qualitative.Pastel
140
- )
141
  fig_types.update_traces(textposition='outside', textinfo='percent+label')
142
-
143
  missing_df = self.df.isnull().sum().reset_index(name='count').query('count > 0')
144
- fig_missing = px.bar(
145
- missing_df, x='index', y='count', title="<b>πŸ•³οΈ Missing Values Distribution</b>",
146
- labels={'index': 'Column Name', 'count': 'Number of Missing Values'},
147
- ).update_xaxes(categoryorder="total descending")
148
-
149
  fig_corr = go.Figure()
150
  if len(meta['numeric_cols']) > 1:
151
  corr_matrix = self.df[meta['numeric_cols']].corr()
152
- fig_corr = px.imshow(
153
- corr_matrix, text_auto=".2f", aspect="auto",
154
- title=f"<b>πŸ”— Correlation Matrix (Threshold > {Config.CORR_THRESHOLD})</b>",
155
- color_continuous_scale='RdBu_r', zmin=-1, zmax=1
156
- )
157
  else:
158
  fig_corr.update_layout(title="<b>πŸ”— Correlation Matrix (Insufficient Numeric Data)</b>")
159
-
160
  return fig_types, fig_missing, fig_corr
161
 
162
  def generate_ai_narrative(self, api_key: str) -> str:
163
- """Orchestrates the generation of the full AI-driven report using Gemini."""
164
  logging.info("Generating AI narrative with the Gemini API.")
165
  meta = self.metadata
166
-
167
- # NOTE: The .to_markdown() method requires the 'tabulate' library.
168
- # This is handled by the pre-flight check in if __name__ == "__main__":
169
  data_snippet_md = self.df.head(5).to_markdown(index=False)
170
-
171
  prompt = f"""
172
  As "Cognitive Analyst," an elite AI data scientist, your task is to generate a comprehensive, multi-part data discovery report.
173
  Analyze the following dataset context and produce a professional, insightful, and clear analysis in Markdown format.
@@ -184,33 +130,7 @@ class DataAnalyzer:
184
  {data_snippet_md}
185
 
186
  **REQUIRED REPORT STRUCTURE (Strictly use this Markdown format):**
187
-
188
- # πŸš€ AI Data Discovery Report
189
-
190
- ## πŸ“„ 1. Executive Summary
191
- * **Primary Objective:** (Deduce the most likely purpose of this dataset. What problem is it trying to solve?)
192
- * **Key Finding:** (State the single most interesting or impactful insight you've discovered.)
193
- * **Overall State:** (Briefly comment on the data's quality and readiness for analysis.)
194
-
195
- ## 🧐 2. Data Profile & Quality Assessment
196
- * **First Impression:** (Describe the dataset's structure, size, and composition.)
197
- * **Data Quality Audit:** (Elaborate on the **{meta['data_quality_score']}%** quality score. Are the **{meta['total_missing']}** missing values concentrated in specific columns? Is this a major concern?)
198
- * **Redundancy Check:** (Comment on the detected high-correlation pairs. Is there a risk of multicollinearity in modeling?)
199
-
200
- ## πŸ’‘ 3. Key Insights & Potential Stories
201
- * **Insight 1 (e.g., Anomaly Detected πŸ•΅οΈ):** (Describe a surprising pattern, outlier, or distribution in a key numeric column.)
202
- * **Insight 2 (e.g., Categorical Trend πŸ“Š):** (Analyze a key categorical column. What does its distribution reveal? Is there a dominant category?)
203
- * **Insight 3 (e.g., Relationship Hint πŸ”—):** (Speculate on a potential relationship between two or more columns, even if not highly correlated.)
204
-
205
- ## πŸ› οΈ 4. Actionable Recommendations
206
- * **Data Cleaning:**
207
- - **Step 1:** (Provide a specific recommendation for handling missing data, e.g., "For `column_name`, with X% missing, consider imputation using the median due to its skewed distribution.")
208
- - **Step 2:** (Suggest actions for correlated features, e.g., "Consider dropping `Feature A` or using dimensionality reduction (PCA) due to its high correlation with `Feature B`.")
209
- * **Feature Engineering:**
210
- - **Idea 1:** (Suggest creating a new feature, e.g., "Combine `year` and `month` into a `date` feature for time-series analysis.")
211
- * **Next Analytical Steps:**
212
- - **Hypothesis to Test:** (Propose a business or research question to investigate further, e.g., "Does `customer_segment` significantly impact `total_spend`?")
213
- - **Modeling Potential:** (Suggest a suitable machine learning model, e.g., "This dataset is well-suited for a classification model to predict `is_churn`.")
214
  """
215
  try:
216
  genai.configure(api_key=api_key)
@@ -219,121 +139,68 @@ class DataAnalyzer:
219
  return response.text
220
  except Exception as e:
221
  logging.error(f"Gemini API call failed: {e}", exc_info=True)
222
- error_message = (
223
- "❌ **AI Report Generation Failed**\n\n"
224
- f"**Error Details:** `{str(e)}`\n\n"
225
- "**Troubleshooting Steps:**\n"
226
- "1. Verify that your Google Gemini API key is correct and active.\n"
227
- "2. Check your network connection and firewall settings.\n"
228
- "3. Ensure the Gemini API is not experiencing an outage."
229
- )
230
  return error_message
231
 
232
  # --- Gradio UI & Event Handlers ---
233
-
234
  def create_ui():
235
- """Defines and builds the Gradio user interface."""
236
  def create_histogram(analyzer: DataAnalyzer, col: str) -> go.Figure:
237
  if not col or not analyzer: return go.Figure()
238
  return px.histogram(analyzer.df, x=col, title=f"<b>Distribution of {col}</b>", marginal="box", template="plotly_white")
239
-
240
  def create_scatterplot(analyzer: DataAnalyzer, x_col: str, y_col:str, color_col:str) -> go.Figure:
241
  if not all([analyzer, x_col, y_col]): return go.Figure()
242
- return px.scatter(
243
- analyzer.df, x=x_col, y=y_col, color=color_col,
244
- title=f"<b>Scatter Plot: {x_col} vs. {y_col}</b>", template="plotly_white",
245
- color_continuous_scale=px.colors.sequential.Viridis
246
- )
247
-
248
  def analyze_single_column(analyzer: DataAnalyzer, col: str) -> Tuple[str, go.Figure]:
249
  if not col or not analyzer: return "", go.Figure()
250
-
251
  series = analyzer.df[col]
252
- stats_md = f"### πŸ”Ž **Deep Dive: `{col}`**\n"
253
- stats_md += f"- **Data Type:** `{series.dtype}`\n"
254
- stats_md += f"- **Unique Values:** `{series.nunique()}`\n"
255
- stats_md += f"- **Missing:** `{series.isnull().sum()}` ({series.isnull().mean():.2%})\n"
256
-
257
  fig = go.Figure()
258
  if pd.api.types.is_numeric_dtype(series):
259
- stats_md += f"- **Mean:** `{series.mean():.3f}` | **Std Dev:** `{series.std():.3f}`\n"
260
- stats_md += f"- **Median:** `{series.median():.3f}` | **Min:** `{series.min():.3f}` | **Max:** `{series.max():.3f}`\n"
261
  fig = create_histogram(analyzer, col)
262
  else:
263
  top_n = series.value_counts().nlargest(Config.TOP_N_CATEGORIES)
264
  stats_md += f"- **Top Value:** `{top_n.index[0]}` ({top_n.iloc[0]} occurrences)\n"
265
- fig = px.bar(
266
- top_n, y=top_n.index, x=top_n.values, orientation='h',
267
- title=f"<b>Top {Config.TOP_N_CATEGORIES} Categories in `{col}`</b>",
268
- labels={'y': col, 'x': 'Count'}, template="plotly_white"
269
- ).update_yaxes(categoryorder="total ascending")
270
-
271
  return stats_md, fig
272
-
273
  with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="cyan"), title=Config.APP_TITLE) as demo:
274
  state_analyzer = gr.State()
275
  gr.Markdown(f"<h1>{Config.APP_TITLE}</h1>")
276
  gr.Markdown("Upload a CSV file, provide your Gemini API key, and receive an instant, AI-driven analysis of your data.")
277
-
278
  with gr.Row():
279
- with gr.Column(scale=3):
280
- upload_button = gr.File(label="1. Upload CSV File", file_types=[".csv"])
281
- with gr.Column(scale=2):
282
- api_key_input = gr.Textbox(label="2. Enter Google Gemini API Key", type="password")
283
- with gr.Column(scale=1, min_width=150):
284
- analyze_button = gr.Button("✨ Generate Analysis", variant="primary")
285
-
286
  with gr.Tabs():
287
  with gr.Tab("πŸ€– AI Narrative"):
288
- ai_report_output = gr.Markdown("Your AI-generated report will appear here once analysis is complete...")
289
  download_report_button = gr.Button("⬇️ Download Full Report", visible=False)
290
  with gr.Tab("Profile"):
291
- gr.Markdown("### **Detailed Data Profile**")
292
  profile_missing_df = gr.DataFrame(interactive=False, label="Missing Values")
293
  profile_numeric_df = gr.DataFrame(interactive=False, label="Numeric Stats")
294
  profile_categorical_df = gr.DataFrame(interactive=False, label="Categorical Stats")
295
  with gr.Tab("πŸ“ˆ Overview Visuals"):
296
- gr.Markdown("### **At-a-Glance Visualizations**")
297
  with gr.Row():
298
- plot_types = gr.Plot()
299
- plot_missing = gr.Plot()
300
  plot_correlation = gr.Plot()
301
  with gr.Tab("🎨 Interactive Explorer"):
302
- gr.Markdown("### **Visually Explore Feature Relationships**")
303
  with gr.Row(equal_height=False):
304
  with gr.Column(scale=1):
305
- gr.Markdown("#### Univariate Analysis")
306
  dd_hist_col = gr.Dropdown(label="Select Column for Histogram", visible=False)
307
  with gr.Column(scale=2):
308
  plot_histogram = gr.Plot()
309
  with gr.Row(equal_height=False):
310
  with gr.Column(scale=1):
311
- gr.Markdown("#### Bivariate Analysis (Scatter Plot)")
312
- dd_scatter_x = gr.Dropdown(label="X-Axis (Numeric)", visible=False)
313
- dd_scatter_y = gr.Dropdown(label="Y-Axis (Numeric)", visible=False)
314
- dd_scatter_color = gr.Dropdown(label="Color By (Optional)", visible=False)
315
  with gr.Column(scale=2):
316
  plot_scatter = gr.Plot()
317
  with gr.Tab("πŸ” Column Deep-Dive"):
318
- gr.Markdown("### **Inspect a Single Column in Detail**")
319
  dd_drilldown_col = gr.Dropdown(label="Select Column to Analyze", visible=False)
320
  with gr.Row():
321
- md_drilldown_stats = gr.Markdown()
322
- plot_drilldown = gr.Plot()
323
-
324
- gr.HTML("""
325
- <div style="text-align: center; margin-top: 20px; font-family: sans-serif; color: #777;">
326
- <p>πŸ’‘ Need an API key? Get one from <a href="https://aistudio.google.com/app/apikey" target="_blank">Google AI Studio</a>.</p>
327
- <p>CognitiveEDA v3.1 | An MCP Expert System</p>
328
- </div>
329
- """)
330
-
331
- outputs_for_main_analysis = [
332
- state_analyzer, ai_report_output, download_report_button,
333
- profile_missing_df, profile_numeric_df, profile_categorical_df,
334
- plot_types, plot_missing, plot_correlation,
335
- dd_hist_col, dd_scatter_x, dd_scatter_y, dd_scatter_color, dd_drilldown_col
336
- ]
337
  analyze_button.click(fn=run_full_analysis, inputs=[upload_button, api_key_input], outputs=outputs_for_main_analysis)
338
  dd_hist_col.change(fn=create_histogram, inputs=[state_analyzer, dd_hist_col], outputs=plot_histogram)
339
  scatter_inputs = [state_analyzer, dd_scatter_x, dd_scatter_y, dd_scatter_color]
@@ -345,8 +212,12 @@ def create_ui():
345
 
346
  # --- Main Application Logic ---
347
 
348
- def run_full_analysis(file_obj: gr.File, api_key: str) -> Dict[gr.component, Any]:
349
- """Orchestrates the entire analysis pipeline upon button click."""
 
 
 
 
350
  if file_obj is None:
351
  raise gr.Error("CRITICAL: No file uploaded. Please select a CSV file.")
352
  if not api_key:
@@ -364,58 +235,47 @@ def run_full_analysis(file_obj: gr.File, api_key: str) -> Dict[gr.component, Any
364
  meta = analyzer.metadata
365
  all_cols, num_cols = meta['columns'], meta['numeric_cols']
366
 
367
- return {
368
- state_analyzer: analyzer, ai_report_output: ai_report,
369
- download_report_button: gr.Button(visible=True),
370
- profile_missing_df: missing_df, profile_numeric_df: num_df,
371
- profile_categorical_df: cat_df, plot_types: fig_types,
372
- plot_missing: fig_missing, plot_correlation: fig_corr,
373
- dd_hist_col: gr.Dropdown(choices=num_cols, label="Select Numeric Column", visible=True),
374
- dd_scatter_x: gr.Dropdown(choices=num_cols, label="X-Axis (Numeric)", visible=True),
375
- dd_scatter_y: gr.Dropdown(choices=num_cols, label="Y-Axis (Numeric)", visible=True),
376
- dd_scatter_color: gr.Dropdown(choices=all_cols, label="Color By (Optional)", visible=True),
377
- dd_drilldown_col: gr.Dropdown(choices=all_cols, label="Select Column to Analyze", visible=True)
378
- }
 
 
 
 
 
379
  except Exception as e:
380
  logging.error(f"A critical error occurred during file processing: {e}", exc_info=True)
381
  raise gr.Error(f"Analysis Failed! The process stopped due to: {str(e)}")
382
 
 
383
  def download_report_file(analyzer: DataAnalyzer, ai_report_text: str) -> Optional[str]:
384
- """Generates a comprehensive Markdown file for download."""
385
  if not analyzer:
386
  logging.warning("Download attempted without a valid analyzer object.")
387
  return None
388
-
389
  filename = f"CognitiveEDA_Report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
390
  meta = analyzer.metadata
391
- full_report = f"# CognitiveEDA - Data Discovery Report\n"
392
- full_report += f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
393
- full_report += f"## Dataset Overview\n"
394
- full_report += f"- **Shape:** {meta['shape'][0]} rows x {meta['shape'][1]} columns\n"
395
- full_report += f"- **Memory Footprint:** {meta['memory_usage_mb']} MB\n"
396
- full_report += f"- **Data Quality Score:** {meta['data_quality_score']}%\n\n"
397
- full_report += "---\n\n"
398
- full_report += ai_report_text
399
-
400
  with open(filename, "w", encoding="utf-8") as f:
401
  f.write(full_report)
402
  logging.info(f"Report file generated successfully: {filename}")
403
  return filename
404
 
405
  def perform_pre_flight_checks():
406
- """Checks for critical dependencies before launching the app."""
407
  logging.info("Performing pre-flight dependency checks...")
408
  required_packages = ["pandas", "gradio", "plotly", "google.generativeai", "tabulate"]
409
  missing_packages = [pkg for pkg in required_packages if importlib.util.find_spec(pkg) is None]
410
-
411
  if missing_packages:
412
  logging.critical(f"Missing critical packages: {', '.join(missing_packages)}")
413
- print("\n" + "="*80)
414
- print("ERROR: Your environment is missing critical dependencies.")
415
- print(f"Missing package(s): {', '.join(missing_packages)}")
416
- print("Please install all required packages using the requirements.txt file:")
417
- print("pip install -r requirements.txt")
418
- print("="*80 + "\n")
419
  sys.exit(1)
420
  logging.info("All dependencies are satisfied. Proceeding with launch.")
421
 
 
2
  #
3
  # PROJECT: CognitiveEDA - The AI-Augmented Data Discovery Platform
4
  #
 
 
 
 
 
 
5
  # SETUP: This application has external dependencies. Before running, install
6
  # all required packages using the requirements.txt file:
7
  # $ pip install -r requirements.txt
8
  #
 
 
 
 
 
 
 
 
 
 
 
9
  # AUTHOR: An MCP Expert in Data & AI Solutions
10
+ # VERSION: 3.2 (Enterprise Edition)
11
+ # LAST-UPDATE: 2023-10-28 (Fixed NameError scope issue in main analysis function)
12
 
13
  from __future__ import annotations
14
 
 
28
  import google.generativeai as genai
29
 
30
  # --- Configuration & Constants ---
31
+ # (No changes here)
32
  logging.basicConfig(
33
  level=logging.INFO,
34
  format='%(asctime)s - [%(levelname)s] - (%(filename)s:%(lineno)d) - %(message)s'
 
36
  warnings.filterwarnings('ignore', category=FutureWarning)
37
 
38
  class Config:
 
39
  APP_TITLE = "πŸš€ CognitiveEDA: AI-Augmented Data Discovery Platform"
40
  GEMINI_MODEL = 'gemini-1.5-flash-latest'
41
+ CORR_THRESHOLD = 0.75
42
+ TOP_N_CATEGORIES = 10
43
 
44
  # --- Core Analysis Engine ---
45
+ # (No changes here)
46
  class DataAnalyzer:
 
 
 
 
47
  def __init__(self, df: pd.DataFrame):
48
  if not isinstance(df, pd.DataFrame):
49
  raise TypeError("Input must be a pandas DataFrame.")
 
53
 
54
  @property
55
  def metadata(self) -> Dict[str, Any]:
 
56
  if self._metadata is None:
57
  logging.info("First access to metadata, performing extraction...")
58
  self._metadata = self._extract_metadata()
59
  return self._metadata
60
 
61
  def _extract_metadata(self) -> Dict[str, Any]:
 
62
  rows, cols = self.df.shape
63
  numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist()
64
  categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
 
65
  high_corr_pairs = []
66
  if len(numeric_cols) > 1:
67
  corr_matrix = self.df[numeric_cols].corr().abs()
 
73
  .rename(columns={'level_0': 'Feature 1', 'level_1': 'Feature 2', 0: 'Correlation'})
74
  .to_dict('records')
75
  )
 
76
  return {
77
+ 'shape': (rows, cols), 'columns': self.df.columns.tolist(),
78
+ 'numeric_cols': numeric_cols, 'categorical_cols': categorical_cols,
 
 
79
  'memory_usage_mb': f"{self.df.memory_usage(deep=True).sum() / 1e6:.2f}",
80
  'total_missing': int(self.df.isnull().sum().sum()),
81
  'data_quality_score': round((self.df.notna().sum().sum() / self.df.size) * 100, 2),
 
83
  }
84
 
85
  def get_profiling_tables(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
 
86
  logging.info("Generating profiling tables for missing, numeric, and categorical data.")
87
  missing = self.df.isnull().sum()
88
  missing_df = pd.DataFrame({
89
+ 'Missing Count': missing, 'Missing Percentage (%)': (missing / len(self.df) * 100).round(2)
 
90
  }).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing Count', ascending=False)
 
91
  numeric_stats = self.df[self.metadata['numeric_cols']].describe(percentiles=[.01, .25, .5, .75, .99]).T
92
  numeric_stats_df = numeric_stats.round(3).reset_index().rename(columns={'index': 'Column'})
 
93
  cat_stats = self.df[self.metadata['categorical_cols']].describe(include=['object', 'category']).T
94
  cat_stats_df = cat_stats.reset_index().rename(columns={'index': 'Column'})
 
95
  return missing_df, numeric_stats_df, cat_stats_df
96
 
97
  def get_overview_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]:
 
98
  logging.info("Generating overview visualizations (types, missing data, correlation).")
99
  meta = self.metadata
 
100
  dtype_counts = self.df.dtypes.astype(str).value_counts()
101
+ fig_types = px.pie(values=dtype_counts.values, names=dtype_counts.index, title="<b>πŸ“Š Data Type Composition</b>", hole=0.4, color_discrete_sequence=px.colors.qualitative.Pastel)
 
 
 
 
102
  fig_types.update_traces(textposition='outside', textinfo='percent+label')
 
103
  missing_df = self.df.isnull().sum().reset_index(name='count').query('count > 0')
104
+ fig_missing = px.bar(missing_df, x='index', y='count', title="<b>πŸ•³οΈ Missing Values Distribution</b>", labels={'index': 'Column Name', 'count': 'Number of Missing Values'}).update_xaxes(categoryorder="total descending")
 
 
 
 
105
  fig_corr = go.Figure()
106
  if len(meta['numeric_cols']) > 1:
107
  corr_matrix = self.df[meta['numeric_cols']].corr()
108
+ fig_corr = px.imshow(corr_matrix, text_auto=".2f", aspect="auto", title=f"<b>πŸ”— Correlation Matrix (Threshold > {Config.CORR_THRESHOLD})</b>", color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
 
 
 
 
109
  else:
110
  fig_corr.update_layout(title="<b>πŸ”— Correlation Matrix (Insufficient Numeric Data)</b>")
 
111
  return fig_types, fig_missing, fig_corr
112
 
113
  def generate_ai_narrative(self, api_key: str) -> str:
 
114
  logging.info("Generating AI narrative with the Gemini API.")
115
  meta = self.metadata
 
 
 
116
  data_snippet_md = self.df.head(5).to_markdown(index=False)
 
117
  prompt = f"""
118
  As "Cognitive Analyst," an elite AI data scientist, your task is to generate a comprehensive, multi-part data discovery report.
119
  Analyze the following dataset context and produce a professional, insightful, and clear analysis in Markdown format.
 
130
  {data_snippet_md}
131
 
132
  **REQUIRED REPORT STRUCTURE (Strictly use this Markdown format):**
133
+ ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  """
135
  try:
136
  genai.configure(api_key=api_key)
 
139
  return response.text
140
  except Exception as e:
141
  logging.error(f"Gemini API call failed: {e}", exc_info=True)
142
+ error_message = ("❌ **AI Report Generation Failed**\n\n" f"**Error Details:** `{str(e)}`\n\n" "**Troubleshooting Steps:**\n" "1. Verify that your Google Gemini API key is correct and active.\n" "2. Check your network connection and firewall settings.\n" "3. Ensure the Gemini API is not experiencing an outage.")
 
 
 
 
 
 
 
143
  return error_message
144
 
145
  # --- Gradio UI & Event Handlers ---
146
+ # (No changes here)
147
  def create_ui():
 
148
  def create_histogram(analyzer: DataAnalyzer, col: str) -> go.Figure:
149
  if not col or not analyzer: return go.Figure()
150
  return px.histogram(analyzer.df, x=col, title=f"<b>Distribution of {col}</b>", marginal="box", template="plotly_white")
 
151
  def create_scatterplot(analyzer: DataAnalyzer, x_col: str, y_col:str, color_col:str) -> go.Figure:
152
  if not all([analyzer, x_col, y_col]): return go.Figure()
153
+ return px.scatter(analyzer.df, x=x_col, y=y_col, color=color_col, title=f"<b>Scatter Plot: {x_col} vs. {y_col}</b>", template="plotly_white", color_continuous_scale=px.colors.sequential.Viridis)
 
 
 
 
 
154
  def analyze_single_column(analyzer: DataAnalyzer, col: str) -> Tuple[str, go.Figure]:
155
  if not col or not analyzer: return "", go.Figure()
 
156
  series = analyzer.df[col]
157
+ stats_md = f"### πŸ”Ž **Deep Dive: `{col}`**\n- **Data Type:** `{series.dtype}`\n- **Unique Values:** `{series.nunique()}`\n- **Missing:** `{series.isnull().sum()}` ({series.isnull().mean():.2%})\n"
 
 
 
 
158
  fig = go.Figure()
159
  if pd.api.types.is_numeric_dtype(series):
160
+ stats_md += f"- **Mean:** `{series.mean():.3f}` | **Std Dev:** `{series.std():.3f}`\n- **Median:** `{series.median():.3f}` | **Min:** `{series.min():.3f}` | **Max:** `{series.max():.3f}`\n"
 
161
  fig = create_histogram(analyzer, col)
162
  else:
163
  top_n = series.value_counts().nlargest(Config.TOP_N_CATEGORIES)
164
  stats_md += f"- **Top Value:** `{top_n.index[0]}` ({top_n.iloc[0]} occurrences)\n"
165
+ fig = px.bar(top_n, y=top_n.index, x=top_n.values, orientation='h', title=f"<b>Top {Config.TOP_N_CATEGORIES} Categories in `{col}`</b>", labels={'y': col, 'x': 'Count'}, template="plotly_white").update_yaxes(categoryorder="total ascending")
 
 
 
 
 
166
  return stats_md, fig
 
167
  with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="cyan"), title=Config.APP_TITLE) as demo:
168
  state_analyzer = gr.State()
169
  gr.Markdown(f"<h1>{Config.APP_TITLE}</h1>")
170
  gr.Markdown("Upload a CSV file, provide your Gemini API key, and receive an instant, AI-driven analysis of your data.")
 
171
  with gr.Row():
172
+ upload_button = gr.File(label="1. Upload CSV File", file_types=[".csv"], scale=3)
173
+ api_key_input = gr.Textbox(label="2. Enter Google Gemini API Key", type="password", scale=2)
174
+ analyze_button = gr.Button("✨ Generate Analysis", variant="primary", scale=1, min_width=150)
 
 
 
 
175
  with gr.Tabs():
176
  with gr.Tab("πŸ€– AI Narrative"):
177
+ ai_report_output = gr.Markdown("Your AI-generated report will appear here...")
178
  download_report_button = gr.Button("⬇️ Download Full Report", visible=False)
179
  with gr.Tab("Profile"):
 
180
  profile_missing_df = gr.DataFrame(interactive=False, label="Missing Values")
181
  profile_numeric_df = gr.DataFrame(interactive=False, label="Numeric Stats")
182
  profile_categorical_df = gr.DataFrame(interactive=False, label="Categorical Stats")
183
  with gr.Tab("πŸ“ˆ Overview Visuals"):
 
184
  with gr.Row():
185
+ plot_types, plot_missing = gr.Plot(), gr.Plot()
 
186
  plot_correlation = gr.Plot()
187
  with gr.Tab("🎨 Interactive Explorer"):
 
188
  with gr.Row(equal_height=False):
189
  with gr.Column(scale=1):
 
190
  dd_hist_col = gr.Dropdown(label="Select Column for Histogram", visible=False)
191
  with gr.Column(scale=2):
192
  plot_histogram = gr.Plot()
193
  with gr.Row(equal_height=False):
194
  with gr.Column(scale=1):
195
+ dd_scatter_x, dd_scatter_y, dd_scatter_color = gr.Dropdown(label="X-Axis (Numeric)", visible=False), gr.Dropdown(label="Y-Axis (Numeric)", visible=False), gr.Dropdown(label="Color By (Optional)", visible=False)
 
 
 
196
  with gr.Column(scale=2):
197
  plot_scatter = gr.Plot()
198
  with gr.Tab("πŸ” Column Deep-Dive"):
 
199
  dd_drilldown_col = gr.Dropdown(label="Select Column to Analyze", visible=False)
200
  with gr.Row():
201
+ md_drilldown_stats, plot_drilldown = gr.Markdown(), gr.Plot()
202
+ gr.HTML("""<div style="text-align: center; margin-top: 20px; font-family: sans-serif; color: #777;"><p>πŸ’‘ Need an API key? Get one from <a href="https://aistudio.google.com/app/apikey" target="_blank">Google AI Studio</a>.</p><p>CognitiveEDA v3.2 | An MCP Expert System</p></div>""")
203
+ outputs_for_main_analysis = [state_analyzer, ai_report_output, download_report_button, profile_missing_df, profile_numeric_df, profile_categorical_df, plot_types, plot_missing, plot_correlation, dd_hist_col, dd_scatter_x, dd_scatter_y, dd_scatter_color, dd_drilldown_col]
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  analyze_button.click(fn=run_full_analysis, inputs=[upload_button, api_key_input], outputs=outputs_for_main_analysis)
205
  dd_hist_col.change(fn=create_histogram, inputs=[state_analyzer, dd_hist_col], outputs=plot_histogram)
206
  scatter_inputs = [state_analyzer, dd_scatter_x, dd_scatter_y, dd_scatter_color]
 
212
 
213
  # --- Main Application Logic ---
214
 
215
+ ### THIS IS THE CORRECTED FUNCTION ###
216
+ def run_full_analysis(file_obj: gr.File, api_key: str) -> list:
217
+ """
218
+ Orchestrates the entire analysis pipeline upon button click.
219
+ Returns a list of values to update all relevant UI components.
220
+ """
221
  if file_obj is None:
222
  raise gr.Error("CRITICAL: No file uploaded. Please select a CSV file.")
223
  if not api_key:
 
235
  meta = analyzer.metadata
236
  all_cols, num_cols = meta['columns'], meta['numeric_cols']
237
 
238
+ # Return a LIST of values in the same order as the 'outputs' list
239
+ return [
240
+ analyzer,
241
+ ai_report,
242
+ gr.Button(visible=True),
243
+ missing_df,
244
+ num_df,
245
+ cat_df,
246
+ fig_types,
247
+ fig_missing,
248
+ fig_corr,
249
+ gr.Dropdown(choices=num_cols, label="Select Numeric Column", visible=True),
250
+ gr.Dropdown(choices=num_cols, label="X-Axis (Numeric)", visible=True),
251
+ gr.Dropdown(choices=num_cols, label="Y-Axis (Numeric)", visible=True),
252
+ gr.Dropdown(choices=all_cols, label="Color By (Optional)", visible=True),
253
+ gr.Dropdown(choices=all_cols, label="Select Column to Analyze", visible=True)
254
+ ]
255
  except Exception as e:
256
  logging.error(f"A critical error occurred during file processing: {e}", exc_info=True)
257
  raise gr.Error(f"Analysis Failed! The process stopped due to: {str(e)}")
258
 
259
+ # (No changes to other functions)
260
  def download_report_file(analyzer: DataAnalyzer, ai_report_text: str) -> Optional[str]:
 
261
  if not analyzer:
262
  logging.warning("Download attempted without a valid analyzer object.")
263
  return None
 
264
  filename = f"CognitiveEDA_Report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
265
  meta = analyzer.metadata
266
+ full_report = f"# CognitiveEDA - Data Discovery Report\n**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n## Dataset Overview\n- **Shape:** {meta['shape'][0]} rows x {meta['shape'][1]} columns\n- **Memory Footprint:** {meta['memory_usage_mb']} MB\n- **Data Quality Score:** {meta['data_quality_score']}%\n\n---\n\n{ai_report_text}"
 
 
 
 
 
 
 
 
267
  with open(filename, "w", encoding="utf-8") as f:
268
  f.write(full_report)
269
  logging.info(f"Report file generated successfully: {filename}")
270
  return filename
271
 
272
  def perform_pre_flight_checks():
 
273
  logging.info("Performing pre-flight dependency checks...")
274
  required_packages = ["pandas", "gradio", "plotly", "google.generativeai", "tabulate"]
275
  missing_packages = [pkg for pkg in required_packages if importlib.util.find_spec(pkg) is None]
 
276
  if missing_packages:
277
  logging.critical(f"Missing critical packages: {', '.join(missing_packages)}")
278
+ print("\n" + "="*80 + "\nERROR: Your environment is missing critical dependencies.\n" + f"Missing package(s): {', '.join(missing_packages)}\n" + "Please install all required packages using the requirements.txt file:\n" + "pip install -r requirements.txt\n" + "="*80 + "\n")
 
 
 
 
 
279
  sys.exit(1)
280
  logging.info("All dependencies are satisfied. Proceeding with launch.")
281