mgbam commited on
Commit
c9ba3ae
·
verified ·
1 Parent(s): 60da408

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +367 -325
app.py CHANGED
@@ -1,84 +1,108 @@
1
  # -*- coding: utf-8 -*-
2
- """
3
- 🚀 AutoEDA: AI-Powered Exploratory Data Analysis Tool
4
-
5
- An advanced Gradio application for automated exploratory data analysis,
6
- data profiling, and AI-driven insights using Google's Gemini API.
7
-
8
- Key Features:
9
- - Unified Analysis Workflow: Upload a CSV and get a full report across all tabs.
10
- - AI-Powered Storytelling: Generates a narrative overview, use cases, and findings.
11
- - Actionable AI Suggestions: Provides data cleaning recommendations.
12
- - Interactive Visualizations: Users can select columns to generate plots dynamically.
13
- - In-depth Profiling: Detailed statistics for numeric and categorical data.
14
- - Column-Level Drilldown: Inspect individual features in detail.
15
- - Report Download: Export the AI-generated analysis as a Markdown file.
16
-
17
- Author: World-Class MCP Expert
18
- Version: 2.0
19
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  from __future__ import annotations
21
 
22
  import warnings
23
  import logging
24
  import os
25
- import pandas as pd
 
 
 
26
  import numpy as np
 
27
  import plotly.express as px
28
  import plotly.graph_objects as go
29
- from plotly.subplots import make_subplots
30
- import gradio as gr
31
  import google.generativeai as genai
32
- from typing import Optional, Dict, Any, Tuple, List
33
- from datetime import datetime
34
 
35
- # --- Configuration & Setup ---
36
 
37
- warnings.filterwarnings('ignore')
38
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
 
 
39
 
40
- # --- Core Analysis Logic (The "Engine") ---
 
 
 
 
 
 
 
41
 
42
  class DataAnalyzer:
43
  """
44
- A comprehensive class to encapsulate all data analysis operations.
45
- It holds the dataframe and provides methods for profiling, visualization,
46
- and AI-powered analysis, ensuring data is processed only once.
47
  """
48
  def __init__(self, df: pd.DataFrame):
49
  if not isinstance(df, pd.DataFrame):
50
  raise TypeError("Input must be a pandas DataFrame.")
51
  self.df = df
52
  self._metadata: Optional[Dict[str, Any]] = None
53
- logging.info(f"DataAnalyzer initialized with DataFrame of shape: {self.df.shape}")
54
 
55
  @property
56
  def metadata(self) -> Dict[str, Any]:
57
- """Lazy-loads and caches dataset metadata."""
58
  if self._metadata is None:
 
59
  self._metadata = self._extract_metadata()
60
  return self._metadata
61
 
62
  def _extract_metadata(self) -> Dict[str, Any]:
63
- """Extracts comprehensive metadata from the DataFrame."""
64
- logging.info("Extracting dataset metadata...")
65
  rows, cols = self.df.shape
66
  numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist()
67
  categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
68
- datetime_cols = self.df.select_dtypes(include=['datetime64']).columns.tolist()
69
 
70
- # High correlation pairs
71
  high_corr_pairs = []
72
  if len(numeric_cols) > 1:
73
  corr_matrix = self.df[numeric_cols].corr().abs()
74
  upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
 
75
  high_corr_pairs = (
76
- upper_tri.stack()
77
  .reset_index()
78
- .rename(columns={'level_0': 'Var 1', 'level_1': 'Var 2', 0: 'Correlation'})
79
- .query('Correlation > 0.7')
80
- .sort_values('Correlation', ascending=False)
81
- .head(5)
82
  .to_dict('records')
83
  )
84
 
@@ -87,341 +111,359 @@ class DataAnalyzer:
87
  'columns': self.df.columns.tolist(),
88
  'numeric_cols': numeric_cols,
89
  'categorical_cols': categorical_cols,
90
- 'datetime_cols': datetime_cols,
91
- 'memory_usage': f"{self.df.memory_usage(deep=True).sum() / 1e6:.2f} MB",
92
  'total_missing': int(self.df.isnull().sum().sum()),
93
- 'data_quality_score': round((self.df.notna().sum().sum() / self.df.size) * 100, 1),
94
  'high_corr_pairs': high_corr_pairs,
95
  }
96
 
97
- def get_profiling_report(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
98
- """Generates detailed data profiling tables."""
99
- logging.info("Generating data profiling report.")
100
-
101
- # Missing data
102
  missing = self.df.isnull().sum()
103
  missing_df = pd.DataFrame({
104
- 'Missing Values': missing,
105
- 'Percentage (%)': (missing / len(self.df) * 100).round(2)
106
- }).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing Values', ascending=False)
107
-
108
- # Numeric stats
109
- numeric_stats_df = self.df[self.metadata['numeric_cols']].describe().round(3).T.reset_index().rename(columns={'index': 'Column'})
110
-
111
- # Categorical stats
112
- cat_stats_list = []
113
- for col in self.metadata['categorical_cols']:
114
- stats = {
115
- 'Column': col,
116
- 'Unique Values': self.df[col].nunique(),
117
- 'Top Value': self.df[col].mode().iloc[0] if not self.df[col].mode().empty else 'N/A',
118
- 'Frequency': self.df[col].value_counts().iloc[0] if not self.df[col].value_counts().empty else 0
119
- }
120
- cat_stats_list.append(stats)
121
- categorical_stats_df = pd.DataFrame(cat_stats_list)
122
-
123
- return missing_df, numeric_stats_df, categorical_stats_df
124
 
125
- def get_initial_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]:
126
- """Creates a set of standard, non-interactive overview plots."""
127
- logging.info("Generating initial overview visualizations.")
 
128
 
129
- # Data type distribution
130
  dtype_counts = self.df.dtypes.astype(str).value_counts()
131
- dtype_fig = px.pie(
132
  values=dtype_counts.values, names=dtype_counts.index,
133
- title="📊 Data Type Distribution", hole=0.3
 
134
  )
135
- dtype_fig.update_traces(textposition='inside', textinfo='percent+label')
136
-
137
- # Missing data overview
138
- missing_fig = px.bar(
139
- x=self.df.isnull().sum(), y=self.df.columns,
140
- orientation='h', title="🕳️ Missing Values Overview",
141
- labels={'x': 'Number of Missing Values', 'y': 'Column'},
142
- ).update_yaxes(categoryorder="total ascending")
143
-
144
- # Correlation heatmap
145
- corr_fig = go.Figure()
146
- if len(self.metadata['numeric_cols']) > 1:
147
- corr_matrix = self.df[self.metadata['numeric_cols']].corr()
148
- corr_fig = px.imshow(
149
  corr_matrix, text_auto=".2f", aspect="auto",
150
- title="🔗 Correlation Matrix (Numeric Features)",
151
- color_continuous_scale='RdBu_r'
152
  )
153
  else:
154
- corr_fig.update_layout(title="🔗 Correlation Matrix (Not enough numeric columns)")
155
 
156
- return dtype_fig, missing_fig, corr_fig
157
-
158
- def generate_ai_report(self, api_key: str) -> str:
159
- """Generates a full data story and analysis using the Gemini API."""
160
- logging.info("Generating AI report with Gemini.")
 
161
 
 
162
  prompt = f"""
163
- As an expert data analyst and storyteller, your task is to analyze the provided dataset summary and generate a comprehensive, insightful, and accessible report.
164
-
165
- **Dataset Metadata:**
166
- - **Shape:** {self.metadata['shape'][0]} rows, {self.metadata['shape'][1]} columns.
167
- - **Column Names:** {', '.join(self.metadata['columns'])}
168
- - **Numeric Columns:** {', '.join(self.metadata['numeric_cols'])}
169
- - **Categorical Columns:** {', '.join(self.metadata['categorical_cols'])}
170
- - **Overall Data Quality:** {self.metadata['data_quality_score']}%
171
- - **Total Missing Values:** {self.metadata['total_missing']:,}
172
- - **Highly Correlated Pairs (>0.7):** {self.metadata['high_corr_pairs'] if self.metadata['high_corr_pairs'] else 'None detected.'}
173
- - **Sample Data (First 3 Rows):**
174
- {self.df.head(3).to_markdown()}
175
-
176
- **Your Report Structure (Use Markdown):**
177
-
178
- # 🚀 AI-Powered Data Analysis Report
179
-
180
- ## 📖 1. The Story of the Data
181
- * **What is this dataset about?** (Deduce the purpose and subject matter of the data.)
182
- * **What domain or industry does it belong to?** (e.g., E-commerce, Finance, Healthcare.)
183
- * **Who might use this data?** (e.g., Marketers, Scientists, Financial Analysts.)
184
-
185
- ## 🎯 2. Key Insights & Interesting Findings
186
- - **Finding 1:** (Describe a significant pattern, trend, or anomaly. Use emojis to highlight.)
187
- - **Finding 2:** (Mention another interesting discovery, perhaps from correlations or categorical data.)
188
- - **Finding 3:** (Highlight a potential business or research opportunity revealed by the data.)
189
-
190
- ## 🧹 3. Data Quality & Cleaning Recommendations
191
- * **Overall Quality Assessment:** (Comment on the {self.metadata['data_quality_score']}% score and {self.metadata['total_missing']} missing values.)
192
- * **Actionable Steps:**
193
- - **Recommendation 1:** (e.g., "For column 'X' with Y% missing values, consider imputation using the mean/median/mode.")
194
- - **Recommendation 2:** (e.g., "Columns 'A' and 'B' are highly correlated ({'e.g., ' + str(self.metadata['high_corr_pairs'][0]) if self.metadata['high_corr_pairs'] else ''}). Consider dropping one for modeling to avoid multicollinearity.")
195
- - **Recommendation 3:** (e.g., "Column 'Z' is categorical but stored as a number. Recommend converting it to a category type.")
196
-
197
- ## 🔮 4. Potential Next Steps & Use Cases
198
- - **Analysis Idea 1:** (e.g., "Build a predictive model for customer churn.")
199
- - **Dashboard Idea 2:** (e.g., "Create a sales performance dashboard tracking KPIs over time.")
200
- - **Research Question 3:** (e.g., "Investigate the factors influencing employee attrition.")
 
 
 
 
201
  """
202
  try:
203
  genai.configure(api_key=api_key)
204
- model = genai.GenerativeModel('gemini-1.5-flash-latest')
205
  response = model.generate_content(prompt)
206
  return response.text
207
  except Exception as e:
208
- logging.error(f"Gemini API call failed: {e}")
209
- return f"❌ **Error generating AI report.**\n**Reason:** {str(e)}\n\nPlease check your API key and network connection. A fallback analysis could not be generated."
 
 
 
 
 
 
 
 
210
 
211
  # --- Gradio UI & Event Handlers ---
212
 
213
- def process_uploaded_file(file_obj: gr.File, api_key: str) -> tuple:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  """
215
- Main function to process the uploaded file. It runs all analyses
216
- and returns updates for all UI components in one go.
217
  """
218
  if file_obj is None:
219
- raise gr.Error("📁 Please upload a CSV file first!")
220
  if not api_key:
221
- raise gr.Error("🔑 Please enter your Gemini API key!")
222
 
223
  try:
 
224
  df = pd.read_csv(file_obj.name)
225
  analyzer = DataAnalyzer(df)
 
 
 
 
 
226
 
227
- # Perform all analyses
228
- ai_report = analyzer.generate_ai_report(api_key)
229
- missing_df, num_stats, cat_stats = analyzer.get_profiling_report()
230
- dtype_fig, missing_fig, corr_fig = analyzer.get_initial_visuals()
231
-
232
- # Prepare UI updates
233
- all_cols = analyzer.metadata['columns']
234
- num_cols = analyzer.metadata['numeric_cols']
235
- cat_cols = analyzer.metadata['categorical_cols']
236
 
237
- # The return dictionary maps UI components to their new values/configurations
238
  return {
 
239
  state_analyzer: analyzer,
240
- # Overview Tab
241
- md_ai_report: ai_report,
242
- btn_download_report: gr.Button(visible=True),
243
  # Profiling Tab
244
- df_missing_data: missing_df,
245
- df_numeric_stats: num_stats,
246
- df_categorical_stats: cat_stats,
247
- # Visuals Tab
248
- plot_dtype: dtype_fig,
249
- plot_missing: missing_fig,
250
- plot_corr: corr_fig,
251
- # Interactive Visuals Tab
252
- dd_hist_col: gr.Dropdown(choices=num_cols, label="Select Numeric Column for Histogram", visible=True),
253
- dd_scatter_x: gr.Dropdown(choices=num_cols, label="Select X-axis (Numeric)", visible=True),
254
- dd_scatter_y: gr.Dropdown(choices=num_cols, label="Select Y-axis (Numeric)", visible=True),
255
- dd_scatter_color: gr.Dropdown(choices=all_cols, label="Select Color (Categorical/Numeric)", visible=True),
256
- dd_box_cat: gr.Dropdown(choices=cat_cols, label="Select Categorical Column for Box Plot", visible=True),
257
- dd_box_num: gr.Dropdown(choices=num_cols, label="Select Numeric Column for Box Plot", visible=True),
258
- # Column Drilldown Tab
259
- dd_drilldown_col: gr.Dropdown(choices=all_cols, label="Select Column to Analyze", visible=True),
260
  }
261
 
262
  except Exception as e:
263
- logging.error(f"An error occurred during file processing: {e}", exc_info=True)
264
- raise gr.Error(f"Processing failed! Error: {str(e)}")
265
-
266
- # --- Interactive Plotting Functions ---
267
 
268
- def create_histogram(analyzer: DataAnalyzer, col: str) -> go.Figure:
269
- if not col: return go.Figure()
270
- return px.histogram(analyzer.df, x=col, title=f"Distribution of {col}", marginal="box")
271
 
272
- def create_scatterplot(analyzer: DataAnalyzer, x_col: str, y_col: str, color_col: str) -> go.Figure:
273
- if not x_col or not y_col: return go.Figure()
274
- return px.scatter(analyzer.df, x=x_col, y=y_col, color=color_col,
275
- title=f"Scatter Plot: {x_col} vs. {y_col}")
276
-
277
- def create_boxplot(analyzer: DataAnalyzer, cat_col: str, num_col: str) -> go.Figure:
278
- if not cat_col or not num_col: return go.Figure()
279
- return px.box(analyzer.df, x=cat_col, y=num_col, title=f"Box Plot: {num_col} by {cat_col}")
280
-
281
- def analyze_single_column(analyzer: DataAnalyzer, col: str) -> Tuple[str, go.Figure]:
282
- if not col: return "", go.Figure()
283
-
284
- col_series = analyzer.df[col]
285
-
286
- # Generate stats markdown
287
- stats_md = f"### 🔎 Analysis of Column: `{col}`\n"
288
- stats_md += f"- **Data Type:** `{col_series.dtype}`\n"
289
- stats_md += f"- **Missing Values:** {col_series.isnull().sum()} ({col_series.isnull().mean():.2%})\n"
290
- stats_md += f"- **Unique Values:** {col_series.nunique()}\n"
291
-
292
- # Generate plot based on type
293
- fig = go.Figure()
294
- if pd.api.types.is_numeric_dtype(col_series):
295
- stats_md += f"- **Mean:** {col_series.mean():.2f}\n"
296
- stats_md += f"- **Median:** {col_series.median():.2f}\n"
297
- stats_md += f"- **Std Dev:** {col_series.std():.2f}\n"
298
- fig = create_histogram(analyzer, col)
299
- elif pd.api.types.is_categorical_dtype(col_series) or pd.api.types.is_object_dtype(col_series):
300
- top5 = col_series.value_counts().head(5)
301
- stats_md += f"- **Top 5 Values:**\n"
302
- for val, count in top5.items():
303
- stats_md += f" - `{val}`: {count} times\n"
304
- fig = px.bar(top5, x=top5.index, y=top5.values, title=f"Top 5 Value Counts for {col}")
305
- fig.update_xaxes(title=col)
306
- fig.update_yaxes(title="Count")
307
-
308
- return stats_md, fig
309
-
310
- def download_report(analyzer: DataAnalyzer, ai_report_text: str) -> str:
311
- """Saves the AI report and basic stats to a markdown file for download."""
312
- if not analyzer: return None
313
-
314
- filename = f"AI_Report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
315
-
316
- # Create the full report content
317
- full_report = f"# AutoEDA Analysis Report\n\n"
318
- full_report += f"**Date Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
319
- full_report += f"**Dataset Shape:** {analyzer.metadata['shape'][0]} rows x {analyzer.metadata['shape'][1]} columns\n\n"
320
  full_report += "---\n\n"
321
  full_report += ai_report_text
322
 
323
  with open(filename, "w", encoding="utf-8") as f:
324
  f.write(full_report)
325
 
326
- logging.info(f"Generated download report: {filename}")
327
  return filename
328
 
329
- # --- Gradio Interface Definition ---
330
-
331
- with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), title="🚀 AutoEDA Pro") as demo:
332
- # State object to hold the DataAnalyzer instance
333
- state_analyzer = gr.State()
334
-
335
- gr.Markdown("# 🚀 AutoEDA Pro: Your AI Data Science Assistant")
336
- gr.Markdown("Upload a CSV, enter your Gemini API key, and click 'Analyze!' to unlock a comprehensive, AI-powered report on your data.")
337
-
338
- with gr.Row():
339
- with gr.Column(scale=2):
340
- file_input = gr.File(label="📁 Upload your CSV File", file_types=[".csv"])
341
- with gr.Column(scale=2):
342
- api_key_input = gr.Textbox(label="🔑 Google Gemini API Key", type="password", placeholder="Enter your key here...")
343
- with gr.Column(scale=1, min_width=150):
344
- analyze_btn = gr.Button("✨ Analyze!", variant="primary", scale=1)
345
-
346
- with gr.Tabs():
347
- with gr.Tab("🤖 AI Report & Overview"):
348
- md_ai_report = gr.Markdown("Your AI-generated report will appear here...")
349
- btn_download_report = gr.Button("⬇️ Download Full Report", visible=False)
350
-
351
- with gr.Tab("📊 Data Profiling"):
352
- gr.Markdown("### Detailed Data Profile")
353
- gr.Markdown("**Missing Data Analysis**")
354
- df_missing_data = gr.DataFrame(interactive=False)
355
- gr.Markdown("**Numeric Feature Statistics**")
356
- df_numeric_stats = gr.DataFrame(interactive=False)
357
- gr.Markdown("**Categorical Feature Statistics**")
358
- df_categorical_stats = gr.DataFrame(interactive=False)
359
-
360
- with gr.Tab("📈 Overview Visuals"):
361
- gr.Markdown("### At-a-Glance Visualizations")
362
- with gr.Row():
363
- plot_dtype = gr.Plot()
364
- plot_missing = gr.Plot()
365
- with gr.Row():
366
- plot_corr = gr.Plot()
367
-
368
- with gr.Tab("🎨 Interactive Visuals"):
369
- gr.Markdown("### Explore Your Data Visually")
370
- with gr.Row():
371
- with gr.Column():
372
- dd_hist_col = gr.Dropdown(label="Select Column", visible=False)
373
- plot_hist = gr.Plot()
374
- with gr.Column():
375
- dd_box_cat = gr.Dropdown(label="Select Category", visible=False)
376
- dd_box_num = gr.Dropdown(label="Select Value", visible=False)
377
- plot_box = gr.Plot()
378
- with gr.Row():
379
- gr.Markdown("#### Scatter Plot Explorer")
380
- with gr.Row():
381
- dd_scatter_x = gr.Dropdown(label="X-axis", visible=False)
382
- dd_scatter_y = gr.Dropdown(label="Y-axis", visible=False)
383
- dd_scatter_color = gr.Dropdown(label="Color", visible=False)
384
- plot_scatter = gr.Plot()
385
-
386
- with gr.Tab("🔍 Column Drilldown"):
387
- gr.Markdown("### Deep Dive into a Single Column")
388
- dd_drilldown_col = gr.Dropdown(label="Select Column", visible=False)
389
- with gr.Row():
390
- md_drilldown_stats = gr.Markdown()
391
- plot_drilldown = gr.Plot()
392
-
393
- # --- Event Listeners ---
394
-
395
- # Main analysis trigger
396
- analyze_btn.click(
397
- fn=process_uploaded_file,
398
- inputs=[file_input, api_key_input],
399
- outputs=[
400
- state_analyzer, md_ai_report, btn_download_report,
401
- df_missing_data, df_numeric_stats, df_categorical_stats,
402
- plot_dtype, plot_missing, plot_corr,
403
- dd_hist_col, dd_scatter_x, dd_scatter_y, dd_scatter_color,
404
- dd_box_cat, dd_box_num, dd_drilldown_col
405
- ]
406
- )
407
-
408
- # Interactive plot triggers
409
- dd_hist_col.change(fn=create_histogram, inputs=[state_analyzer, dd_hist_col], outputs=plot_hist)
410
- dd_scatter_x.change(fn=create_scatterplot, inputs=[state_analyzer, dd_scatter_x, dd_scatter_y, dd_scatter_color], outputs=plot_scatter)
411
- dd_scatter_y.change(fn=create_scatterplot, inputs=[state_analyzer, dd_scatter_x, dd_scatter_y, dd_scatter_color], outputs=plot_scatter)
412
- dd_scatter_color.change(fn=create_scatterplot, inputs=[state_analyzer, dd_scatter_x, dd_scatter_y, dd_scatter_color], outputs=plot_scatter)
413
- dd_box_cat.change(fn=create_boxplot, inputs=[state_analyzer, dd_box_cat, dd_box_num], outputs=plot_box)
414
- dd_box_num.change(fn=create_boxplot, inputs=[state_analyzer, dd_box_cat, dd_box_num], outputs=plot_box)
415
-
416
- # Drilldown trigger
417
- dd_drilldown_col.change(fn=analyze_single_column, inputs=[state_analyzer, dd_drilldown_col], outputs=[md_drilldown_stats, plot_drilldown])
418
-
419
- # Download trigger
420
- btn_download_report.click(fn=download_report, inputs=[state_analyzer, md_ai_report], outputs=gr.File(label="Download Report"))
421
-
422
- gr.Markdown("---")
423
- gr.Markdown("💡 **Tip**: Get your free Google Gemini API key from [Google AI Studio](https://aistudio.google.com/app/apikey).")
424
- gr.Markdown("MCP Expert System v2.0 - Analysis Complete.")
425
 
426
  if __name__ == "__main__":
427
- demo.launch(debug=True)
 
 
1
  # -*- coding: utf-8 -*-
2
+ #
3
+ # PROJECT: CognitiveEDA - The AI-Augmented Data Discovery Platform
4
+ #
5
+ # DESCRIPTION: An enterprise-grade Gradio application that revolutionizes Exploratory
6
+ # Data Analysis (EDA). By integrating Google's Gemini Pro LLM, this
7
+ # tool transcends traditional data profiling. It automates the generation
8
+ # of statistical summaries, interactive visualizations, and, most
9
+ # importantly, a rich, narrative-driven analysis. It delivers
10
+ # executive summaries, data quality assessments, actionable insights,
11
+ # and strategic recommendations in a single, streamlined workflow.
12
+ #
13
+ # ARCHITECTURE: The application is built upon a robust, object-oriented foundation.
14
+ # - DataAnalyzer (Core Engine): An encapsulated class that holds the
15
+ # DataFrame state and performs all statistical calculations and
16
+ # metadata extraction efficiently, ensuring data is processed once.
17
+ # - AI Integration: A dedicated module communicates with the Gemini API,
18
+ # using a sophisticated, structured prompt to ensure consistent,
19
+ # high-quality analytical narratives.
20
+ # - Gradio Interface (UI Layer): A multi-tabbed, interactive dashboard
21
+ # that logically separates the AI narrative, data profiling, static
22
+ # visuals, and interactive exploration tools. State is managed
23
+ # efficiently to provide a responsive user experience.
24
+ #
25
+ # FEATURES:
26
+ # - AI-Powered Executive Summary: Generates a high-level overview for stakeholders.
27
+ # - Automated Data Quality Audit: Provides a quality score and actionable cleaning steps.
28
+ # - Insight Discovery Engine: Uncovers hidden patterns, correlations, and anomalies.
29
+ # - Strategic Recommendations: Suggests next steps, modeling approaches, and business use cases.
30
+ # - Comprehensive Profiling: Detailed statistical tables for all data types.
31
+ # - Interactive Visualization Suite: Dynamic plots for deep-dive analysis.
32
+ # - One-Click Report Export: Downloads the complete AI-generated analysis as a Markdown file.
33
+ #
34
+ # AUTHOR: An MCP Expert in Data & AI Solutions
35
+ # VERSION: 3.0 (Enterprise Edition)
36
+ # LAST-UPDATE: 2023-10-27
37
+
38
  from __future__ import annotations
39
 
40
  import warnings
41
  import logging
42
  import os
43
+ from datetime import datetime
44
+ from typing import Any, Dict, List, Optional, Tuple
45
+
46
+ import gradio as gr
47
  import numpy as np
48
+ import pandas as pd
49
  import plotly.express as px
50
  import plotly.graph_objects as go
 
 
51
  import google.generativeai as genai
 
 
52
 
53
+ # --- Configuration & Constants ---
54
 
55
+ logging.basicConfig(
56
+ level=logging.INFO,
57
+ format='%(asctime)s - [%(levelname)s] - (%(filename)s:%(lineno)d) - %(message)s'
58
+ )
59
+ warnings.filterwarnings('ignore', category=FutureWarning)
60
 
61
+ class Config:
62
+ """Application-wide configuration settings."""
63
+ APP_TITLE = "🚀 CognitiveEDA: AI-Augmented Data Discovery Platform"
64
+ GEMINI_MODEL = 'gemini-1.5-flash-latest'
65
+ CORR_THRESHOLD = 0.75 # Threshold for highlighting high correlation
66
+ TOP_N_CATEGORIES = 10 # For bar charts of categorical features
67
+
68
+ # --- Core Analysis Engine ---
69
 
70
  class DataAnalyzer:
71
  """
72
+ Encapsulates all data analysis logic, acting as the single source of truth
73
+ for the uploaded dataset and its derived metadata.
 
74
  """
75
  def __init__(self, df: pd.DataFrame):
76
  if not isinstance(df, pd.DataFrame):
77
  raise TypeError("Input must be a pandas DataFrame.")
78
  self.df = df
79
  self._metadata: Optional[Dict[str, Any]] = None
80
+ logging.info(f"DataAnalyzer instantiated with DataFrame of shape: {self.df.shape}")
81
 
82
  @property
83
  def metadata(self) -> Dict[str, Any]:
84
+ """Lazy-loads and caches comprehensive dataset metadata for efficient reuse."""
85
  if self._metadata is None:
86
+ logging.info("First access to metadata, performing extraction...")
87
  self._metadata = self._extract_metadata()
88
  return self._metadata
89
 
90
  def _extract_metadata(self) -> Dict[str, Any]:
91
+ """Performs a deep scan of the DataFrame to extract key characteristics."""
 
92
  rows, cols = self.df.shape
93
  numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist()
94
  categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
 
95
 
96
+ # Advanced: High correlation pair detection
97
  high_corr_pairs = []
98
  if len(numeric_cols) > 1:
99
  corr_matrix = self.df[numeric_cols].corr().abs()
100
  upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
101
+ high_corr_series = upper_tri.stack()
102
  high_corr_pairs = (
103
+ high_corr_series[high_corr_series > Config.CORR_THRESHOLD]
104
  .reset_index()
105
+ .rename(columns={'level_0': 'Feature 1', 'level_1': 'Feature 2', 0: 'Correlation'})
 
 
 
106
  .to_dict('records')
107
  )
108
 
 
111
  'columns': self.df.columns.tolist(),
112
  'numeric_cols': numeric_cols,
113
  'categorical_cols': categorical_cols,
114
+ 'memory_usage_mb': f"{self.df.memory_usage(deep=True).sum() / 1e6:.2f}",
 
115
  'total_missing': int(self.df.isnull().sum().sum()),
116
+ 'data_quality_score': round((self.df.notna().sum().sum() / self.df.size) * 100, 2),
117
  'high_corr_pairs': high_corr_pairs,
118
  }
119
 
120
+ def get_profiling_tables(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
121
+ """Generates structured DataFrames for data profiling."""
122
+ logging.info("Generating profiling tables for missing, numeric, and categorical data.")
123
+ # Missing data profile
 
124
  missing = self.df.isnull().sum()
125
  missing_df = pd.DataFrame({
126
+ 'Missing Count': missing,
127
+ 'Missing Percentage (%)': (missing / len(self.df) * 100).round(2)
128
+ }).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing Count', ascending=False)
129
+
130
+ # Numeric features profile
131
+ numeric_stats = self.df[self.metadata['numeric_cols']].describe(percentiles=[.01, .25, .5, .75, .99]).T
132
+ numeric_stats_df = numeric_stats.round(3).reset_index().rename(columns={'index': 'Column'})
133
+
134
+ # Categorical features profile
135
+ cat_stats = self.df[self.metadata['categorical_cols']].describe(include=['object', 'category']).T
136
+ cat_stats_df = cat_stats.reset_index().rename(columns={'index': 'Column'})
137
+
138
+ return missing_df, numeric_stats_df, cat_stats_df
 
 
 
 
 
 
 
139
 
140
+ def get_overview_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]:
141
+ """Creates a set of key visualizations for a high-level overview."""
142
+ logging.info("Generating overview visualizations (types, missing data, correlation).")
143
+ meta = self.metadata
144
 
 
145
  dtype_counts = self.df.dtypes.astype(str).value_counts()
146
+ fig_types = px.pie(
147
  values=dtype_counts.values, names=dtype_counts.index,
148
+ title="<b>📊 Data Type Composition</b>", hole=0.4,
149
+ color_discrete_sequence=px.colors.qualitative.Pastel
150
  )
151
+ fig_types.update_traces(textposition='outside', textinfo='percent+label')
152
+
153
+ missing_df = self.df.isnull().sum().reset_index(name='count').query('count > 0')
154
+ fig_missing = px.bar(
155
+ missing_df, x='index', y='count', title="<b>🕳️ Missing Values Distribution</b>",
156
+ labels={'index': 'Column Name', 'count': 'Number of Missing Values'},
157
+ ).update_xaxes(categoryorder="total descending")
158
+
159
+ fig_corr = go.Figure()
160
+ if len(meta['numeric_cols']) > 1:
161
+ corr_matrix = self.df[meta['numeric_cols']].corr()
162
+ fig_corr = px.imshow(
 
 
163
  corr_matrix, text_auto=".2f", aspect="auto",
164
+ title=f"<b>🔗 Correlation Matrix (Threshold > {Config.CORR_THRESHOLD})</b>",
165
+ color_continuous_scale='RdBu_r', zmin=-1, zmax=1
166
  )
167
  else:
168
+ fig_corr.update_layout(title="<b>🔗 Correlation Matrix (Insufficient Numeric Data)</b>")
169
 
170
+ return fig_types, fig_missing, fig_corr
171
+
172
+ def generate_ai_narrative(self, api_key: str) -> str:
173
+ """Orchestrates the generation of the full AI-driven report using Gemini."""
174
+ logging.info("Generating AI narrative with the Gemini API.")
175
+ meta = self.metadata
176
 
177
+ # A more sophisticated, structured prompt for a better report
178
  prompt = f"""
179
+ As "Cognitive Analyst," an elite AI data scientist, your task is to generate a comprehensive, multi-part data discovery report.
180
+ Analyze the following dataset context and produce a professional, insightful, and clear analysis in Markdown format.
181
+
182
+ **DATASET CONTEXT:**
183
+ - **Shape:** {meta['shape'][0]} rows, {meta['shape'][1]} columns.
184
+ - **Column Schema:**
185
+ - Numeric: {', '.join(meta['numeric_cols']) if meta['numeric_cols'] else 'None'}
186
+ - Categorical: {', '.join(meta['categorical_cols']) if meta['categorical_cols'] else 'None'}
187
+ - **Data Quality Score:** {meta['data_quality_score']}% (Percentage of non-missing cells)
188
+ - **Total Missing Values:** {meta['total_missing']:,}
189
+ - **High-Correlation Pairs (>{Config.CORR_THRESHOLD}):** {meta['high_corr_pairs'] if meta['high_corr_pairs'] else 'None detected.'}
190
+ - **Data Snippet (First 5 Rows):**
191
+ {self.df.head(5).to_markdown(index=False)}
192
+
193
+ **REQUIRED REPORT STRUCTURE (Strictly use this Markdown format):**
194
+
195
+ # 🚀 AI Data Discovery Report
196
+
197
+ ## 📄 1. Executive Summary
198
+ * **Primary Objective:** (Deduce the most likely purpose of this dataset. What problem is it trying to solve?)
199
+ * **Key Finding:** (State the single most interesting or impactful insight you've discovered.)
200
+ * **Overall State:** (Briefly comment on the data's quality and readiness for analysis.)
201
+
202
+ ## 🧐 2. Data Profile & Quality Assessment
203
+ * **First Impression:** (Describe the dataset's structure, size, and composition.)
204
+ * **Data Quality Audit:** (Elaborate on the **{meta['data_quality_score']}%** quality score. Are the **{meta['total_missing']}** missing values concentrated in specific columns? Is this a major concern?)
205
+ * **Redundancy Check:** (Comment on the detected high-correlation pairs. Is there a risk of multicollinearity in modeling?)
206
+
207
+ ## 💡 3. Key Insights & Potential Stories
208
+ * **Insight 1 (e.g., Anomaly Detected 🕵️):** (Describe a surprising pattern, outlier, or distribution in a key numeric column.)
209
+ * **Insight 2 (e.g., Categorical Trend 📊):** (Analyze a key categorical column. What does its distribution reveal? Is there a dominant category?)
210
+ * **Insight 3 (e.g., Relationship Hint 🔗):** (Speculate on a potential relationship between two or more columns, even if not highly correlated.)
211
+
212
+ ## 🛠️ 4. Actionable Recommendations
213
+ * **Data Cleaning:**
214
+ - **Step 1:** (Provide a specific recommendation for handling missing data, e.g., "For `column_name`, with X% missing, consider imputation using the median due to its skewed distribution.")
215
+ - **Step 2:** (Suggest actions for correlated features, e.g., "Consider dropping `Feature A` or using dimensionality reduction (PCA) due to its high correlation with `Feature B`.")
216
+ * **Feature Engineering:**
217
+ - **Idea 1:** (Suggest creating a new feature, e.g., "Combine `year` and `month` into a `date` feature for time-series analysis.")
218
+ * **Next Analytical Steps:**
219
+ - **Hypothesis to Test:** (Propose a business or research question to investigate further, e.g., "Does `customer_segment` significantly impact `total_spend`?")
220
+ - **Modeling Potential:** (Suggest a suitable machine learning model, e.g., "This dataset is well-suited for a classification model to predict `is_churn`.")
221
  """
222
  try:
223
  genai.configure(api_key=api_key)
224
+ model = genai.GenerativeModel(Config.GEMINI_MODEL)
225
  response = model.generate_content(prompt)
226
  return response.text
227
  except Exception as e:
228
+ logging.error(f"Gemini API call failed: {e}", exc_info=True)
229
+ error_message = (
230
+ "❌ **AI Report Generation Failed**\n\n"
231
+ f"**Error Details:** `{str(e)}`\n\n"
232
+ "**Troubleshooting Steps:**\n"
233
+ "1. Verify that your Google Gemini API key is correct and active.\n"
234
+ "2. Check your network connection and firewall settings.\n"
235
+ "3. Ensure the Gemini API is not experiencing an outage."
236
+ )
237
+ return error_message
238
 
239
  # --- Gradio UI & Event Handlers ---
240
 
241
+ def create_ui():
242
+ """Defines and builds the Gradio user interface."""
243
+
244
+ # --- Interactive Plotting Functions (scoped inside UI creation for clarity) ---
245
+ def create_histogram(analyzer: DataAnalyzer, col: str) -> go.Figure:
246
+ if not col or not analyzer: return go.Figure()
247
+ return px.histogram(analyzer.df, x=col, title=f"<b>Distribution of {col}</b>", marginal="box", template="plotly_white")
248
+
249
+ def create_scatterplot(analyzer: DataAnalyzer, x_col: str, y_col:str, color_col:str) -> go.Figure:
250
+ if not all([analyzer, x_col, y_col]): return go.Figure()
251
+ return px.scatter(
252
+ analyzer.df, x=x_col, y=y_col, color=color_col,
253
+ title=f"<b>Scatter Plot: {x_col} vs. {y_col}</b>", template="plotly_white",
254
+ color_continuous_scale=px.colors.sequential.Viridis
255
+ )
256
+
257
+ def analyze_single_column(analyzer: DataAnalyzer, col: str) -> Tuple[str, go.Figure]:
258
+ if not col or not analyzer: return "", go.Figure()
259
+
260
+ series = analyzer.df[col]
261
+ stats_md = f"### 🔎 **Deep Dive: `{col}`**\n"
262
+ stats_md += f"- **Data Type:** `{series.dtype}`\n"
263
+ stats_md += f"- **Unique Values:** `{series.nunique()}`\n"
264
+ stats_md += f"- **Missing:** `{series.isnull().sum()}` ({series.isnull().mean():.2%})\n"
265
+
266
+ fig = go.Figure()
267
+ if pd.api.types.is_numeric_dtype(series):
268
+ stats_md += f"- **Mean:** `{series.mean():.3f}` | **Std Dev:** `{series.std():.3f}`\n"
269
+ stats_md += f"- **Median:** `{series.median():.3f}` | **Min:** `{series.min():.3f}` | **Max:** `{series.max():.3f}`\n"
270
+ fig = create_histogram(analyzer, col)
271
+ else:
272
+ top_n = series.value_counts().nlargest(Config.TOP_N_CATEGORIES)
273
+ stats_md += f"- **Top Value:** `{top_n.index[0]}` ({top_n.iloc[0]} occurrences)\n"
274
+ fig = px.bar(
275
+ top_n, y=top_n.index, x=top_n.values, orientation='h',
276
+ title=f"<b>Top {Config.TOP_N_CATEGORIES} Categories in `{col}`</b>",
277
+ labels={'y': col, 'x': 'Count'}, template="plotly_white"
278
+ ).update_yaxes(categoryorder="total ascending")
279
+
280
+ return stats_md, fig
281
+
282
+ # --- Main UI Blocks ---
283
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="cyan"), title=Config.APP_TITLE) as demo:
284
+ # Store for the main DataAnalyzer object
285
+ state_analyzer = gr.State()
286
+
287
+ gr.Markdown(f"<h1>{Config.APP_TITLE}</h1>")
288
+ gr.Markdown("Upload a CSV file, provide your Gemini API key, and receive an instant, AI-driven analysis of your data.")
289
+
290
+ with gr.Row():
291
+ with gr.Column(scale=3):
292
+ upload_button = gr.File(label="1. Upload CSV File", file_types=[".csv"])
293
+ with gr.Column(scale=2):
294
+ api_key_input = gr.Textbox(label="2. Enter Google Gemini API Key", type="password")
295
+ with gr.Column(scale=1, min_width=150):
296
+ analyze_button = gr.Button("✨ Generate Analysis", variant="primary")
297
+
298
+ with gr.Tabs() as tabs:
299
+ with gr.Tab("🤖 AI Narrative", id=0):
300
+ ai_report_output = gr.Markdown("Your AI-generated report will appear here once analysis is complete...")
301
+ download_report_button = gr.Button("⬇️ Download Full Report", visible=False)
302
+
303
+ with gr.Tab(" Profile", id=1):
304
+ gr.Markdown("### **Detailed Data Profile**")
305
+ gr.Markdown("#### Missing Data Summary")
306
+ profile_missing_df = gr.DataFrame(interactive=False, label="Missing Values")
307
+ gr.Markdown("#### Numeric Features Summary")
308
+ profile_numeric_df = gr.DataFrame(interactive=False, label="Numeric Stats")
309
+ gr.Markdown("#### Categorical Features Summary")
310
+ profile_categorical_df = gr.DataFrame(interactive=False, label="Categorical Stats")
311
+
312
+ with gr.Tab("📈 Overview Visuals", id=2):
313
+ gr.Markdown("### **At-a-Glance Visualizations**")
314
+ with gr.Row():
315
+ plot_types = gr.Plot()
316
+ plot_missing = gr.Plot()
317
+ plot_correlation = gr.Plot()
318
+
319
+ with gr.Tab("🎨 Interactive Explorer", id=3):
320
+ gr.Markdown("### **Visually Explore Feature Relationships**")
321
+ with gr.Row():
322
+ with gr.Column(scale=1):
323
+ gr.Markdown("#### Univariate Analysis")
324
+ dd_hist_col = gr.Dropdown(label="Select Column for Histogram", visible=False)
325
+ with gr.Column(scale=2):
326
+ plot_histogram = gr.Plot()
327
+
328
+ with gr.Row():
329
+ with gr.Column(scale=1):
330
+ gr.Markdown("#### Bivariate Analysis (Scatter Plot)")
331
+ dd_scatter_x = gr.Dropdown(label="X-Axis (Numeric)", visible=False)
332
+ dd_scatter_y = gr.Dropdown(label="Y-Axis (Numeric)", visible=False)
333
+ dd_scatter_color = gr.Dropdown(label="Color By (Optional)", visible=False)
334
+ with gr.Column(scale=2):
335
+ plot_scatter = gr.Plot()
336
+
337
+ with gr.Tab("🔍 Column Deep-Dive", id=4):
338
+ gr.Markdown("### **Inspect a Single Column in Detail**")
339
+ dd_drilldown_col = gr.Dropdown(label="Select Column to Analyze", visible=False)
340
+ with gr.Row():
341
+ md_drilldown_stats = gr.Markdown()
342
+ plot_drilldown = gr.Plot()
343
+
344
+ gr.HTML("""
345
+ <div style="text-align: center; margin-top: 20px; font-family: sans-serif; color: #777;">
346
+ <p>💡 Need an API key? Get one from <a href="https://aistudio.google.com/app/apikey" target="_blank">Google AI Studio</a>.</p>
347
+ <p>CognitiveEDA v3.0 | An MCP Expert System</p>
348
+ </div>
349
+ """)
350
+
351
+ # --- Event Listeners & Control Flow ---
352
+
353
+ outputs_for_main_analysis = [
354
+ state_analyzer, ai_report_output, download_report_button,
355
+ profile_missing_df, profile_numeric_df, profile_categorical_df,
356
+ plot_types, plot_missing, plot_correlation,
357
+ dd_hist_col, dd_scatter_x, dd_scatter_y, dd_scatter_color, dd_drilldown_col
358
+ ]
359
+
360
+ analyze_button.click(
361
+ fn=run_full_analysis,
362
+ inputs=[upload_button, api_key_input],
363
+ outputs=outputs_for_main_analysis
364
+ )
365
+
366
+ # Interactive plot triggers
367
+ dd_hist_col.change(fn=create_histogram, inputs=[state_analyzer, dd_hist_col], outputs=plot_histogram)
368
+
369
+ scatter_inputs = [state_analyzer, dd_scatter_x, dd_scatter_y, dd_scatter_color]
370
+ dd_scatter_x.change(fn=create_scatterplot, inputs=scatter_inputs, outputs=plot_scatter)
371
+ dd_scatter_y.change(fn=create_scatterplot, inputs=scatter_inputs, outputs=plot_scatter)
372
+ dd_scatter_color.change(fn=create_scatterplot, inputs=scatter_inputs, outputs=plot_scatter)
373
+
374
+ dd_drilldown_col.change(
375
+ fn=analyze_single_column,
376
+ inputs=[state_analyzer, dd_drilldown_col],
377
+ outputs=[md_drilldown_stats, plot_drilldown]
378
+ )
379
+
380
+ download_report_button.click(
381
+ fn=download_report_file,
382
+ inputs=[state_analyzer, ai_report_output],
383
+ outputs=gr.File(label="Download Report")
384
+ )
385
+
386
+ return demo
387
+
388
+ # --- Main Application Logic ---
389
+
390
+ def run_full_analysis(file_obj: gr.File, api_key: str) -> Dict[gr.component, Any]:
391
  """
392
+ Orchestrates the entire analysis pipeline upon button click.
393
+ Returns a dictionary to update all relevant UI components at once.
394
  """
395
  if file_obj is None:
396
+ raise gr.Error("CRITICAL: No file uploaded. Please select a CSV file.")
397
  if not api_key:
398
+ raise gr.Error("CRITICAL: Gemini API key is missing. Please provide your key.")
399
 
400
  try:
401
+ logging.info(f"Processing uploaded file: {file_obj.name}")
402
  df = pd.read_csv(file_obj.name)
403
  analyzer = DataAnalyzer(df)
404
+
405
+ # --- Execute all analysis tasks concurrently (conceptually) ---
406
+ ai_report = analyzer.generate_ai_narrative(api_key)
407
+ missing_df, num_df, cat_df = analyzer.get_profiling_tables()
408
+ fig_types, fig_missing, fig_corr = analyzer.get_overview_visuals()
409
 
410
+ # --- Prepare UI component updates ---
411
+ meta = analyzer.metadata
412
+ all_cols, num_cols, cat_cols = meta['columns'], meta['numeric_cols'], meta['categorical_cols']
 
 
 
 
 
 
413
 
414
+ # Return a dictionary mapping components to their new state/value
415
  return {
416
+ # State & AI Report
417
  state_analyzer: analyzer,
418
+ ai_report_output: ai_report,
419
+ download_report_button: gr.Button(visible=True),
 
420
  # Profiling Tab
421
+ profile_missing_df: missing_df,
422
+ profile_numeric_df: num_df,
423
+ profile_categorical_df: cat_df,
424
+ # Overview Visuals Tab
425
+ plot_types: fig_types,
426
+ plot_missing: fig_missing,
427
+ plot_correlation: fig_corr,
428
+ # Interactive Explorer & Drilldown Dropdown Updates
429
+ dd_hist_col: gr.Dropdown(choices=num_cols, label="Select Numeric Column", visible=True),
430
+ dd_scatter_x: gr.Dropdown(choices=num_cols, label="X-Axis (Numeric)", visible=True),
431
+ dd_scatter_y: gr.Dropdown(choices=num_cols, label="Y-Axis (Numeric)", visible=True),
432
+ dd_scatter_color: gr.Dropdown(choices=all_cols, label="Color By (Optional)", visible=True),
433
+ dd_drilldown_col: gr.Dropdown(choices=all_cols, label="Select Column to Analyze", visible=True)
 
 
 
434
  }
435
 
436
  except Exception as e:
437
+ logging.error(f"A critical error occurred during file processing: {e}", exc_info=True)
438
+ raise gr.Error(f"Analysis Failed! The process stopped due to: {str(e)}")
 
 
439
 
 
 
 
440
 
441
+ def download_report_file(analyzer: DataAnalyzer, ai_report_text: str) -> str:
442
+ """Generates a comprehensive Markdown file for download."""
443
+ if not analyzer:
444
+ logging.warning("Download attempted without a valid analyzer object.")
445
+ return None
446
+
447
+ filename = f"CognitiveEDA_Report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
448
+ meta = analyzer.metadata
449
+
450
+ # Assemble the full report
451
+ full_report = f"# CognitiveEDA - Data Discovery Report\n"
452
+ full_report += f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
453
+ full_report += f"## Dataset Overview\n"
454
+ full_report += f"- **Shape:** {meta['shape'][0]} rows x {meta['shape'][1]} columns\n"
455
+ full_report += f"- **Memory Footprint:** {meta['memory_usage_mb']} MB\n"
456
+ full_report += f"- **Data Quality Score:** {meta['data_quality_score']}%\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
  full_report += "---\n\n"
458
  full_report += ai_report_text
459
 
460
  with open(filename, "w", encoding="utf-8") as f:
461
  f.write(full_report)
462
 
463
+ logging.info(f"Report file generated successfully: {filename}")
464
  return filename
465
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
466
 
467
  if __name__ == "__main__":
468
+ app_instance = create_ui()
469
+ app_instance.launch(debug=True, server_name="0.0.0.0")