mgbam commited on
Commit
60da408
·
verified ·
1 Parent(s): a4f4e77

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +410 -233
app.py CHANGED
@@ -1,250 +1,427 @@
1
- import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import pandas as pd
3
  import numpy as np
4
  import plotly.express as px
5
- import io
6
- import json
7
- import warnings
8
  import google.generativeai as genai
9
- import os
10
- from typing import List, Dict, Any, Tuple, Optional
11
- import re
 
12
 
13
- # --- Configuration & Constants ---
14
  warnings.filterwarnings('ignore')
15
- MAX_DASHBOARD_PLOTS = 10
16
- CSS = """
17
- /* --- Phoenix UI Professional Dark CSS --- */
18
- #app-title { text-align: center; font-weight: 800; font-size: 2.5rem; color: #f9fafb; padding-top: 10px; }
19
- .stat-card { border-radius: 12px !important; padding: 20px !important; background: #1f2937 !important; border: 1px solid #374151 !important; text-align: center; transition: all 0.3s ease; }
20
- .stat-card:hover { transform: translateY(-5px); box-shadow: 0 10px 15px -3px rgba(0,0,0,0.1), 0 4px 6px -2px rgba(0,0,0,0.05); }
21
- .stat-card-title { font-size: 16px; font-weight: 500; color: #9ca3af !important; margin-bottom: 8px; }
22
- .stat-card-value { font-size: 32px; font-weight: 700; color: #f9fafb !important; }
23
- .sidebar { background-color: #111827 !important; padding: 15px; border-right: 1px solid #374151 !important; min-height: 100vh; }
24
- .sidebar .gr-button { width: 100%; text-align: left !important; background: none !important; border: none !important; box-shadow: none !important; color: #d1d5db !important; font-size: 16px !important; padding: 12px 10px !important; margin-bottom: 8px !important; border-radius: 8px !important; transition: background-color 0.2s ease; }
25
- .sidebar .gr-button:hover { background-color: #374151 !important; }
26
- .sidebar .gr-button.selected { background-color: #4f46e5 !important; font-weight: 600 !important; color: white !important; }
27
- .explanation-block { background-color: #1e3a8a !important; border-left: 4px solid #3b82f6 !important; padding: 12px; color: #e5e7eb !important; border-radius: 4px; }
28
- """
 
 
 
 
 
 
 
 
 
29
 
30
- class DataExplorerApp:
31
- """A professional-grade, AI-powered data exploration application."""
32
-
33
- def __init__(self):
34
- self.demo = self._build_ui()
35
-
36
- def _build_ui(self) -> gr.Blocks:
37
- with gr.Blocks(theme=gr.themes.Glass(primary_hue="indigo", secondary_hue="blue"), css=CSS, title="AI Data Explorer Pro") as demo:
38
- state_var = gr.State({})
39
-
40
- # --- Component Definition ---
41
- cockpit_btn = gr.Button("📊 Data Cockpit", elem_classes="selected", elem_id="cockpit")
42
- deep_dive_btn = gr.Button("🔍 Deep Dive Builder", elem_id="deep_dive")
43
- copilot_btn = gr.Button("🤖 Chief Data Scientist", elem_id="co-pilot")
44
-
45
- # UPDATED: File input now accepts multiple types
46
- file_input = gr.File(
47
- label="📁 Upload Data File",
48
- file_types=[".csv", ".txt", ".xls", ".xlsx"]
 
 
49
  )
50
-
51
- status_output = gr.Markdown("Status: Awaiting data...")
52
- api_key_input = gr.Textbox(label="🔑 Gemini API Key", type="password", placeholder="Enter key to enable AI...")
53
- suggestion_btn = gr.Button("Get Smart Suggestions", variant="secondary", interactive=False)
54
-
55
- rows_stat, cols_stat = gr.Textbox("0", interactive=False, show_label=False), gr.Textbox("0", interactive=False, show_label=False)
56
- quality_stat, time_cols_stat = gr.Textbox("0%", interactive=False, show_label=False), gr.Textbox("0", interactive=False, show_label=False)
57
- suggestion_buttons = [gr.Button(visible=False) for _ in range(5)]
58
-
59
- plot_type_dd = gr.Dropdown(['histogram', 'bar', 'scatter', 'box'], label="Plot Type", value='histogram')
60
- x_col_dd = gr.Dropdown([], label="X-Axis / Column", interactive=False)
61
- y_col_dd = gr.Dropdown([], label="Y-Axis (for Scatter/Box)", visible=False, interactive=False)
62
- add_plot_btn, clear_plots_btn = gr.Button("Add to Dashboard", variant="primary", interactive=False), gr.Button("Clear Dashboard", interactive=False)
63
- dashboard_plots = [gr.Plot(visible=False) for _ in range(MAX_DASHBOARD_PLOTS)]
64
-
65
- chatbot = gr.Chatbot(height=500, label="Conversation", show_copy_button=True, avatar_images=(None, "bot.png"))
66
- copilot_explanation, copilot_code = gr.Markdown(visible=False, elem_classes="explanation-block"), gr.Code(language="python", visible=False, label="Executed Code")
67
- copilot_plot, copilot_table = gr.Plot(visible=False, label="Generated Visualization"), gr.Dataframe(visible=False, label="Generated Table", wrap=True)
68
- chat_input, chat_submit_btn = gr.Textbox(label="Your Question", placeholder="e.g., 'What is the relationship between age and salary?'", scale=4), gr.Button("Ask AI", variant="primary", interactive=False)
69
-
70
- # --- Layout Arrangement ---
71
- with gr.Row():
72
- with gr.Column(scale=1, elem_classes="sidebar"):
73
- gr.Markdown("## 🚀 AI Explorer Pro", elem_id="app-title"); cockpit_btn; deep_dive_btn; copilot_btn; gr.Markdown("---")
74
- file_input; status_output; gr.Markdown("---"); api_key_input; suggestion_btn
75
- with gr.Column(scale=4):
76
- welcome_page, cockpit_page, deep_dive_page, copilot_page = [gr.Column(visible=i==0) for i in range(4)]
77
- with welcome_page: gr.Markdown("# Welcome to the AI Data Explorer Pro\n> Please **upload a CSV, TXT, or Excel file** and **enter your Gemini API key** to begin your analysis.")
78
- with cockpit_page:
79
- gr.Markdown("## 📊 Data Cockpit: At-a-Glance Overview")
80
- with gr.Row():
81
- for title, stat_comp in [("Rows", rows_stat), ("Columns", cols_stat), ("Data Quality", quality_stat), ("Date/Time Cols", time_cols_stat)]:
82
- with gr.Column(elem_classes="stat-card"): gr.Markdown(f"<div class='stat-card-title'>{title}</div>"); stat_comp
83
- with gr.Accordion(label="✨ AI Smart Suggestions", open=True): [btn for btn in suggestion_buttons]
84
- with deep_dive_page:
85
- gr.Markdown("## 🔍 Deep Dive: Manual Dashboard Builder"); gr.Markdown("Construct visualizations to investigate specific relationships.")
86
- with gr.Row(): plot_type_dd; x_col_dd; y_col_dd
87
- with gr.Row(): add_plot_btn; clear_plots_btn
88
- with gr.Column(): [plot for plot in dashboard_plots]
89
- with copilot_page:
90
- gr.Markdown("## 🤖 Chief Data Scientist: Your AI Partner"); chatbot
91
- with gr.Accordion("AI's Detailed Response", open=True): copilot_explanation; copilot_code; copilot_plot; copilot_table
92
- with gr.Row(): chat_input; chat_submit_btn
93
-
94
- # --- Event Handlers Registration ---
95
- pages, nav_buttons = [welcome_page, cockpit_page, deep_dive_page, copilot_page], [cockpit_btn, deep_dive_btn, copilot_btn]
96
- for i, btn in enumerate(nav_buttons):
97
- btn.click(lambda id=btn.elem_id: self._switch_page(id, pages), outputs=pages).then(
98
- lambda i=i: [gr.update(elem_classes="selected" if j==i else "") for j in range(len(nav_buttons))], outputs=nav_buttons)
99
-
100
- file_input.upload(self.load_and_process_file, inputs=[file_input], outputs=[
101
- state_var, status_output, *pages, rows_stat, cols_stat, quality_stat, time_cols_stat, x_col_dd, y_col_dd, add_plot_btn])
102
-
103
- api_key_input.change(lambda x: gr.update(interactive=bool(x)), inputs=[api_key_input], outputs=[suggestion_btn])
104
- chat_input.change(lambda x: gr.update(interactive=bool(x.strip())), inputs=[chat_input], outputs=[chat_submit_btn])
105
-
106
- plot_type_dd.change(self._update_plot_controls, inputs=[plot_type_dd], outputs=[y_col_dd])
107
- add_plot_btn.click(self.add_plot_to_dashboard, inputs=[state_var, x_col_dd, y_col_dd, plot_type_dd], outputs=[state_var, clear_plots_btn, *dashboard_plots])
108
- clear_plots_btn.click(self.clear_dashboard, inputs=[state_var], outputs=[state_var, clear_plots_btn, *dashboard_plots])
109
-
110
- suggestion_btn.click(self.get_ai_suggestions, inputs=[state_var, api_key_input], outputs=suggestion_buttons)
111
- for btn in suggestion_buttons:
112
- btn.click(self.handle_suggestion_click, inputs=[btn], outputs=[*pages, chat_input])
113
-
114
- chat_submit_btn.click(self.respond_to_chat, [state_var, api_key_input, chat_input, chatbot], [chatbot, copilot_explanation, copilot_code, copilot_plot, copilot_table]).then(lambda: "", outputs=[chat_input])
115
- chat_input.submit(self.respond_to_chat, [state_var, api_key_input, chat_input, chatbot], [chatbot, copilot_explanation, copilot_code, copilot_plot, copilot_table]).then(lambda: "", outputs=[chat_input])
116
- return demo
117
-
118
- def launch(self): self.demo.launch(debug=True)
119
-
120
- def _switch_page(self, page_id: str, all_pages: List) -> List[gr.update]:
121
- visibility = {"welcome":0, "cockpit":1, "deep_dive":2, "co-pilot":3}
122
- return [gr.update(visible=i == visibility.get(page_id, 0)) for i in range(len(all_pages))]
123
 
124
- def _update_plot_controls(self, plot_type: str) -> gr.update: return gr.update(visible=plot_type in ['scatter', 'box'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
- def load_and_process_file(self, file_obj: Any) -> Tuple[Any, ...]:
127
- """Intelligently loads data from CSV, TXT, or Excel files."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  try:
129
- filename = file_obj.name
130
- extension = os.path.splitext(filename)[1].lower()
131
-
132
- if extension == '.csv':
133
- df = pd.read_csv(filename)
134
- elif extension == '.txt':
135
- # Use sep=None to auto-detect the delimiter (tabs, spaces, etc.)
136
- df = pd.read_csv(filename, sep=None, engine='python')
137
- elif extension in ['.xls', '.xlsx']:
138
- df = pd.read_excel(filename)
139
- else:
140
- raise ValueError(f"Unsupported file type: {extension}")
141
-
142
- # Continue with processing once the DataFrame is loaded
143
- for col in df.select_dtypes(include=['object']).columns:
144
- try: df[col] = pd.to_datetime(df[col], errors='raise')
145
- except (ValueError, TypeError): continue
146
-
147
- metadata = self._extract_dataset_metadata(df)
148
- state = {'df': df, 'metadata': metadata, 'dashboard_plots': []}
149
- rows, cols, quality = metadata['shape'][0], metadata['shape'][1], metadata['data_quality']
150
- page_updates = self._switch_page("cockpit", [0,1,2,3])
151
- return (state, f"✅ **{os.path.basename(filename)}** loaded.", *page_updates, f"{rows:,}", f"{cols}", f"{quality}%", f"{len(metadata['datetime_cols'])}",
152
- gr.update(choices=metadata['columns'], interactive=True), gr.update(choices=metadata['columns'], interactive=True), gr.update(interactive=True))
153
  except Exception as e:
154
- gr.Error(f"File Load Error: {e}"); page_updates = self._switch_page("welcome", [0,1,2,3]);
155
- return {}, f"❌ Error: {e}", *page_updates, "0", "0", "0%", "0", gr.update(choices=[], interactive=False), gr.update(choices=[], interactive=False), gr.update(interactive=False)
156
-
157
- def _extract_dataset_metadata(self, df: pd.DataFrame) -> Dict[str, Any]:
158
- rows, cols = df.shape
159
- quality = round((df.notna().sum().sum() / df.size) * 100, 1) if df.size > 0 else 0
160
- return {'shape': (rows, cols), 'columns': df.columns.tolist(), 'numeric_cols': df.select_dtypes(include=np.number).columns.tolist(),
161
- 'categorical_cols': df.select_dtypes(include=['object', 'category']).columns.tolist(), 'datetime_cols': df.select_dtypes(include=['datetime64', 'datetime64[ns]']).columns.tolist(),
162
- 'dtypes_head': df.head(3).to_string(), 'data_quality': quality}
163
-
164
- def add_plot_to_dashboard(self, state: Dict, x_col: str, y_col: Optional[str], plot_type: str) -> List[Any]:
165
- dashboard_plots = state.get('dashboard_plots', [])
166
- if len(dashboard_plots) >= MAX_DASHBOARD_PLOTS:
167
- gr.Warning(f"Dashboard is full. Max {MAX_DASHBOARD_PLOTS} plots."); return [state, gr.update(interactive=True), *self._get_plot_updates(state)]
168
- if not x_col: gr.Warning("Please select an X-axis column."); return [state, gr.update(interactive=True), *self._get_plot_updates(state)]
169
- df, title = state.get('df'), f"{plot_type.capitalize()}: {y_col} by {x_col}" if y_col and plot_type in ['box', 'scatter'] else f"Distribution of {x_col}"
170
- try:
171
- fig=None;
172
- if plot_type == 'histogram': fig = px.histogram(df, x=x_col, title=title)
173
- elif plot_type == 'box': fig = px.box(df, x=x_col, y=y_col, title=title)
174
- elif plot_type == 'scatter': fig = px.scatter(df, x=x_col, y=y_col, title=title, trendline="ols")
175
- elif plot_type == 'bar': fig = px.bar(df[x_col].value_counts().nlargest(20), title=f"Top 20 for {x_col}")
176
- if fig:
177
- fig.update_layout(template="plotly_dark"); dashboard_plots.append(fig); gr.Info(f"Added '{title}' to dashboard.")
178
- return [state, gr.update(interactive=True), *self._get_plot_updates(state)]
179
- except Exception as e: gr.Error(f"Plotting Error: {e}"); return [state, gr.update(interactive=True), *self._get_plot_updates(state)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
 
181
- def _get_plot_updates(self, state: Dict) -> List[gr.update]:
182
- plots = state.get('dashboard_plots', [])
183
- return [gr.update(value=plots[i] if i < len(plots) else None, visible=i < len(plots)) for i in range(MAX_DASHBOARD_PLOTS)]
184
-
185
- def clear_dashboard(self, state: Dict) -> List[Any]:
186
- state['dashboard_plots'] = []; gr.Info("Dashboard cleared."); return [state, gr.update(interactive=False), *self._get_plot_updates(state)]
187
-
188
- def get_ai_suggestions(self, state: Dict, api_key: str) -> List[gr.update]:
189
- if not api_key: gr.Warning("API Key is required."); return [gr.update(visible=False)]*5
190
- if not state: gr.Warning("Please load data first."); return [gr.update(visible=False)]*5
191
- metadata, columns = state.get('metadata', {}), state.get('metadata', {}).get('columns', [])
192
- prompt = f"From columns {columns}, generate 4 impactful analytical questions. Return ONLY a JSON list of strings."
193
- try:
194
- genai.configure(api_key=api_key); suggestions = json.loads(genai.GenerativeModel('gemini-1.5-flash').generate_content(prompt).text)
195
- return [gr.Button(s, visible=True) for s in suggestions] + [gr.Button(visible=False)] * (5 - len(suggestions))
196
- except Exception as e: gr.Error(f"AI Suggestion Error: {e}"); return [gr.update(visible=False)]*5
197
-
198
- def handle_suggestion_click(self, question: str) -> Tuple[gr.update, ...]:
199
- return *self._switch_page("co-pilot", [0,1,2,3]), question
200
-
201
- def _sanitize_and_parse_json(self, raw_text: str) -> Dict:
202
- clean_text = re.sub(r'```json\n?|```', '', raw_text).strip()
203
- clean_text = re.sub(r'(?<!\\)\\(?!["\\/bfnrtu])', r'\\\\', clean_text)
204
- return json.loads(clean_text)
205
-
206
- def respond_to_chat(self, state: Dict, api_key: str, user_message: str, history: List) -> Any:
207
- if not user_message.strip(): return history, *[gr.update()]*4
208
- if not api_key or not state:
209
- history.append((user_message, "I need a Gemini API key and a dataset to work.")); return history, *[gr.update(visible=False)]*4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
- history.append((user_message, "Thinking... 🤔")); yield history, *[gr.update(visible=False)]*4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
 
213
- metadata = state.get('metadata', {}); dtypes_head = metadata.get('dtypes_head', 'No metadata available.')
214
- prompt = f"""You are 'Chief Data Scientist', an expert AI analyst...
215
- Respond ONLY with a single JSON object with keys: "plan", "code", "insight".
216
- Metadata: {dtypes_head}
217
- User Question: "{user_message}"
218
- """
219
- try:
220
- genai.configure(api_key=api_key); response_json = self._sanitize_and_parse_json(genai.GenerativeModel('gemini-1.5-flash').generate_content(prompt).text)
221
- plan, code, insight = response_json.get("plan"), response_json.get("code"), response_json.get("insight")
222
- stdout, fig, df_result, error = self._safe_exec(code, {'df': state['df'], 'px': px, 'pd': pd})
223
-
224
- history[-1] = (user_message, f"**Plan:** {plan}")
225
- explanation = f"**Insight:** {insight}"
226
- if stdout: explanation += f"\n\n**Console Output:**\n```\n{stdout}\n```"
227
- if error: gr.Error(f"AI Code Execution Failed: {error}")
228
-
229
- yield (history, gr.update(visible=bool(explanation), value=explanation), gr.update(visible=bool(code), value=code),
230
- gr.update(visible=bool(fig), value=fig), gr.update(visible=bool(df_result is not None), value=df_result))
231
- except Exception as e:
232
- history[-1] = (user_message, f"I encountered an error processing the AI response. Please rephrase your question.\n\n**Details:** `{str(e)}`")
233
- yield history, *[gr.update(visible=False)]*4
234
-
235
- def _safe_exec(self, code_string: str, local_vars: Dict) -> Tuple[Any, ...]:
236
- try:
237
- output_buffer = io.StringIO()
238
- with redirect_stdout(output_buffer): exec(code_string, globals(), local_vars)
239
- return output_buffer.getvalue(), local_vars.get('fig'), local_vars.get('result_df'), None
240
- except Exception as e: return None, None, None, str(e)
241
 
242
- if __name__ == "__main__":
243
- if not os.path.exists("bot.png"):
244
- try:
245
- from PIL import Image
246
- Image.new('RGB', (1, 1)).save('bot.png')
247
- except ImportError: print("Pillow not installed, cannot create dummy bot.png.")
 
 
 
 
 
 
 
 
248
 
249
- app = DataExplorerApp()
250
- app.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ 🚀 AutoEDA: AI-Powered Exploratory Data Analysis Tool
4
+
5
+ An advanced Gradio application for automated exploratory data analysis,
6
+ data profiling, and AI-driven insights using Google's Gemini API.
7
+
8
+ Key Features:
9
+ - Unified Analysis Workflow: Upload a CSV and get a full report across all tabs.
10
+ - AI-Powered Storytelling: Generates a narrative overview, use cases, and findings.
11
+ - Actionable AI Suggestions: Provides data cleaning recommendations.
12
+ - Interactive Visualizations: Users can select columns to generate plots dynamically.
13
+ - In-depth Profiling: Detailed statistics for numeric and categorical data.
14
+ - Column-Level Drilldown: Inspect individual features in detail.
15
+ - Report Download: Export the AI-generated analysis as a Markdown file.
16
+
17
+ Author: World-Class MCP Expert
18
+ Version: 2.0
19
+ """
20
+ from __future__ import annotations
21
+
22
+ import warnings
23
+ import logging
24
+ import os
25
  import pandas as pd
26
  import numpy as np
27
  import plotly.express as px
28
+ import plotly.graph_objects as go
29
+ from plotly.subplots import make_subplots
30
+ import gradio as gr
31
  import google.generativeai as genai
32
+ from typing import Optional, Dict, Any, Tuple, List
33
+ from datetime import datetime
34
+
35
+ # --- Configuration & Setup ---
36
 
 
37
  warnings.filterwarnings('ignore')
38
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
39
+
40
+ # --- Core Analysis Logic (The "Engine") ---
41
+
42
+ class DataAnalyzer:
43
+ """
44
+ A comprehensive class to encapsulate all data analysis operations.
45
+ It holds the dataframe and provides methods for profiling, visualization,
46
+ and AI-powered analysis, ensuring data is processed only once.
47
+ """
48
+ def __init__(self, df: pd.DataFrame):
49
+ if not isinstance(df, pd.DataFrame):
50
+ raise TypeError("Input must be a pandas DataFrame.")
51
+ self.df = df
52
+ self._metadata: Optional[Dict[str, Any]] = None
53
+ logging.info(f"DataAnalyzer initialized with DataFrame of shape: {self.df.shape}")
54
+
55
+ @property
56
+ def metadata(self) -> Dict[str, Any]:
57
+ """Lazy-loads and caches dataset metadata."""
58
+ if self._metadata is None:
59
+ self._metadata = self._extract_metadata()
60
+ return self._metadata
61
 
62
+ def _extract_metadata(self) -> Dict[str, Any]:
63
+ """Extracts comprehensive metadata from the DataFrame."""
64
+ logging.info("Extracting dataset metadata...")
65
+ rows, cols = self.df.shape
66
+ numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist()
67
+ categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
68
+ datetime_cols = self.df.select_dtypes(include=['datetime64']).columns.tolist()
69
+
70
+ # High correlation pairs
71
+ high_corr_pairs = []
72
+ if len(numeric_cols) > 1:
73
+ corr_matrix = self.df[numeric_cols].corr().abs()
74
+ upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
75
+ high_corr_pairs = (
76
+ upper_tri.stack()
77
+ .reset_index()
78
+ .rename(columns={'level_0': 'Var 1', 'level_1': 'Var 2', 0: 'Correlation'})
79
+ .query('Correlation > 0.7')
80
+ .sort_values('Correlation', ascending=False)
81
+ .head(5)
82
+ .to_dict('records')
83
  )
84
+
85
+ return {
86
+ 'shape': (rows, cols),
87
+ 'columns': self.df.columns.tolist(),
88
+ 'numeric_cols': numeric_cols,
89
+ 'categorical_cols': categorical_cols,
90
+ 'datetime_cols': datetime_cols,
91
+ 'memory_usage': f"{self.df.memory_usage(deep=True).sum() / 1e6:.2f} MB",
92
+ 'total_missing': int(self.df.isnull().sum().sum()),
93
+ 'data_quality_score': round((self.df.notna().sum().sum() / self.df.size) * 100, 1),
94
+ 'high_corr_pairs': high_corr_pairs,
95
+ }
96
+
97
+ def get_profiling_report(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
98
+ """Generates detailed data profiling tables."""
99
+ logging.info("Generating data profiling report.")
100
+
101
+ # Missing data
102
+ missing = self.df.isnull().sum()
103
+ missing_df = pd.DataFrame({
104
+ 'Missing Values': missing,
105
+ 'Percentage (%)': (missing / len(self.df) * 100).round(2)
106
+ }).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing Values', ascending=False)
107
+
108
+ # Numeric stats
109
+ numeric_stats_df = self.df[self.metadata['numeric_cols']].describe().round(3).T.reset_index().rename(columns={'index': 'Column'})
110
+
111
+ # Categorical stats
112
+ cat_stats_list = []
113
+ for col in self.metadata['categorical_cols']:
114
+ stats = {
115
+ 'Column': col,
116
+ 'Unique Values': self.df[col].nunique(),
117
+ 'Top Value': self.df[col].mode().iloc[0] if not self.df[col].mode().empty else 'N/A',
118
+ 'Frequency': self.df[col].value_counts().iloc[0] if not self.df[col].value_counts().empty else 0
119
+ }
120
+ cat_stats_list.append(stats)
121
+ categorical_stats_df = pd.DataFrame(cat_stats_list)
122
+
123
+ return missing_df, numeric_stats_df, categorical_stats_df
124
+
125
+ def get_initial_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]:
126
+ """Creates a set of standard, non-interactive overview plots."""
127
+ logging.info("Generating initial overview visualizations.")
128
+
129
+ # Data type distribution
130
+ dtype_counts = self.df.dtypes.astype(str).value_counts()
131
+ dtype_fig = px.pie(
132
+ values=dtype_counts.values, names=dtype_counts.index,
133
+ title="📊 Data Type Distribution", hole=0.3
134
+ )
135
+ dtype_fig.update_traces(textposition='inside', textinfo='percent+label')
136
+
137
+ # Missing data overview
138
+ missing_fig = px.bar(
139
+ x=self.df.isnull().sum(), y=self.df.columns,
140
+ orientation='h', title="🕳️ Missing Values Overview",
141
+ labels={'x': 'Number of Missing Values', 'y': 'Column'},
142
+ ).update_yaxes(categoryorder="total ascending")
143
+
144
+ # Correlation heatmap
145
+ corr_fig = go.Figure()
146
+ if len(self.metadata['numeric_cols']) > 1:
147
+ corr_matrix = self.df[self.metadata['numeric_cols']].corr()
148
+ corr_fig = px.imshow(
149
+ corr_matrix, text_auto=".2f", aspect="auto",
150
+ title="🔗 Correlation Matrix (Numeric Features)",
151
+ color_continuous_scale='RdBu_r'
152
+ )
153
+ else:
154
+ corr_fig.update_layout(title="🔗 Correlation Matrix (Not enough numeric columns)")
155
+
156
+ return dtype_fig, missing_fig, corr_fig
157
 
158
+ def generate_ai_report(self, api_key: str) -> str:
159
+ """Generates a full data story and analysis using the Gemini API."""
160
+ logging.info("Generating AI report with Gemini.")
161
+
162
+ prompt = f"""
163
+ As an expert data analyst and storyteller, your task is to analyze the provided dataset summary and generate a comprehensive, insightful, and accessible report.
164
+
165
+ **Dataset Metadata:**
166
+ - **Shape:** {self.metadata['shape'][0]} rows, {self.metadata['shape'][1]} columns.
167
+ - **Column Names:** {', '.join(self.metadata['columns'])}
168
+ - **Numeric Columns:** {', '.join(self.metadata['numeric_cols'])}
169
+ - **Categorical Columns:** {', '.join(self.metadata['categorical_cols'])}
170
+ - **Overall Data Quality:** {self.metadata['data_quality_score']}%
171
+ - **Total Missing Values:** {self.metadata['total_missing']:,}
172
+ - **Highly Correlated Pairs (>0.7):** {self.metadata['high_corr_pairs'] if self.metadata['high_corr_pairs'] else 'None detected.'}
173
+ - **Sample Data (First 3 Rows):**
174
+ {self.df.head(3).to_markdown()}
175
 
176
+ **Your Report Structure (Use Markdown):**
177
+
178
+ # 🚀 AI-Powered Data Analysis Report
179
+
180
+ ## 📖 1. The Story of the Data
181
+ * **What is this dataset about?** (Deduce the purpose and subject matter of the data.)
182
+ * **What domain or industry does it belong to?** (e.g., E-commerce, Finance, Healthcare.)
183
+ * **Who might use this data?** (e.g., Marketers, Scientists, Financial Analysts.)
184
+
185
+ ## 🎯 2. Key Insights & Interesting Findings
186
+ - **Finding 1:** (Describe a significant pattern, trend, or anomaly. Use emojis to highlight.)
187
+ - **Finding 2:** (Mention another interesting discovery, perhaps from correlations or categorical data.)
188
+ - **Finding 3:** (Highlight a potential business or research opportunity revealed by the data.)
189
+
190
+ ## 🧹 3. Data Quality & Cleaning Recommendations
191
+ * **Overall Quality Assessment:** (Comment on the {self.metadata['data_quality_score']}% score and {self.metadata['total_missing']} missing values.)
192
+ * **Actionable Steps:**
193
+ - **Recommendation 1:** (e.g., "For column 'X' with Y% missing values, consider imputation using the mean/median/mode.")
194
+ - **Recommendation 2:** (e.g., "Columns 'A' and 'B' are highly correlated ({'e.g., ' + str(self.metadata['high_corr_pairs'][0]) if self.metadata['high_corr_pairs'] else ''}). Consider dropping one for modeling to avoid multicollinearity.")
195
+ - **Recommendation 3:** (e.g., "Column 'Z' is categorical but stored as a number. Recommend converting it to a category type.")
196
+
197
+ ## 🔮 4. Potential Next Steps & Use Cases
198
+ - **Analysis Idea 1:** (e.g., "Build a predictive model for customer churn.")
199
+ - **Dashboard Idea 2:** (e.g., "Create a sales performance dashboard tracking KPIs over time.")
200
+ - **Research Question 3:** (e.g., "Investigate the factors influencing employee attrition.")
201
+ """
202
  try:
203
+ genai.configure(api_key=api_key)
204
+ model = genai.GenerativeModel('gemini-1.5-flash-latest')
205
+ response = model.generate_content(prompt)
206
+ return response.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  except Exception as e:
208
+ logging.error(f"Gemini API call failed: {e}")
209
+ return f"❌ **Error generating AI report.**\n**Reason:** {str(e)}\n\nPlease check your API key and network connection. A fallback analysis could not be generated."
210
+
211
+ # --- Gradio UI & Event Handlers ---
212
+
213
+ def process_uploaded_file(file_obj: gr.File, api_key: str) -> tuple:
214
+ """
215
+ Main function to process the uploaded file. It runs all analyses
216
+ and returns updates for all UI components in one go.
217
+ """
218
+ if file_obj is None:
219
+ raise gr.Error("📁 Please upload a CSV file first!")
220
+ if not api_key:
221
+ raise gr.Error("🔑 Please enter your Gemini API key!")
222
+
223
+ try:
224
+ df = pd.read_csv(file_obj.name)
225
+ analyzer = DataAnalyzer(df)
226
+
227
+ # Perform all analyses
228
+ ai_report = analyzer.generate_ai_report(api_key)
229
+ missing_df, num_stats, cat_stats = analyzer.get_profiling_report()
230
+ dtype_fig, missing_fig, corr_fig = analyzer.get_initial_visuals()
231
+
232
+ # Prepare UI updates
233
+ all_cols = analyzer.metadata['columns']
234
+ num_cols = analyzer.metadata['numeric_cols']
235
+ cat_cols = analyzer.metadata['categorical_cols']
236
+
237
+ # The return dictionary maps UI components to their new values/configurations
238
+ return {
239
+ state_analyzer: analyzer,
240
+ # Overview Tab
241
+ md_ai_report: ai_report,
242
+ btn_download_report: gr.Button(visible=True),
243
+ # Profiling Tab
244
+ df_missing_data: missing_df,
245
+ df_numeric_stats: num_stats,
246
+ df_categorical_stats: cat_stats,
247
+ # Visuals Tab
248
+ plot_dtype: dtype_fig,
249
+ plot_missing: missing_fig,
250
+ plot_corr: corr_fig,
251
+ # Interactive Visuals Tab
252
+ dd_hist_col: gr.Dropdown(choices=num_cols, label="Select Numeric Column for Histogram", visible=True),
253
+ dd_scatter_x: gr.Dropdown(choices=num_cols, label="Select X-axis (Numeric)", visible=True),
254
+ dd_scatter_y: gr.Dropdown(choices=num_cols, label="Select Y-axis (Numeric)", visible=True),
255
+ dd_scatter_color: gr.Dropdown(choices=all_cols, label="Select Color (Categorical/Numeric)", visible=True),
256
+ dd_box_cat: gr.Dropdown(choices=cat_cols, label="Select Categorical Column for Box Plot", visible=True),
257
+ dd_box_num: gr.Dropdown(choices=num_cols, label="Select Numeric Column for Box Plot", visible=True),
258
+ # Column Drilldown Tab
259
+ dd_drilldown_col: gr.Dropdown(choices=all_cols, label="Select Column to Analyze", visible=True),
260
+ }
261
+
262
+ except Exception as e:
263
+ logging.error(f"An error occurred during file processing: {e}", exc_info=True)
264
+ raise gr.Error(f"Processing failed! Error: {str(e)}")
265
+
266
+ # --- Interactive Plotting Functions ---
267
+
268
+ def create_histogram(analyzer: DataAnalyzer, col: str) -> go.Figure:
269
+ if not col: return go.Figure()
270
+ return px.histogram(analyzer.df, x=col, title=f"Distribution of {col}", marginal="box")
271
+
272
+ def create_scatterplot(analyzer: DataAnalyzer, x_col: str, y_col: str, color_col: str) -> go.Figure:
273
+ if not x_col or not y_col: return go.Figure()
274
+ return px.scatter(analyzer.df, x=x_col, y=y_col, color=color_col,
275
+ title=f"Scatter Plot: {x_col} vs. {y_col}")
276
+
277
+ def create_boxplot(analyzer: DataAnalyzer, cat_col: str, num_col: str) -> go.Figure:
278
+ if not cat_col or not num_col: return go.Figure()
279
+ return px.box(analyzer.df, x=cat_col, y=num_col, title=f"Box Plot: {num_col} by {cat_col}")
280
 
281
+ def analyze_single_column(analyzer: DataAnalyzer, col: str) -> Tuple[str, go.Figure]:
282
+ if not col: return "", go.Figure()
283
+
284
+ col_series = analyzer.df[col]
285
+
286
+ # Generate stats markdown
287
+ stats_md = f"### 🔎 Analysis of Column: `{col}`\n"
288
+ stats_md += f"- **Data Type:** `{col_series.dtype}`\n"
289
+ stats_md += f"- **Missing Values:** {col_series.isnull().sum()} ({col_series.isnull().mean():.2%})\n"
290
+ stats_md += f"- **Unique Values:** {col_series.nunique()}\n"
291
+
292
+ # Generate plot based on type
293
+ fig = go.Figure()
294
+ if pd.api.types.is_numeric_dtype(col_series):
295
+ stats_md += f"- **Mean:** {col_series.mean():.2f}\n"
296
+ stats_md += f"- **Median:** {col_series.median():.2f}\n"
297
+ stats_md += f"- **Std Dev:** {col_series.std():.2f}\n"
298
+ fig = create_histogram(analyzer, col)
299
+ elif pd.api.types.is_categorical_dtype(col_series) or pd.api.types.is_object_dtype(col_series):
300
+ top5 = col_series.value_counts().head(5)
301
+ stats_md += f"- **Top 5 Values:**\n"
302
+ for val, count in top5.items():
303
+ stats_md += f" - `{val}`: {count} times\n"
304
+ fig = px.bar(top5, x=top5.index, y=top5.values, title=f"Top 5 Value Counts for {col}")
305
+ fig.update_xaxes(title=col)
306
+ fig.update_yaxes(title="Count")
307
+
308
+ return stats_md, fig
309
+
310
+ def download_report(analyzer: DataAnalyzer, ai_report_text: str) -> str:
311
+ """Saves the AI report and basic stats to a markdown file for download."""
312
+ if not analyzer: return None
313
+
314
+ filename = f"AI_Report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
315
+
316
+ # Create the full report content
317
+ full_report = f"# AutoEDA Analysis Report\n\n"
318
+ full_report += f"**Date Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
319
+ full_report += f"**Dataset Shape:** {analyzer.metadata['shape'][0]} rows x {analyzer.metadata['shape'][1]} columns\n\n"
320
+ full_report += "---\n\n"
321
+ full_report += ai_report_text
322
+
323
+ with open(filename, "w", encoding="utf-8") as f:
324
+ f.write(full_report)
325
 
326
+ logging.info(f"Generated download report: {filename}")
327
+ return filename
328
+
329
+ # --- Gradio Interface Definition ---
330
+
331
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), title="🚀 AutoEDA Pro") as demo:
332
+ # State object to hold the DataAnalyzer instance
333
+ state_analyzer = gr.State()
334
+
335
+ gr.Markdown("# 🚀 AutoEDA Pro: Your AI Data Science Assistant")
336
+ gr.Markdown("Upload a CSV, enter your Gemini API key, and click 'Analyze!' to unlock a comprehensive, AI-powered report on your data.")
337
+
338
+ with gr.Row():
339
+ with gr.Column(scale=2):
340
+ file_input = gr.File(label="📁 Upload your CSV File", file_types=[".csv"])
341
+ with gr.Column(scale=2):
342
+ api_key_input = gr.Textbox(label="🔑 Google Gemini API Key", type="password", placeholder="Enter your key here...")
343
+ with gr.Column(scale=1, min_width=150):
344
+ analyze_btn = gr.Button("✨ Analyze!", variant="primary", scale=1)
345
+
346
+ with gr.Tabs():
347
+ with gr.Tab("🤖 AI Report & Overview"):
348
+ md_ai_report = gr.Markdown("Your AI-generated report will appear here...")
349
+ btn_download_report = gr.Button("⬇️ Download Full Report", visible=False)
350
+
351
+ with gr.Tab("📊 Data Profiling"):
352
+ gr.Markdown("### Detailed Data Profile")
353
+ gr.Markdown("**Missing Data Analysis**")
354
+ df_missing_data = gr.DataFrame(interactive=False)
355
+ gr.Markdown("**Numeric Feature Statistics**")
356
+ df_numeric_stats = gr.DataFrame(interactive=False)
357
+ gr.Markdown("**Categorical Feature Statistics**")
358
+ df_categorical_stats = gr.DataFrame(interactive=False)
359
+
360
+ with gr.Tab("📈 Overview Visuals"):
361
+ gr.Markdown("### At-a-Glance Visualizations")
362
+ with gr.Row():
363
+ plot_dtype = gr.Plot()
364
+ plot_missing = gr.Plot()
365
+ with gr.Row():
366
+ plot_corr = gr.Plot()
367
+
368
+ with gr.Tab("🎨 Interactive Visuals"):
369
+ gr.Markdown("### Explore Your Data Visually")
370
+ with gr.Row():
371
+ with gr.Column():
372
+ dd_hist_col = gr.Dropdown(label="Select Column", visible=False)
373
+ plot_hist = gr.Plot()
374
+ with gr.Column():
375
+ dd_box_cat = gr.Dropdown(label="Select Category", visible=False)
376
+ dd_box_num = gr.Dropdown(label="Select Value", visible=False)
377
+ plot_box = gr.Plot()
378
+ with gr.Row():
379
+ gr.Markdown("#### Scatter Plot Explorer")
380
+ with gr.Row():
381
+ dd_scatter_x = gr.Dropdown(label="X-axis", visible=False)
382
+ dd_scatter_y = gr.Dropdown(label="Y-axis", visible=False)
383
+ dd_scatter_color = gr.Dropdown(label="Color", visible=False)
384
+ plot_scatter = gr.Plot()
385
 
386
+ with gr.Tab("🔍 Column Drilldown"):
387
+ gr.Markdown("### Deep Dive into a Single Column")
388
+ dd_drilldown_col = gr.Dropdown(label="Select Column", visible=False)
389
+ with gr.Row():
390
+ md_drilldown_stats = gr.Markdown()
391
+ plot_drilldown = gr.Plot()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
392
 
393
+ # --- Event Listeners ---
394
+
395
+ # Main analysis trigger
396
+ analyze_btn.click(
397
+ fn=process_uploaded_file,
398
+ inputs=[file_input, api_key_input],
399
+ outputs=[
400
+ state_analyzer, md_ai_report, btn_download_report,
401
+ df_missing_data, df_numeric_stats, df_categorical_stats,
402
+ plot_dtype, plot_missing, plot_corr,
403
+ dd_hist_col, dd_scatter_x, dd_scatter_y, dd_scatter_color,
404
+ dd_box_cat, dd_box_num, dd_drilldown_col
405
+ ]
406
+ )
407
 
408
+ # Interactive plot triggers
409
+ dd_hist_col.change(fn=create_histogram, inputs=[state_analyzer, dd_hist_col], outputs=plot_hist)
410
+ dd_scatter_x.change(fn=create_scatterplot, inputs=[state_analyzer, dd_scatter_x, dd_scatter_y, dd_scatter_color], outputs=plot_scatter)
411
+ dd_scatter_y.change(fn=create_scatterplot, inputs=[state_analyzer, dd_scatter_x, dd_scatter_y, dd_scatter_color], outputs=plot_scatter)
412
+ dd_scatter_color.change(fn=create_scatterplot, inputs=[state_analyzer, dd_scatter_x, dd_scatter_y, dd_scatter_color], outputs=plot_scatter)
413
+ dd_box_cat.change(fn=create_boxplot, inputs=[state_analyzer, dd_box_cat, dd_box_num], outputs=plot_box)
414
+ dd_box_num.change(fn=create_boxplot, inputs=[state_analyzer, dd_box_cat, dd_box_num], outputs=plot_box)
415
+
416
+ # Drilldown trigger
417
+ dd_drilldown_col.change(fn=analyze_single_column, inputs=[state_analyzer, dd_drilldown_col], outputs=[md_drilldown_stats, plot_drilldown])
418
+
419
+ # Download trigger
420
+ btn_download_report.click(fn=download_report, inputs=[state_analyzer, md_ai_report], outputs=gr.File(label="Download Report"))
421
+
422
+ gr.Markdown("---")
423
+ gr.Markdown("💡 **Tip**: Get your free Google Gemini API key from [Google AI Studio](https://aistudio.google.com/app/apikey).")
424
+ gr.Markdown("MCP Expert System v2.0 - Analysis Complete.")
425
+
426
+ if __name__ == "__main__":
427
+ demo.launch(debug=True)