mgbam commited on
Commit
d9ea3f9
·
verified ·
1 Parent(s): d1943e0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -259
app.py CHANGED
@@ -1,280 +1,49 @@
 
 
1
  # -*- coding: utf-8 -*-
2
  #
3
- # PROJECT: CognitiveEDA - The Adaptive Intelligence Engine
4
  #
5
- # DESCRIPTION: A world-class data discovery platform that provides a complete suite
6
- # of standard EDA tools and intelligently unlocks specialized analysis
7
- # modules for Time-Series, Text, and Clustering, offering a truly
8
- # comprehensive and context-aware analytical experience.
9
  #
10
  # SETUP: $ pip install -r requirements.txt
11
  #
12
- # AUTHOR: An MCP Expert in Data & AI Solutions
13
- # VERSION: 4.2 (Bugfix Edition: AI Narrative Engine Restored)
14
- # LAST-UPDATE: 2023-10-29 (Fixed critical bug where AI was not being called)
15
-
16
- from __future__ import annotations
17
 
18
  import warnings
19
  import logging
20
  import os
21
- from datetime import datetime
22
- from typing import Any, Dict, Optional, Tuple
23
-
24
- import gradio as gr
25
- import numpy as np
26
- import pandas as pd
27
- import plotly.express as px
28
- import plotly.graph_objects as go
29
- import google.generativeai as genai
30
 
31
- # --- Local Adaptive Modules (Requires analysis_modules.py and requirements.txt) ---
32
- from analysis_modules import analyze_time_series, generate_word_cloud, perform_clustering
 
33
 
34
  # --- Configuration & Setup ---
35
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - [%(levelname)s] - (%(filename)s:%(lineno)d) - %(message)s')
 
 
 
36
  warnings.filterwarnings('ignore', category=FutureWarning)
37
 
38
- class Config:
39
- APP_TITLE = "🚀 CognitiveEDA: The Adaptive Intelligence Engine"
40
- GEMINI_MODEL = 'gemini-1.5-flash-latest'
41
- MAX_UI_ROWS = 50000
42
-
43
- # --- Core Analysis Engine ---
44
- class DataAnalyzer:
45
- """The complete DataAnalyzer class, now with a fully functional AI engine."""
46
- def __init__(self, df: pd.DataFrame):
47
- if not isinstance(df, pd.DataFrame): raise TypeError("Input must be a pandas DataFrame.")
48
- self.df = df
49
- self._metadata: Optional[Dict[str, Any]] = None
50
- logging.info(f"DataAnalyzer instantiated with DataFrame of shape: {self.df.shape}")
51
-
52
- @property
53
- def metadata(self) -> Dict[str, Any]:
54
- if self._metadata is None: self._metadata = self._extract_metadata()
55
- return self._metadata
56
-
57
- def _extract_metadata(self) -> Dict[str, Any]:
58
- rows, cols = self.df.shape
59
- numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist()
60
- categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
61
- datetime_cols = self.df.select_dtypes(include=['datetime64', 'datetimetz']).columns.tolist()
62
- text_cols = [col for col in categorical_cols if self.df[col].dropna().str.len().mean() > 50]
63
-
64
- high_corr_pairs = []
65
- if len(numeric_cols) > 1:
66
- corr_matrix = self.df[numeric_cols].corr().abs()
67
- upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
68
- high_corr_series = upper_tri.stack()
69
- high_corr_pairs = (high_corr_series[high_corr_series > 0.75].reset_index().rename(columns={'level_0': 'Feature 1', 'level_1': 'Feature 2', 0: 'Correlation'}).to_dict('records'))
70
-
71
- return {
72
- 'shape': (rows, cols), 'columns': self.df.columns.tolist(),
73
- 'numeric_cols': numeric_cols, 'categorical_cols': categorical_cols,
74
- 'datetime_cols': datetime_cols, 'text_cols': text_cols,
75
- 'memory_usage_mb': f"{self.df.memory_usage(deep=True).sum() / 1e6:.2f}",
76
- 'total_missing': int(self.df.isnull().sum().sum()),
77
- 'data_quality_score': round((self.df.notna().sum().sum() / self.df.size) * 100, 2),
78
- 'high_corr_pairs': high_corr_pairs,
79
- }
80
-
81
- def get_profiling_tables(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
82
- missing = self.df.isnull().sum()
83
- missing_df = pd.DataFrame({'Missing Count': missing, 'Missing Percentage (%)': (missing / len(self.df) * 100).round(2)}).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing Count', ascending=False)
84
- numeric_stats = self.df[self.metadata['numeric_cols']].describe(percentiles=[.01, .25, .5, .75, .99]).T
85
- numeric_stats_df = numeric_stats.round(3).reset_index().rename(columns={'index': 'Column'})
86
- cat_stats = self.df[self.metadata['categorical_cols']].describe(include=['object', 'category']).T
87
- cat_stats_df = cat_stats.reset_index().rename(columns={'index': 'Column'})
88
- return missing_df, numeric_stats_df, cat_stats_df
89
-
90
- def get_overview_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]:
91
- meta = self.metadata
92
- dtype_counts = self.df.dtypes.astype(str).value_counts()
93
- fig_types = px.pie(values=dtype_counts.values, names=dtype_counts.index, title="<b>📊 Data Type Composition</b>", hole=0.4, color_discrete_sequence=px.colors.qualitative.Pastel)
94
- missing_df = self.df.isnull().sum().reset_index(name='count').query('count > 0')
95
- fig_missing = px.bar(missing_df, x='index', y='count', title="<b>🕳️ Missing Values Distribution</b>", labels={'index': 'Column Name', 'count': 'Number of Missing Values'}).update_xaxes(categoryorder="total descending")
96
- fig_corr = go.Figure()
97
- if len(meta['numeric_cols']) > 1:
98
- corr_matrix = self.df[meta['numeric_cols']].corr()
99
- fig_corr = px.imshow(corr_matrix, text_auto=".2f", aspect="auto", title="<b>🔗 Correlation Matrix</b>", color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
100
- return fig_types, fig_missing, fig_corr
101
-
102
- def generate_ai_narrative(self, api_key: str, context: Dict[str, Any]) -> str:
103
- """Generates a context-aware AI narrative using the Gemini API."""
104
- logging.info(f"Generating AI narrative with context: {list(context.keys())}")
105
- meta = self.metadata
106
- data_snippet_md = self.df.head(5).to_markdown(index=False)
107
-
108
- context_prompt = "**PRIMARY ANALYSIS MODES:**\n"
109
- if context.get('is_timeseries'):
110
- context_prompt += "- **Time-Series Detected:** Focus on trends, seasonality, and stationarity. Suggest forecasting models.\n"
111
- if context.get('has_text'):
112
- context_prompt += "- **Long-Form Text Detected:** Note potential for NLP tasks like sentiment analysis or topic modeling.\n"
113
- if not context.get('is_timeseries') and not context.get('has_text'):
114
- context_prompt += "- **General Tabular Data:** Focus on distributions, correlations, and potential for classification/regression modeling.\n"
115
-
116
- prompt = f"""
117
- As "Cognitive Analyst," an elite AI data scientist, your task is to generate a comprehensive, multi-part data discovery report in Markdown format.
118
-
119
- {context_prompt}
120
- **DATASET METADATA:**
121
- - **Shape:** {meta['shape'][0]} rows, {meta['shape'][1]} columns.
122
- - **Data Quality Score:** {meta['data_quality_score']}%
123
- - **Total Missing Values:** {meta['total_missing']:,}
124
- - **Highly Correlated Pairs:** {meta['high_corr_pairs'] if meta['high_corr_pairs'] else 'None detected.'}
125
- - **Data Snippet (First 5 Rows):**
126
- {data_snippet_md}
127
-
128
- **REQUIRED REPORT STRUCTURE:**
129
-
130
- # 🚀 AI Data Discovery Report
131
-
132
- ## 📄 1. Executive Summary
133
- * **Primary Objective:** (Deduce the likely purpose of this dataset. What problem could it solve?)
134
- * **Key Finding:** (State the single most interesting insight you've discovered.)
135
- * **Overall State:** (Briefly comment on the data's quality and readiness for analysis.)
136
-
137
- ## 🧐 2. Deep Dive & Quality Assessment
138
- * **Structural Profile:** (Describe the dataset's composition: numeric, categorical, text, time-series features.)
139
- * **Data Quality Audit:** (Elaborate on the quality score and missing values. Are they a major concern?)
140
- * **Redundancy Check:** (Comment on the detected high-correlation pairs and any risks.)
141
-
142
- ## 💡 3. Actionable Recommendations
143
- * **Data Cleaning:** (Provide a specific recommendation for handling missing data or outliers.)
144
- * **Feature Engineering:** (Suggest creating a new, valuable feature.)
145
- * **Next Analytical Steps:** (Propose a specific hypothesis to test or a suitable ML model to build.)
146
- """
147
- try:
148
- genai.configure(api_key=api_key)
149
- model = genai.GenerativeModel(Config.GEMINI_MODEL)
150
- response = model.generate_content(prompt)
151
- if not response.parts:
152
- blocked_reason = response.prompt_feedback.block_reason.name if response.prompt_feedback else "Unknown"
153
- logging.warning(f"AI response blocked. Reason: {blocked_reason}")
154
- return f"❌ **AI Report Generation Blocked by Safety Settings**\n**Reason:** `{blocked_reason}`."
155
- return response.text
156
- except Exception as e:
157
- logging.error(f"Gemini API call failed: {e}", exc_info=True)
158
- return f"❌ **AI Report Generation Failed**\n**Error:** `{str(e)}`"
159
-
160
- # --- UI Creation ---
161
- def create_ui():
162
- """Defines the complete, integrated Gradio user interface."""
163
- def create_histogram(analyzer: DataAnalyzer, col: str) -> go.Figure:
164
- if not col or not analyzer: return go.Figure()
165
- return px.histogram(analyzer.df, x=col, title=f"<b>Distribution of {col}</b>", marginal="box", template="plotly_white")
166
- def create_scatterplot(analyzer: DataAnalyzer, x_col: str, y_col:str, color_col:str) -> go.Figure:
167
- if not all([analyzer, x_col, y_col]): return go.Figure()
168
- return px.scatter(analyzer.df, x=x_col, y=y_col, color=color_col, title=f"<b>Scatter Plot: {x_col} vs. {y_col}</b>", template="plotly_white")
169
- def analyze_single_column(analyzer: DataAnalyzer, col: str) -> Tuple[str, go.Figure]:
170
- if not col or not analyzer: return "", go.Figure()
171
- series = analyzer.df[col]
172
- stats_md = f"### 🔎 **Deep Dive: `{col}`**\n- **Data Type:** `{series.dtype}`\n- **Unique Values:** `{series.nunique()}`\n- **Missing:** `{series.isnull().sum()}` ({series.isnull().mean():.2%})\n"
173
- if pd.api.types.is_numeric_dtype(series):
174
- stats_md += f"- **Mean:** `{series.mean():.3f}` | **Median:** `{series.median():.3f}` | **Std Dev:** `{series.std():.3f}`"
175
- fig = create_histogram(analyzer, col)
176
- else:
177
- stats_md += f"- **Top Value:** `{series.value_counts().index[0]}`"
178
- top_n = series.value_counts().nlargest(10)
179
- fig = px.bar(top_n, y=top_n.index, x=top_n.values, orientation='h', title=f"<b>Top 10 Categories in `{col}`</b>").update_yaxes(categoryorder="total ascending")
180
- return stats_md, fig
181
-
182
- with gr.Blocks(theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="blue"), title=Config.APP_TITLE) as demo:
183
- state_analyzer = gr.State()
184
- gr.Markdown(f"<h1>{Config.APP_TITLE}</h1>")
185
- gr.Markdown("Upload your data to receive a complete standard analysis, plus specialized dashboards that unlock automatically based on your data's content.")
186
- with gr.Row():
187
- upload_button = gr.File(label="1. Upload Data File (CSV, Excel)", file_types=[".csv", ".xlsx", ".xls"], scale=3)
188
- api_key_input = gr.Textbox(label="2. Enter Google Gemini API Key", type="password", scale=2)
189
- analyze_button = gr.Button("✨ Build My Dashboard", variant="primary", scale=1)
190
 
191
- with gr.Tabs():
192
- with gr.Tab("🤖 AI Narrative"):
193
- ai_report_output = gr.Markdown("### Your AI-generated report will appear here...")
194
- with gr.Tab("📋 Profile"):
195
- profile_missing_df, profile_numeric_df, profile_categorical_df = gr.DataFrame(), gr.DataFrame(), gr.DataFrame()
196
- with gr.Tab("📊 Overview Visuals"):
197
- with gr.Row(): plot_types, plot_missing = gr.Plot(), gr.Plot()
198
- plot_correlation = gr.Plot()
199
- with gr.Tab("🎨 Interactive Explorer"):
200
- with gr.Row():
201
- with gr.Column(scale=1): dd_hist_col = gr.Dropdown(label="Select Column for Histogram", interactive=True)
202
- with gr.Column(scale=2): plot_histogram = gr.Plot()
203
- with gr.Row():
204
- with gr.Column(scale=1):
205
- dd_scatter_x = gr.Dropdown(label="X-Axis (Numeric)", interactive=True)
206
- dd_scatter_y = gr.Dropdown(label="Y-Axis (Numeric)", interactive=True)
207
- dd_scatter_color = gr.Dropdown(label="Color By (Optional)", interactive=True)
208
- with gr.Column(scale=2): plot_scatter = gr.Plot()
209
- with gr.Tab("🔍 Column Deep-Dive"):
210
- dd_drilldown_col = gr.Dropdown(label="Select Column to Analyze", interactive=True)
211
- with gr.Row(): md_drilldown_stats, plot_drilldown = gr.Markdown(), gr.Plot()
212
- with gr.Tab("⌛ Time-Series Analysis", visible=False) as tab_timeseries:
213
- with gr.Row():
214
- dd_ts_date = gr.Dropdown(label="Select Date/Time Column", interactive=True)
215
- dd_ts_value = gr.Dropdown(label="Select Value Column", interactive=True)
216
- plot_ts_decomp, md_ts_stats = gr.Plot(), gr.Markdown()
217
- with gr.Tab("📝 Text Analysis", visible=False) as tab_text:
218
- dd_text_col = gr.Dropdown(label="Select Text Column", interactive=True)
219
- html_word_cloud = gr.HTML()
220
- with gr.Tab("🧩 Clustering (K-Means)", visible=False) as tab_cluster:
221
- num_clusters = gr.Slider(minimum=2, maximum=10, value=4, step=1, label="Number of Clusters (K)", interactive=True)
222
- plot_cluster, md_cluster_summary = gr.Plot(), gr.Markdown()
223
 
224
- main_outputs = [
225
- state_analyzer, ai_report_output,
226
- profile_missing_df, profile_numeric_df, profile_categorical_df,
227
- plot_types, plot_missing, plot_correlation,
228
- dd_hist_col, dd_scatter_x, dd_scatter_y, dd_scatter_color, dd_drilldown_col,
229
- tab_timeseries, dd_ts_date, dd_ts_value,
230
- tab_text, dd_text_col,
231
- tab_cluster, num_clusters]
232
- analyze_button.click(fn=run_full_analysis, inputs=[upload_button, api_key_input], outputs=main_outputs, show_progress="full")
233
- dd_hist_col.change(fn=create_histogram, inputs=[state_analyzer, dd_hist_col], outputs=plot_histogram)
234
- scatter_inputs = [state_analyzer, dd_scatter_x, dd_scatter_y, dd_scatter_color]
235
- for dd in [dd_scatter_x, dd_scatter_y, dd_scatter_color]: dd.change(fn=create_scatterplot, inputs=scatter_inputs, outputs=plot_scatter)
236
- dd_drilldown_col.change(fn=analyze_single_column, inputs=[state_analyzer, dd_drilldown_col], outputs=[md_drilldown_stats, plot_drilldown])
237
- ts_inputs = [state_analyzer, dd_ts_date, dd_ts_value]
238
- for dd in [dd_ts_date, dd_ts_value]: dd.change(fn=lambda a, d, v: analyze_time_series(a.df, d, v), inputs=ts_inputs, outputs=[plot_ts_decomp, md_ts_stats])
239
- dd_text_col.change(fn=lambda a, t: generate_word_cloud(a.df, t), inputs=[state_analyzer, dd_text_col], outputs=html_word_cloud)
240
- num_clusters.change(fn=lambda a, k: perform_clustering(a.df, a.metadata['numeric_cols'], k), inputs=[state_analyzer, num_clusters], outputs=[plot_cluster, md_cluster_summary])
241
- return demo
242
 
243
- # --- Main Application Logic & Orchestration ---
244
- def run_full_analysis(file_obj: gr.File, api_key: str) -> list:
245
- if file_obj is None: raise gr.Error("CRITICAL: No file uploaded.")
246
- if not api_key: raise gr.Error("CRITICAL: Gemini API key is missing.")
247
- try:
248
- logging.info(f"Processing uploaded file: {file_obj.name}")
249
- df = pd.read_csv(file_obj.name) if file_obj.name.endswith('.csv') else pd.read_excel(file_obj.name)
250
- if len(df) > Config.MAX_UI_ROWS: df = df.sample(n=Config.MAX_UI_ROWS, random_state=42)
251
- analyzer = DataAnalyzer(df)
252
- meta = analyzer.metadata
253
- ai_context = {'is_timeseries': bool(meta['datetime_cols']), 'has_text': bool(meta['text_cols'])}
254
- ai_report = analyzer.generate_ai_narrative(api_key, context=ai_context)
255
- missing_df, num_df, cat_df = analyzer.get_profiling_tables()
256
- fig_types, fig_missing, fig_corr = analyzer.get_overview_visuals()
257
-
258
- update_hist_dd = gr.Dropdown(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None)
259
- update_scatter_x = gr.Dropdown(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None)
260
- update_scatter_y = gr.Dropdown(choices=meta['numeric_cols'], value=meta['numeric_cols'][1] if len(meta['numeric_cols']) > 1 else None)
261
- update_scatter_color = gr.Dropdown(choices=meta['columns'])
262
- update_drill_dd = gr.Dropdown(choices=meta['columns'])
263
-
264
- show_ts_tab = gr.Tab(visible=bool(meta['datetime_cols']))
265
- update_ts_date_dd, update_ts_value_dd = gr.Dropdown(choices=meta['datetime_cols']), gr.Dropdown(choices=meta['numeric_cols'])
266
- show_text_tab, update_text_dd = gr.Tab(visible=bool(meta['text_cols'])), gr.Dropdown(choices=meta['text_cols'])
267
- show_cluster_tab, update_cluster_slider = gr.Tab(visible=len(meta['numeric_cols']) > 1), gr.Slider(visible=len(meta['numeric_cols']) > 1)
268
 
269
- return [analyzer, ai_report, missing_df, num_df, cat_df, fig_types, fig_missing, fig_corr,
270
- update_hist_dd, update_scatter_x, update_scatter_y, update_scatter_color, update_drill_dd,
271
- show_ts_tab, update_ts_date_dd, update_ts_value_dd,
272
- show_text_tab, update_text_dd,
273
- show_cluster_tab, update_cluster_slider]
274
- except Exception as e:
275
- logging.error(f"A critical error occurred: {e}", exc_info=True)
276
- raise gr.Error(f"Analysis Failed! Error: {str(e)}")
277
 
278
  if __name__ == "__main__":
279
- app_instance = create_ui()
280
- app_instance.launch(debug=True, server_name="0.0.0.0")
 
1
+ # app.py
2
+
3
  # -*- coding: utf-8 -*-
4
  #
5
+ # PROJECT: CognitiveEDA v5.0 - The QuantumLeap Intelligence Platform
6
  #
7
+ # DESCRIPTION: Main application entry point. This script initializes the UI,
8
+ # registers all event callbacks, and launches the Gradio server.
 
 
9
  #
10
  # SETUP: $ pip install -r requirements.txt
11
  #
12
+ # AUTHOR: An MCP & PhD Expert in Data & AI Solutions
13
+ # VERSION: 5.0 (QuantumLeap Edition: Asynchronous, Modular, & Resilient)
14
+ # LAST-UPDATE: 2023-10-30 (Complete architectural overhaul)
 
 
15
 
16
  import warnings
17
  import logging
18
  import os
 
 
 
 
 
 
 
 
 
19
 
20
+ from ui.layout import create_main_layout
21
+ from ui.callbacks import register_callbacks
22
+ from core.config import settings # To access title
23
 
24
  # --- Configuration & Setup ---
25
+ logging.basicConfig(
26
+ level=logging.INFO,
27
+ format='%(asctime)s - [%(levelname)s] - (%(filename)s:%(lineno)d) - %(message)s'
28
+ )
29
  warnings.filterwarnings('ignore', category=FutureWarning)
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ def main():
33
+ """
34
+ Primary function to build and launch the Gradio application.
35
+ """
36
+ logging.info(f"Starting {settings.APP_TITLE}")
37
+
38
+ # 1. Build the UI from the layout module
39
+ demo, components = create_main_layout()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ # 2. Register all event handlers from the callbacks module
42
+ register_callbacks(components)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
+ # 3. Launch the application
45
+ demo.launch(debug=True, server_name="0.0.0.0")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
 
 
 
 
 
 
 
 
47
 
48
  if __name__ == "__main__":
49
+ main()