mgbam commited on
Commit
6eb2933
Β·
verified Β·
1 Parent(s): e0c35f9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +126 -92
app.py CHANGED
@@ -2,16 +2,16 @@
2
  #
3
  # PROJECT: CognitiveEDA - The Adaptive Intelligence Engine
4
  #
5
- # DESCRIPTION: A world-class data discovery platform that transcends static EDA.
6
- # It intelligently profiles datasets to unlock specialized analysis
7
- # modules for Time-Series, Text, and Unsupervised Learning, providing
8
- # a context-aware, deeply insightful user experience.
9
  #
10
  # SETUP: $ pip install -r requirements.txt
11
  #
12
  # AUTHOR: An MCP Expert in Data & AI Solutions
13
- # VERSION: 4.0 (Adaptive Intelligence Engine)
14
- # LAST-UPDATE: 2023-10-29 (Major architectural refactor for adaptive modules)
15
 
16
  from __future__ import annotations
17
 
@@ -30,22 +30,23 @@ import plotly.express as px
30
  import plotly.graph_objects as go
31
  import google.generativeai as genai
32
 
33
- # --- Local Adaptive Modules ---
34
  from analysis_modules import analyze_time_series, generate_word_cloud, perform_clustering
35
 
36
- # --- Configuration & Setup (Identical to previous versions) ---
37
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - [%(levelname)s] - (%(filename)s:%(lineno)d) - %(message)s')
38
  warnings.filterwarnings('ignore', category=FutureWarning)
39
 
40
  class Config:
41
  APP_TITLE = "πŸš€ CognitiveEDA: The Adaptive Intelligence Engine"
42
  GEMINI_MODEL = 'gemini-1.5-flash-latest'
43
- CORR_THRESHOLD = 0.75
44
- TOP_N_CATEGORIES = 10
45
- MAX_UI_ROWS = 50000 # Sample large datasets for UI responsiveness
46
 
47
- # --- Core Analysis Engine (Mostly unchanged, added context to AI prompt) ---
48
  class DataAnalyzer:
 
 
 
49
  def __init__(self, df: pd.DataFrame):
50
  if not isinstance(df, pd.DataFrame): raise TypeError("Input must be a pandas DataFrame.")
51
  self.df = df
@@ -58,7 +59,6 @@ class DataAnalyzer:
58
  return self._metadata
59
 
60
  def _extract_metadata(self) -> Dict[str, Any]:
61
- # (This method remains the same as v3.2)
62
  rows, cols = self.df.shape
63
  numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist()
64
  categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
@@ -70,7 +70,7 @@ class DataAnalyzer:
70
  corr_matrix = self.df[numeric_cols].corr().abs()
71
  upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
72
  high_corr_series = upper_tri.stack()
73
- high_corr_pairs = (high_corr_series[high_corr_series > Config.CORR_THRESHOLD].reset_index().rename(columns={'level_0': 'Feature 1', 'level_1': 'Feature 2', 0: 'Correlation'}).to_dict('records'))
74
 
75
  return {
76
  'shape': (rows, cols), 'columns': self.df.columns.tolist(),
@@ -83,115 +83,141 @@ class DataAnalyzer:
83
  }
84
 
85
  def get_profiling_tables(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
86
- # (This method remains the same as v3.2)
87
- ...
 
 
 
 
 
88
 
89
  def get_overview_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]:
90
- # (This method remains the same as v3.2)
91
- ...
 
 
 
 
 
 
 
 
92
 
93
  def generate_ai_narrative(self, api_key: str, context: Dict[str, Any]) -> str:
94
- """Generates a context-aware AI narrative."""
95
- logging.info(f"Generating AI narrative with context: {context.keys()}")
96
- meta = self.metadata
97
- data_snippet_md = self.df.head(5).to_markdown(index=False)
98
-
99
- # Dynamically build the context section of the prompt
100
- context_prompt = "**DATASET CONTEXT:**\n"
101
- if context.get('is_timeseries'):
102
- context_prompt += "- **Analysis Mode:** Time-Series. Focus on trends, seasonality, and stationarity.\n"
103
- if context.get('has_text'):
104
- context_prompt += "- **Analysis Mode:** Text Analysis. Note potential for NLP tasks like sentiment analysis or topic modeling.\n"
105
-
106
- prompt = f"""
107
- As "Cognitive Analyst," an elite AI data scientist, your task is to generate a comprehensive data discovery report.
108
- {context_prompt}
109
- - **Shape:** {meta['shape'][0]} rows, {meta['shape'][1]} columns.
110
- ... (rest of the prompt from v3.2)
111
- """
112
- # (API call logic remains the same)
113
- ...
114
- return "AI Narrative Placeholder" # For brevity in this example
115
-
116
- # --- UI Creation (create_ui) ---
117
- # Contains all Gradio component definitions and their event listeners
118
  def create_ui():
119
- """Defines and builds the new adaptive Gradio user interface."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
  with gr.Blocks(theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="blue"), title=Config.APP_TITLE) as demo:
122
- # State object to hold the DataAnalyzer instance
123
  state_analyzer = gr.State()
124
 
125
- # --- Header & Main Controls ---
126
  gr.Markdown(f"<h1>{Config.APP_TITLE}</h1>")
127
- gr.Markdown("Upload your data (CSV, Excel) and let the AI build a custom analysis dashboard for you.")
 
128
  with gr.Row():
129
- upload_button = gr.File(label="1. Upload Data File", file_types=[".csv", ".xlsx", ".xls"], scale=3)
130
  api_key_input = gr.Textbox(label="2. Enter Google Gemini API Key", type="password", scale=2)
131
  analyze_button = gr.Button("✨ Build My Dashboard", variant="primary", scale=1)
132
 
133
- # --- Tabbed Interface for Analysis Modules ---
134
  with gr.Tabs():
135
- # Standard Tabs (Always Visible)
136
  with gr.Tab("πŸ€– AI Narrative"):
137
  ai_report_output = gr.Markdown("### Your AI-generated report will appear here...")
138
- download_report_button = gr.Button("⬇️ Download Full Report", visible=False)
139
  with gr.Tab("πŸ“‹ Profile"):
140
- gr.Markdown("### **Detailed Data Profile**")
141
- profile_missing_df = gr.DataFrame(interactive=False, label="Missing Values")
142
- profile_numeric_df = gr.DataFrame(interactive=False, label="Numeric Stats")
143
- profile_categorical_df = gr.DataFrame(interactive=False, label="Categorical Stats")
144
  with gr.Tab("πŸ“Š Overview Visuals"):
145
  with gr.Row(): plot_types, plot_missing = gr.Plot(), gr.Plot()
146
  plot_correlation = gr.Plot()
147
-
148
- # Specialized, Initially Hidden Tabs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  with gr.Tab("βŒ› Time-Series Analysis", visible=False) as tab_timeseries:
150
- gr.Markdown("### **Decompose and Analyze Time-Series Data**")
151
  with gr.Row():
152
  dd_ts_date = gr.Dropdown(label="Select Date/Time Column", interactive=True)
153
  dd_ts_value = gr.Dropdown(label="Select Value Column", interactive=True)
154
- plot_ts_decomp = gr.Plot()
155
- md_ts_stats = gr.Markdown()
156
 
157
  with gr.Tab("πŸ“ Text Analysis", visible=False) as tab_text:
158
- gr.Markdown("### **Visualize High-Frequency Words**")
159
  dd_text_col = gr.Dropdown(label="Select Text Column", interactive=True)
160
  html_word_cloud = gr.HTML()
161
 
162
  with gr.Tab("🧩 Clustering (K-Means)", visible=False) as tab_cluster:
163
- gr.Markdown("### **Discover Latent Groups with K-Means Clustering**")
164
- with gr.Row():
165
- num_clusters = gr.Slider(minimum=2, maximum=10, value=4, step=1, label="Number of Clusters (K)", interactive=True)
166
- plot_cluster = gr.Plot()
167
- md_cluster_summary = gr.Markdown()
168
 
169
  # --- Event Listeners ---
170
  main_outputs = [
171
- state_analyzer, ai_report_output, download_report_button,
172
  profile_missing_df, profile_numeric_df, profile_categorical_df,
173
  plot_types, plot_missing, plot_correlation,
 
174
  tab_timeseries, dd_ts_date, dd_ts_value,
175
  tab_text, dd_text_col,
176
  tab_cluster, num_clusters
177
  ]
178
- analyze_button.click(fn=run_full_analysis, inputs=[upload_button, api_key_input], outputs=main_outputs)
 
 
 
 
 
 
 
179
 
180
- # Listeners for specialized tabs
181
  ts_inputs = [state_analyzer, dd_ts_date, dd_ts_value]
182
  for dd in [dd_ts_date, dd_ts_value]:
183
  dd.change(fn=lambda a, d, v: analyze_time_series(a.df, d, v), inputs=ts_inputs, outputs=[plot_ts_decomp, md_ts_stats])
184
-
185
  dd_text_col.change(fn=lambda a, t: generate_word_cloud(a.df, t), inputs=[state_analyzer, dd_text_col], outputs=html_word_cloud)
186
-
187
- cluster_inputs = [state_analyzer, num_clusters]
188
- num_clusters.change(fn=lambda a, k: perform_clustering(a.df, a.metadata['numeric_cols'], k), inputs=cluster_inputs, outputs=[plot_cluster, md_cluster_summary])
189
 
190
  return demo
191
 
192
  # --- Main Application Logic & Orchestration ---
193
  def run_full_analysis(file_obj: gr.File, api_key: str) -> list:
194
- """The new adaptive analysis orchestrator."""
195
  if file_obj is None: raise gr.Error("CRITICAL: No file uploaded.")
196
  if not api_key: raise gr.Error("CRITICAL: Gemini API key is missing.")
197
 
@@ -200,42 +226,50 @@ def run_full_analysis(file_obj: gr.File, api_key: str) -> list:
200
  df = pd.read_csv(file_obj.name) if file_obj.name.endswith('.csv') else pd.read_excel(file_obj.name)
201
 
202
  if len(df) > Config.MAX_UI_ROWS:
203
- logging.info(f"Large dataset detected ({len(df)} rows). Sampling to {Config.MAX_UI_ROWS} for UI.")
204
- df_display = df.sample(n=Config.MAX_UI_ROWS, random_state=42)
205
- else:
206
- df_display = df
207
 
208
- analyzer = DataAnalyzer(df_display)
209
  meta = analyzer.metadata
210
 
211
- # --- Base Analysis ---
212
  ai_context = {'is_timeseries': bool(meta['datetime_cols']), 'has_text': bool(meta['text_cols'])}
213
- # ai_report = analyzer.generate_ai_narrative(api_key, context=ai_context) # Commented out for speed
214
- ai_report = "AI Narrative generation is ready. Trigger on demand." # Placeholder
215
  missing_df, num_df, cat_df = analyzer.get_profiling_tables()
216
  fig_types, fig_missing, fig_corr = analyzer.get_overview_visuals()
217
 
218
- # --- Adaptive Module Configuration ---
 
 
 
 
 
 
 
219
  show_ts_tab = gr.Tab(visible=bool(meta['datetime_cols']))
 
 
 
220
  show_text_tab = gr.Tab(visible=bool(meta['text_cols']))
 
 
221
  show_cluster_tab = gr.Tab(visible=len(meta['numeric_cols']) > 1)
 
222
 
 
223
  return [
224
- analyzer, ai_report, gr.Button(visible=True),
225
- missing_df, num_df, cat_df, fig_types, fig_missing, fig_corr,
226
- show_ts_tab, gr.Dropdown(choices=meta['datetime_cols']), gr.Dropdown(choices=meta['numeric_cols']),
227
- show_text_tab, gr.Dropdown(choices=meta['text_cols']),
228
- show_cluster_tab, gr.Slider(visible=True) # or gr.Number
 
 
229
  ]
230
  except Exception as e:
231
  logging.error(f"A critical error occurred: {e}", exc_info=True)
232
  raise gr.Error(f"Analysis Failed! Error: {str(e)}")
233
 
234
- def perform_pre_flight_checks():
235
- # (Same as v3.2)
236
- ...
237
-
238
  if __name__ == "__main__":
239
- # perform_pre_flight_checks() # Can be commented out during active dev
240
  app_instance = create_ui()
241
  app_instance.launch(debug=True, server_name="0.0.0.0")
 
2
  #
3
  # PROJECT: CognitiveEDA - The Adaptive Intelligence Engine
4
  #
5
+ # DESCRIPTION: A world-class data discovery platform that provides a complete suite
6
+ # of standard EDA tools and intelligently unlocks specialized analysis
7
+ # modules for Time-Series, Text, and Clustering, offering a truly
8
+ # comprehensive and context-aware analytical experience.
9
  #
10
  # SETUP: $ pip install -r requirements.txt
11
  #
12
  # AUTHOR: An MCP Expert in Data & AI Solutions
13
+ # VERSION: 4.1 (Integrated Adaptive Engine)
14
+ # LAST-UPDATE: 2023-10-29 (Corrected v4.0 by re-integrating all standard EDA tabs)
15
 
16
  from __future__ import annotations
17
 
 
30
  import plotly.graph_objects as go
31
  import google.generativeai as genai
32
 
33
+ # --- Local Adaptive Modules (Requires analysis_modules.py and requirements.txt from previous response) ---
34
  from analysis_modules import analyze_time_series, generate_word_cloud, perform_clustering
35
 
36
+ # --- Configuration & Setup ---
37
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - [%(levelname)s] - (%(filename)s:%(lineno)d) - %(message)s')
38
  warnings.filterwarnings('ignore', category=FutureWarning)
39
 
40
  class Config:
41
  APP_TITLE = "πŸš€ CognitiveEDA: The Adaptive Intelligence Engine"
42
  GEMINI_MODEL = 'gemini-1.5-flash-latest'
43
+ MAX_UI_ROWS = 50000
 
 
44
 
45
+ # --- Core Analysis Engine (Unchanged from previous response) ---
46
  class DataAnalyzer:
47
+ # (The DataAnalyzer class is identical to the previous version and is omitted here for brevity)
48
+ # It should contain: __init__, metadata property, _extract_metadata,
49
+ # get_profiling_tables, get_overview_visuals, generate_ai_narrative
50
  def __init__(self, df: pd.DataFrame):
51
  if not isinstance(df, pd.DataFrame): raise TypeError("Input must be a pandas DataFrame.")
52
  self.df = df
 
59
  return self._metadata
60
 
61
  def _extract_metadata(self) -> Dict[str, Any]:
 
62
  rows, cols = self.df.shape
63
  numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist()
64
  categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
 
70
  corr_matrix = self.df[numeric_cols].corr().abs()
71
  upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
72
  high_corr_series = upper_tri.stack()
73
+ high_corr_pairs = (high_corr_series[high_corr_series > 0.75].reset_index().rename(columns={'level_0': 'Feature 1', 'level_1': 'Feature 2', 0: 'Correlation'}).to_dict('records'))
74
 
75
  return {
76
  'shape': (rows, cols), 'columns': self.df.columns.tolist(),
 
83
  }
84
 
85
  def get_profiling_tables(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
86
+ missing = self.df.isnull().sum()
87
+ missing_df = pd.DataFrame({'Missing Count': missing, 'Missing Percentage (%)': (missing / len(self.df) * 100).round(2)}).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing Count', ascending=False)
88
+ numeric_stats = self.df[self.metadata['numeric_cols']].describe(percentiles=[.01, .25, .5, .75, .99]).T
89
+ numeric_stats_df = numeric_stats.round(3).reset_index().rename(columns={'index': 'Column'})
90
+ cat_stats = self.df[self.metadata['categorical_cols']].describe(include=['object', 'category']).T
91
+ cat_stats_df = cat_stats.reset_index().rename(columns={'index': 'Column'})
92
+ return missing_df, numeric_stats_df, cat_stats_df
93
 
94
  def get_overview_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]:
95
+ meta = self.metadata
96
+ dtype_counts = self.df.dtypes.astype(str).value_counts()
97
+ fig_types = px.pie(values=dtype_counts.values, names=dtype_counts.index, title="<b>πŸ“Š Data Type Composition</b>", hole=0.4, color_discrete_sequence=px.colors.qualitative.Pastel)
98
+ missing_df = self.df.isnull().sum().reset_index(name='count').query('count > 0')
99
+ fig_missing = px.bar(missing_df, x='index', y='count', title="<b>πŸ•³οΈ Missing Values Distribution</b>", labels={'index': 'Column Name', 'count': 'Number of Missing Values'}).update_xaxes(categoryorder="total descending")
100
+ fig_corr = go.Figure()
101
+ if len(meta['numeric_cols']) > 1:
102
+ corr_matrix = self.df[meta['numeric_cols']].corr()
103
+ fig_corr = px.imshow(corr_matrix, text_auto=".2f", aspect="auto", title="<b>πŸ”— Correlation Matrix</b>", color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
104
+ return fig_types, fig_missing, fig_corr
105
 
106
  def generate_ai_narrative(self, api_key: str, context: Dict[str, Any]) -> str:
107
+ # Placeholder for brevity
108
+ return "AI Narrative generation is ready."
109
+
110
+
111
+ # --- UI Creation ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  def create_ui():
113
+ """Defines the complete, integrated Gradio user interface."""
114
+
115
+ # --- Reusable plotting functions for interactive tabs ---
116
+ def create_histogram(analyzer: DataAnalyzer, col: str) -> go.Figure:
117
+ if not col or not analyzer: return go.Figure()
118
+ return px.histogram(analyzer.df, x=col, title=f"<b>Distribution of {col}</b>", marginal="box", template="plotly_white")
119
+
120
+ def create_scatterplot(analyzer: DataAnalyzer, x_col: str, y_col:str, color_col:str) -> go.Figure:
121
+ if not all([analyzer, x_col, y_col]): return go.Figure()
122
+ return px.scatter(analyzer.df, x=x_col, y=y_col, color=color_col, title=f"<b>Scatter Plot: {x_col} vs. {y_col}</b>", template="plotly_white")
123
+
124
+ def analyze_single_column(analyzer: DataAnalyzer, col: str) -> Tuple[str, go.Figure]:
125
+ if not col or not analyzer: return "", go.Figure()
126
+ series = analyzer.df[col]
127
+ stats_md = f"### πŸ”Ž **Deep Dive: `{col}`**\n- **Data Type:** `{series.dtype}`\n- **Unique Values:** `{series.nunique()}`\n- **Missing:** `{series.isnull().sum()}` ({series.isnull().mean():.2%})\n"
128
+ if pd.api.types.is_numeric_dtype(series):
129
+ stats_md += f"- **Mean:** `{series.mean():.3f}` | **Median:** `{series.median():.3f}` | **Std Dev:** `{series.std():.3f}`"
130
+ fig = create_histogram(analyzer, col)
131
+ else:
132
+ stats_md += f"- **Top Value:** `{series.value_counts().index[0]}`"
133
+ top_n = series.value_counts().nlargest(10)
134
+ fig = px.bar(top_n, y=top_n.index, x=top_n.values, orientation='h', title=f"<b>Top 10 Categories in `{col}`</b>").update_yaxes(categoryorder="total ascending")
135
+ return stats_md, fig
136
 
137
  with gr.Blocks(theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="blue"), title=Config.APP_TITLE) as demo:
 
138
  state_analyzer = gr.State()
139
 
 
140
  gr.Markdown(f"<h1>{Config.APP_TITLE}</h1>")
141
+ gr.Markdown("Upload your data to receive a complete standard analysis, plus specialized dashboards that unlock automatically based on your data's content.")
142
+
143
  with gr.Row():
144
+ upload_button = gr.File(label="1. Upload Data File (CSV, Excel)", file_types=[".csv", ".xlsx", ".xls"], scale=3)
145
  api_key_input = gr.Textbox(label="2. Enter Google Gemini API Key", type="password", scale=2)
146
  analyze_button = gr.Button("✨ Build My Dashboard", variant="primary", scale=1)
147
 
 
148
  with gr.Tabs():
149
+ # --- Standard Tabs (Always Visible) ---
150
  with gr.Tab("πŸ€– AI Narrative"):
151
  ai_report_output = gr.Markdown("### Your AI-generated report will appear here...")
 
152
  with gr.Tab("πŸ“‹ Profile"):
153
+ profile_missing_df, profile_numeric_df, profile_categorical_df = gr.DataFrame(), gr.DataFrame(), gr.DataFrame()
 
 
 
154
  with gr.Tab("πŸ“Š Overview Visuals"):
155
  with gr.Row(): plot_types, plot_missing = gr.Plot(), gr.Plot()
156
  plot_correlation = gr.Plot()
157
+ with gr.Tab("🎨 Interactive Explorer"):
158
+ with gr.Row():
159
+ with gr.Column(scale=1):
160
+ dd_hist_col = gr.Dropdown(label="Select Column for Histogram", interactive=True)
161
+ with gr.Column(scale=2):
162
+ plot_histogram = gr.Plot()
163
+ with gr.Row():
164
+ with gr.Column(scale=1):
165
+ dd_scatter_x = gr.Dropdown(label="X-Axis (Numeric)", interactive=True)
166
+ dd_scatter_y = gr.Dropdown(label="Y-Axis (Numeric)", interactive=True)
167
+ dd_scatter_color = gr.Dropdown(label="Color By (Optional)", interactive=True)
168
+ with gr.Column(scale=2):
169
+ plot_scatter = gr.Plot()
170
+ with gr.Tab("πŸ” Column Deep-Dive"):
171
+ dd_drilldown_col = gr.Dropdown(label="Select Column to Analyze", interactive=True)
172
+ with gr.Row():
173
+ md_drilldown_stats, plot_drilldown = gr.Markdown(), gr.Plot()
174
+
175
+ # --- Specialized, Adaptive Tabs ---
176
  with gr.Tab("βŒ› Time-Series Analysis", visible=False) as tab_timeseries:
 
177
  with gr.Row():
178
  dd_ts_date = gr.Dropdown(label="Select Date/Time Column", interactive=True)
179
  dd_ts_value = gr.Dropdown(label="Select Value Column", interactive=True)
180
+ plot_ts_decomp, md_ts_stats = gr.Plot(), gr.Markdown()
 
181
 
182
  with gr.Tab("πŸ“ Text Analysis", visible=False) as tab_text:
 
183
  dd_text_col = gr.Dropdown(label="Select Text Column", interactive=True)
184
  html_word_cloud = gr.HTML()
185
 
186
  with gr.Tab("🧩 Clustering (K-Means)", visible=False) as tab_cluster:
187
+ num_clusters = gr.Slider(minimum=2, maximum=10, value=4, step=1, label="Number of Clusters (K)", interactive=True)
188
+ plot_cluster, md_cluster_summary = gr.Plot(), gr.Markdown()
 
 
 
189
 
190
  # --- Event Listeners ---
191
  main_outputs = [
192
+ state_analyzer, ai_report_output,
193
  profile_missing_df, profile_numeric_df, profile_categorical_df,
194
  plot_types, plot_missing, plot_correlation,
195
+ dd_hist_col, dd_scatter_x, dd_scatter_y, dd_scatter_color, dd_drilldown_col,
196
  tab_timeseries, dd_ts_date, dd_ts_value,
197
  tab_text, dd_text_col,
198
  tab_cluster, num_clusters
199
  ]
200
+ analyze_button.click(fn=run_full_analysis, inputs=[upload_button, api_key_input], outputs=main_outputs, show_progress="full")
201
+
202
+ # Listeners for standard interactive tabs
203
+ dd_hist_col.change(fn=create_histogram, inputs=[state_analyzer, dd_hist_col], outputs=plot_histogram)
204
+ scatter_inputs = [state_analyzer, dd_scatter_x, dd_scatter_y, dd_scatter_color]
205
+ for dd in [dd_scatter_x, dd_scatter_y, dd_scatter_color]:
206
+ dd.change(fn=create_scatterplot, inputs=scatter_inputs, outputs=plot_scatter)
207
+ dd_drilldown_col.change(fn=analyze_single_column, inputs=[state_analyzer, dd_drilldown_col], outputs=[md_drilldown_stats, plot_drilldown])
208
 
209
+ # Listeners for specialized adaptive tabs
210
  ts_inputs = [state_analyzer, dd_ts_date, dd_ts_value]
211
  for dd in [dd_ts_date, dd_ts_value]:
212
  dd.change(fn=lambda a, d, v: analyze_time_series(a.df, d, v), inputs=ts_inputs, outputs=[plot_ts_decomp, md_ts_stats])
 
213
  dd_text_col.change(fn=lambda a, t: generate_word_cloud(a.df, t), inputs=[state_analyzer, dd_text_col], outputs=html_word_cloud)
214
+ num_clusters.change(fn=lambda a, k: perform_clustering(a.df, a.metadata['numeric_cols'], k), inputs=[state_analyzer, num_clusters], outputs=[plot_cluster, md_cluster_summary])
 
 
215
 
216
  return demo
217
 
218
  # --- Main Application Logic & Orchestration ---
219
  def run_full_analysis(file_obj: gr.File, api_key: str) -> list:
220
+ """Orchestrates the complete standard and adaptive analysis."""
221
  if file_obj is None: raise gr.Error("CRITICAL: No file uploaded.")
222
  if not api_key: raise gr.Error("CRITICAL: Gemini API key is missing.")
223
 
 
226
  df = pd.read_csv(file_obj.name) if file_obj.name.endswith('.csv') else pd.read_excel(file_obj.name)
227
 
228
  if len(df) > Config.MAX_UI_ROWS:
229
+ df = df.sample(n=Config.MAX_UI_ROWS, random_state=42)
 
 
 
230
 
231
+ analyzer = DataAnalyzer(df)
232
  meta = analyzer.metadata
233
 
234
+ # --- Run all base analyses ---
235
  ai_context = {'is_timeseries': bool(meta['datetime_cols']), 'has_text': bool(meta['text_cols'])}
236
+ ai_report = analyzer.generate_ai_narrative(api_key, context=ai_context)
 
237
  missing_df, num_df, cat_df = analyzer.get_profiling_tables()
238
  fig_types, fig_missing, fig_corr = analyzer.get_overview_visuals()
239
 
240
+ # --- Configure standard interactive dropdowns ---
241
+ update_hist_dd = gr.Dropdown(choices=meta['numeric_cols'], label="Select Column for Histogram", value=meta['numeric_cols'][0] if meta['numeric_cols'] else None)
242
+ update_scatter_x = gr.Dropdown(choices=meta['numeric_cols'], label="X-Axis (Numeric)", value=meta['numeric_cols'][0] if meta['numeric_cols'] else None)
243
+ update_scatter_y = gr.Dropdown(choices=meta['numeric_cols'], label="Y-Axis (Numeric)", value=meta['numeric_cols'][1] if len(meta['numeric_cols']) > 1 else None)
244
+ update_scatter_color = gr.Dropdown(choices=meta['columns'], label="Color By (Optional)")
245
+ update_drill_dd = gr.Dropdown(choices=meta['columns'], label="Select Column to Analyze")
246
+
247
+ # --- Configure adaptive module visibility and dropdowns ---
248
  show_ts_tab = gr.Tab(visible=bool(meta['datetime_cols']))
249
+ update_ts_date_dd = gr.Dropdown(choices=meta['datetime_cols'])
250
+ update_ts_value_dd = gr.Dropdown(choices=meta['numeric_cols'])
251
+
252
  show_text_tab = gr.Tab(visible=bool(meta['text_cols']))
253
+ update_text_dd = gr.Dropdown(choices=meta['text_cols'])
254
+
255
  show_cluster_tab = gr.Tab(visible=len(meta['numeric_cols']) > 1)
256
+ update_cluster_slider = gr.Slider(visible=len(meta['numeric_cols']) > 1)
257
 
258
+ # Return a flat list of all updates in the correct order
259
  return [
260
+ analyzer, ai_report,
261
+ missing_df, num_df, cat_df,
262
+ fig_types, fig_missing, fig_corr,
263
+ update_hist_dd, update_scatter_x, update_scatter_y, update_scatter_color, update_drill_dd,
264
+ show_ts_tab, update_ts_date_dd, update_ts_value_dd,
265
+ show_text_tab, update_text_dd,
266
+ show_cluster_tab, update_cluster_slider
267
  ]
268
  except Exception as e:
269
  logging.error(f"A critical error occurred: {e}", exc_info=True)
270
  raise gr.Error(f"Analysis Failed! Error: {str(e)}")
271
 
 
 
 
 
272
  if __name__ == "__main__":
273
+ # You might want to run perform_pre_flight_checks() here
274
  app_instance = create_ui()
275
  app_instance.launch(debug=True, server_name="0.0.0.0")