mgbam commited on
Commit
fe02df7
·
verified ·
1 Parent(s): c039984

Update ui/callbacks.py

Browse files
Files changed (1) hide show
  1. ui/callbacks.py +48 -79
ui/callbacks.py CHANGED
@@ -2,12 +2,11 @@
2
 
3
  # -*- coding: utf-8 -*-
4
  #
5
- # PROJECT: CognitiveEDA v5.5 - The QuantumLeap Intelligence Platform
6
  #
7
  # DESCRIPTION: This module contains the core logic for all Gradio event handlers.
8
- # It is designed to be completely decoupled from the UI definition.
9
- # Functions here return values in a specific order (often as tuples)
10
- # that correspond to a list of output components defined in app.py.
11
 
12
  import gradio as gr
13
  import pandas as pd
@@ -17,7 +16,9 @@ from threading import Thread
17
  import plotly.graph_objects as go
18
  import plotly.express as px
19
 
20
- from core.analyzer import DataAnalyzer
 
 
21
  from core.llm import GeminiNarrativeGenerator
22
  from core.config import settings
23
  from core.exceptions import DataProcessingError
@@ -28,16 +29,9 @@ from modules.clustering import perform_clustering
28
 
29
  def run_initial_analysis(file_obj, progress=gr.Progress(track_tqdm=True)):
30
  """
31
- Phase 1: Fast, synchronous tasks.
32
- Validates inputs, loads data, and creates the core DataAnalyzer object. This
33
- function updates the gr.State object, which then triggers the next phase.
34
-
35
- Args:
36
- file_obj: The uploaded file object from Gradio.
37
- progress: The Gradio progress tracker.
38
-
39
- Returns:
40
- The instantiated DataAnalyzer object, or None if processing fails.
41
  """
42
  if file_obj is None:
43
  raise gr.Error("No file uploaded. Please upload a CSV or Excel file.")
@@ -48,15 +42,23 @@ def run_initial_analysis(file_obj, progress=gr.Progress(track_tqdm=True)):
48
  raise gr.Error("CRITICAL: GOOGLE_API_KEY is not configured. Please add it as a secret.")
49
 
50
  try:
51
- progress(0.2, desc="Loading and parsing data file...")
52
- df = pd.read_csv(file_obj.name) if file_obj.name.endswith('.csv') else pd.read_excel(file_obj.name)
53
- if len(df) > settings.MAX_UI_ROWS:
54
- df = df.sample(n=settings.MAX_UI_ROWS, random_state=42)
55
  logging.info(f"DataFrame sampled down to {settings.MAX_UI_ROWS} rows.")
56
 
57
- progress(0.7, desc="Instantiating analysis engine...")
58
- analyzer = DataAnalyzer(df)
59
- progress(1.0, desc="Initial analysis complete. Generating reports...")
 
 
 
 
 
 
 
 
60
  return analyzer
61
  except Exception as e:
62
  logging.error(f"A critical error occurred during initial analysis: {e}", exc_info=True)
@@ -66,26 +68,15 @@ def run_initial_analysis(file_obj, progress=gr.Progress(track_tqdm=True)):
66
  def generate_reports_and_visuals(analyzer, progress=gr.Progress(track_tqdm=True)):
67
  """
68
  Phase 2: Slower, multi-stage report and visual generation.
69
- This generator function yields tuples of UI updates. The order of the yielded
70
- tuple is CRITICAL and must exactly match the `main_outputs` list in `app.py`.
71
-
72
- Args:
73
- analyzer: The DataAnalyzer object from the gr.State.
74
- progress: The Gradio progress tracker.
75
-
76
- Yields:
77
- A tuple of gr.update() objects to populate the dashboard.
78
  """
79
  if not isinstance(analyzer, DataAnalyzer):
80
  logging.warning("generate_reports_and_visuals called without a valid analyzer. Clearing UI.")
81
- # Return a tuple of Nones matching the output length to clear/reset the UI.
82
- # There are 14 components in the `main_outputs` list in app.py.
83
  yield (None,) * 14
84
  return
85
 
86
- # 1. Start AI narrative generation in a background thread
87
  progress(0, desc="Spawning AI report thread...")
88
- ai_report_queue = [""] # Use a mutable list to pass string by reference
89
  def generate_ai_report_threaded(analyzer_instance):
90
  narrative_generator = GeminiNarrativeGenerator(api_key=settings.GOOGLE_API_KEY)
91
  ai_report_queue[0] = narrative_generator.generate_narrative(analyzer_instance)
@@ -93,74 +84,52 @@ def generate_reports_and_visuals(analyzer, progress=gr.Progress(track_tqdm=True)
93
  thread = Thread(target=generate_ai_report_threaded, args=(analyzer,))
94
  thread.start()
95
 
96
- # 2. Generate standard reports and visuals
97
- progress(0.4, desc="Generating data profiles and visuals...")
98
  meta = analyzer.metadata
99
  missing_df, num_df, cat_df = analyzer.get_profiling_reports()
100
  fig_types, fig_missing, fig_corr = analyzer.get_overview_visuals()
101
 
102
- # 3. Yield the first set of updates to populate the main dashboard immediately.
103
- # The order of this tuple MUST match the `main_outputs` list in `app.py`.
104
  initial_updates = (
105
- gr.update(value="⏳ Generating AI-powered report in the background... The main dashboard is ready now."), # 0: ai_report_output
106
- gr.update(value=missing_df), # 1: profile_missing_df
107
- gr.update(value=num_df), # 2: profile_numeric_df
108
- gr.update(value=cat_df), # 3: profile_categorical_df
109
- gr.update(value=fig_types), # 4: plot_types
110
- gr.update(value=fig_missing), # 5: plot_missing
111
- gr.update(value=fig_corr), # 6: plot_correlation
112
- gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None), # 7: dd_hist_col
113
- gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None), # 8: dd_scatter_x
114
- gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][1] if len(meta['numeric_cols']) > 1 else None), # 9: dd_scatter_y
115
- gr.update(choices=meta['columns']), # 10: dd_scatter_color
116
- gr.update(visible=bool(meta['datetime_cols'])), # 11: tab_timeseries
117
- gr.update(visible=bool(meta['text_cols'])), # 12: tab_text
118
- gr.update(visible=len(meta['numeric_cols']) > 1) # 13: tab_cluster
119
  )
120
  yield initial_updates
121
 
122
- # 4. Wait for the AI thread to complete
123
  thread.join()
124
  progress(1.0, desc="AI Report complete!")
125
 
126
- # 5. Yield the final update. We create a mutable list from the initial tuple,
127
- # update the AI report element, and convert it back to a tuple to yield.
128
  final_updates_list = list(initial_updates)
129
  final_updates_list[0] = gr.update(value=ai_report_queue[0])
130
  yield tuple(final_updates_list)
131
 
132
 
133
- # --- Interactive Explorer Callbacks ---
134
 
135
  def create_histogram(analyzer, col):
136
- """Generates a histogram for a selected numeric column."""
137
  if not isinstance(analyzer, DataAnalyzer) or not col:
138
- return go.Figure().update_layout(title="Select a column to generate a histogram")
139
- return px.histogram(analyzer.df, x=col, title=f"<b>Distribution of {col}</b>", marginal="box", template="plotly_white")
140
 
141
  def create_scatterplot(analyzer, x_col, y_col, color_col):
142
- """Generates a scatter plot for selected X, Y, and optional color columns."""
143
  if not isinstance(analyzer, DataAnalyzer) or not x_col or not y_col:
144
- return go.Figure().update_layout(title="Select X and Y axes to generate a scatter plot")
145
-
146
- # Use a subset for performance on large datasets
147
- df_sample = analyzer.df
148
- if len(analyzer.df) > 10000:
149
- df_sample = analyzer.df.sample(n=10000, random_state=42)
150
-
151
- return px.scatter(
152
- df_sample, x=x_col, y=y_col, color=color_col if color_col else None,
153
- title=f"<b>Scatter Plot: {x_col} vs. {y_col}</b>", template="plotly_white"
154
- )
155
-
156
-
157
- # --- Specialized Module Callbacks ---
158
 
159
  def update_clustering(analyzer, k):
160
- """Callback for the clustering module. Returns a tuple of three updates."""
161
  if not isinstance(analyzer, DataAnalyzer):
162
- return gr.update(), gr.update(), gr.update(value="Run analysis first.")
163
-
164
- # Delegate the heavy lifting to the specialized module
165
  fig_cluster, fig_elbow, summary = perform_clustering(analyzer.df, analyzer.metadata['numeric_cols'], k)
166
  return fig_cluster, fig_elbow, summary
 
2
 
3
  # -*- coding: utf-8 -*-
4
  #
5
+ # PROJECT: CognitiveEDA v5.6 - The QuantumLeap Intelligence Platform
6
  #
7
  # DESCRIPTION: This module contains the core logic for all Gradio event handlers.
8
+ # The main analysis pipeline now includes a strategic feature
9
+ # engineering step before analysis.
 
10
 
11
  import gradio as gr
12
  import pandas as pd
 
16
  import plotly.graph_objects as go
17
  import plotly.express as px
18
 
19
+ # --- MODIFIED IMPORT ---
20
+ # Import both the analyzer class and the new feature engineering function
21
+ from core.analyzer import DataAnalyzer, engineer_features
22
  from core.llm import GeminiNarrativeGenerator
23
  from core.config import settings
24
  from core.exceptions import DataProcessingError
 
29
 
30
  def run_initial_analysis(file_obj, progress=gr.Progress(track_tqdm=True)):
31
  """
32
+ Phase 1: Now includes the strategic feature engineering step.
33
+ Validates inputs, loads raw data, applies feature engineering, and then
34
+ creates the core DataAnalyzer object on the transformed data.
 
 
 
 
 
 
 
35
  """
36
  if file_obj is None:
37
  raise gr.Error("No file uploaded. Please upload a CSV or Excel file.")
 
42
  raise gr.Error("CRITICAL: GOOGLE_API_KEY is not configured. Please add it as a secret.")
43
 
44
  try:
45
+ progress(0.1, desc="Loading raw data...")
46
+ df_raw = pd.read_csv(file_obj.name) if file_obj.name.endswith('.csv') else pd.read_excel(file_obj.name)
47
+ if len(df_raw) > settings.MAX_UI_ROWS:
48
+ df_raw = df_raw.sample(n=settings.MAX_UI_ROWS, random_state=42)
49
  logging.info(f"DataFrame sampled down to {settings.MAX_UI_ROWS} rows.")
50
 
51
+ # --- INTEGRATION POINT ---
52
+ # Apply the feature engineering function immediately after loading
53
+ progress(0.5, desc="Applying strategic feature engineering...")
54
+ df_engineered = engineer_features(df_raw)
55
+ # -------------------------
56
+
57
+ progress(0.8, desc="Instantiating analysis engine on engineered data...")
58
+ # The analyzer now works with the transformed, high-value dataset
59
+ analyzer = DataAnalyzer(df_engineered)
60
+
61
+ progress(1.0, desc="Analysis complete. Generating reports...")
62
  return analyzer
63
  except Exception as e:
64
  logging.error(f"A critical error occurred during initial analysis: {e}", exc_info=True)
 
68
  def generate_reports_and_visuals(analyzer, progress=gr.Progress(track_tqdm=True)):
69
  """
70
  Phase 2: Slower, multi-stage report and visual generation.
71
+ Yields tuples of UI updates based on the *engineered* data.
 
 
 
 
 
 
 
 
72
  """
73
  if not isinstance(analyzer, DataAnalyzer):
74
  logging.warning("generate_reports_and_visuals called without a valid analyzer. Clearing UI.")
 
 
75
  yield (None,) * 14
76
  return
77
 
 
78
  progress(0, desc="Spawning AI report thread...")
79
+ ai_report_queue = [""]
80
  def generate_ai_report_threaded(analyzer_instance):
81
  narrative_generator = GeminiNarrativeGenerator(api_key=settings.GOOGLE_API_KEY)
82
  ai_report_queue[0] = narrative_generator.generate_narrative(analyzer_instance)
 
84
  thread = Thread(target=generate_ai_report_threaded, args=(analyzer,))
85
  thread.start()
86
 
87
+ progress(0.4, desc="Generating reports and visuals...")
 
88
  meta = analyzer.metadata
89
  missing_df, num_df, cat_df = analyzer.get_profiling_reports()
90
  fig_types, fig_missing, fig_corr = analyzer.get_overview_visuals()
91
 
 
 
92
  initial_updates = (
93
+ gr.update(value="⏳ Generating AI report... Dashboard is ready."),
94
+ gr.update(value=missing_df),
95
+ gr.update(value=num_df),
96
+ gr.update(value=cat_df),
97
+ gr.update(value=fig_types),
98
+ gr.update(value=fig_missing),
99
+ gr.update(value=fig_corr),
100
+ gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None),
101
+ gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None),
102
+ gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][1] if len(meta['numeric_cols']) > 1 else None),
103
+ gr.update(choices=meta['columns']),
104
+ gr.update(visible=bool(meta['datetime_cols'])),
105
+ gr.update(visible=bool(meta['text_cols'])),
106
+ gr.update(visible=len(meta['numeric_cols']) > 1)
107
  )
108
  yield initial_updates
109
 
 
110
  thread.join()
111
  progress(1.0, desc="AI Report complete!")
112
 
 
 
113
  final_updates_list = list(initial_updates)
114
  final_updates_list[0] = gr.update(value=ai_report_queue[0])
115
  yield tuple(final_updates_list)
116
 
117
 
118
+ # --- Interactive Explorer & Module Callbacks ---
119
 
120
  def create_histogram(analyzer, col):
 
121
  if not isinstance(analyzer, DataAnalyzer) or not col:
122
+ return go.Figure()
123
+ return px.histogram(analyzer.df, x=col, title=f"<b>Distribution of {col}</b>", marginal="box")
124
 
125
  def create_scatterplot(analyzer, x_col, y_col, color_col):
 
126
  if not isinstance(analyzer, DataAnalyzer) or not x_col or not y_col:
127
+ return go.Figure()
128
+ df_sample = analyzer.df.sample(n=min(len(analyzer.df), 10000))
129
+ return px.scatter(df_sample, x=x_col, y=y_col, color=color_col if color_col else None)
 
 
 
 
 
 
 
 
 
 
 
130
 
131
  def update_clustering(analyzer, k):
 
132
  if not isinstance(analyzer, DataAnalyzer):
133
+ return gr.update(), gr.update(), gr.update()
 
 
134
  fig_cluster, fig_elbow, summary = perform_clustering(analyzer.df, analyzer.metadata['numeric_cols'], k)
135
  return fig_cluster, fig_elbow, summary