mgbam commited on
Commit
57f807e
·
verified ·
1 Parent(s): 05e596d

Update ui/callbacks.py

Browse files
Files changed (1) hide show
  1. ui/callbacks.py +79 -46
ui/callbacks.py CHANGED
@@ -2,10 +2,11 @@
2
 
3
  # -*- coding: utf-8 -*-
4
  #
5
- # PROJECT: CognitiveEDA v5.7 - The QuantumLeap Intelligence Platform
6
  #
7
- # DESCRIPTION: This module contains the core logic for all Gradio event handlers.
8
- # The clustering callback is now updated to include persona profiling.
 
9
 
10
  import gradio as gr
11
  import pandas as pd
@@ -18,13 +19,11 @@ import plotly.express as px
18
  from core.analyzer import DataAnalyzer, engineer_features
19
  from core.llm import GeminiNarrativeGenerator
20
  from core.config import settings
21
- from core.exceptions import DataProcessingError
22
  from modules.clustering import perform_clustering
23
- # --- NEW IMPORT ---
24
  from modules.profiling import profile_clusters
25
 
 
26
 
27
- # --- Primary Analysis Chain (Unchanged) ---
28
  def run_initial_analysis(file_obj, progress=gr.Progress(track_tqdm=True)):
29
  if file_obj is None: raise gr.Error("No file uploaded.")
30
  progress(0, desc="Validating configuration...")
@@ -45,81 +44,115 @@ def run_initial_analysis(file_obj, progress=gr.Progress(track_tqdm=True)):
45
  raise gr.Error(f"Analysis Failed: {str(e)}")
46
 
47
  def generate_reports_and_visuals(analyzer, progress=gr.Progress(track_tqdm=True)):
 
 
 
48
  if not isinstance(analyzer, DataAnalyzer):
49
- yield (None,) * 14
50
  return
 
51
  progress(0, desc="Spawning AI report thread...")
52
  ai_report_queue = [""]
53
- def generate_ai_report_threaded(a):
54
- narrative_generator = GeminiNarrativeGenerator(settings.GOOGLE_API_KEY)
55
- ai_report_queue[0] = narrative_generator.generate_narrative(a)
56
  thread = Thread(target=generate_ai_report_threaded, args=(analyzer,))
57
  thread.start()
 
58
  progress(0.4, desc="Generating reports and visuals...")
59
  meta = analyzer.metadata
60
  missing_df, num_df, cat_df = analyzer.get_profiling_reports()
61
  fig_types, fig_missing, fig_corr = analyzer.get_overview_visuals()
 
 
 
 
 
 
 
 
 
62
  initial_updates = (
63
  gr.update(value="⏳ Generating AI report..."), gr.update(value=missing_df),
64
  gr.update(value=num_df), gr.update(value=cat_df), gr.update(value=fig_types),
65
  gr.update(value=fig_missing), gr.update(value=fig_corr),
66
- gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None),
67
- gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None),
68
- gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][1] if len(meta['numeric_cols']) > 1 else None),
69
- gr.update(choices=meta['columns']), gr.update(visible=bool(meta['datetime_cols'])),
70
- gr.update(visible=bool(meta['text_cols'])), gr.update(visible=len(meta['numeric_cols']) > 1)
 
71
  )
72
  yield initial_updates
 
73
  thread.join()
74
  progress(1.0, desc="AI Report complete!")
75
  final_updates_list = list(initial_updates)
76
  final_updates_list[0] = gr.update(value=ai_report_queue[0])
77
  yield tuple(final_updates_list)
78
 
79
- # --- Interactive Explorer Callbacks (Unchanged) ---
80
- def create_histogram(analyzer, col):
81
- if not isinstance(analyzer, DataAnalyzer) or not col: return go.Figure()
82
- return px.histogram(analyzer.df, x=col, title=f"<b>Distribution of {col}</b>", marginal="box")
83
-
84
- def create_scatterplot(analyzer, x_col, y_col, color_col):
85
- if not isinstance(analyzer, DataAnalyzer) or not x_col or not y_col: return go.Figure()
86
- df_sample = analyzer.df.sample(n=min(len(analyzer.df), 10000))
87
- return px.scatter(df_sample, x=x_col, y=y_col, color=color_col if color_col else None)
88
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- # --- MODIFIED CLUSTERING CALLBACK ---
91
- def update_clustering(analyzer, k):
92
  """
93
- Orchestrates the full clustering workflow:
94
- 1. Runs K-Means clustering.
95
- 2. Receives cluster labels.
96
- 3. Calls the profiling module to analyze the segments.
97
- 4. Returns all results to the UI.
98
  """
99
  if not isinstance(analyzer, DataAnalyzer):
100
- # Return empty updates for all 5 clustering output components
101
  return go.Figure(), go.Figure(), "", "", go.Figure()
102
 
103
- # Step 1: Perform Clustering to get visuals and labels
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  fig_cluster, fig_elbow, summary, cluster_labels = perform_clustering(
105
- analyzer.df, analyzer.metadata['numeric_cols'], k
106
  )
107
 
108
  if cluster_labels.empty:
109
- # Handle cases where clustering fails (e.g., not enough data)
110
- return fig_cluster, fig_elbow, summary, "Clustering failed. No personas to profile.", go.Figure()
111
 
112
- # Step 2: Profile the resulting clusters
113
- numeric_to_profile = ['Total_Revenue', 'Quantity_Ordered', 'Hour']
114
- cats_to_profile = ['City', 'Product', 'Day_of_Week']
115
-
116
- # Filter to only use columns that actually exist in the engineered dataframe
117
- numeric_to_profile = [c for c in numeric_to_profile if c in analyzer.df.columns]
118
- cats_to_profile = [c for c in cats_to_profile if c in analyzer.df.columns]
119
 
120
  md_personas, fig_profile = profile_clusters(
121
- analyzer.df, cluster_labels, numeric_to_profile, cats_to_profile
122
  )
 
 
 
 
 
 
123
 
124
- # Step 3: Return all 5 results in the correct order for the UI
125
- return fig_cluster, fig_elbow, summary, md_personas, fig_profile
 
 
 
 
 
 
 
 
2
 
3
  # -*- coding: utf-8 -*-
4
  #
5
+ # PROJECT: CognitiveEDA v5.9 - The QuantumLeap Intelligence Platform
6
  #
7
+ # DESCRIPTION: This module is updated with a generic, data-agnostic
8
+ # stratification engine. It dynamically identifies candidate
9
+ # features for filtering and updates the UI accordingly.
10
 
11
  import gradio as gr
12
  import pandas as pd
 
19
  from core.analyzer import DataAnalyzer, engineer_features
20
  from core.llm import GeminiNarrativeGenerator
21
  from core.config import settings
 
22
  from modules.clustering import perform_clustering
 
23
  from modules.profiling import profile_clusters
24
 
25
+ # --- Primary Analysis Chain ---
26
 
 
27
  def run_initial_analysis(file_obj, progress=gr.Progress(track_tqdm=True)):
28
  if file_obj is None: raise gr.Error("No file uploaded.")
29
  progress(0, desc="Validating configuration...")
 
44
  raise gr.Error(f"Analysis Failed: {str(e)}")
45
 
46
  def generate_reports_and_visuals(analyzer, progress=gr.Progress(track_tqdm=True)):
47
+ """
48
+ Phase 2: Now populates the generic 'Stratify By' dropdown with candidate columns.
49
+ """
50
  if not isinstance(analyzer, DataAnalyzer):
51
+ yield (None,) * 15
52
  return
53
+
54
  progress(0, desc="Spawning AI report thread...")
55
  ai_report_queue = [""]
56
+ def generate_ai_report_threaded(a): ai_report_queue[0] = GeminiNarrativeGenerator(settings.GOOGLE_API_KEY).generate_narrative(a)
 
 
57
  thread = Thread(target=generate_ai_report_threaded, args=(analyzer,))
58
  thread.start()
59
+
60
  progress(0.4, desc="Generating reports and visuals...")
61
  meta = analyzer.metadata
62
  missing_df, num_df, cat_df = analyzer.get_profiling_reports()
63
  fig_types, fig_missing, fig_corr = analyzer.get_overview_visuals()
64
+
65
+ # --- Dynamically identify candidate columns for stratification ---
66
+ candidate_cols = ["(Do not stratify)"]
67
+ if 'categorical_cols' in meta:
68
+ for col in meta['categorical_cols']:
69
+ # A good candidate has more than 1 but fewer than 50 unique values (heuristic)
70
+ if analyzer.df[col].dtype.name != 'object' or (1 < analyzer.df[col].nunique() < 50):
71
+ candidate_cols.append(col)
72
+
73
  initial_updates = (
74
  gr.update(value="⏳ Generating AI report..."), gr.update(value=missing_df),
75
  gr.update(value=num_df), gr.update(value=cat_df), gr.update(value=fig_types),
76
  gr.update(value=fig_missing), gr.update(value=fig_corr),
77
+ gr.update(choices=meta.get('numeric_cols', [])),
78
+ gr.update(choices=meta.get('numeric_cols', [])),
79
+ gr.update(choices=meta.get('numeric_cols', [])),
80
+ gr.update(choices=meta.get('columns', [])), gr.update(visible=bool(meta.get('datetime_cols'))),
81
+ gr.update(visible=bool(meta.get('text_cols'))), gr.update(visible=len(meta.get('numeric_cols', [])) > 1),
82
+ gr.update(choices=candidate_cols, value="(Do not stratify)") # dd_stratify_by_col
83
  )
84
  yield initial_updates
85
+
86
  thread.join()
87
  progress(1.0, desc="AI Report complete!")
88
  final_updates_list = list(initial_updates)
89
  final_updates_list[0] = gr.update(value=ai_report_queue[0])
90
  yield tuple(final_updates_list)
91
 
92
+ # --- Stratification Callbacks ---
 
 
 
 
 
 
 
 
93
 
94
+ def update_filter_dropdown(analyzer, stratify_col):
95
+ """
96
+ When the user selects a feature to stratify by, this function populates
97
+ the second dropdown with the unique values of that feature.
98
+ """
99
+ if not isinstance(analyzer, DataAnalyzer) or not stratify_col or stratify_col == "(Do not stratify)":
100
+ return gr.update(choices=[], value=None, interactive=False)
101
+
102
+ values = ["(Global Analysis)"] + sorted(analyzer.df[stratify_col].unique().tolist())
103
+ return gr.update(choices=values, value="(Global Analysis)", interactive=True)
104
 
105
+ def update_stratified_clustering(analyzer, stratify_col, filter_value, k):
 
106
  """
107
+ Orchestrates the full clustering workflow on a dataset that is generically
108
+ filtered based on user selections.
 
 
 
109
  """
110
  if not isinstance(analyzer, DataAnalyzer):
 
111
  return go.Figure(), go.Figure(), "", "", go.Figure()
112
 
113
+ logging.info(f"Updating clustering. Stratify by: '{stratify_col}', Filter: '{filter_value}', K={k}")
114
+
115
+ # Step 1: Stratify the DataFrame based on user selection
116
+ analysis_df = analyzer.df
117
+ report_title_prefix = "Global Analysis: "
118
+
119
+ if stratify_col and stratify_col != "(Do not stratify)" and filter_value and filter_value != "(Global Analysis)":
120
+ analysis_df = analyzer.df[analyzer.df[stratify_col] == filter_value]
121
+ report_title_prefix = f"Analysis for '{stratify_col}' = '{filter_value}': "
122
+
123
+ if len(analysis_df) < k:
124
+ error_msg = f"Not enough data ({len(analysis_df)} rows) to form {k} clusters for the selected filter."
125
+ return go.Figure(), go.Figure(), error_msg, error_msg, go.Figure()
126
+
127
+ # Step 2: Perform Clustering
128
+ numeric_cols = [c for c in analyzer.metadata['numeric_cols'] if c in analysis_df.columns]
129
  fig_cluster, fig_elbow, summary, cluster_labels = perform_clustering(
130
+ analysis_df, numeric_cols, k
131
  )
132
 
133
  if cluster_labels.empty:
134
+ return fig_cluster, fig_elbow, summary, "Clustering failed.", go.Figure()
 
135
 
136
+ # Step 3: Profile the resulting clusters
137
+ cats_to_profile = [c for c in analyzer.metadata['categorical_cols'] if c in analysis_df.columns]
138
+ numeric_to_profile = [c for c in numeric_cols if c not in ['Month', 'Day_of_Week', 'Is_Weekend', 'Hour']]
 
 
 
 
139
 
140
  md_personas, fig_profile = profile_clusters(
141
+ analysis_df, cluster_labels, numeric_to_profile, cats_to_profile
142
  )
143
+
144
+ summary = f"**{report_title_prefix}**" + summary
145
+ md_personas = f"**{report_title_prefix}**" + md_personas
146
+
147
+ # Step 4: Return all results
148
+ return fig_cluster, fig_elbow, summary, md_personas, fig_profile
149
 
150
+ # --- Other Callbacks ---
151
+ def create_histogram(analyzer, col):
152
+ if not isinstance(analyzer, DataAnalyzer) or not col: return go.Figure()
153
+ return px.histogram(analyzer.df, x=col, title=f"<b>Distribution of {col}</b>", marginal="box")
154
+
155
+ def create_scatterplot(analyzer, x_col, y_col, color_col):
156
+ if not isinstance(analyzer, DataAnalyzer) or not x_col or not y_col: return go.Figure()
157
+ df_sample = analyzer.df.sample(n=min(len(analyzer.df), 10000))
158
+ return px.scatter(df_sample, x=x_col, y=y_col, color=color_col if color_col else None)