mgbam commited on
Commit
1dae368
·
verified ·
1 Parent(s): 41ceb78

Update ui/callbacks.py

Browse files
Files changed (1) hide show
  1. ui/callbacks.py +56 -66
ui/callbacks.py CHANGED
@@ -2,11 +2,10 @@
2
 
3
  # -*- coding: utf-8 -*-
4
  #
5
- # PROJECT: CognitiveEDA v5.6 - The QuantumLeap Intelligence Platform
6
  #
7
  # DESCRIPTION: This module contains the core logic for all Gradio event handlers.
8
- # The main analysis pipeline now includes a strategic feature
9
- # engineering step before analysis.
10
 
11
  import gradio as gr
12
  import pandas as pd
@@ -16,120 +15,111 @@ from threading import Thread
16
  import plotly.graph_objects as go
17
  import plotly.express as px
18
 
19
- # --- MODIFIED IMPORT ---
20
- # Import both the analyzer class and the new feature engineering function
21
  from core.analyzer import DataAnalyzer, engineer_features
22
  from core.llm import GeminiNarrativeGenerator
23
  from core.config import settings
24
  from core.exceptions import DataProcessingError
25
  from modules.clustering import perform_clustering
 
 
26
 
27
 
28
- # --- Primary Analysis Chain ---
29
-
30
  def run_initial_analysis(file_obj, progress=gr.Progress(track_tqdm=True)):
31
- """
32
- Phase 1: Now includes the strategic feature engineering step.
33
- Validates inputs, loads raw data, applies feature engineering, and then
34
- creates the core DataAnalyzer object on the transformed data.
35
- """
36
- if file_obj is None:
37
- raise gr.Error("No file uploaded. Please upload a CSV or Excel file.")
38
-
39
  progress(0, desc="Validating configuration...")
40
- if not settings.GOOGLE_API_KEY:
41
- logging.error("Analysis attempted without GOOGLE_API_KEY set.")
42
- raise gr.Error("CRITICAL: GOOGLE_API_KEY is not configured. Please add it as a secret.")
43
-
44
  try:
45
  progress(0.1, desc="Loading raw data...")
46
  df_raw = pd.read_csv(file_obj.name) if file_obj.name.endswith('.csv') else pd.read_excel(file_obj.name)
47
  if len(df_raw) > settings.MAX_UI_ROWS:
48
  df_raw = df_raw.sample(n=settings.MAX_UI_ROWS, random_state=42)
49
- logging.info(f"DataFrame sampled down to {settings.MAX_UI_ROWS} rows.")
50
-
51
- # --- INTEGRATION POINT ---
52
- # Apply the feature engineering function immediately after loading
53
  progress(0.5, desc="Applying strategic feature engineering...")
54
  df_engineered = engineer_features(df_raw)
55
- # -------------------------
56
-
57
- progress(0.8, desc="Instantiating analysis engine on engineered data...")
58
- # The analyzer now works with the transformed, high-value dataset
59
  analyzer = DataAnalyzer(df_engineered)
60
-
61
  progress(1.0, desc="Analysis complete. Generating reports...")
62
  return analyzer
63
  except Exception as e:
64
- logging.error(f"A critical error occurred during initial analysis: {e}", exc_info=True)
65
- raise gr.Error(f"Analysis Failed! An unexpected error occurred: {str(e)}")
66
-
67
 
68
  def generate_reports_and_visuals(analyzer, progress=gr.Progress(track_tqdm=True)):
69
- """
70
- Phase 2: Slower, multi-stage report and visual generation.
71
- Yields tuples of UI updates based on the *engineered* data.
72
- """
73
  if not isinstance(analyzer, DataAnalyzer):
74
- logging.warning("generate_reports_and_visuals called without a valid analyzer. Clearing UI.")
75
  yield (None,) * 14
76
  return
77
-
78
  progress(0, desc="Spawning AI report thread...")
79
  ai_report_queue = [""]
80
- def generate_ai_report_threaded(analyzer_instance):
81
- narrative_generator = GeminiNarrativeGenerator(api_key=settings.GOOGLE_API_KEY)
82
- ai_report_queue[0] = narrative_generator.generate_narrative(analyzer_instance)
83
-
84
  thread = Thread(target=generate_ai_report_threaded, args=(analyzer,))
85
  thread.start()
86
-
87
  progress(0.4, desc="Generating reports and visuals...")
88
  meta = analyzer.metadata
89
  missing_df, num_df, cat_df = analyzer.get_profiling_reports()
90
  fig_types, fig_missing, fig_corr = analyzer.get_overview_visuals()
91
-
92
  initial_updates = (
93
- gr.update(value="⏳ Generating AI report... Dashboard is ready."),
94
- gr.update(value=missing_df),
95
- gr.update(value=num_df),
96
- gr.update(value=cat_df),
97
- gr.update(value=fig_types),
98
- gr.update(value=fig_missing),
99
- gr.update(value=fig_corr),
100
  gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None),
101
  gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None),
102
  gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][1] if len(meta['numeric_cols']) > 1 else None),
103
- gr.update(choices=meta['columns']),
104
- gr.update(visible=bool(meta['datetime_cols'])),
105
- gr.update(visible=bool(meta['text_cols'])),
106
- gr.update(visible=len(meta['numeric_cols']) > 1)
107
  )
108
  yield initial_updates
109
-
110
  thread.join()
111
  progress(1.0, desc="AI Report complete!")
112
-
113
  final_updates_list = list(initial_updates)
114
  final_updates_list[0] = gr.update(value=ai_report_queue[0])
115
  yield tuple(final_updates_list)
116
 
117
-
118
- # --- Interactive Explorer & Module Callbacks ---
119
-
120
  def create_histogram(analyzer, col):
121
- if not isinstance(analyzer, DataAnalyzer) or not col:
122
- return go.Figure()
123
  return px.histogram(analyzer.df, x=col, title=f"<b>Distribution of {col}</b>", marginal="box")
124
 
125
  def create_scatterplot(analyzer, x_col, y_col, color_col):
126
- if not isinstance(analyzer, DataAnalyzer) or not x_col or not y_col:
127
- return go.Figure()
128
  df_sample = analyzer.df.sample(n=min(len(analyzer.df), 10000))
129
  return px.scatter(df_sample, x=x_col, y=y_col, color=color_col if color_col else None)
130
 
 
 
131
  def update_clustering(analyzer, k):
 
 
 
 
 
 
 
132
  if not isinstance(analyzer, DataAnalyzer):
133
- return gr.update(), gr.update(), gr.update()
134
- fig_cluster, fig_elbow, summary = perform_clustering(analyzer.df, analyzer.metadata['numeric_cols'], k)
135
- return fig_cluster, fig_elbow, summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  # -*- coding: utf-8 -*-
4
  #
5
+ # PROJECT: CognitiveEDA v5.7 - The QuantumLeap Intelligence Platform
6
  #
7
  # DESCRIPTION: This module contains the core logic for all Gradio event handlers.
8
+ # The clustering callback is now updated to include persona profiling.
 
9
 
10
  import gradio as gr
11
  import pandas as pd
 
15
  import plotly.graph_objects as go
16
  import plotly.express as px
17
 
 
 
18
  from core.analyzer import DataAnalyzer, engineer_features
19
  from core.llm import GeminiNarrativeGenerator
20
  from core.config import settings
21
  from core.exceptions import DataProcessingError
22
  from modules.clustering import perform_clustering
23
+ # --- NEW IMPORT ---
24
+ from modules.profiling import profile_clusters
25
 
26
 
27
+ # --- Primary Analysis Chain (Unchanged) ---
 
28
  def run_initial_analysis(file_obj, progress=gr.Progress(track_tqdm=True)):
29
+ if file_obj is None: raise gr.Error("No file uploaded.")
 
 
 
 
 
 
 
30
  progress(0, desc="Validating configuration...")
31
+ if not settings.GOOGLE_API_KEY: raise gr.Error("CRITICAL: GOOGLE_API_KEY is not configured.")
 
 
 
32
  try:
33
  progress(0.1, desc="Loading raw data...")
34
  df_raw = pd.read_csv(file_obj.name) if file_obj.name.endswith('.csv') else pd.read_excel(file_obj.name)
35
  if len(df_raw) > settings.MAX_UI_ROWS:
36
  df_raw = df_raw.sample(n=settings.MAX_UI_ROWS, random_state=42)
 
 
 
 
37
  progress(0.5, desc="Applying strategic feature engineering...")
38
  df_engineered = engineer_features(df_raw)
39
+ progress(0.8, desc="Instantiating analysis engine...")
 
 
 
40
  analyzer = DataAnalyzer(df_engineered)
 
41
  progress(1.0, desc="Analysis complete. Generating reports...")
42
  return analyzer
43
  except Exception as e:
44
+ logging.error(f"Error in initial analysis: {e}", exc_info=True)
45
+ raise gr.Error(f"Analysis Failed: {str(e)}")
 
46
 
47
  def generate_reports_and_visuals(analyzer, progress=gr.Progress(track_tqdm=True)):
 
 
 
 
48
  if not isinstance(analyzer, DataAnalyzer):
 
49
  yield (None,) * 14
50
  return
 
51
  progress(0, desc="Spawning AI report thread...")
52
  ai_report_queue = [""]
53
+ def generate_ai_report_threaded(a):
54
+ narrative_generator = GeminiNarrativeGenerator(settings.GOOGLE_API_KEY)
55
+ ai_report_queue[0] = narrative_generator.generate_narrative(a)
 
56
  thread = Thread(target=generate_ai_report_threaded, args=(analyzer,))
57
  thread.start()
 
58
  progress(0.4, desc="Generating reports and visuals...")
59
  meta = analyzer.metadata
60
  missing_df, num_df, cat_df = analyzer.get_profiling_reports()
61
  fig_types, fig_missing, fig_corr = analyzer.get_overview_visuals()
 
62
  initial_updates = (
63
+ gr.update(value="⏳ Generating AI report..."), gr.update(value=missing_df),
64
+ gr.update(value=num_df), gr.update(value=cat_df), gr.update(value=fig_types),
65
+ gr.update(value=fig_missing), gr.update(value=fig_corr),
 
 
 
 
66
  gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None),
67
  gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None),
68
  gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][1] if len(meta['numeric_cols']) > 1 else None),
69
+ gr.update(choices=meta['columns']), gr.update(visible=bool(meta['datetime_cols'])),
70
+ gr.update(visible=bool(meta['text_cols'])), gr.update(visible=len(meta['numeric_cols']) > 1)
 
 
71
  )
72
  yield initial_updates
 
73
  thread.join()
74
  progress(1.0, desc="AI Report complete!")
 
75
  final_updates_list = list(initial_updates)
76
  final_updates_list[0] = gr.update(value=ai_report_queue[0])
77
  yield tuple(final_updates_list)
78
 
79
+ # --- Interactive Explorer Callbacks (Unchanged) ---
 
 
80
  def create_histogram(analyzer, col):
81
+ if not isinstance(analyzer, DataAnalyzer) or not col: return go.Figure()
 
82
  return px.histogram(analyzer.df, x=col, title=f"<b>Distribution of {col}</b>", marginal="box")
83
 
84
  def create_scatterplot(analyzer, x_col, y_col, color_col):
85
+ if not isinstance(analyzer, DataAnalyzer) or not x_col or not y_col: return go.Figure()
 
86
  df_sample = analyzer.df.sample(n=min(len(analyzer.df), 10000))
87
  return px.scatter(df_sample, x=x_col, y=y_col, color=color_col if color_col else None)
88
 
89
+
90
+ # --- MODIFIED CLUSTERING CALLBACK ---
91
  def update_clustering(analyzer, k):
92
+ """
93
+ Orchestrates the full clustering workflow:
94
+ 1. Runs K-Means clustering.
95
+ 2. Receives cluster labels.
96
+ 3. Calls the profiling module to analyze the segments.
97
+ 4. Returns all results to the UI.
98
+ """
99
  if not isinstance(analyzer, DataAnalyzer):
100
+ # Return empty updates for all 5 clustering output components
101
+ return go.Figure(), go.Figure(), "", "", go.Figure()
102
+
103
+ # Step 1: Perform Clustering to get visuals and labels
104
+ fig_cluster, fig_elbow, summary, cluster_labels = perform_clustering(
105
+ analyzer.df, analyzer.metadata['numeric_cols'], k
106
+ )
107
+
108
+ if cluster_labels.empty:
109
+ # Handle cases where clustering fails (e.g., not enough data)
110
+ return fig_cluster, fig_elbow, summary, "Clustering failed. No personas to profile.", go.Figure()
111
+
112
+ # Step 2: Profile the resulting clusters
113
+ numeric_to_profile = ['Total_Revenue', 'Quantity_Ordered', 'Hour']
114
+ cats_to_profile = ['City', 'Product', 'Day_of_Week']
115
+
116
+ # Filter to only use columns that actually exist in the engineered dataframe
117
+ numeric_to_profile = [c for c in numeric_to_profile if c in analyzer.df.columns]
118
+ cats_to_profile = [c for c in cats_to_profile if c in analyzer.df.columns]
119
+
120
+ md_personas, fig_profile = profile_clusters(
121
+ analyzer.df, cluster_labels, numeric_to_profile, cats_to_profile
122
+ )
123
+
124
+ # Step 3: Return all 5 results in the correct order for the UI
125
+ return fig_cluster, fig_elbow, summary, md_personas, fig_profile