Spaces:
Sleeping
Sleeping
Update ui/callbacks.py
Browse files- ui/callbacks.py +56 -66
ui/callbacks.py
CHANGED
@@ -2,11 +2,10 @@
|
|
2 |
|
3 |
# -*- coding: utf-8 -*-
|
4 |
#
|
5 |
-
# PROJECT: CognitiveEDA v5.
|
6 |
#
|
7 |
# DESCRIPTION: This module contains the core logic for all Gradio event handlers.
|
8 |
-
# The
|
9 |
-
# engineering step before analysis.
|
10 |
|
11 |
import gradio as gr
|
12 |
import pandas as pd
|
@@ -16,120 +15,111 @@ from threading import Thread
|
|
16 |
import plotly.graph_objects as go
|
17 |
import plotly.express as px
|
18 |
|
19 |
-
# --- MODIFIED IMPORT ---
|
20 |
-
# Import both the analyzer class and the new feature engineering function
|
21 |
from core.analyzer import DataAnalyzer, engineer_features
|
22 |
from core.llm import GeminiNarrativeGenerator
|
23 |
from core.config import settings
|
24 |
from core.exceptions import DataProcessingError
|
25 |
from modules.clustering import perform_clustering
|
|
|
|
|
26 |
|
27 |
|
28 |
-
# --- Primary Analysis Chain ---
|
29 |
-
|
30 |
def run_initial_analysis(file_obj, progress=gr.Progress(track_tqdm=True)):
|
31 |
-
""
|
32 |
-
Phase 1: Now includes the strategic feature engineering step.
|
33 |
-
Validates inputs, loads raw data, applies feature engineering, and then
|
34 |
-
creates the core DataAnalyzer object on the transformed data.
|
35 |
-
"""
|
36 |
-
if file_obj is None:
|
37 |
-
raise gr.Error("No file uploaded. Please upload a CSV or Excel file.")
|
38 |
-
|
39 |
progress(0, desc="Validating configuration...")
|
40 |
-
if not settings.GOOGLE_API_KEY:
|
41 |
-
logging.error("Analysis attempted without GOOGLE_API_KEY set.")
|
42 |
-
raise gr.Error("CRITICAL: GOOGLE_API_KEY is not configured. Please add it as a secret.")
|
43 |
-
|
44 |
try:
|
45 |
progress(0.1, desc="Loading raw data...")
|
46 |
df_raw = pd.read_csv(file_obj.name) if file_obj.name.endswith('.csv') else pd.read_excel(file_obj.name)
|
47 |
if len(df_raw) > settings.MAX_UI_ROWS:
|
48 |
df_raw = df_raw.sample(n=settings.MAX_UI_ROWS, random_state=42)
|
49 |
-
logging.info(f"DataFrame sampled down to {settings.MAX_UI_ROWS} rows.")
|
50 |
-
|
51 |
-
# --- INTEGRATION POINT ---
|
52 |
-
# Apply the feature engineering function immediately after loading
|
53 |
progress(0.5, desc="Applying strategic feature engineering...")
|
54 |
df_engineered = engineer_features(df_raw)
|
55 |
-
|
56 |
-
|
57 |
-
progress(0.8, desc="Instantiating analysis engine on engineered data...")
|
58 |
-
# The analyzer now works with the transformed, high-value dataset
|
59 |
analyzer = DataAnalyzer(df_engineered)
|
60 |
-
|
61 |
progress(1.0, desc="Analysis complete. Generating reports...")
|
62 |
return analyzer
|
63 |
except Exception as e:
|
64 |
-
logging.error(f"
|
65 |
-
raise gr.Error(f"Analysis Failed
|
66 |
-
|
67 |
|
68 |
def generate_reports_and_visuals(analyzer, progress=gr.Progress(track_tqdm=True)):
|
69 |
-
"""
|
70 |
-
Phase 2: Slower, multi-stage report and visual generation.
|
71 |
-
Yields tuples of UI updates based on the *engineered* data.
|
72 |
-
"""
|
73 |
if not isinstance(analyzer, DataAnalyzer):
|
74 |
-
logging.warning("generate_reports_and_visuals called without a valid analyzer. Clearing UI.")
|
75 |
yield (None,) * 14
|
76 |
return
|
77 |
-
|
78 |
progress(0, desc="Spawning AI report thread...")
|
79 |
ai_report_queue = [""]
|
80 |
-
def generate_ai_report_threaded(
|
81 |
-
narrative_generator = GeminiNarrativeGenerator(
|
82 |
-
ai_report_queue[0] = narrative_generator.generate_narrative(
|
83 |
-
|
84 |
thread = Thread(target=generate_ai_report_threaded, args=(analyzer,))
|
85 |
thread.start()
|
86 |
-
|
87 |
progress(0.4, desc="Generating reports and visuals...")
|
88 |
meta = analyzer.metadata
|
89 |
missing_df, num_df, cat_df = analyzer.get_profiling_reports()
|
90 |
fig_types, fig_missing, fig_corr = analyzer.get_overview_visuals()
|
91 |
-
|
92 |
initial_updates = (
|
93 |
-
gr.update(value="⏳ Generating AI report...
|
94 |
-
gr.update(value=
|
95 |
-
gr.update(value=
|
96 |
-
gr.update(value=cat_df),
|
97 |
-
gr.update(value=fig_types),
|
98 |
-
gr.update(value=fig_missing),
|
99 |
-
gr.update(value=fig_corr),
|
100 |
gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None),
|
101 |
gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None),
|
102 |
gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][1] if len(meta['numeric_cols']) > 1 else None),
|
103 |
-
gr.update(choices=meta['columns']),
|
104 |
-
gr.update(visible=bool(meta['
|
105 |
-
gr.update(visible=bool(meta['text_cols'])),
|
106 |
-
gr.update(visible=len(meta['numeric_cols']) > 1)
|
107 |
)
|
108 |
yield initial_updates
|
109 |
-
|
110 |
thread.join()
|
111 |
progress(1.0, desc="AI Report complete!")
|
112 |
-
|
113 |
final_updates_list = list(initial_updates)
|
114 |
final_updates_list[0] = gr.update(value=ai_report_queue[0])
|
115 |
yield tuple(final_updates_list)
|
116 |
|
117 |
-
|
118 |
-
# --- Interactive Explorer & Module Callbacks ---
|
119 |
-
|
120 |
def create_histogram(analyzer, col):
|
121 |
-
if not isinstance(analyzer, DataAnalyzer) or not col:
|
122 |
-
return go.Figure()
|
123 |
return px.histogram(analyzer.df, x=col, title=f"<b>Distribution of {col}</b>", marginal="box")
|
124 |
|
125 |
def create_scatterplot(analyzer, x_col, y_col, color_col):
|
126 |
-
if not isinstance(analyzer, DataAnalyzer) or not x_col or not y_col:
|
127 |
-
return go.Figure()
|
128 |
df_sample = analyzer.df.sample(n=min(len(analyzer.df), 10000))
|
129 |
return px.scatter(df_sample, x=x_col, y=y_col, color=color_col if color_col else None)
|
130 |
|
|
|
|
|
131 |
def update_clustering(analyzer, k):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
if not isinstance(analyzer, DataAnalyzer):
|
133 |
-
|
134 |
-
|
135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
# -*- coding: utf-8 -*-
|
4 |
#
|
5 |
+
# PROJECT: CognitiveEDA v5.7 - The QuantumLeap Intelligence Platform
|
6 |
#
|
7 |
# DESCRIPTION: This module contains the core logic for all Gradio event handlers.
|
8 |
+
# The clustering callback is now updated to include persona profiling.
|
|
|
9 |
|
10 |
import gradio as gr
|
11 |
import pandas as pd
|
|
|
15 |
import plotly.graph_objects as go
|
16 |
import plotly.express as px
|
17 |
|
|
|
|
|
18 |
from core.analyzer import DataAnalyzer, engineer_features
|
19 |
from core.llm import GeminiNarrativeGenerator
|
20 |
from core.config import settings
|
21 |
from core.exceptions import DataProcessingError
|
22 |
from modules.clustering import perform_clustering
|
23 |
+
# --- NEW IMPORT ---
|
24 |
+
from modules.profiling import profile_clusters
|
25 |
|
26 |
|
27 |
+
# --- Primary Analysis Chain (Unchanged) ---
|
|
|
28 |
def run_initial_analysis(file_obj, progress=gr.Progress(track_tqdm=True)):
|
29 |
+
if file_obj is None: raise gr.Error("No file uploaded.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
progress(0, desc="Validating configuration...")
|
31 |
+
if not settings.GOOGLE_API_KEY: raise gr.Error("CRITICAL: GOOGLE_API_KEY is not configured.")
|
|
|
|
|
|
|
32 |
try:
|
33 |
progress(0.1, desc="Loading raw data...")
|
34 |
df_raw = pd.read_csv(file_obj.name) if file_obj.name.endswith('.csv') else pd.read_excel(file_obj.name)
|
35 |
if len(df_raw) > settings.MAX_UI_ROWS:
|
36 |
df_raw = df_raw.sample(n=settings.MAX_UI_ROWS, random_state=42)
|
|
|
|
|
|
|
|
|
37 |
progress(0.5, desc="Applying strategic feature engineering...")
|
38 |
df_engineered = engineer_features(df_raw)
|
39 |
+
progress(0.8, desc="Instantiating analysis engine...")
|
|
|
|
|
|
|
40 |
analyzer = DataAnalyzer(df_engineered)
|
|
|
41 |
progress(1.0, desc="Analysis complete. Generating reports...")
|
42 |
return analyzer
|
43 |
except Exception as e:
|
44 |
+
logging.error(f"Error in initial analysis: {e}", exc_info=True)
|
45 |
+
raise gr.Error(f"Analysis Failed: {str(e)}")
|
|
|
46 |
|
47 |
def generate_reports_and_visuals(analyzer, progress=gr.Progress(track_tqdm=True)):
|
|
|
|
|
|
|
|
|
48 |
if not isinstance(analyzer, DataAnalyzer):
|
|
|
49 |
yield (None,) * 14
|
50 |
return
|
|
|
51 |
progress(0, desc="Spawning AI report thread...")
|
52 |
ai_report_queue = [""]
|
53 |
+
def generate_ai_report_threaded(a):
|
54 |
+
narrative_generator = GeminiNarrativeGenerator(settings.GOOGLE_API_KEY)
|
55 |
+
ai_report_queue[0] = narrative_generator.generate_narrative(a)
|
|
|
56 |
thread = Thread(target=generate_ai_report_threaded, args=(analyzer,))
|
57 |
thread.start()
|
|
|
58 |
progress(0.4, desc="Generating reports and visuals...")
|
59 |
meta = analyzer.metadata
|
60 |
missing_df, num_df, cat_df = analyzer.get_profiling_reports()
|
61 |
fig_types, fig_missing, fig_corr = analyzer.get_overview_visuals()
|
|
|
62 |
initial_updates = (
|
63 |
+
gr.update(value="⏳ Generating AI report..."), gr.update(value=missing_df),
|
64 |
+
gr.update(value=num_df), gr.update(value=cat_df), gr.update(value=fig_types),
|
65 |
+
gr.update(value=fig_missing), gr.update(value=fig_corr),
|
|
|
|
|
|
|
|
|
66 |
gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None),
|
67 |
gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None),
|
68 |
gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][1] if len(meta['numeric_cols']) > 1 else None),
|
69 |
+
gr.update(choices=meta['columns']), gr.update(visible=bool(meta['datetime_cols'])),
|
70 |
+
gr.update(visible=bool(meta['text_cols'])), gr.update(visible=len(meta['numeric_cols']) > 1)
|
|
|
|
|
71 |
)
|
72 |
yield initial_updates
|
|
|
73 |
thread.join()
|
74 |
progress(1.0, desc="AI Report complete!")
|
|
|
75 |
final_updates_list = list(initial_updates)
|
76 |
final_updates_list[0] = gr.update(value=ai_report_queue[0])
|
77 |
yield tuple(final_updates_list)
|
78 |
|
79 |
+
# --- Interactive Explorer Callbacks (Unchanged) ---
|
|
|
|
|
80 |
def create_histogram(analyzer, col):
|
81 |
+
if not isinstance(analyzer, DataAnalyzer) or not col: return go.Figure()
|
|
|
82 |
return px.histogram(analyzer.df, x=col, title=f"<b>Distribution of {col}</b>", marginal="box")
|
83 |
|
84 |
def create_scatterplot(analyzer, x_col, y_col, color_col):
|
85 |
+
if not isinstance(analyzer, DataAnalyzer) or not x_col or not y_col: return go.Figure()
|
|
|
86 |
df_sample = analyzer.df.sample(n=min(len(analyzer.df), 10000))
|
87 |
return px.scatter(df_sample, x=x_col, y=y_col, color=color_col if color_col else None)
|
88 |
|
89 |
+
|
90 |
+
# --- MODIFIED CLUSTERING CALLBACK ---
|
91 |
def update_clustering(analyzer, k):
|
92 |
+
"""
|
93 |
+
Orchestrates the full clustering workflow:
|
94 |
+
1. Runs K-Means clustering.
|
95 |
+
2. Receives cluster labels.
|
96 |
+
3. Calls the profiling module to analyze the segments.
|
97 |
+
4. Returns all results to the UI.
|
98 |
+
"""
|
99 |
if not isinstance(analyzer, DataAnalyzer):
|
100 |
+
# Return empty updates for all 5 clustering output components
|
101 |
+
return go.Figure(), go.Figure(), "", "", go.Figure()
|
102 |
+
|
103 |
+
# Step 1: Perform Clustering to get visuals and labels
|
104 |
+
fig_cluster, fig_elbow, summary, cluster_labels = perform_clustering(
|
105 |
+
analyzer.df, analyzer.metadata['numeric_cols'], k
|
106 |
+
)
|
107 |
+
|
108 |
+
if cluster_labels.empty:
|
109 |
+
# Handle cases where clustering fails (e.g., not enough data)
|
110 |
+
return fig_cluster, fig_elbow, summary, "Clustering failed. No personas to profile.", go.Figure()
|
111 |
+
|
112 |
+
# Step 2: Profile the resulting clusters
|
113 |
+
numeric_to_profile = ['Total_Revenue', 'Quantity_Ordered', 'Hour']
|
114 |
+
cats_to_profile = ['City', 'Product', 'Day_of_Week']
|
115 |
+
|
116 |
+
# Filter to only use columns that actually exist in the engineered dataframe
|
117 |
+
numeric_to_profile = [c for c in numeric_to_profile if c in analyzer.df.columns]
|
118 |
+
cats_to_profile = [c for c in cats_to_profile if c in analyzer.df.columns]
|
119 |
+
|
120 |
+
md_personas, fig_profile = profile_clusters(
|
121 |
+
analyzer.df, cluster_labels, numeric_to_profile, cats_to_profile
|
122 |
+
)
|
123 |
+
|
124 |
+
# Step 3: Return all 5 results in the correct order for the UI
|
125 |
+
return fig_cluster, fig_elbow, summary, md_personas, fig_profile
|