Spaces:
Sleeping
Sleeping
Update ui/callbacks.py
Browse files- ui/callbacks.py +79 -46
ui/callbacks.py
CHANGED
@@ -2,10 +2,11 @@
|
|
2 |
|
3 |
# -*- coding: utf-8 -*-
|
4 |
#
|
5 |
-
# PROJECT: CognitiveEDA v5.
|
6 |
#
|
7 |
-
# DESCRIPTION: This module
|
8 |
-
#
|
|
|
9 |
|
10 |
import gradio as gr
|
11 |
import pandas as pd
|
@@ -18,13 +19,11 @@ import plotly.express as px
|
|
18 |
from core.analyzer import DataAnalyzer, engineer_features
|
19 |
from core.llm import GeminiNarrativeGenerator
|
20 |
from core.config import settings
|
21 |
-
from core.exceptions import DataProcessingError
|
22 |
from modules.clustering import perform_clustering
|
23 |
-
# --- NEW IMPORT ---
|
24 |
from modules.profiling import profile_clusters
|
25 |
|
|
|
26 |
|
27 |
-
# --- Primary Analysis Chain (Unchanged) ---
|
28 |
def run_initial_analysis(file_obj, progress=gr.Progress(track_tqdm=True)):
|
29 |
if file_obj is None: raise gr.Error("No file uploaded.")
|
30 |
progress(0, desc="Validating configuration...")
|
@@ -45,81 +44,115 @@ def run_initial_analysis(file_obj, progress=gr.Progress(track_tqdm=True)):
|
|
45 |
raise gr.Error(f"Analysis Failed: {str(e)}")
|
46 |
|
47 |
def generate_reports_and_visuals(analyzer, progress=gr.Progress(track_tqdm=True)):
|
|
|
|
|
|
|
48 |
if not isinstance(analyzer, DataAnalyzer):
|
49 |
-
yield (None,) *
|
50 |
return
|
|
|
51 |
progress(0, desc="Spawning AI report thread...")
|
52 |
ai_report_queue = [""]
|
53 |
-
def generate_ai_report_threaded(a):
|
54 |
-
narrative_generator = GeminiNarrativeGenerator(settings.GOOGLE_API_KEY)
|
55 |
-
ai_report_queue[0] = narrative_generator.generate_narrative(a)
|
56 |
thread = Thread(target=generate_ai_report_threaded, args=(analyzer,))
|
57 |
thread.start()
|
|
|
58 |
progress(0.4, desc="Generating reports and visuals...")
|
59 |
meta = analyzer.metadata
|
60 |
missing_df, num_df, cat_df = analyzer.get_profiling_reports()
|
61 |
fig_types, fig_missing, fig_corr = analyzer.get_overview_visuals()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
initial_updates = (
|
63 |
gr.update(value="⏳ Generating AI report..."), gr.update(value=missing_df),
|
64 |
gr.update(value=num_df), gr.update(value=cat_df), gr.update(value=fig_types),
|
65 |
gr.update(value=fig_missing), gr.update(value=fig_corr),
|
66 |
-
gr.update(choices=meta
|
67 |
-
gr.update(choices=meta
|
68 |
-
gr.update(choices=meta
|
69 |
-
gr.update(choices=meta
|
70 |
-
gr.update(visible=bool(meta
|
|
|
71 |
)
|
72 |
yield initial_updates
|
|
|
73 |
thread.join()
|
74 |
progress(1.0, desc="AI Report complete!")
|
75 |
final_updates_list = list(initial_updates)
|
76 |
final_updates_list[0] = gr.update(value=ai_report_queue[0])
|
77 |
yield tuple(final_updates_list)
|
78 |
|
79 |
-
# ---
|
80 |
-
def create_histogram(analyzer, col):
|
81 |
-
if not isinstance(analyzer, DataAnalyzer) or not col: return go.Figure()
|
82 |
-
return px.histogram(analyzer.df, x=col, title=f"<b>Distribution of {col}</b>", marginal="box")
|
83 |
-
|
84 |
-
def create_scatterplot(analyzer, x_col, y_col, color_col):
|
85 |
-
if not isinstance(analyzer, DataAnalyzer) or not x_col or not y_col: return go.Figure()
|
86 |
-
df_sample = analyzer.df.sample(n=min(len(analyzer.df), 10000))
|
87 |
-
return px.scatter(df_sample, x=x_col, y=y_col, color=color_col if color_col else None)
|
88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
-
|
91 |
-
def update_clustering(analyzer, k):
|
92 |
"""
|
93 |
-
Orchestrates the full clustering workflow
|
94 |
-
|
95 |
-
2. Receives cluster labels.
|
96 |
-
3. Calls the profiling module to analyze the segments.
|
97 |
-
4. Returns all results to the UI.
|
98 |
"""
|
99 |
if not isinstance(analyzer, DataAnalyzer):
|
100 |
-
# Return empty updates for all 5 clustering output components
|
101 |
return go.Figure(), go.Figure(), "", "", go.Figure()
|
102 |
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
fig_cluster, fig_elbow, summary, cluster_labels = perform_clustering(
|
105 |
-
|
106 |
)
|
107 |
|
108 |
if cluster_labels.empty:
|
109 |
-
|
110 |
-
return fig_cluster, fig_elbow, summary, "Clustering failed. No personas to profile.", go.Figure()
|
111 |
|
112 |
-
# Step
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
# Filter to only use columns that actually exist in the engineered dataframe
|
117 |
-
numeric_to_profile = [c for c in numeric_to_profile if c in analyzer.df.columns]
|
118 |
-
cats_to_profile = [c for c in cats_to_profile if c in analyzer.df.columns]
|
119 |
|
120 |
md_personas, fig_profile = profile_clusters(
|
121 |
-
|
122 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
-
|
125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
# -*- coding: utf-8 -*-
|
4 |
#
|
5 |
+
# PROJECT: CognitiveEDA v5.9 - The QuantumLeap Intelligence Platform
|
6 |
#
|
7 |
+
# DESCRIPTION: This module is updated with a generic, data-agnostic
|
8 |
+
# stratification engine. It dynamically identifies candidate
|
9 |
+
# features for filtering and updates the UI accordingly.
|
10 |
|
11 |
import gradio as gr
|
12 |
import pandas as pd
|
|
|
19 |
from core.analyzer import DataAnalyzer, engineer_features
|
20 |
from core.llm import GeminiNarrativeGenerator
|
21 |
from core.config import settings
|
|
|
22 |
from modules.clustering import perform_clustering
|
|
|
23 |
from modules.profiling import profile_clusters
|
24 |
|
25 |
+
# --- Primary Analysis Chain ---
|
26 |
|
|
|
27 |
def run_initial_analysis(file_obj, progress=gr.Progress(track_tqdm=True)):
|
28 |
if file_obj is None: raise gr.Error("No file uploaded.")
|
29 |
progress(0, desc="Validating configuration...")
|
|
|
44 |
raise gr.Error(f"Analysis Failed: {str(e)}")
|
45 |
|
46 |
def generate_reports_and_visuals(analyzer, progress=gr.Progress(track_tqdm=True)):
|
47 |
+
"""
|
48 |
+
Phase 2: Now populates the generic 'Stratify By' dropdown with candidate columns.
|
49 |
+
"""
|
50 |
if not isinstance(analyzer, DataAnalyzer):
|
51 |
+
yield (None,) * 15
|
52 |
return
|
53 |
+
|
54 |
progress(0, desc="Spawning AI report thread...")
|
55 |
ai_report_queue = [""]
|
56 |
+
def generate_ai_report_threaded(a): ai_report_queue[0] = GeminiNarrativeGenerator(settings.GOOGLE_API_KEY).generate_narrative(a)
|
|
|
|
|
57 |
thread = Thread(target=generate_ai_report_threaded, args=(analyzer,))
|
58 |
thread.start()
|
59 |
+
|
60 |
progress(0.4, desc="Generating reports and visuals...")
|
61 |
meta = analyzer.metadata
|
62 |
missing_df, num_df, cat_df = analyzer.get_profiling_reports()
|
63 |
fig_types, fig_missing, fig_corr = analyzer.get_overview_visuals()
|
64 |
+
|
65 |
+
# --- Dynamically identify candidate columns for stratification ---
|
66 |
+
candidate_cols = ["(Do not stratify)"]
|
67 |
+
if 'categorical_cols' in meta:
|
68 |
+
for col in meta['categorical_cols']:
|
69 |
+
# A good candidate has more than 1 but fewer than 50 unique values (heuristic)
|
70 |
+
if analyzer.df[col].dtype.name != 'object' or (1 < analyzer.df[col].nunique() < 50):
|
71 |
+
candidate_cols.append(col)
|
72 |
+
|
73 |
initial_updates = (
|
74 |
gr.update(value="⏳ Generating AI report..."), gr.update(value=missing_df),
|
75 |
gr.update(value=num_df), gr.update(value=cat_df), gr.update(value=fig_types),
|
76 |
gr.update(value=fig_missing), gr.update(value=fig_corr),
|
77 |
+
gr.update(choices=meta.get('numeric_cols', [])),
|
78 |
+
gr.update(choices=meta.get('numeric_cols', [])),
|
79 |
+
gr.update(choices=meta.get('numeric_cols', [])),
|
80 |
+
gr.update(choices=meta.get('columns', [])), gr.update(visible=bool(meta.get('datetime_cols'))),
|
81 |
+
gr.update(visible=bool(meta.get('text_cols'))), gr.update(visible=len(meta.get('numeric_cols', [])) > 1),
|
82 |
+
gr.update(choices=candidate_cols, value="(Do not stratify)") # dd_stratify_by_col
|
83 |
)
|
84 |
yield initial_updates
|
85 |
+
|
86 |
thread.join()
|
87 |
progress(1.0, desc="AI Report complete!")
|
88 |
final_updates_list = list(initial_updates)
|
89 |
final_updates_list[0] = gr.update(value=ai_report_queue[0])
|
90 |
yield tuple(final_updates_list)
|
91 |
|
92 |
+
# --- Stratification Callbacks ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
+
def update_filter_dropdown(analyzer, stratify_col):
|
95 |
+
"""
|
96 |
+
When the user selects a feature to stratify by, this function populates
|
97 |
+
the second dropdown with the unique values of that feature.
|
98 |
+
"""
|
99 |
+
if not isinstance(analyzer, DataAnalyzer) or not stratify_col or stratify_col == "(Do not stratify)":
|
100 |
+
return gr.update(choices=[], value=None, interactive=False)
|
101 |
+
|
102 |
+
values = ["(Global Analysis)"] + sorted(analyzer.df[stratify_col].unique().tolist())
|
103 |
+
return gr.update(choices=values, value="(Global Analysis)", interactive=True)
|
104 |
|
105 |
+
def update_stratified_clustering(analyzer, stratify_col, filter_value, k):
|
|
|
106 |
"""
|
107 |
+
Orchestrates the full clustering workflow on a dataset that is generically
|
108 |
+
filtered based on user selections.
|
|
|
|
|
|
|
109 |
"""
|
110 |
if not isinstance(analyzer, DataAnalyzer):
|
|
|
111 |
return go.Figure(), go.Figure(), "", "", go.Figure()
|
112 |
|
113 |
+
logging.info(f"Updating clustering. Stratify by: '{stratify_col}', Filter: '{filter_value}', K={k}")
|
114 |
+
|
115 |
+
# Step 1: Stratify the DataFrame based on user selection
|
116 |
+
analysis_df = analyzer.df
|
117 |
+
report_title_prefix = "Global Analysis: "
|
118 |
+
|
119 |
+
if stratify_col and stratify_col != "(Do not stratify)" and filter_value and filter_value != "(Global Analysis)":
|
120 |
+
analysis_df = analyzer.df[analyzer.df[stratify_col] == filter_value]
|
121 |
+
report_title_prefix = f"Analysis for '{stratify_col}' = '{filter_value}': "
|
122 |
+
|
123 |
+
if len(analysis_df) < k:
|
124 |
+
error_msg = f"Not enough data ({len(analysis_df)} rows) to form {k} clusters for the selected filter."
|
125 |
+
return go.Figure(), go.Figure(), error_msg, error_msg, go.Figure()
|
126 |
+
|
127 |
+
# Step 2: Perform Clustering
|
128 |
+
numeric_cols = [c for c in analyzer.metadata['numeric_cols'] if c in analysis_df.columns]
|
129 |
fig_cluster, fig_elbow, summary, cluster_labels = perform_clustering(
|
130 |
+
analysis_df, numeric_cols, k
|
131 |
)
|
132 |
|
133 |
if cluster_labels.empty:
|
134 |
+
return fig_cluster, fig_elbow, summary, "Clustering failed.", go.Figure()
|
|
|
135 |
|
136 |
+
# Step 3: Profile the resulting clusters
|
137 |
+
cats_to_profile = [c for c in analyzer.metadata['categorical_cols'] if c in analysis_df.columns]
|
138 |
+
numeric_to_profile = [c for c in numeric_cols if c not in ['Month', 'Day_of_Week', 'Is_Weekend', 'Hour']]
|
|
|
|
|
|
|
|
|
139 |
|
140 |
md_personas, fig_profile = profile_clusters(
|
141 |
+
analysis_df, cluster_labels, numeric_to_profile, cats_to_profile
|
142 |
)
|
143 |
+
|
144 |
+
summary = f"**{report_title_prefix}**" + summary
|
145 |
+
md_personas = f"**{report_title_prefix}**" + md_personas
|
146 |
+
|
147 |
+
# Step 4: Return all results
|
148 |
+
return fig_cluster, fig_elbow, summary, md_personas, fig_profile
|
149 |
|
150 |
+
# --- Other Callbacks ---
|
151 |
+
def create_histogram(analyzer, col):
|
152 |
+
if not isinstance(analyzer, DataAnalyzer) or not col: return go.Figure()
|
153 |
+
return px.histogram(analyzer.df, x=col, title=f"<b>Distribution of {col}</b>", marginal="box")
|
154 |
+
|
155 |
+
def create_scatterplot(analyzer, x_col, y_col, color_col):
|
156 |
+
if not isinstance(analyzer, DataAnalyzer) or not x_col or not y_col: return go.Figure()
|
157 |
+
df_sample = analyzer.df.sample(n=min(len(analyzer.df), 10000))
|
158 |
+
return px.scatter(df_sample, x=x_col, y=y_col, color=color_col if color_col else None)
|