Spaces:

mgbam
/

CognitiveEDA

Sleeping

App Files Files Community

mgbam commited on 9 days ago

Commit

57f807e

verified ·

1 Parent(s): 05e596d

Update ui/callbacks.py

Browse files

Files changed (1) hide show

ui/callbacks.py +79 -46

ui/callbacks.py CHANGED Viewed

@@ -2,10 +2,11 @@
 # -*- coding: utf-8 -*-
 #
-# PROJECT:      CognitiveEDA v5.7 - The QuantumLeap Intelligence Platform
 #
-# DESCRIPTION:  This module contains the core logic for all Gradio event handlers.
-#               The clustering callback is now updated to include persona profiling.
 import gradio as gr
 import pandas as pd
@@ -18,13 +19,11 @@ import plotly.express as px
 from core.analyzer import DataAnalyzer, engineer_features
 from core.llm import GeminiNarrativeGenerator
 from core.config import settings
-from core.exceptions import DataProcessingError
 from modules.clustering import perform_clustering
-# --- NEW IMPORT ---
 from modules.profiling import profile_clusters
-# --- Primary Analysis Chain (Unchanged) ---
 def run_initial_analysis(file_obj, progress=gr.Progress(track_tqdm=True)):
     if file_obj is None: raise gr.Error("No file uploaded.")
     progress(0, desc="Validating configuration...")
@@ -45,81 +44,115 @@ def run_initial_analysis(file_obj, progress=gr.Progress(track_tqdm=True)):
         raise gr.Error(f"Analysis Failed: {str(e)}")
 def generate_reports_and_visuals(analyzer, progress=gr.Progress(track_tqdm=True)):
     if not isinstance(analyzer, DataAnalyzer):
-        yield (None,) * 14
         return
     progress(0, desc="Spawning AI report thread...")
     ai_report_queue = [""]
-    def generate_ai_report_threaded(a):
-        narrative_generator = GeminiNarrativeGenerator(settings.GOOGLE_API_KEY)
-        ai_report_queue[0] = narrative_generator.generate_narrative(a)
     thread = Thread(target=generate_ai_report_threaded, args=(analyzer,))
     thread.start()
     progress(0.4, desc="Generating reports and visuals...")
     meta = analyzer.metadata
     missing_df, num_df, cat_df = analyzer.get_profiling_reports()
     fig_types, fig_missing, fig_corr = analyzer.get_overview_visuals()
     initial_updates = (
         gr.update(value="⏳ Generating AI report..."), gr.update(value=missing_df),
         gr.update(value=num_df), gr.update(value=cat_df), gr.update(value=fig_types),
         gr.update(value=fig_missing), gr.update(value=fig_corr),
-        gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None),
-        gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None),
-        gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][1] if len(meta['numeric_cols']) > 1 else None),
-        gr.update(choices=meta['columns']), gr.update(visible=bool(meta['datetime_cols'])),
-        gr.update(visible=bool(meta['text_cols'])), gr.update(visible=len(meta['numeric_cols']) > 1)
     )
     yield initial_updates
     thread.join()
     progress(1.0, desc="AI Report complete!")
     final_updates_list = list(initial_updates)
     final_updates_list[0] = gr.update(value=ai_report_queue[0])
     yield tuple(final_updates_list)
-# --- Interactive Explorer Callbacks (Unchanged) ---
-def create_histogram(analyzer, col):
-    if not isinstance(analyzer, DataAnalyzer) or not col: return go.Figure()
-    return px.histogram(analyzer.df, x=col, title=f"<b>Distribution of {col}</b>", marginal="box")
-def create_scatterplot(analyzer, x_col, y_col, color_col):
-    if not isinstance(analyzer, DataAnalyzer) or not x_col or not y_col: return go.Figure()
-    df_sample = analyzer.df.sample(n=min(len(analyzer.df), 10000))
-    return px.scatter(df_sample, x=x_col, y=y_col, color=color_col if color_col else None)
-# --- MODIFIED CLUSTERING CALLBACK ---
-def update_clustering(analyzer, k):
     """
-    Orchestrates the full clustering workflow:
-    1. Runs K-Means clustering.
-    2. Receives cluster labels.
-    3. Calls the profiling module to analyze the segments.
-    4. Returns all results to the UI.
     """
     if not isinstance(analyzer, DataAnalyzer):
-        # Return empty updates for all 5 clustering output components
         return go.Figure(), go.Figure(), "", "", go.Figure()
-    # Step 1: Perform Clustering to get visuals and labels
     fig_cluster, fig_elbow, summary, cluster_labels = perform_clustering(
-        analyzer.df, analyzer.metadata['numeric_cols'], k
     )
     if cluster_labels.empty:
-        # Handle cases where clustering fails (e.g., not enough data)
-        return fig_cluster, fig_elbow, summary, "Clustering failed. No personas to profile.", go.Figure()
-    # Step 2: Profile the resulting clusters
-    numeric_to_profile = ['Total_Revenue', 'Quantity_Ordered', 'Hour']
-    cats_to_profile = ['City', 'Product', 'Day_of_Week']
-    # Filter to only use columns that actually exist in the engineered dataframe
-    numeric_to_profile = [c for c in numeric_to_profile if c in analyzer.df.columns]
-    cats_to_profile = [c for c in cats_to_profile if c in analyzer.df.columns]
     md_personas, fig_profile = profile_clusters(
-        analyzer.df, cluster_labels, numeric_to_profile, cats_to_profile
     )
-    # Step 3: Return all 5 results in the correct order for the UI
-    return fig_cluster, fig_elbow, summary, md_personas, fig_profile

 # -*- coding: utf-8 -*-
 #
+# PROJECT:      CognitiveEDA v5.9 - The QuantumLeap Intelligence Platform
 #
+# DESCRIPTION:  This module is updated with a generic, data-agnostic
+#               stratification engine. It dynamically identifies candidate
+#               features for filtering and updates the UI accordingly.
 import gradio as gr
 import pandas as pd
 from core.analyzer import DataAnalyzer, engineer_features
 from core.llm import GeminiNarrativeGenerator
 from core.config import settings
 from modules.clustering import perform_clustering
 from modules.profiling import profile_clusters
+# --- Primary Analysis Chain ---
 def run_initial_analysis(file_obj, progress=gr.Progress(track_tqdm=True)):
     if file_obj is None: raise gr.Error("No file uploaded.")
     progress(0, desc="Validating configuration...")
         raise gr.Error(f"Analysis Failed: {str(e)}")
 def generate_reports_and_visuals(analyzer, progress=gr.Progress(track_tqdm=True)):
+    """
+    Phase 2: Now populates the generic 'Stratify By' dropdown with candidate columns.
+    """
     if not isinstance(analyzer, DataAnalyzer):
+        yield (None,) * 15
         return
     progress(0, desc="Spawning AI report thread...")
     ai_report_queue = [""]
+    def generate_ai_report_threaded(a): ai_report_queue[0] = GeminiNarrativeGenerator(settings.GOOGLE_API_KEY).generate_narrative(a)
     thread = Thread(target=generate_ai_report_threaded, args=(analyzer,))
     thread.start()
     progress(0.4, desc="Generating reports and visuals...")
     meta = analyzer.metadata
     missing_df, num_df, cat_df = analyzer.get_profiling_reports()
     fig_types, fig_missing, fig_corr = analyzer.get_overview_visuals()
+    # --- Dynamically identify candidate columns for stratification ---
+    candidate_cols = ["(Do not stratify)"]
+    if 'categorical_cols' in meta:
+        for col in meta['categorical_cols']:
+            # A good candidate has more than 1 but fewer than 50 unique values (heuristic)
+            if analyzer.df[col].dtype.name != 'object' or (1 < analyzer.df[col].nunique() < 50):
+                 candidate_cols.append(col)
     initial_updates = (
         gr.update(value="⏳ Generating AI report..."), gr.update(value=missing_df),
         gr.update(value=num_df), gr.update(value=cat_df), gr.update(value=fig_types),
         gr.update(value=fig_missing), gr.update(value=fig_corr),
+        gr.update(choices=meta.get('numeric_cols', [])),
+        gr.update(choices=meta.get('numeric_cols', [])),
+        gr.update(choices=meta.get('numeric_cols', [])),
+        gr.update(choices=meta.get('columns', [])), gr.update(visible=bool(meta.get('datetime_cols'))),
+        gr.update(visible=bool(meta.get('text_cols'))), gr.update(visible=len(meta.get('numeric_cols', [])) > 1),
+        gr.update(choices=candidate_cols, value="(Do not stratify)") # dd_stratify_by_col
     )
     yield initial_updates
     thread.join()
     progress(1.0, desc="AI Report complete!")
     final_updates_list = list(initial_updates)
     final_updates_list[0] = gr.update(value=ai_report_queue[0])
     yield tuple(final_updates_list)
+# --- Stratification Callbacks ---
+def update_filter_dropdown(analyzer, stratify_col):
+    """
+    When the user selects a feature to stratify by, this function populates
+    the second dropdown with the unique values of that feature.
+    """
+    if not isinstance(analyzer, DataAnalyzer) or not stratify_col or stratify_col == "(Do not stratify)":
+        return gr.update(choices=[], value=None, interactive=False)
+    values = ["(Global Analysis)"] + sorted(analyzer.df[stratify_col].unique().tolist())
+    return gr.update(choices=values, value="(Global Analysis)", interactive=True)
+def update_stratified_clustering(analyzer, stratify_col, filter_value, k):
     """
+    Orchestrates the full clustering workflow on a dataset that is generically
+    filtered based on user selections.
     """
     if not isinstance(analyzer, DataAnalyzer):
         return go.Figure(), go.Figure(), "", "", go.Figure()
+    logging.info(f"Updating clustering. Stratify by: '{stratify_col}', Filter: '{filter_value}', K={k}")
+    # Step 1: Stratify the DataFrame based on user selection
+    analysis_df = analyzer.df
+    report_title_prefix = "Global Analysis: "
+    if stratify_col and stratify_col != "(Do not stratify)" and filter_value and filter_value != "(Global Analysis)":
+        analysis_df = analyzer.df[analyzer.df[stratify_col] == filter_value]
+        report_title_prefix = f"Analysis for '{stratify_col}' = '{filter_value}': "
+    if len(analysis_df) < k:
+        error_msg = f"Not enough data ({len(analysis_df)} rows) to form {k} clusters for the selected filter."
+        return go.Figure(), go.Figure(), error_msg, error_msg, go.Figure()
+    # Step 2: Perform Clustering
+    numeric_cols = [c for c in analyzer.metadata['numeric_cols'] if c in analysis_df.columns]
     fig_cluster, fig_elbow, summary, cluster_labels = perform_clustering(
+        analysis_df, numeric_cols, k
     )
     if cluster_labels.empty:
+        return fig_cluster, fig_elbow, summary, "Clustering failed.", go.Figure()
+    # Step 3: Profile the resulting clusters
+    cats_to_profile = [c for c in analyzer.metadata['categorical_cols'] if c in analysis_df.columns]
+    numeric_to_profile = [c for c in numeric_cols if c not in ['Month', 'Day_of_Week', 'Is_Weekend', 'Hour']]
     md_personas, fig_profile = profile_clusters(
+        analysis_df, cluster_labels, numeric_to_profile, cats_to_profile
     )
+    summary = f"**{report_title_prefix}**" + summary
+    md_personas = f"**{report_title_prefix}**" + md_personas
+    # Step 4: Return all results
+    return fig_cluster, fig_elbow, summary, md_personas, fig_profile
+# --- Other Callbacks ---
+def create_histogram(analyzer, col):
+    if not isinstance(analyzer, DataAnalyzer) or not col: return go.Figure()
+    return px.histogram(analyzer.df, x=col, title=f"<b>Distribution of {col}</b>", marginal="box")
+def create_scatterplot(analyzer, x_col, y_col, color_col):
+    if not isinstance(analyzer, DataAnalyzer) or not x_col or not y_col: return go.Figure()
+    df_sample = analyzer.df.sample(n=min(len(analyzer.df), 10000))
+    return px.scatter(df_sample, x=x_col, y=y_col, color=color_col if color_col else None)