Spaces:

mgbam
/

CognitiveEDA

Sleeping

App Files Files Community

mgbam commited on 10 days ago

Commit

fe02df7

verified ·

1 Parent(s): c039984

Update ui/callbacks.py

Browse files

Files changed (1) hide show

ui/callbacks.py +48 -79

ui/callbacks.py CHANGED Viewed

@@ -2,12 +2,11 @@
 # -*- coding: utf-8 -*-
 #
-# PROJECT:      CognitiveEDA v5.5 - The QuantumLeap Intelligence Platform
 #
 # DESCRIPTION:  This module contains the core logic for all Gradio event handlers.
-#               It is designed to be completely decoupled from the UI definition.
-#               Functions here return values in a specific order (often as tuples)
-#               that correspond to a list of output components defined in app.py.
 import gradio as gr
 import pandas as pd
@@ -17,7 +16,9 @@ from threading import Thread
 import plotly.graph_objects as go
 import plotly.express as px
-from core.analyzer import DataAnalyzer
 from core.llm import GeminiNarrativeGenerator
 from core.config import settings
 from core.exceptions import DataProcessingError
@@ -28,16 +29,9 @@ from modules.clustering import perform_clustering
 def run_initial_analysis(file_obj, progress=gr.Progress(track_tqdm=True)):
     """
-    Phase 1: Fast, synchronous tasks.
-    Validates inputs, loads data, and creates the core DataAnalyzer object. This
-    function updates the gr.State object, which then triggers the next phase.
-    Args:
-        file_obj: The uploaded file object from Gradio.
-        progress: The Gradio progress tracker.
-    Returns:
-        The instantiated DataAnalyzer object, or None if processing fails.
     """
     if file_obj is None:
         raise gr.Error("No file uploaded. Please upload a CSV or Excel file.")
@@ -48,15 +42,23 @@ def run_initial_analysis(file_obj, progress=gr.Progress(track_tqdm=True)):
         raise gr.Error("CRITICAL: GOOGLE_API_KEY is not configured. Please add it as a secret.")
     try:
-        progress(0.2, desc="Loading and parsing data file...")
-        df = pd.read_csv(file_obj.name) if file_obj.name.endswith('.csv') else pd.read_excel(file_obj.name)
-        if len(df) > settings.MAX_UI_ROWS:
-            df = df.sample(n=settings.MAX_UI_ROWS, random_state=42)
             logging.info(f"DataFrame sampled down to {settings.MAX_UI_ROWS} rows.")
-        progress(0.7, desc="Instantiating analysis engine...")
-        analyzer = DataAnalyzer(df)
-        progress(1.0, desc="Initial analysis complete. Generating reports...")
         return analyzer
     except Exception as e:
         logging.error(f"A critical error occurred during initial analysis: {e}", exc_info=True)
@@ -66,26 +68,15 @@ def run_initial_analysis(file_obj, progress=gr.Progress(track_tqdm=True)):
 def generate_reports_and_visuals(analyzer, progress=gr.Progress(track_tqdm=True)):
     """
     Phase 2: Slower, multi-stage report and visual generation.
-    This generator function yields tuples of UI updates. The order of the yielded
-    tuple is CRITICAL and must exactly match the `main_outputs` list in `app.py`.
-    Args:
-        analyzer: The DataAnalyzer object from the gr.State.
-        progress: The Gradio progress tracker.
-    Yields:
-        A tuple of gr.update() objects to populate the dashboard.
     """
     if not isinstance(analyzer, DataAnalyzer):
         logging.warning("generate_reports_and_visuals called without a valid analyzer. Clearing UI.")
-        # Return a tuple of Nones matching the output length to clear/reset the UI.
-        # There are 14 components in the `main_outputs` list in app.py.
         yield (None,) * 14
         return
-    # 1. Start AI narrative generation in a background thread
     progress(0, desc="Spawning AI report thread...")
-    ai_report_queue = [""]  # Use a mutable list to pass string by reference
     def generate_ai_report_threaded(analyzer_instance):
         narrative_generator = GeminiNarrativeGenerator(api_key=settings.GOOGLE_API_KEY)
         ai_report_queue[0] = narrative_generator.generate_narrative(analyzer_instance)
@@ -93,74 +84,52 @@ def generate_reports_and_visuals(analyzer, progress=gr.Progress(track_tqdm=True)
     thread = Thread(target=generate_ai_report_threaded, args=(analyzer,))
     thread.start()
-    # 2. Generate standard reports and visuals
-    progress(0.4, desc="Generating data profiles and visuals...")
     meta = analyzer.metadata
     missing_df, num_df, cat_df = analyzer.get_profiling_reports()
     fig_types, fig_missing, fig_corr = analyzer.get_overview_visuals()
-    # 3. Yield the first set of updates to populate the main dashboard immediately.
-    # The order of this tuple MUST match the `main_outputs` list in `app.py`.
     initial_updates = (
-        gr.update(value="⏳ Generating AI-powered report in the background... The main dashboard is ready now."), # 0: ai_report_output
-        gr.update(value=missing_df),        # 1: profile_missing_df
-        gr.update(value=num_df),            # 2: profile_numeric_df
-        gr.update(value=cat_df),            # 3: profile_categorical_df
-        gr.update(value=fig_types),         # 4: plot_types
-        gr.update(value=fig_missing),       # 5: plot_missing
-        gr.update(value=fig_corr),          # 6: plot_correlation
-        gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None), # 7: dd_hist_col
-        gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None), # 8: dd_scatter_x
-        gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][1] if len(meta['numeric_cols']) > 1 else None), # 9: dd_scatter_y
-        gr.update(choices=meta['columns']), # 10: dd_scatter_color
-        gr.update(visible=bool(meta['datetime_cols'])), # 11: tab_timeseries
-        gr.update(visible=bool(meta['text_cols'])),     # 12: tab_text
-        gr.update(visible=len(meta['numeric_cols']) > 1) # 13: tab_cluster
     )
     yield initial_updates
-    # 4. Wait for the AI thread to complete
     thread.join()
     progress(1.0, desc="AI Report complete!")
-    # 5. Yield the final update. We create a mutable list from the initial tuple,
-    #    update the AI report element, and convert it back to a tuple to yield.
     final_updates_list = list(initial_updates)
     final_updates_list[0] = gr.update(value=ai_report_queue[0])
     yield tuple(final_updates_list)
-# --- Interactive Explorer Callbacks ---
 def create_histogram(analyzer, col):
-    """Generates a histogram for a selected numeric column."""
     if not isinstance(analyzer, DataAnalyzer) or not col:
-        return go.Figure().update_layout(title="Select a column to generate a histogram")
-    return px.histogram(analyzer.df, x=col, title=f"<b>Distribution of {col}</b>", marginal="box", template="plotly_white")
 def create_scatterplot(analyzer, x_col, y_col, color_col):
-    """Generates a scatter plot for selected X, Y, and optional color columns."""
     if not isinstance(analyzer, DataAnalyzer) or not x_col or not y_col:
-        return go.Figure().update_layout(title="Select X and Y axes to generate a scatter plot")
-    # Use a subset for performance on large datasets
-    df_sample = analyzer.df
-    if len(analyzer.df) > 10000:
-        df_sample = analyzer.df.sample(n=10000, random_state=42)
-    return px.scatter(
-        df_sample, x=x_col, y=y_col, color=color_col if color_col else None,
-        title=f"<b>Scatter Plot: {x_col} vs. {y_col}</b>", template="plotly_white"
-    )
-# --- Specialized Module Callbacks ---
 def update_clustering(analyzer, k):
-    """Callback for the clustering module. Returns a tuple of three updates."""
     if not isinstance(analyzer, DataAnalyzer):
-        return gr.update(), gr.update(), gr.update(value="Run analysis first.")
-    # Delegate the heavy lifting to the specialized module
     fig_cluster, fig_elbow, summary = perform_clustering(analyzer.df, analyzer.metadata['numeric_cols'], k)
     return fig_cluster, fig_elbow, summary

 # -*- coding: utf-8 -*-
 #
+# PROJECT:      CognitiveEDA v5.6 - The QuantumLeap Intelligence Platform
 #
 # DESCRIPTION:  This module contains the core logic for all Gradio event handlers.
+#               The main analysis pipeline now includes a strategic feature
+#               engineering step before analysis.
 import gradio as gr
 import pandas as pd
 import plotly.graph_objects as go
 import plotly.express as px
+# --- MODIFIED IMPORT ---
+# Import both the analyzer class and the new feature engineering function
+from core.analyzer import DataAnalyzer, engineer_features
 from core.llm import GeminiNarrativeGenerator
 from core.config import settings
 from core.exceptions import DataProcessingError
 def run_initial_analysis(file_obj, progress=gr.Progress(track_tqdm=True)):
     """
+    Phase 1: Now includes the strategic feature engineering step.
+    Validates inputs, loads raw data, applies feature engineering, and then
+    creates the core DataAnalyzer object on the transformed data.
     """
     if file_obj is None:
         raise gr.Error("No file uploaded. Please upload a CSV or Excel file.")
         raise gr.Error("CRITICAL: GOOGLE_API_KEY is not configured. Please add it as a secret.")
     try:
+        progress(0.1, desc="Loading raw data...")
+        df_raw = pd.read_csv(file_obj.name) if file_obj.name.endswith('.csv') else pd.read_excel(file_obj.name)
+        if len(df_raw) > settings.MAX_UI_ROWS:
+            df_raw = df_raw.sample(n=settings.MAX_UI_ROWS, random_state=42)
             logging.info(f"DataFrame sampled down to {settings.MAX_UI_ROWS} rows.")
+        # --- INTEGRATION POINT ---
+        # Apply the feature engineering function immediately after loading
+        progress(0.5, desc="Applying strategic feature engineering...")
+        df_engineered = engineer_features(df_raw)
+        # -------------------------
+        progress(0.8, desc="Instantiating analysis engine on engineered data...")
+        # The analyzer now works with the transformed, high-value dataset
+        analyzer = DataAnalyzer(df_engineered)
+        progress(1.0, desc="Analysis complete. Generating reports...")
         return analyzer
     except Exception as e:
         logging.error(f"A critical error occurred during initial analysis: {e}", exc_info=True)
 def generate_reports_and_visuals(analyzer, progress=gr.Progress(track_tqdm=True)):
     """
     Phase 2: Slower, multi-stage report and visual generation.
+    Yields tuples of UI updates based on the *engineered* data.
     """
     if not isinstance(analyzer, DataAnalyzer):
         logging.warning("generate_reports_and_visuals called without a valid analyzer. Clearing UI.")
         yield (None,) * 14
         return
     progress(0, desc="Spawning AI report thread...")
+    ai_report_queue = [""]
     def generate_ai_report_threaded(analyzer_instance):
         narrative_generator = GeminiNarrativeGenerator(api_key=settings.GOOGLE_API_KEY)
         ai_report_queue[0] = narrative_generator.generate_narrative(analyzer_instance)
     thread = Thread(target=generate_ai_report_threaded, args=(analyzer,))
     thread.start()
+    progress(0.4, desc="Generating reports and visuals...")
     meta = analyzer.metadata
     missing_df, num_df, cat_df = analyzer.get_profiling_reports()
     fig_types, fig_missing, fig_corr = analyzer.get_overview_visuals()
     initial_updates = (
+        gr.update(value="⏳ Generating AI report... Dashboard is ready."),
+        gr.update(value=missing_df),
+        gr.update(value=num_df),
+        gr.update(value=cat_df),
+        gr.update(value=fig_types),
+        gr.update(value=fig_missing),
+        gr.update(value=fig_corr),
+        gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None),
+        gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None),
+        gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][1] if len(meta['numeric_cols']) > 1 else None),
+        gr.update(choices=meta['columns']),
+        gr.update(visible=bool(meta['datetime_cols'])),
+        gr.update(visible=bool(meta['text_cols'])),
+        gr.update(visible=len(meta['numeric_cols']) > 1)
     )
     yield initial_updates
     thread.join()
     progress(1.0, desc="AI Report complete!")
     final_updates_list = list(initial_updates)
     final_updates_list[0] = gr.update(value=ai_report_queue[0])
     yield tuple(final_updates_list)
+# --- Interactive Explorer & Module Callbacks ---
 def create_histogram(analyzer, col):
     if not isinstance(analyzer, DataAnalyzer) or not col:
+        return go.Figure()
+    return px.histogram(analyzer.df, x=col, title=f"<b>Distribution of {col}</b>", marginal="box")
 def create_scatterplot(analyzer, x_col, y_col, color_col):
     if not isinstance(analyzer, DataAnalyzer) or not x_col or not y_col:
+        return go.Figure()
+    df_sample = analyzer.df.sample(n=min(len(analyzer.df), 10000))
+    return px.scatter(df_sample, x=x_col, y=y_col, color=color_col if color_col else None)
 def update_clustering(analyzer, k):
     if not isinstance(analyzer, DataAnalyzer):
+        return gr.update(), gr.update(), gr.update()
     fig_cluster, fig_elbow, summary = perform_clustering(analyzer.df, analyzer.metadata['numeric_cols'], k)
     return fig_cluster, fig_elbow, summary