# ui/callbacks.py # -*- coding: utf-8 -*- # # PROJECT: CognitiveEDA v5.2 - The QuantumLeap Intelligence Platform # # DESCRIPTION: The "Controller" logic of the application. This module contains # the Python functions that execute when Gradio events are triggered. # It is designed to be completely decoupled from the UI definition # and event attachment process. import gradio as gr import pandas as pd import logging from threading import Thread import plotly.graph_objects as go import plotly.express as px from core.analyzer import DataAnalyzer from core.llm import GeminiNarrativeGenerator from core.config import settings from core.exceptions import DataProcessingError from modules.clustering import perform_clustering from modules.text import generate_word_cloud from modules.timeseries import analyze_time_series # --- Primary Analysis Chain --- def run_initial_analysis(file_obj, progress=gr.Progress(track_tqdm=True)): """ Phase 1 of the analysis: Fast, synchronous tasks. Validates inputs, loads data, and creates the core DataAnalyzer object. Args: file_obj: The uploaded file object from Gradio. progress: The Gradio progress tracker. Returns: The instantiated DataAnalyzer object, which will update the gr.State. Returns None if any validation or processing fails. """ # 1. Input Validation if file_obj is None: raise gr.Error("No file uploaded. Please upload a CSV or Excel file.") # 2. Runtime Configuration Validation progress(0, desc="Validating configuration...") if not settings.GOOGLE_API_KEY: logging.error("Analysis attempted without GOOGLE_API_KEY set.") raise gr.Error( "CRITICAL: GOOGLE_API_KEY is not configured. " "Please add it to your .env file or as a platform secret and restart." ) try: # 3. Data Loading progress(0.2, desc="Loading and parsing data file...") df = pd.read_csv(file_obj.name) if file_obj.name.endswith('.csv') else pd.read_excel(file_obj.name) if len(df) > settings.MAX_UI_ROWS: df = df.sample(n=settings.MAX_UI_ROWS, random_state=42) logging.info(f"DataFrame sampled down to {settings.MAX_UI_ROWS} rows.") # 4. Core Analyzer Instantiation progress(0.7, desc="Instantiating analysis engine...") analyzer = DataAnalyzer(df) progress(1.0, desc="Initial analysis complete.") return analyzer except DataProcessingError as e: logging.error(f"User-facing data processing error: {e}", exc_info=True) raise gr.Error(str(e)) except Exception as e: logging.error(f"A critical unhandled error occurred during initial analysis: {e}", exc_info=True) raise gr.Error(f"Analysis Failed! An unexpected error occurred: {str(e)}") def generate_reports_and_visuals(analyzer, progress=gr.Progress(track_tqdm=True)): """ Phase 2 of the analysis: Slower, multi-stage tasks. This generator function yields UI updates as they become available. Args: analyzer: The DataAnalyzer object from the gr.State. progress: The Gradio progress tracker. Yields: A dictionary of Gradio updates to populate the dashboard. """ # Guard clause: Do nothing if the initial analysis failed. if not isinstance(analyzer, DataAnalyzer): logging.warning("generate_reports_and_visuals called without a valid analyzer. Aborting.") return {} # 1. Start AI narrative generation in a background thread progress(0, desc="Spawning AI report thread...") ai_report_queue = [""] # Use a mutable list to pass string by reference def generate_ai_report_threaded(analyzer_instance): narrative_generator = GeminiNarrativeGenerator(api_key=settings.GOOGLE_API_KEY) ai_report_queue[0] = narrative_generator.generate_narrative(analyzer_instance) thread = Thread(target=generate_ai_report_threaded, args=(analyzer,)) thread.start() # 2. Generate standard reports and visuals (this is fast) progress(0.4, desc="Generating data profiles and visuals...") meta = analyzer.metadata missing_df, num_df, cat_df = analyzer.get_profiling_reports() fig_types, fig_missing, fig_corr = analyzer.get_overview_visuals() # 3. Yield the first set of updates to populate the main dashboard immediately progress(0.8, desc="Building initial dashboard...") initial_updates = { "ai_report_output": gr.update(value="⏳ Generating AI-powered report in the background... The main dashboard is ready now."), "profile_missing_df": gr.update(value=missing_df), "profile_numeric_df": gr.update(value=num_df), "profile_categorical_df": gr.update(value=cat_df), "plot_types": gr.update(value=fig_types), "plot_missing": gr.update(value=fig_missing), "plot_correlation": gr.update(value=fig_corr), "dd_hist_col": gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None), "dd_scatter_x": gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None), "dd_scatter_y": gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][1] if len(meta['numeric_cols']) > 1 else None), "dd_scatter_color": gr.update(choices=meta['columns']), "tab_timeseries": gr.update(visible=bool(meta['datetime_cols'])), "tab_text": gr.update(visible=bool(meta['text_cols'])), "tab_cluster": gr.update(visible=len(meta['numeric_cols']) > 1), } yield initial_updates # 4. Wait for the AI thread to complete thread.join() progress(1.0, desc="AI Report complete!") # 5. Yield the final update, now including the AI-generated report final_updates = initial_updates.copy() final_updates["ai_report_output"] = ai_report_queue[0] yield final_updates # --- Interactive Explorer Callbacks --- def create_histogram(analyzer, col): """Generates a histogram for a selected numeric column.""" if not isinstance(analyzer, DataAnalyzer) or not col: return go.Figure().update_layout(title="Select a column to generate a histogram") return px.histogram(analyzer.df, x=col, title=f"Distribution of {col}", marginal="box", template="plotly_white") def create_scatterplot(analyzer, x_col, y_col, color_col): """Generates a scatter plot for selected X, Y, and optional color columns.""" if not isinstance(analyzer, DataAnalyzer) or not x_col or not y_col: return go.Figure().update_layout(title="Select X and Y axes to generate a scatter plot") # Use a subset for performance on large datasets df_sample = analyzer.df if len(analyzer.df) > 10000: df_sample = analyzer.df.sample(n=10000, random_state=42) return px.scatter( df_sample, x=x_col, y=y_col, color=color_col if color_col else None, title=f"Scatter Plot: {x_col} vs. {y_col}", template="plotly_white" ) # --- Specialized Module Callbacks --- def update_clustering(analyzer, k): """Callback for the clustering module.""" if not isinstance(analyzer, DataAnalyzer): return gr.update(), gr.update(), gr.update(value="Run analysis first.") # Delegate the heavy lifting to the specialized module fig_cluster, fig_elbow, summary = perform_clustering(analyzer.df, analyzer.metadata['numeric_cols'], k) return fig_cluster, fig_elbow, summary # Add other specialized callbacks for text and time-series here if needed. # For example, if you add the dropdowns and plots to the layout: # # def update_timeseries(analyzer, date_col, value_col): # if not isinstance(analyzer, DataAnalyzer): # return gr.update(), gr.update(value="Run analysis first.") # fig, md = analyze_time_series(analyzer.df, date_col, value_col) # return fig, md # # def update_text(analyzer, text_col): # if not isinstance(analyzer, DataAnalyzer): # return gr.update() # return generate_word_cloud(analyzer.df, text_col)