File size: 8,158 Bytes
d1943e0
12fa967
 
 
7d40c30
12fa967
7d40c30
 
 
 
12fa967
d1943e0
 
 
 
 
7d40c30
 
 
d1943e0
 
 
12fa967
d1943e0
7d40c30
 
 
 
 
d1943e0
7d40c30
12fa967
7d40c30
 
 
 
 
 
 
 
 
 
12fa967
7d40c30
fcc261b
 
 
7d40c30
fcc261b
 
 
 
 
 
 
 
 
7d40c30
 
fcc261b
 
 
7d40c30
fcc261b
7d40c30
 
fcc261b
7d40c30
 
fcc261b
 
 
 
 
7d40c30
fcc261b
 
 
 
 
7d40c30
 
 
 
 
 
 
 
 
fcc261b
7d40c30
 
 
 
 
 
 
 
fcc261b
 
 
 
 
 
 
7d40c30
 
fcc261b
 
 
 
7d40c30
fcc261b
 
7d40c30
fcc261b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d40c30
fcc261b
 
7d40c30
 
fcc261b
 
 
 
7d40c30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fcc261b
7d40c30
 
 
 
 
fcc261b
7d40c30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# ui/callbacks.py

# -*- coding: utf-8 -*-
#
# PROJECT:      CognitiveEDA v5.2 - The QuantumLeap Intelligence Platform
#
# DESCRIPTION:  The "Controller" logic of the application. This module contains
#               the Python functions that execute when Gradio events are triggered.
#               It is designed to be completely decoupled from the UI definition
#               and event attachment process.

import gradio as gr
import pandas as pd
import logging
from threading import Thread

import plotly.graph_objects as go
import plotly.express as px

from core.analyzer import DataAnalyzer
from core.llm import GeminiNarrativeGenerator
from core.config import settings
from core.exceptions import DataProcessingError
from modules.clustering import perform_clustering
from modules.text import generate_word_cloud
from modules.timeseries import analyze_time_series


# --- Primary Analysis Chain ---

def run_initial_analysis(file_obj, progress=gr.Progress(track_tqdm=True)):
    """
    Phase 1 of the analysis: Fast, synchronous tasks.
    Validates inputs, loads data, and creates the core DataAnalyzer object.

    Args:
        file_obj: The uploaded file object from Gradio.
        progress: The Gradio progress tracker.

    Returns:
        The instantiated DataAnalyzer object, which will update the gr.State.
        Returns None if any validation or processing fails.
    """
    # 1. Input Validation
    if file_obj is None:
        raise gr.Error("No file uploaded. Please upload a CSV or Excel file.")

    # 2. Runtime Configuration Validation
    progress(0, desc="Validating configuration...")
    if not settings.GOOGLE_API_KEY:
        logging.error("Analysis attempted without GOOGLE_API_KEY set.")
        raise gr.Error(
            "CRITICAL: GOOGLE_API_KEY is not configured. "
            "Please add it to your .env file or as a platform secret and restart."
        )

    try:
        # 3. Data Loading
        progress(0.2, desc="Loading and parsing data file...")
        df = pd.read_csv(file_obj.name) if file_obj.name.endswith('.csv') else pd.read_excel(file_obj.name)
        if len(df) > settings.MAX_UI_ROWS:
            df = df.sample(n=settings.MAX_UI_ROWS, random_state=42)
            logging.info(f"DataFrame sampled down to {settings.MAX_UI_ROWS} rows.")

        # 4. Core Analyzer Instantiation
        progress(0.7, desc="Instantiating analysis engine...")
        analyzer = DataAnalyzer(df)
        progress(1.0, desc="Initial analysis complete.")
        return analyzer

    except DataProcessingError as e:
        logging.error(f"User-facing data processing error: {e}", exc_info=True)
        raise gr.Error(str(e))
    except Exception as e:
        logging.error(f"A critical unhandled error occurred during initial analysis: {e}", exc_info=True)
        raise gr.Error(f"Analysis Failed! An unexpected error occurred: {str(e)}")


def generate_reports_and_visuals(analyzer, progress=gr.Progress(track_tqdm=True)):
    """
    Phase 2 of the analysis: Slower, multi-stage tasks.
    This generator function yields UI updates as they become available.

    Args:
        analyzer: The DataAnalyzer object from the gr.State.
        progress: The Gradio progress tracker.

    Yields:
        A dictionary of Gradio updates to populate the dashboard.
    """
    # Guard clause: Do nothing if the initial analysis failed.
    if not isinstance(analyzer, DataAnalyzer):
        logging.warning("generate_reports_and_visuals called without a valid analyzer. Aborting.")
        return {}

    # 1. Start AI narrative generation in a background thread
    progress(0, desc="Spawning AI report thread...")
    ai_report_queue = [""]  # Use a mutable list to pass string by reference
    def generate_ai_report_threaded(analyzer_instance):
        narrative_generator = GeminiNarrativeGenerator(api_key=settings.GOOGLE_API_KEY)
        ai_report_queue[0] = narrative_generator.generate_narrative(analyzer_instance)
    
    thread = Thread(target=generate_ai_report_threaded, args=(analyzer,))
    thread.start()

    # 2. Generate standard reports and visuals (this is fast)
    progress(0.4, desc="Generating data profiles and visuals...")
    meta = analyzer.metadata
    missing_df, num_df, cat_df = analyzer.get_profiling_reports()
    fig_types, fig_missing, fig_corr = analyzer.get_overview_visuals()

    # 3. Yield the first set of updates to populate the main dashboard immediately
    progress(0.8, desc="Building initial dashboard...")
    initial_updates = {
        "ai_report_output": gr.update(value="⏳ Generating AI-powered report in the background... The main dashboard is ready now."),
        "profile_missing_df": gr.update(value=missing_df),
        "profile_numeric_df": gr.update(value=num_df),
        "profile_categorical_df": gr.update(value=cat_df),
        "plot_types": gr.update(value=fig_types),
        "plot_missing": gr.update(value=fig_missing),
        "plot_correlation": gr.update(value=fig_corr),
        "dd_hist_col": gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None),
        "dd_scatter_x": gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None),
        "dd_scatter_y": gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][1] if len(meta['numeric_cols']) > 1 else None),
        "dd_scatter_color": gr.update(choices=meta['columns']),
        "tab_timeseries": gr.update(visible=bool(meta['datetime_cols'])),
        "tab_text": gr.update(visible=bool(meta['text_cols'])),
        "tab_cluster": gr.update(visible=len(meta['numeric_cols']) > 1),
    }
    yield initial_updates

    # 4. Wait for the AI thread to complete
    thread.join()
    progress(1.0, desc="AI Report complete!")

    # 5. Yield the final update, now including the AI-generated report
    final_updates = initial_updates.copy()
    final_updates["ai_report_output"] = ai_report_queue[0]
    yield final_updates


# --- Interactive Explorer Callbacks ---

def create_histogram(analyzer, col):
    """Generates a histogram for a selected numeric column."""
    if not isinstance(analyzer, DataAnalyzer) or not col:
        return go.Figure().update_layout(title="Select a column to generate a histogram")
    return px.histogram(analyzer.df, x=col, title=f"<b>Distribution of {col}</b>", marginal="box", template="plotly_white")

def create_scatterplot(analyzer, x_col, y_col, color_col):
    """Generates a scatter plot for selected X, Y, and optional color columns."""
    if not isinstance(analyzer, DataAnalyzer) or not x_col or not y_col:
        return go.Figure().update_layout(title="Select X and Y axes to generate a scatter plot")
    
    # Use a subset for performance on large datasets
    df_sample = analyzer.df
    if len(analyzer.df) > 10000:
        df_sample = analyzer.df.sample(n=10000, random_state=42)

    return px.scatter(
        df_sample, x=x_col, y=y_col, color=color_col if color_col else None,
        title=f"<b>Scatter Plot: {x_col} vs. {y_col}</b>", template="plotly_white"
    )


# --- Specialized Module Callbacks ---

def update_clustering(analyzer, k):
    """Callback for the clustering module."""
    if not isinstance(analyzer, DataAnalyzer):
        return gr.update(), gr.update(), gr.update(value="Run analysis first.")
    
    # Delegate the heavy lifting to the specialized module
    fig_cluster, fig_elbow, summary = perform_clustering(analyzer.df, analyzer.metadata['numeric_cols'], k)
    return fig_cluster, fig_elbow, summary

# Add other specialized callbacks for text and time-series here if needed.
# For example, if you add the dropdowns and plots to the layout:
#
# def update_timeseries(analyzer, date_col, value_col):
#     if not isinstance(analyzer, DataAnalyzer):
#         return gr.update(), gr.update(value="Run analysis first.")
#     fig, md = analyze_time_series(analyzer.df, date_col, value_col)
#     return fig, md
#
# def update_text(analyzer, text_col):
#     if not isinstance(analyzer, DataAnalyzer):
#         return gr.update()
#     return generate_word_cloud(analyzer.df, text_col)