File size: 5,807 Bytes
d1943e0
12fa967
 
 
1dae368
12fa967
a00699a
1dae368
12fa967
d1943e0
 
 
 
 
7d40c30
 
 
fe02df7
d1943e0
 
12fa967
d1943e0
1dae368
 
7d40c30
 
1dae368
7d40c30
1dae368
fcc261b
1dae368
fcc261b
fe02df7
 
 
 
 
 
1dae368
fe02df7
 
7d40c30
fcc261b
1dae368
 
fcc261b
 
7d40c30
a00699a
 
7d40c30
fe02df7
1dae368
 
 
fcc261b
 
fe02df7
fcc261b
 
 
a00699a
1dae368
 
 
fe02df7
 
 
1dae368
 
a00699a
fcc261b
 
 
a00699a
 
 
fcc261b
1dae368
7d40c30
1dae368
fe02df7
7d40c30
 
1dae368
fe02df7
 
7d40c30
1dae368
 
fcc261b
1dae368
 
 
 
 
 
 
7d40c30
1dae368
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# ui/callbacks.py

# -*- coding: utf-8 -*-
#
# PROJECT:      CognitiveEDA v5.7 - The QuantumLeap Intelligence Platform
#
# DESCRIPTION:  This module contains the core logic for all Gradio event handlers.
#               The clustering callback is now updated to include persona profiling.

import gradio as gr
import pandas as pd
import logging
from threading import Thread

import plotly.graph_objects as go
import plotly.express as px

from core.analyzer import DataAnalyzer, engineer_features
from core.llm import GeminiNarrativeGenerator
from core.config import settings
from core.exceptions import DataProcessingError
from modules.clustering import perform_clustering
# --- NEW IMPORT ---
from modules.profiling import profile_clusters


# --- Primary Analysis Chain (Unchanged) ---
def run_initial_analysis(file_obj, progress=gr.Progress(track_tqdm=True)):
    if file_obj is None: raise gr.Error("No file uploaded.")
    progress(0, desc="Validating configuration...")
    if not settings.GOOGLE_API_KEY: raise gr.Error("CRITICAL: GOOGLE_API_KEY is not configured.")
    try:
        progress(0.1, desc="Loading raw data...")
        df_raw = pd.read_csv(file_obj.name) if file_obj.name.endswith('.csv') else pd.read_excel(file_obj.name)
        if len(df_raw) > settings.MAX_UI_ROWS:
            df_raw = df_raw.sample(n=settings.MAX_UI_ROWS, random_state=42)
        progress(0.5, desc="Applying strategic feature engineering...")
        df_engineered = engineer_features(df_raw)
        progress(0.8, desc="Instantiating analysis engine...")
        analyzer = DataAnalyzer(df_engineered) 
        progress(1.0, desc="Analysis complete. Generating reports...")
        return analyzer
    except Exception as e:
        logging.error(f"Error in initial analysis: {e}", exc_info=True)
        raise gr.Error(f"Analysis Failed: {str(e)}")

def generate_reports_and_visuals(analyzer, progress=gr.Progress(track_tqdm=True)):
    if not isinstance(analyzer, DataAnalyzer):
        yield (None,) * 14
        return
    progress(0, desc="Spawning AI report thread...")
    ai_report_queue = [""]
    def generate_ai_report_threaded(a):
        narrative_generator = GeminiNarrativeGenerator(settings.GOOGLE_API_KEY)
        ai_report_queue[0] = narrative_generator.generate_narrative(a)
    thread = Thread(target=generate_ai_report_threaded, args=(analyzer,))
    thread.start()
    progress(0.4, desc="Generating reports and visuals...")
    meta = analyzer.metadata
    missing_df, num_df, cat_df = analyzer.get_profiling_reports()
    fig_types, fig_missing, fig_corr = analyzer.get_overview_visuals()
    initial_updates = (
        gr.update(value="⏳ Generating AI report..."), gr.update(value=missing_df),
        gr.update(value=num_df), gr.update(value=cat_df), gr.update(value=fig_types),
        gr.update(value=fig_missing), gr.update(value=fig_corr),
        gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None),
        gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None),
        gr.update(choices=meta['numeric_cols'], value=meta['numeric_cols'][1] if len(meta['numeric_cols']) > 1 else None),
        gr.update(choices=meta['columns']), gr.update(visible=bool(meta['datetime_cols'])),
        gr.update(visible=bool(meta['text_cols'])), gr.update(visible=len(meta['numeric_cols']) > 1)
    )
    yield initial_updates
    thread.join()
    progress(1.0, desc="AI Report complete!")
    final_updates_list = list(initial_updates)
    final_updates_list[0] = gr.update(value=ai_report_queue[0])
    yield tuple(final_updates_list)

# --- Interactive Explorer Callbacks (Unchanged) ---
def create_histogram(analyzer, col):
    if not isinstance(analyzer, DataAnalyzer) or not col: return go.Figure()
    return px.histogram(analyzer.df, x=col, title=f"<b>Distribution of {col}</b>", marginal="box")

def create_scatterplot(analyzer, x_col, y_col, color_col):
    if not isinstance(analyzer, DataAnalyzer) or not x_col or not y_col: return go.Figure()
    df_sample = analyzer.df.sample(n=min(len(analyzer.df), 10000))
    return px.scatter(df_sample, x=x_col, y=y_col, color=color_col if color_col else None)


# --- MODIFIED CLUSTERING CALLBACK ---
def update_clustering(analyzer, k):
    """
    Orchestrates the full clustering workflow:
    1. Runs K-Means clustering.
    2. Receives cluster labels.
    3. Calls the profiling module to analyze the segments.
    4. Returns all results to the UI.
    """
    if not isinstance(analyzer, DataAnalyzer):
        # Return empty updates for all 5 clustering output components
        return go.Figure(), go.Figure(), "", "", go.Figure()

    # Step 1: Perform Clustering to get visuals and labels
    fig_cluster, fig_elbow, summary, cluster_labels = perform_clustering(
        analyzer.df, analyzer.metadata['numeric_cols'], k
    )

    if cluster_labels.empty:
        # Handle cases where clustering fails (e.g., not enough data)
        return fig_cluster, fig_elbow, summary, "Clustering failed. No personas to profile.", go.Figure()

    # Step 2: Profile the resulting clusters
    numeric_to_profile = ['Total_Revenue', 'Quantity_Ordered', 'Hour']
    cats_to_profile = ['City', 'Product', 'Day_of_Week']
    
    # Filter to only use columns that actually exist in the engineered dataframe
    numeric_to_profile = [c for c in numeric_to_profile if c in analyzer.df.columns]
    cats_to_profile = [c for c in cats_to_profile if c in analyzer.df.columns]

    md_personas, fig_profile = profile_clusters(
        analyzer.df, cluster_labels, numeric_to_profile, cats_to_profile
    )

    # Step 3: Return all 5 results in the correct order for the UI
    return fig_cluster, fig_elbow, summary, md_personas, fig_profile