File size: 3,534 Bytes
91d2a0a
23226ad
 
 
 
 
 
 
 
91d2a0a
 
 
 
 
 
 
 
 
23226ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91d2a0a
23226ad
 
91d2a0a
 
 
23226ad
 
91d2a0a
 
 
 
 
 
 
 
23226ad
91d2a0a
 
 
 
 
 
 
 
 
 
23226ad
 
91d2a0a
 
 
23226ad
 
 
 
91d2a0a
 
23226ad
91d2a0a
 
 
 
 
 
 
 
23226ad
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# modules/clustering.py

# -*- coding: utf-8 -*-
#
# PROJECT:      CognitiveEDA v5.7 - The QuantumLeap Intelligence Platform
#
# DESCRIPTION:  Specialized module for K-Means clustering. This version is
#               updated to return the cluster labels for downstream profiling.

import logging
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int):
    """
    Performs K-Means clustering, generates an Elbow plot for optimal K,
    visualizes the clusters via PCA, and returns the cluster labels.

    Args:
        df (pd.DataFrame): The input DataFrame.
        numeric_cols (list): A list of numeric columns to use for clustering.
        n_clusters (int): The number of clusters (k) to create.

    Returns:
        A tuple containing:
        - fig_cluster (go.Figure): Plot of the clustered data in 2D PCA space.
        - fig_elbow (go.Figure): The Elbow Method plot for determining optimal k.
        - summary (str): A markdown summary of the methodology.
        - labels (pd.Series): The cluster label assigned to each data point.
    """
    if len(numeric_cols) < 2:
        empty_fig = go.Figure()
        return empty_fig, empty_fig, "Clustering requires at least 2 numeric features.", pd.Series()
        
    cluster_data = df[numeric_cols].dropna()
    if len(cluster_data) < n_clusters:
        empty_fig = go.Figure()
        return empty_fig, empty_fig, f"Not enough data ({len(cluster_data)}) for {n_clusters} clusters.", pd.Series()

    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(cluster_data)

    # --- Elbow Method Plot ---
    wcss = []
    k_range = range(1, 11)
    for i in k_range:
        kmeans_elbow = KMeans(n_clusters=i, init='k-means++', random_state=42, n_init='auto')
        kmeans_elbow.fit(scaled_data)
        wcss.append(kmeans_elbow.inertia_)
    
    fig_elbow = go.Figure()
    fig_elbow.add_trace(go.Scatter(x=list(k_range), y=wcss, mode='lines+markers'))
    fig_elbow.update_layout(title='<b>💡 The Elbow Method for Optimal K</b>',
                          xaxis_title='Number of Clusters (K)',
                          yaxis_title='Within-Cluster Sum of Squares (WCSS)')

    # --- K-Means Clustering & Visualization ---
    kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42, n_init='auto').fit(scaled_data)
    labels = pd.Series(kmeans.labels_, name='Cluster_Labels', index=cluster_data.index)
    
    pca = PCA(n_components=2)
    components = pca.fit_transform(scaled_data)
    
    # Create a DataFrame for plotting
    plot_df = pd.DataFrame(components, columns=['PCA1', 'PCA2'], index=cluster_data.index)
    plot_df['Cluster'] = labels.astype(str)
    
    fig_cluster = px.scatter(
        plot_df, x='PCA1', y='PCA2', color='Cluster',
        title=f"<b>K-Means Clustering Visualization (K={n_clusters})</b>",
        labels={'PCA1': 'Principal Component 1', 'PCA2': 'Principal Component 2'},
        color_discrete_sequence=px.colors.qualitative.Vivid
    )
    
    explained_variance = pca.explained_variance_ratio_.sum() * 100
    summary = (f"**Features Used:** `{len(numeric_cols)}` | **Clusters (K):** `{n_clusters}`\n\n"
               f"PCA explains **{explained_variance:.2f}%** of variance.")
    
    # --- MODIFIED RETURN ---
    return fig_cluster, fig_elbow, summary, labels