Spaces:
Sleeping
Sleeping
Update modules/clustering.py
Browse files- modules/clustering.py +39 -8
modules/clustering.py
CHANGED
@@ -1,4 +1,12 @@
|
|
1 |
# modules/clustering.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import logging
|
3 |
import pandas as pd
|
4 |
import plotly.express as px
|
@@ -8,12 +16,30 @@ from sklearn.decomposition import PCA
|
|
8 |
from sklearn.preprocessing import StandardScaler
|
9 |
|
10 |
def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
if len(numeric_cols) < 2:
|
12 |
-
|
|
|
13 |
|
14 |
cluster_data = df[numeric_cols].dropna()
|
15 |
if len(cluster_data) < n_clusters:
|
16 |
-
|
|
|
17 |
|
18 |
scaler = StandardScaler()
|
19 |
scaled_data = scaler.fit_transform(cluster_data)
|
@@ -22,7 +48,7 @@ def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int):
|
|
22 |
wcss = []
|
23 |
k_range = range(1, 11)
|
24 |
for i in k_range:
|
25 |
-
kmeans_elbow = KMeans(n_clusters=i, init='k-means++', random_state=42, n_init=
|
26 |
kmeans_elbow.fit(scaled_data)
|
27 |
wcss.append(kmeans_elbow.inertia_)
|
28 |
|
@@ -33,15 +59,18 @@ def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int):
|
|
33 |
yaxis_title='Within-Cluster Sum of Squares (WCSS)')
|
34 |
|
35 |
# --- K-Means Clustering & Visualization ---
|
36 |
-
kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42, n_init=
|
37 |
-
|
38 |
|
39 |
pca = PCA(n_components=2)
|
40 |
components = pca.fit_transform(scaled_data)
|
41 |
-
|
|
|
|
|
|
|
42 |
|
43 |
fig_cluster = px.scatter(
|
44 |
-
|
45 |
title=f"<b>K-Means Clustering Visualization (K={n_clusters})</b>",
|
46 |
labels={'PCA1': 'Principal Component 1', 'PCA2': 'Principal Component 2'},
|
47 |
color_discrete_sequence=px.colors.qualitative.Vivid
|
@@ -50,4 +79,6 @@ def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int):
|
|
50 |
explained_variance = pca.explained_variance_ratio_.sum() * 100
|
51 |
summary = (f"**Features Used:** `{len(numeric_cols)}` | **Clusters (K):** `{n_clusters}`\n\n"
|
52 |
f"PCA explains **{explained_variance:.2f}%** of variance.")
|
53 |
-
|
|
|
|
|
|
1 |
# modules/clustering.py
|
2 |
+
|
3 |
+
# -*- coding: utf-8 -*-
|
4 |
+
#
|
5 |
+
# PROJECT: CognitiveEDA v5.7 - The QuantumLeap Intelligence Platform
|
6 |
+
#
|
7 |
+
# DESCRIPTION: Specialized module for K-Means clustering. This version is
|
8 |
+
# updated to return the cluster labels for downstream profiling.
|
9 |
+
|
10 |
import logging
|
11 |
import pandas as pd
|
12 |
import plotly.express as px
|
|
|
16 |
from sklearn.preprocessing import StandardScaler
|
17 |
|
18 |
def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int):
|
19 |
+
"""
|
20 |
+
Performs K-Means clustering, generates an Elbow plot for optimal K,
|
21 |
+
visualizes the clusters via PCA, and returns the cluster labels.
|
22 |
+
|
23 |
+
Args:
|
24 |
+
df (pd.DataFrame): The input DataFrame.
|
25 |
+
numeric_cols (list): A list of numeric columns to use for clustering.
|
26 |
+
n_clusters (int): The number of clusters (k) to create.
|
27 |
+
|
28 |
+
Returns:
|
29 |
+
A tuple containing:
|
30 |
+
- fig_cluster (go.Figure): Plot of the clustered data in 2D PCA space.
|
31 |
+
- fig_elbow (go.Figure): The Elbow Method plot for determining optimal k.
|
32 |
+
- summary (str): A markdown summary of the methodology.
|
33 |
+
- labels (pd.Series): The cluster label assigned to each data point.
|
34 |
+
"""
|
35 |
if len(numeric_cols) < 2:
|
36 |
+
empty_fig = go.Figure()
|
37 |
+
return empty_fig, empty_fig, "Clustering requires at least 2 numeric features.", pd.Series()
|
38 |
|
39 |
cluster_data = df[numeric_cols].dropna()
|
40 |
if len(cluster_data) < n_clusters:
|
41 |
+
empty_fig = go.Figure()
|
42 |
+
return empty_fig, empty_fig, f"Not enough data ({len(cluster_data)}) for {n_clusters} clusters.", pd.Series()
|
43 |
|
44 |
scaler = StandardScaler()
|
45 |
scaled_data = scaler.fit_transform(cluster_data)
|
|
|
48 |
wcss = []
|
49 |
k_range = range(1, 11)
|
50 |
for i in k_range:
|
51 |
+
kmeans_elbow = KMeans(n_clusters=i, init='k-means++', random_state=42, n_init='auto')
|
52 |
kmeans_elbow.fit(scaled_data)
|
53 |
wcss.append(kmeans_elbow.inertia_)
|
54 |
|
|
|
59 |
yaxis_title='Within-Cluster Sum of Squares (WCSS)')
|
60 |
|
61 |
# --- K-Means Clustering & Visualization ---
|
62 |
+
kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42, n_init='auto').fit(scaled_data)
|
63 |
+
labels = pd.Series(kmeans.labels_, name='Cluster_Labels', index=cluster_data.index)
|
64 |
|
65 |
pca = PCA(n_components=2)
|
66 |
components = pca.fit_transform(scaled_data)
|
67 |
+
|
68 |
+
# Create a DataFrame for plotting
|
69 |
+
plot_df = pd.DataFrame(components, columns=['PCA1', 'PCA2'], index=cluster_data.index)
|
70 |
+
plot_df['Cluster'] = labels.astype(str)
|
71 |
|
72 |
fig_cluster = px.scatter(
|
73 |
+
plot_df, x='PCA1', y='PCA2', color='Cluster',
|
74 |
title=f"<b>K-Means Clustering Visualization (K={n_clusters})</b>",
|
75 |
labels={'PCA1': 'Principal Component 1', 'PCA2': 'Principal Component 2'},
|
76 |
color_discrete_sequence=px.colors.qualitative.Vivid
|
|
|
79 |
explained_variance = pca.explained_variance_ratio_.sum() * 100
|
80 |
summary = (f"**Features Used:** `{len(numeric_cols)}` | **Clusters (K):** `{n_clusters}`\n\n"
|
81 |
f"PCA explains **{explained_variance:.2f}%** of variance.")
|
82 |
+
|
83 |
+
# --- MODIFIED RETURN ---
|
84 |
+
return fig_cluster, fig_elbow, summary, labels
|