mgbam commited on
Commit
23226ad
·
verified ·
1 Parent(s): 91f1cc5

Update modules/clustering.py

Browse files
Files changed (1) hide show
  1. modules/clustering.py +39 -8
modules/clustering.py CHANGED
@@ -1,4 +1,12 @@
1
  # modules/clustering.py
 
 
 
 
 
 
 
 
2
  import logging
3
  import pandas as pd
4
  import plotly.express as px
@@ -8,12 +16,30 @@ from sklearn.decomposition import PCA
8
  from sklearn.preprocessing import StandardScaler
9
 
10
  def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  if len(numeric_cols) < 2:
12
- return go.Figure(), go.Figure(), "Clustering requires at least 2 numeric features."
 
13
 
14
  cluster_data = df[numeric_cols].dropna()
15
  if len(cluster_data) < n_clusters:
16
- return go.Figure(), go.Figure(), f"Not enough data ({len(cluster_data)}) for {n_clusters} clusters."
 
17
 
18
  scaler = StandardScaler()
19
  scaled_data = scaler.fit_transform(cluster_data)
@@ -22,7 +48,7 @@ def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int):
22
  wcss = []
23
  k_range = range(1, 11)
24
  for i in k_range:
25
- kmeans_elbow = KMeans(n_clusters=i, init='k-means++', random_state=42, n_init=10)
26
  kmeans_elbow.fit(scaled_data)
27
  wcss.append(kmeans_elbow.inertia_)
28
 
@@ -33,15 +59,18 @@ def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int):
33
  yaxis_title='Within-Cluster Sum of Squares (WCSS)')
34
 
35
  # --- K-Means Clustering & Visualization ---
36
- kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42, n_init=10).fit(scaled_data)
37
- cluster_data['Cluster'] = kmeans.labels_.astype(str)
38
 
39
  pca = PCA(n_components=2)
40
  components = pca.fit_transform(scaled_data)
41
- cluster_data['PCA1'], cluster_data['PCA2'] = components[:, 0], components[:, 1]
 
 
 
42
 
43
  fig_cluster = px.scatter(
44
- cluster_data, x='PCA1', y='PCA2', color='Cluster',
45
  title=f"<b>K-Means Clustering Visualization (K={n_clusters})</b>",
46
  labels={'PCA1': 'Principal Component 1', 'PCA2': 'Principal Component 2'},
47
  color_discrete_sequence=px.colors.qualitative.Vivid
@@ -50,4 +79,6 @@ def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int):
50
  explained_variance = pca.explained_variance_ratio_.sum() * 100
51
  summary = (f"**Features Used:** `{len(numeric_cols)}` | **Clusters (K):** `{n_clusters}`\n\n"
52
  f"PCA explains **{explained_variance:.2f}%** of variance.")
53
- return fig_cluster, fig_elbow, summary
 
 
 
1
  # modules/clustering.py
2
+
3
+ # -*- coding: utf-8 -*-
4
+ #
5
+ # PROJECT: CognitiveEDA v5.7 - The QuantumLeap Intelligence Platform
6
+ #
7
+ # DESCRIPTION: Specialized module for K-Means clustering. This version is
8
+ # updated to return the cluster labels for downstream profiling.
9
+
10
  import logging
11
  import pandas as pd
12
  import plotly.express as px
 
16
  from sklearn.preprocessing import StandardScaler
17
 
18
  def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int):
19
+ """
20
+ Performs K-Means clustering, generates an Elbow plot for optimal K,
21
+ visualizes the clusters via PCA, and returns the cluster labels.
22
+
23
+ Args:
24
+ df (pd.DataFrame): The input DataFrame.
25
+ numeric_cols (list): A list of numeric columns to use for clustering.
26
+ n_clusters (int): The number of clusters (k) to create.
27
+
28
+ Returns:
29
+ A tuple containing:
30
+ - fig_cluster (go.Figure): Plot of the clustered data in 2D PCA space.
31
+ - fig_elbow (go.Figure): The Elbow Method plot for determining optimal k.
32
+ - summary (str): A markdown summary of the methodology.
33
+ - labels (pd.Series): The cluster label assigned to each data point.
34
+ """
35
  if len(numeric_cols) < 2:
36
+ empty_fig = go.Figure()
37
+ return empty_fig, empty_fig, "Clustering requires at least 2 numeric features.", pd.Series()
38
 
39
  cluster_data = df[numeric_cols].dropna()
40
  if len(cluster_data) < n_clusters:
41
+ empty_fig = go.Figure()
42
+ return empty_fig, empty_fig, f"Not enough data ({len(cluster_data)}) for {n_clusters} clusters.", pd.Series()
43
 
44
  scaler = StandardScaler()
45
  scaled_data = scaler.fit_transform(cluster_data)
 
48
  wcss = []
49
  k_range = range(1, 11)
50
  for i in k_range:
51
+ kmeans_elbow = KMeans(n_clusters=i, init='k-means++', random_state=42, n_init='auto')
52
  kmeans_elbow.fit(scaled_data)
53
  wcss.append(kmeans_elbow.inertia_)
54
 
 
59
  yaxis_title='Within-Cluster Sum of Squares (WCSS)')
60
 
61
  # --- K-Means Clustering & Visualization ---
62
+ kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42, n_init='auto').fit(scaled_data)
63
+ labels = pd.Series(kmeans.labels_, name='Cluster_Labels', index=cluster_data.index)
64
 
65
  pca = PCA(n_components=2)
66
  components = pca.fit_transform(scaled_data)
67
+
68
+ # Create a DataFrame for plotting
69
+ plot_df = pd.DataFrame(components, columns=['PCA1', 'PCA2'], index=cluster_data.index)
70
+ plot_df['Cluster'] = labels.astype(str)
71
 
72
  fig_cluster = px.scatter(
73
+ plot_df, x='PCA1', y='PCA2', color='Cluster',
74
  title=f"<b>K-Means Clustering Visualization (K={n_clusters})</b>",
75
  labels={'PCA1': 'Principal Component 1', 'PCA2': 'Principal Component 2'},
76
  color_discrete_sequence=px.colors.qualitative.Vivid
 
79
  explained_variance = pca.explained_variance_ratio_.sum() * 100
80
  summary = (f"**Features Used:** `{len(numeric_cols)}` | **Clusters (K):** `{n_clusters}`\n\n"
81
  f"PCA explains **{explained_variance:.2f}%** of variance.")
82
+
83
+ # --- MODIFIED RETURN ---
84
+ return fig_cluster, fig_elbow, summary, labels