Spaces:

mgbam
/

CognitiveEDA

Sleeping

App Files Files Community

mgbam commited on 10 days ago

Commit

23226ad

verified ·

1 Parent(s): 91f1cc5

Update modules/clustering.py

Browse files

Files changed (1) hide show

modules/clustering.py +39 -8

modules/clustering.py CHANGED Viewed

@@ -1,4 +1,12 @@
 # modules/clustering.py
 import logging
 import pandas as pd
 import plotly.express as px
@@ -8,12 +16,30 @@ from sklearn.decomposition import PCA
 from sklearn.preprocessing import StandardScaler
 def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int):
     if len(numeric_cols) < 2:
-        return go.Figure(), go.Figure(), "Clustering requires at least 2 numeric features."
     cluster_data = df[numeric_cols].dropna()
     if len(cluster_data) < n_clusters:
-        return go.Figure(), go.Figure(), f"Not enough data ({len(cluster_data)}) for {n_clusters} clusters."
     scaler = StandardScaler()
     scaled_data = scaler.fit_transform(cluster_data)
@@ -22,7 +48,7 @@ def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int):
     wcss = []
     k_range = range(1, 11)
     for i in k_range:
-        kmeans_elbow = KMeans(n_clusters=i, init='k-means++', random_state=42, n_init=10)
         kmeans_elbow.fit(scaled_data)
         wcss.append(kmeans_elbow.inertia_)
@@ -33,15 +59,18 @@ def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int):
                           yaxis_title='Within-Cluster Sum of Squares (WCSS)')
     # --- K-Means Clustering & Visualization ---
-    kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42, n_init=10).fit(scaled_data)
-    cluster_data['Cluster'] = kmeans.labels_.astype(str)
     pca = PCA(n_components=2)
     components = pca.fit_transform(scaled_data)
-    cluster_data['PCA1'], cluster_data['PCA2'] = components[:, 0], components[:, 1]
     fig_cluster = px.scatter(
-        cluster_data, x='PCA1', y='PCA2', color='Cluster',
         title=f"<b>K-Means Clustering Visualization (K={n_clusters})</b>",
         labels={'PCA1': 'Principal Component 1', 'PCA2': 'Principal Component 2'},
         color_discrete_sequence=px.colors.qualitative.Vivid
@@ -50,4 +79,6 @@ def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int):
     explained_variance = pca.explained_variance_ratio_.sum() * 100
     summary = (f"**Features Used:** `{len(numeric_cols)}` | **Clusters (K):** `{n_clusters}`\n\n"
                f"PCA explains **{explained_variance:.2f}%** of variance.")
-    return fig_cluster, fig_elbow, summary

 # modules/clustering.py
+# -*- coding: utf-8 -*-
+#
+# PROJECT:      CognitiveEDA v5.7 - The QuantumLeap Intelligence Platform
+#
+# DESCRIPTION:  Specialized module for K-Means clustering. This version is
+#               updated to return the cluster labels for downstream profiling.
 import logging
 import pandas as pd
 import plotly.express as px
 from sklearn.preprocessing import StandardScaler
 def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int):
+    """
+    Performs K-Means clustering, generates an Elbow plot for optimal K,
+    visualizes the clusters via PCA, and returns the cluster labels.
+    Args:
+        df (pd.DataFrame): The input DataFrame.
+        numeric_cols (list): A list of numeric columns to use for clustering.
+        n_clusters (int): The number of clusters (k) to create.
+    Returns:
+        A tuple containing:
+        - fig_cluster (go.Figure): Plot of the clustered data in 2D PCA space.
+        - fig_elbow (go.Figure): The Elbow Method plot for determining optimal k.
+        - summary (str): A markdown summary of the methodology.
+        - labels (pd.Series): The cluster label assigned to each data point.
+    """
     if len(numeric_cols) < 2:
+        empty_fig = go.Figure()
+        return empty_fig, empty_fig, "Clustering requires at least 2 numeric features.", pd.Series()
     cluster_data = df[numeric_cols].dropna()
     if len(cluster_data) < n_clusters:
+        empty_fig = go.Figure()
+        return empty_fig, empty_fig, f"Not enough data ({len(cluster_data)}) for {n_clusters} clusters.", pd.Series()
     scaler = StandardScaler()
     scaled_data = scaler.fit_transform(cluster_data)
     wcss = []
     k_range = range(1, 11)
     for i in k_range:
+        kmeans_elbow = KMeans(n_clusters=i, init='k-means++', random_state=42, n_init='auto')
         kmeans_elbow.fit(scaled_data)
         wcss.append(kmeans_elbow.inertia_)
                           yaxis_title='Within-Cluster Sum of Squares (WCSS)')
     # --- K-Means Clustering & Visualization ---
+    kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42, n_init='auto').fit(scaled_data)
+    labels = pd.Series(kmeans.labels_, name='Cluster_Labels', index=cluster_data.index)
     pca = PCA(n_components=2)
     components = pca.fit_transform(scaled_data)
+    # Create a DataFrame for plotting
+    plot_df = pd.DataFrame(components, columns=['PCA1', 'PCA2'], index=cluster_data.index)
+    plot_df['Cluster'] = labels.astype(str)
     fig_cluster = px.scatter(
+        plot_df, x='PCA1', y='PCA2', color='Cluster',
         title=f"<b>K-Means Clustering Visualization (K={n_clusters})</b>",
         labels={'PCA1': 'Principal Component 1', 'PCA2': 'Principal Component 2'},
         color_discrete_sequence=px.colors.qualitative.Vivid
     explained_variance = pca.explained_variance_ratio_.sum() * 100
     summary = (f"**Features Used:** `{len(numeric_cols)}` | **Clusters (K):** `{n_clusters}`\n\n"
                f"PCA explains **{explained_variance:.2f}%** of variance.")
+    # --- MODIFIED RETURN ---
+    return fig_cluster, fig_elbow, summary, labels