Spaces:
Sleeping
Sleeping
# modules/clustering.py | |
import logging | |
import pandas as pd | |
import plotly.express as px | |
import plotly.graph_objects as go | |
from sklearn.cluster import KMeans | |
from sklearn.decomposition import PCA | |
from sklearn.preprocessing import StandardScaler | |
def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int): | |
if len(numeric_cols) < 2: | |
return go.Figure(), go.Figure(), "Clustering requires at least 2 numeric features." | |
cluster_data = df[numeric_cols].dropna() | |
if len(cluster_data) < n_clusters: | |
return go.Figure(), go.Figure(), f"Not enough data ({len(cluster_data)}) for {n_clusters} clusters." | |
scaler = StandardScaler() | |
scaled_data = scaler.fit_transform(cluster_data) | |
# --- Elbow Method Plot --- | |
wcss = [] | |
k_range = range(1, 11) | |
for i in k_range: | |
kmeans_elbow = KMeans(n_clusters=i, init='k-means++', random_state=42, n_init=10) | |
kmeans_elbow.fit(scaled_data) | |
wcss.append(kmeans_elbow.inertia_) | |
fig_elbow = go.Figure() | |
fig_elbow.add_trace(go.Scatter(x=list(k_range), y=wcss, mode='lines+markers')) | |
fig_elbow.update_layout(title='<b>π‘ The Elbow Method for Optimal K</b>', | |
xaxis_title='Number of Clusters (K)', | |
yaxis_title='Within-Cluster Sum of Squares (WCSS)') | |
# --- K-Means Clustering & Visualization --- | |
kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42, n_init=10).fit(scaled_data) | |
cluster_data['Cluster'] = kmeans.labels_.astype(str) | |
pca = PCA(n_components=2) | |
components = pca.fit_transform(scaled_data) | |
cluster_data['PCA1'], cluster_data['PCA2'] = components[:, 0], components[:, 1] | |
fig_cluster = px.scatter( | |
cluster_data, x='PCA1', y='PCA2', color='Cluster', | |
title=f"<b>K-Means Clustering Visualization (K={n_clusters})</b>", | |
labels={'PCA1': 'Principal Component 1', 'PCA2': 'Principal Component 2'}, | |
color_discrete_sequence=px.colors.qualitative.Vivid | |
) | |
explained_variance = pca.explained_variance_ratio_.sum() * 100 | |
summary = (f"**Features Used:** `{len(numeric_cols)}` | **Clusters (K):** `{n_clusters}`\n\n" | |
f"PCA explains **{explained_variance:.2f}%** of variance.") | |
return fig_cluster, fig_elbow, summary |