Spaces:
Sleeping
Sleeping
Create modules/clustering.py
Browse files- modules/clustering.py +53 -0
modules/clustering.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# modules/clustering.py
|
2 |
+
import logging
|
3 |
+
import pandas as pd
|
4 |
+
import plotly.express as px
|
5 |
+
import plotly.graph_objects as go
|
6 |
+
from sklearn.cluster import KMeans
|
7 |
+
from sklearn.decomposition import PCA
|
8 |
+
from sklearn.preprocessing import StandardScaler
|
9 |
+
|
10 |
+
def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int):
|
11 |
+
if len(numeric_cols) < 2:
|
12 |
+
return go.Figure(), go.Figure(), "Clustering requires at least 2 numeric features."
|
13 |
+
|
14 |
+
cluster_data = df[numeric_cols].dropna()
|
15 |
+
if len(cluster_data) < n_clusters:
|
16 |
+
return go.Figure(), go.Figure(), f"Not enough data ({len(cluster_data)}) for {n_clusters} clusters."
|
17 |
+
|
18 |
+
scaler = StandardScaler()
|
19 |
+
scaled_data = scaler.fit_transform(cluster_data)
|
20 |
+
|
21 |
+
# --- Elbow Method Plot ---
|
22 |
+
wcss = []
|
23 |
+
k_range = range(1, 11)
|
24 |
+
for i in k_range:
|
25 |
+
kmeans_elbow = KMeans(n_clusters=i, init='k-means++', random_state=42, n_init=10)
|
26 |
+
kmeans_elbow.fit(scaled_data)
|
27 |
+
wcss.append(kmeans_elbow.inertia_)
|
28 |
+
|
29 |
+
fig_elbow = go.Figure()
|
30 |
+
fig_elbow.add_trace(go.Scatter(x=list(k_range), y=wcss, mode='lines+markers'))
|
31 |
+
fig_elbow.update_layout(title='<b>💡 The Elbow Method for Optimal K</b>',
|
32 |
+
xaxis_title='Number of Clusters (K)',
|
33 |
+
yaxis_title='Within-Cluster Sum of Squares (WCSS)')
|
34 |
+
|
35 |
+
# --- K-Means Clustering & Visualization ---
|
36 |
+
kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42, n_init=10).fit(scaled_data)
|
37 |
+
cluster_data['Cluster'] = kmeans.labels_.astype(str)
|
38 |
+
|
39 |
+
pca = PCA(n_components=2)
|
40 |
+
components = pca.fit_transform(scaled_data)
|
41 |
+
cluster_data['PCA1'], cluster_data['PCA2'] = components[:, 0], components[:, 1]
|
42 |
+
|
43 |
+
fig_cluster = px.scatter(
|
44 |
+
cluster_data, x='PCA1', y='PCA2', color='Cluster',
|
45 |
+
title=f"<b>K-Means Clustering Visualization (K={n_clusters})</b>",
|
46 |
+
labels={'PCA1': 'Principal Component 1', 'PCA2': 'Principal Component 2'},
|
47 |
+
color_discrete_sequence=px.colors.qualitative.Vivid
|
48 |
+
)
|
49 |
+
|
50 |
+
explained_variance = pca.explained_variance_ratio_.sum() * 100
|
51 |
+
summary = (f"**Features Used:** `{len(numeric_cols)}` | **Clusters (K):** `{n_clusters}`\n\n"
|
52 |
+
f"PCA explains **{explained_variance:.2f}%** of variance.")
|
53 |
+
return fig_cluster, fig_elbow, summary
|