mgbam commited on
Commit
91d2a0a
·
verified ·
1 Parent(s): b834ff0

Create modules/clustering.py

Browse files
Files changed (1) hide show
  1. modules/clustering.py +53 -0
modules/clustering.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # modules/clustering.py
2
+ import logging
3
+ import pandas as pd
4
+ import plotly.express as px
5
+ import plotly.graph_objects as go
6
+ from sklearn.cluster import KMeans
7
+ from sklearn.decomposition import PCA
8
+ from sklearn.preprocessing import StandardScaler
9
+
10
+ def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int):
11
+ if len(numeric_cols) < 2:
12
+ return go.Figure(), go.Figure(), "Clustering requires at least 2 numeric features."
13
+
14
+ cluster_data = df[numeric_cols].dropna()
15
+ if len(cluster_data) < n_clusters:
16
+ return go.Figure(), go.Figure(), f"Not enough data ({len(cluster_data)}) for {n_clusters} clusters."
17
+
18
+ scaler = StandardScaler()
19
+ scaled_data = scaler.fit_transform(cluster_data)
20
+
21
+ # --- Elbow Method Plot ---
22
+ wcss = []
23
+ k_range = range(1, 11)
24
+ for i in k_range:
25
+ kmeans_elbow = KMeans(n_clusters=i, init='k-means++', random_state=42, n_init=10)
26
+ kmeans_elbow.fit(scaled_data)
27
+ wcss.append(kmeans_elbow.inertia_)
28
+
29
+ fig_elbow = go.Figure()
30
+ fig_elbow.add_trace(go.Scatter(x=list(k_range), y=wcss, mode='lines+markers'))
31
+ fig_elbow.update_layout(title='<b>💡 The Elbow Method for Optimal K</b>',
32
+ xaxis_title='Number of Clusters (K)',
33
+ yaxis_title='Within-Cluster Sum of Squares (WCSS)')
34
+
35
+ # --- K-Means Clustering & Visualization ---
36
+ kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42, n_init=10).fit(scaled_data)
37
+ cluster_data['Cluster'] = kmeans.labels_.astype(str)
38
+
39
+ pca = PCA(n_components=2)
40
+ components = pca.fit_transform(scaled_data)
41
+ cluster_data['PCA1'], cluster_data['PCA2'] = components[:, 0], components[:, 1]
42
+
43
+ fig_cluster = px.scatter(
44
+ cluster_data, x='PCA1', y='PCA2', color='Cluster',
45
+ title=f"<b>K-Means Clustering Visualization (K={n_clusters})</b>",
46
+ labels={'PCA1': 'Principal Component 1', 'PCA2': 'Principal Component 2'},
47
+ color_discrete_sequence=px.colors.qualitative.Vivid
48
+ )
49
+
50
+ explained_variance = pca.explained_variance_ratio_.sum() * 100
51
+ summary = (f"**Features Used:** `{len(numeric_cols)}` | **Clusters (K):** `{n_clusters}`\n\n"
52
+ f"PCA explains **{explained_variance:.2f}%** of variance.")
53
+ return fig_cluster, fig_elbow, summary