Bhupen commited on
Commit
4228d91
ยท
1 Parent(s): e783272

Clustering intro

Browse files
Files changed (2) hide show
  1. app.py +272 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ import pandas as pd
4
+ import matplotlib.pyplot as plt
5
+ from sklearn.datasets import make_blobs
6
+ import time
7
+
8
+ st.set_page_config(layout="wide")
9
+
10
+ st.markdown("#### Clustering in AI - (unsupervised modeling)")
11
+
12
+ # Section 1: What is clustering?
13
+ with st.expander("๐Ÿ” What is clustering, and why is it relevant in business?"):
14
+ st.markdown("""
15
+ Clustering is an **unsupervised machine learning technique** that groups similar data points together.
16
+ It's commonly used in:
17
+ - **Customer segmentation** (e.g., marketing campaigns)
18
+ - **Anomaly detection** (e.g., fraud or system failures)
19
+ - **Document categorization**
20
+
21
+ Clustering helps discover **patterns** without labeled data, making it extremely useful in business scenarios where manual labeling is costly or infeasible.
22
+ """)
23
+
24
+ from sklearn.datasets import make_blobs
25
+ from sklearn.cluster import KMeans
26
+ import matplotlib.pyplot as plt
27
+ import seaborn as sns
28
+
29
+ # Set plot style
30
+ sns.set(style="whitegrid")
31
+
32
+ # --- 1. Customer Segmentation ---
33
+ st.markdown("###### ๐Ÿ“Š 1. Customer Segmentation")
34
+ st.write("Imagine customers represented by their **age** and **spending score**. Clustering reveals distinct customer groups.")
35
+
36
+ X_seg, _ = make_blobs(n_samples=300, centers=4, cluster_std=1.0, random_state=42)
37
+ kmeans_seg = KMeans(n_clusters=4, random_state=42).fit(X_seg)
38
+ labels_seg = kmeans_seg.labels_
39
+
40
+ fig1, ax1 = plt.subplots(figsize=(9,4))
41
+ scatter1 = ax1.scatter(X_seg[:, 0], X_seg[:, 1], c=labels_seg, cmap='Accent')
42
+ ax1.set_xlabel("Age")
43
+ ax1.set_ylabel("Spending Score")
44
+ ax1.set_title("Customer Segmentation Clusters")
45
+ st.pyplot(fig1)
46
+
47
+ st.markdown("""
48
+ **Interpretation:**
49
+ Each cluster corresponds to a distinct customer segment, like:
50
+ - High spenders vs budget-conscious
51
+ - Young vs older demographics
52
+ This allows targeted marketing and better personalization.
53
+ """)
54
+
55
+ # --- 2. Anomaly Detection ---
56
+ st.markdown("###### ๐Ÿšจ 2. Anomaly Detection")
57
+ st.write("Letโ€™s simulate normal system activity with a few injected anomalies.")
58
+
59
+ X_anom, _ = make_blobs(n_samples=290, centers=1, cluster_std=1.0, random_state=42)
60
+ anomalies = np.random.uniform(low=-6, high=6, size=(10, 2))
61
+ X_anom_combined = np.vstack([X_anom, anomalies])
62
+
63
+ kmeans_anom = KMeans(n_clusters=1, random_state=42).fit(X_anom_combined)
64
+ distances = np.linalg.norm(X_anom_combined - kmeans_anom.cluster_centers_[0], axis=1)
65
+ threshold = np.percentile(distances, 95)
66
+ outliers = distances > threshold
67
+
68
+ fig2, ax2 = plt.subplots(figsize=(9,4))
69
+ ax2.scatter(X_anom_combined[~outliers, 0], X_anom_combined[~outliers, 1], label="Normal", alpha=0.6)
70
+ ax2.scatter(X_anom_combined[outliers, 0], X_anom_combined[outliers, 1], color='red', label="Anomaly")
71
+ ax2.set_title("Anomaly Detection using Clustering")
72
+ ax2.legend()
73
+ st.pyplot(fig2)
74
+
75
+ st.markdown("""
76
+ **Interpretation:**
77
+ Data points that are **far from the cluster center** are flagged as anomalies.
78
+ Great for:
79
+ - Fraud detection
80
+ - Network intrusion
81
+ - Fault detection in systems
82
+ """)
83
+
84
+ # --- 3. Document Categorization ---
85
+ st.markdown("###### ๐Ÿ“š 3. Document Categorization")
86
+ st.write("Assume each document is reduced to 2D space using techniques like TF-IDF + PCA.")
87
+
88
+ X_docs, _ = make_blobs(n_samples=300, centers=3, cluster_std=1.2, random_state=7)
89
+ kmeans_docs = KMeans(n_clusters=3, random_state=7).fit(X_docs)
90
+
91
+ fig3, ax3 = plt.subplots(figsize=(9,4))
92
+ ax3.scatter(X_docs[:, 0], X_docs[:, 1], c=kmeans_docs.labels_, cmap='Set2')
93
+ ax3.set_title("Clustering Documents into Categories")
94
+ ax3.set_xlabel("Topic Vector 1")
95
+ ax3.set_ylabel("Topic Vector 2")
96
+ st.pyplot(fig3)
97
+
98
+ st.markdown("""
99
+ **Interpretation:**
100
+ Clustering helps group similar documents or articles (e.g., tech, sports, health) without prior labels.
101
+ It's used in:
102
+ - News aggregation
103
+ - Content recommendation
104
+ - Automated document organization
105
+ """)
106
+
107
+
108
+ # Section 2: Key characteristics
109
+ with st.expander("๐Ÿง  Key characteristics of clustering (Human-in-the-loop)"):
110
+ st.markdown("""
111
+ - No predefined labels โ€” clustering is exploratory.
112
+ - Requires defining **number of clusters (K)** manually in many algorithms like K-Means.
113
+ - Human input is essential for:
114
+ - **Interpreting cluster meanings**
115
+ - **Validating business relevance**
116
+ - **Tuning parameters like K or distance metrics**
117
+
118
+ This is where **"human-in-the-loop"** comes in โ€” domain experts make sense of the clusters produced.
119
+ """)
120
+
121
+ # --- 1. Standard Numeric Dataset ---
122
+ st.markdown("###### ๐Ÿงฎ 1. Standard Numeric Dataset (e.g., Customer Features)")
123
+
124
+ import pandas as pd
125
+ import numpy as np
126
+
127
+ df_numeric = pd.DataFrame({
128
+ "Age": np.random.randint(18, 65, size=5),
129
+ "Annual Income ($)": np.random.randint(20000, 100000, size=5),
130
+ "Spending Score": np.random.randint(1, 100, size=5),
131
+ "Cluster_Label": ["" for _ in range(5)]
132
+ })
133
+ st.dataframe(df_numeric)
134
+
135
+ # --- 2. Text Dataset ---
136
+ st.markdown("###### โœ๏ธ 2. Text Dataset (e.g., Customer Reviews)")
137
+
138
+ df_text = pd.DataFrame({
139
+ "Review_Text": [
140
+ "Great product, loved the quality!",
141
+ "Terrible support. Never buying again.",
142
+ "Okay-ish experience. Could be better.",
143
+ "Fast delivery and nice packaging.",
144
+ "Didn't meet my expectations."
145
+ ],
146
+ "Cluster_Label": ["" for _ in range(5)]
147
+ })
148
+ st.dataframe(df_text)
149
+
150
+ # --- 3. Image Dataset ---
151
+ st.markdown("###### ๐Ÿ–ผ๏ธ 3. Image Dataset (e.g., Pixel Vectors)")
152
+
153
+ df_image = pd.DataFrame(np.random.randint(0, 256, size=(5, 10)), columns=[f"Pixel_{i}" for i in range(10)])
154
+ df_image["Cluster_Label"] = ""
155
+ st.dataframe(df_image)
156
+
157
+ st.markdown("""
158
+ **Notice:**
159
+ There are **no predefined labels** (`Cluster_Label` is empty).
160
+ Clustering algorithms group the rows based on internal patterns, and **humans interpret what those groupings mean**.
161
+ """)
162
+
163
+ # Section 3: Custom K-Means visualization
164
+ with st.expander("๐Ÿ“Š Visualizing K-Means Clustering (Custom Implementation)"):
165
+ st.markdown("K-Means Clustering Demonstration (Custom Implementation)")
166
+
167
+ # Sidebar parameters
168
+ num_points = st.sidebar.slider("Number of points per cluster", 10, 100, 50)
169
+ cluster_sep = st.sidebar.slider("Cluster separation", 0.5, 5.0, 2.0)
170
+ sleep_interval = st.sidebar.slider("Sleep interval (seconds)", 0.1, 2.0, 0.5)
171
+ show_table = st.sidebar.checkbox("Show cluster table")
172
+
173
+ # Generate synthetic data
174
+ @st.cache_data
175
+ def generate_data(num_points, cluster_sep):
176
+ points, _ = make_blobs(n_samples=num_points*3, centers=3, cluster_std=cluster_sep, n_features=2, random_state=42)
177
+ return points
178
+
179
+ points = generate_data(num_points, cluster_sep)
180
+
181
+ # Random centers
182
+ np.random.seed(42)
183
+ centers = np.column_stack((
184
+ np.random.uniform(-10, 10, 3),
185
+ np.random.uniform(-10, 5, 3)
186
+ ))
187
+
188
+ def calculate_distances(points, centers):
189
+ return np.linalg.norm(points[:, np.newaxis] - centers, axis=2)
190
+
191
+ fig, axes = plt.subplots(4, 3, figsize=(12, 16))
192
+ num_iterations = 12
193
+
194
+ for iteration in range(num_iterations):
195
+ distances = calculate_distances(points, centers)
196
+ closest = np.argmin(distances, axis=1)
197
+ df = pd.DataFrame(points, columns=['x1', 'x2'])
198
+ for i in range(3):
199
+ df[f'dist_to_center_{i+1}'] = distances[:, i]
200
+ df['closest_center'] = closest
201
+
202
+ row, col = divmod(iteration, 3)
203
+ ax = axes[row, col]
204
+ colors = ['red', 'green', 'blue']
205
+ for i in range(3):
206
+ cluster = df[df['closest_center'] == i]
207
+ ax.scatter(cluster['x1'], cluster['x2'], color=colors[i], s=5, label=f'Cluster {i+1}')
208
+ ax.scatter(centers[i][0], centers[i][1], color='black', marker='x', s=50, linewidths=2)
209
+ ax.set_title(f"Iteration {iteration + 1}", fontsize=8)
210
+ ax.set_xlabel("x1", fontsize=8)
211
+ ax.set_ylabel("x2", fontsize=8)
212
+ ax.tick_params(labelsize=6)
213
+ ax.legend(fontsize=6)
214
+
215
+ # Update centers
216
+ centers = np.array([df[df['closest_center'] == i][['x1', 'x2']].mean() for i in range(3)])
217
+
218
+ time.sleep(sleep_interval)
219
+
220
+ st.pyplot(fig)
221
+
222
+ if show_table:
223
+ def highlight_min(s): return ['background-color: lightgreen' if v == s.min() else '' for v in s]
224
+ st.dataframe(df.style.apply(highlight_min, subset=[f'dist_to_center_{i+1}' for i in range(3)]))
225
+
226
+ # Section 4: Evaluating with the Elbow Method
227
+ with st.expander("๐Ÿ“‰ How do we know if clustering worked well (Elbow Method)?"):
228
+ st.markdown("""
229
+ The **Elbow Method** helps identify the optimal number of clusters (K).
230
+ - Plot the **inertia** (sum of squared distances from points to their cluster center) for different K.
231
+ - The 'elbow' point in the curve is the ideal number of clusters.
232
+
233
+ A sharp drop followed by a plateau indicates the elbow.
234
+
235
+ This technique avoids both under- and over-clustering.
236
+ """)
237
+
238
+ from sklearn.cluster import KMeans
239
+
240
+ X = generate_data(100, 1.5)
241
+ inertias = []
242
+ Ks = range(1, 10)
243
+ for k in Ks:
244
+ km = KMeans(n_clusters=k, n_init="auto", random_state=42)
245
+ km.fit(X)
246
+ inertias.append(km.inertia_)
247
+
248
+ fig2, ax2 = plt.subplots()
249
+ ax2.plot(Ks, inertias, marker='o')
250
+ ax2.set_title("Elbow Method for Optimal K")
251
+ ax2.set_xlabel("Number of Clusters (K)")
252
+ ax2.set_ylabel("Inertia")
253
+ st.pyplot(fig2)
254
+
255
+ # Section 5: Challenges and Alternatives
256
+ with st.expander("โš ๏ธ Challenges with K-Means & Alternatives"):
257
+ st.markdown("""
258
+ **K-Means limitations:**
259
+ - Requires choosing K manually
260
+ - Assumes clusters are spherical and equal-sized
261
+ - Sensitive to outliers and initial center placement
262
+
263
+ **Variants / Alternatives:**
264
+ - **K-Medoids**: More robust to outliers
265
+ - **DBSCAN**: Density-based, no need to specify K
266
+ - **Hierarchical Clustering**: Builds a tree of clusters
267
+ - **Gaussian Mixture Models (GMM)**: Probabilistic soft clustering
268
+
269
+ Use-case and data characteristics often guide which method to choose.
270
+ """)
271
+
272
+
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ numpy>=1.24.0
2
+ pandas>=1.5.0
3
+ scikit-learn>=1.2.0
4
+ matplotlib>=3.6.0