clustering_demo / app.py
Bhupen
Clustering intro
4228d91
import streamlit as st
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
import time
st.set_page_config(layout="wide")
st.markdown("#### Clustering in AI - (unsupervised modeling)")
# Section 1: What is clustering?
with st.expander("๐Ÿ” What is clustering, and why is it relevant in business?"):
st.markdown("""
Clustering is an **unsupervised machine learning technique** that groups similar data points together.
It's commonly used in:
- **Customer segmentation** (e.g., marketing campaigns)
- **Anomaly detection** (e.g., fraud or system failures)
- **Document categorization**
Clustering helps discover **patterns** without labeled data, making it extremely useful in business scenarios where manual labeling is costly or infeasible.
""")
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
# Set plot style
sns.set(style="whitegrid")
# --- 1. Customer Segmentation ---
st.markdown("###### ๐Ÿ“Š 1. Customer Segmentation")
st.write("Imagine customers represented by their **age** and **spending score**. Clustering reveals distinct customer groups.")
X_seg, _ = make_blobs(n_samples=300, centers=4, cluster_std=1.0, random_state=42)
kmeans_seg = KMeans(n_clusters=4, random_state=42).fit(X_seg)
labels_seg = kmeans_seg.labels_
fig1, ax1 = plt.subplots(figsize=(9,4))
scatter1 = ax1.scatter(X_seg[:, 0], X_seg[:, 1], c=labels_seg, cmap='Accent')
ax1.set_xlabel("Age")
ax1.set_ylabel("Spending Score")
ax1.set_title("Customer Segmentation Clusters")
st.pyplot(fig1)
st.markdown("""
**Interpretation:**
Each cluster corresponds to a distinct customer segment, like:
- High spenders vs budget-conscious
- Young vs older demographics
This allows targeted marketing and better personalization.
""")
# --- 2. Anomaly Detection ---
st.markdown("###### ๐Ÿšจ 2. Anomaly Detection")
st.write("Letโ€™s simulate normal system activity with a few injected anomalies.")
X_anom, _ = make_blobs(n_samples=290, centers=1, cluster_std=1.0, random_state=42)
anomalies = np.random.uniform(low=-6, high=6, size=(10, 2))
X_anom_combined = np.vstack([X_anom, anomalies])
kmeans_anom = KMeans(n_clusters=1, random_state=42).fit(X_anom_combined)
distances = np.linalg.norm(X_anom_combined - kmeans_anom.cluster_centers_[0], axis=1)
threshold = np.percentile(distances, 95)
outliers = distances > threshold
fig2, ax2 = plt.subplots(figsize=(9,4))
ax2.scatter(X_anom_combined[~outliers, 0], X_anom_combined[~outliers, 1], label="Normal", alpha=0.6)
ax2.scatter(X_anom_combined[outliers, 0], X_anom_combined[outliers, 1], color='red', label="Anomaly")
ax2.set_title("Anomaly Detection using Clustering")
ax2.legend()
st.pyplot(fig2)
st.markdown("""
**Interpretation:**
Data points that are **far from the cluster center** are flagged as anomalies.
Great for:
- Fraud detection
- Network intrusion
- Fault detection in systems
""")
# --- 3. Document Categorization ---
st.markdown("###### ๐Ÿ“š 3. Document Categorization")
st.write("Assume each document is reduced to 2D space using techniques like TF-IDF + PCA.")
X_docs, _ = make_blobs(n_samples=300, centers=3, cluster_std=1.2, random_state=7)
kmeans_docs = KMeans(n_clusters=3, random_state=7).fit(X_docs)
fig3, ax3 = plt.subplots(figsize=(9,4))
ax3.scatter(X_docs[:, 0], X_docs[:, 1], c=kmeans_docs.labels_, cmap='Set2')
ax3.set_title("Clustering Documents into Categories")
ax3.set_xlabel("Topic Vector 1")
ax3.set_ylabel("Topic Vector 2")
st.pyplot(fig3)
st.markdown("""
**Interpretation:**
Clustering helps group similar documents or articles (e.g., tech, sports, health) without prior labels.
It's used in:
- News aggregation
- Content recommendation
- Automated document organization
""")
# Section 2: Key characteristics
with st.expander("๐Ÿง  Key characteristics of clustering (Human-in-the-loop)"):
st.markdown("""
- No predefined labels โ€” clustering is exploratory.
- Requires defining **number of clusters (K)** manually in many algorithms like K-Means.
- Human input is essential for:
- **Interpreting cluster meanings**
- **Validating business relevance**
- **Tuning parameters like K or distance metrics**
This is where **"human-in-the-loop"** comes in โ€” domain experts make sense of the clusters produced.
""")
# --- 1. Standard Numeric Dataset ---
st.markdown("###### ๐Ÿงฎ 1. Standard Numeric Dataset (e.g., Customer Features)")
import pandas as pd
import numpy as np
df_numeric = pd.DataFrame({
"Age": np.random.randint(18, 65, size=5),
"Annual Income ($)": np.random.randint(20000, 100000, size=5),
"Spending Score": np.random.randint(1, 100, size=5),
"Cluster_Label": ["" for _ in range(5)]
})
st.dataframe(df_numeric)
# --- 2. Text Dataset ---
st.markdown("###### โœ๏ธ 2. Text Dataset (e.g., Customer Reviews)")
df_text = pd.DataFrame({
"Review_Text": [
"Great product, loved the quality!",
"Terrible support. Never buying again.",
"Okay-ish experience. Could be better.",
"Fast delivery and nice packaging.",
"Didn't meet my expectations."
],
"Cluster_Label": ["" for _ in range(5)]
})
st.dataframe(df_text)
# --- 3. Image Dataset ---
st.markdown("###### ๐Ÿ–ผ๏ธ 3. Image Dataset (e.g., Pixel Vectors)")
df_image = pd.DataFrame(np.random.randint(0, 256, size=(5, 10)), columns=[f"Pixel_{i}" for i in range(10)])
df_image["Cluster_Label"] = ""
st.dataframe(df_image)
st.markdown("""
**Notice:**
There are **no predefined labels** (`Cluster_Label` is empty).
Clustering algorithms group the rows based on internal patterns, and **humans interpret what those groupings mean**.
""")
# Section 3: Custom K-Means visualization
with st.expander("๐Ÿ“Š Visualizing K-Means Clustering (Custom Implementation)"):
st.markdown("K-Means Clustering Demonstration (Custom Implementation)")
# Sidebar parameters
num_points = st.sidebar.slider("Number of points per cluster", 10, 100, 50)
cluster_sep = st.sidebar.slider("Cluster separation", 0.5, 5.0, 2.0)
sleep_interval = st.sidebar.slider("Sleep interval (seconds)", 0.1, 2.0, 0.5)
show_table = st.sidebar.checkbox("Show cluster table")
# Generate synthetic data
@st.cache_data
def generate_data(num_points, cluster_sep):
points, _ = make_blobs(n_samples=num_points*3, centers=3, cluster_std=cluster_sep, n_features=2, random_state=42)
return points
points = generate_data(num_points, cluster_sep)
# Random centers
np.random.seed(42)
centers = np.column_stack((
np.random.uniform(-10, 10, 3),
np.random.uniform(-10, 5, 3)
))
def calculate_distances(points, centers):
return np.linalg.norm(points[:, np.newaxis] - centers, axis=2)
fig, axes = plt.subplots(4, 3, figsize=(12, 16))
num_iterations = 12
for iteration in range(num_iterations):
distances = calculate_distances(points, centers)
closest = np.argmin(distances, axis=1)
df = pd.DataFrame(points, columns=['x1', 'x2'])
for i in range(3):
df[f'dist_to_center_{i+1}'] = distances[:, i]
df['closest_center'] = closest
row, col = divmod(iteration, 3)
ax = axes[row, col]
colors = ['red', 'green', 'blue']
for i in range(3):
cluster = df[df['closest_center'] == i]
ax.scatter(cluster['x1'], cluster['x2'], color=colors[i], s=5, label=f'Cluster {i+1}')
ax.scatter(centers[i][0], centers[i][1], color='black', marker='x', s=50, linewidths=2)
ax.set_title(f"Iteration {iteration + 1}", fontsize=8)
ax.set_xlabel("x1", fontsize=8)
ax.set_ylabel("x2", fontsize=8)
ax.tick_params(labelsize=6)
ax.legend(fontsize=6)
# Update centers
centers = np.array([df[df['closest_center'] == i][['x1', 'x2']].mean() for i in range(3)])
time.sleep(sleep_interval)
st.pyplot(fig)
if show_table:
def highlight_min(s): return ['background-color: lightgreen' if v == s.min() else '' for v in s]
st.dataframe(df.style.apply(highlight_min, subset=[f'dist_to_center_{i+1}' for i in range(3)]))
# Section 4: Evaluating with the Elbow Method
with st.expander("๐Ÿ“‰ How do we know if clustering worked well (Elbow Method)?"):
st.markdown("""
The **Elbow Method** helps identify the optimal number of clusters (K).
- Plot the **inertia** (sum of squared distances from points to their cluster center) for different K.
- The 'elbow' point in the curve is the ideal number of clusters.
A sharp drop followed by a plateau indicates the elbow.
This technique avoids both under- and over-clustering.
""")
from sklearn.cluster import KMeans
X = generate_data(100, 1.5)
inertias = []
Ks = range(1, 10)
for k in Ks:
km = KMeans(n_clusters=k, n_init="auto", random_state=42)
km.fit(X)
inertias.append(km.inertia_)
fig2, ax2 = plt.subplots()
ax2.plot(Ks, inertias, marker='o')
ax2.set_title("Elbow Method for Optimal K")
ax2.set_xlabel("Number of Clusters (K)")
ax2.set_ylabel("Inertia")
st.pyplot(fig2)
# Section 5: Challenges and Alternatives
with st.expander("โš ๏ธ Challenges with K-Means & Alternatives"):
st.markdown("""
**K-Means limitations:**
- Requires choosing K manually
- Assumes clusters are spherical and equal-sized
- Sensitive to outliers and initial center placement
**Variants / Alternatives:**
- **K-Medoids**: More robust to outliers
- **DBSCAN**: Density-based, no need to specify K
- **Hierarchical Clustering**: Builds a tree of clusters
- **Gaussian Mixture Models (GMM)**: Probabilistic soft clustering
Use-case and data characteristics often guide which method to choose.
""")