Spaces:

gridflowai
/

clustering_demo

Sleeping

clustering_demo / app.py

Bhupen

Clustering intro

4228d91 20 days ago

10.4 kB

	import streamlit as st
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	from sklearn.datasets import make_blobs
	import time

	st.set_page_config(layout="wide")

	st.markdown("#### Clustering in AI - (unsupervised modeling)")

	# Section 1: What is clustering?
	with st.expander("🔍 What is clustering, and why is it relevant in business?"):
	st.markdown("""
	Clustering is an unsupervised machine learning technique that groups similar data points together.
	It's commonly used in:
	- Customer segmentation (e.g., marketing campaigns)
	- Anomaly detection (e.g., fraud or system failures)
	- Document categorization

	Clustering helps discover patterns without labeled data, making it extremely useful in business scenarios where manual labeling is costly or infeasible.
	""")

	from sklearn.datasets import make_blobs
	from sklearn.cluster import KMeans
	import matplotlib.pyplot as plt
	import seaborn as sns

	# Set plot style
	sns.set(style="whitegrid")

	# --- 1. Customer Segmentation ---
	st.markdown("###### 📊 1. Customer Segmentation")
	st.write("Imagine customers represented by their age and spending score. Clustering reveals distinct customer groups.")

	X_seg, _ = make_blobs(n_samples=300, centers=4, cluster_std=1.0, random_state=42)
	kmeans_seg = KMeans(n_clusters=4, random_state=42).fit(X_seg)
	labels_seg = kmeans_seg.labels_

	fig1, ax1 = plt.subplots(figsize=(9,4))
	scatter1 = ax1.scatter(X_seg[:, 0], X_seg[:, 1], c=labels_seg, cmap='Accent')
	ax1.set_xlabel("Age")
	ax1.set_ylabel("Spending Score")
	ax1.set_title("Customer Segmentation Clusters")
	st.pyplot(fig1)

	st.markdown("""
	Interpretation:
	Each cluster corresponds to a distinct customer segment, like:
	- High spenders vs budget-conscious
	- Young vs older demographics
	This allows targeted marketing and better personalization.
	""")

	# --- 2. Anomaly Detection ---
	st.markdown("###### 🚨 2. Anomaly Detection")
	st.write("Let’s simulate normal system activity with a few injected anomalies.")

	X_anom, _ = make_blobs(n_samples=290, centers=1, cluster_std=1.0, random_state=42)
	anomalies = np.random.uniform(low=-6, high=6, size=(10, 2))
	X_anom_combined = np.vstack([X_anom, anomalies])

	kmeans_anom = KMeans(n_clusters=1, random_state=42).fit(X_anom_combined)
	distances = np.linalg.norm(X_anom_combined - kmeans_anom.cluster_centers_[0], axis=1)
	threshold = np.percentile(distances, 95)
	outliers = distances > threshold

	fig2, ax2 = plt.subplots(figsize=(9,4))
	ax2.scatter(X_anom_combined[~outliers, 0], X_anom_combined[~outliers, 1], label="Normal", alpha=0.6)
	ax2.scatter(X_anom_combined[outliers, 0], X_anom_combined[outliers, 1], color='red', label="Anomaly")
	ax2.set_title("Anomaly Detection using Clustering")
	ax2.legend()
	st.pyplot(fig2)

	st.markdown("""
	Interpretation:
	Data points that are far from the cluster center are flagged as anomalies.
	Great for:
	- Fraud detection
	- Network intrusion
	- Fault detection in systems
	""")

	# --- 3. Document Categorization ---
	st.markdown("###### 📚 3. Document Categorization")
	st.write("Assume each document is reduced to 2D space using techniques like TF-IDF + PCA.")

	X_docs, _ = make_blobs(n_samples=300, centers=3, cluster_std=1.2, random_state=7)
	kmeans_docs = KMeans(n_clusters=3, random_state=7).fit(X_docs)

	fig3, ax3 = plt.subplots(figsize=(9,4))
	ax3.scatter(X_docs[:, 0], X_docs[:, 1], c=kmeans_docs.labels_, cmap='Set2')
	ax3.set_title("Clustering Documents into Categories")
	ax3.set_xlabel("Topic Vector 1")
	ax3.set_ylabel("Topic Vector 2")
	st.pyplot(fig3)

	st.markdown("""
	Interpretation:
	Clustering helps group similar documents or articles (e.g., tech, sports, health) without prior labels.
	It's used in:
	- News aggregation
	- Content recommendation
	- Automated document organization
	""")


	# Section 2: Key characteristics
	with st.expander("🧠 Key characteristics of clustering (Human-in-the-loop)"):
	st.markdown("""
	- No predefined labels — clustering is exploratory.
	- Requires defining number of clusters (K) manually in many algorithms like K-Means.
	- Human input is essential for:
	- Interpreting cluster meanings
	- Validating business relevance
	- Tuning parameters like K or distance metrics

	This is where "human-in-the-loop" comes in — domain experts make sense of the clusters produced.
	""")

	# --- 1. Standard Numeric Dataset ---
	st.markdown("###### 🧮 1. Standard Numeric Dataset (e.g., Customer Features)")

	import pandas as pd
	import numpy as np

	df_numeric = pd.DataFrame({
	"Age": np.random.randint(18, 65, size=5),
	"Annual Income ($)": np.random.randint(20000, 100000, size=5),
	"Spending Score": np.random.randint(1, 100, size=5),
	"Cluster_Label": ["" for _ in range(5)]
	})
	st.dataframe(df_numeric)

	# --- 2. Text Dataset ---
	st.markdown("###### ✍️ 2. Text Dataset (e.g., Customer Reviews)")

	df_text = pd.DataFrame({
	"Review_Text": [
	"Great product, loved the quality!",
	"Terrible support. Never buying again.",
	"Okay-ish experience. Could be better.",
	"Fast delivery and nice packaging.",
	"Didn't meet my expectations."
	],
	"Cluster_Label": ["" for _ in range(5)]
	})
	st.dataframe(df_text)

	# --- 3. Image Dataset ---
	st.markdown("###### 🖼️ 3. Image Dataset (e.g., Pixel Vectors)")

	df_image = pd.DataFrame(np.random.randint(0, 256, size=(5, 10)), columns=[f"Pixel_{i}" for i in range(10)])
	df_image["Cluster_Label"] = ""
	st.dataframe(df_image)

	st.markdown("""
	Notice:
	There are no predefined labels (`Cluster_Label` is empty).
	Clustering algorithms group the rows based on internal patterns, and humans interpret what those groupings mean.
	""")

	# Section 3: Custom K-Means visualization
	with st.expander("📊 Visualizing K-Means Clustering (Custom Implementation)"):
	st.markdown("K-Means Clustering Demonstration (Custom Implementation)")

	# Sidebar parameters
	num_points = st.sidebar.slider("Number of points per cluster", 10, 100, 50)
	cluster_sep = st.sidebar.slider("Cluster separation", 0.5, 5.0, 2.0)
	sleep_interval = st.sidebar.slider("Sleep interval (seconds)", 0.1, 2.0, 0.5)
	show_table = st.sidebar.checkbox("Show cluster table")

	# Generate synthetic data
	@st.cache_data
	def generate_data(num_points, cluster_sep):
	points, _ = make_blobs(n_samples=num_points*3, centers=3, cluster_std=cluster_sep, n_features=2, random_state=42)
	return points

	points = generate_data(num_points, cluster_sep)

	# Random centers
	np.random.seed(42)
	centers = np.column_stack((
	np.random.uniform(-10, 10, 3),
	np.random.uniform(-10, 5, 3)
	))

	def calculate_distances(points, centers):
	return np.linalg.norm(points[:, np.newaxis] - centers, axis=2)

	fig, axes = plt.subplots(4, 3, figsize=(12, 16))
	num_iterations = 12

	for iteration in range(num_iterations):
	distances = calculate_distances(points, centers)
	closest = np.argmin(distances, axis=1)
	df = pd.DataFrame(points, columns=['x1', 'x2'])
	for i in range(3):
	df[f'dist_to_center_{i+1}'] = distances[:, i]
	df['closest_center'] = closest

	row, col = divmod(iteration, 3)
	ax = axes[row, col]
	colors = ['red', 'green', 'blue']
	for i in range(3):
	cluster = df[df['closest_center'] == i]
	ax.scatter(cluster['x1'], cluster['x2'], color=colors[i], s=5, label=f'Cluster {i+1}')
	ax.scatter(centers[i][0], centers[i][1], color='black', marker='x', s=50, linewidths=2)
	ax.set_title(f"Iteration {iteration + 1}", fontsize=8)
	ax.set_xlabel("x1", fontsize=8)
	ax.set_ylabel("x2", fontsize=8)
	ax.tick_params(labelsize=6)
	ax.legend(fontsize=6)

	# Update centers
	centers = np.array([df[df['closest_center'] == i][['x1', 'x2']].mean() for i in range(3)])

	time.sleep(sleep_interval)

	st.pyplot(fig)

	if show_table:
	def highlight_min(s): return ['background-color: lightgreen' if v == s.min() else '' for v in s]
	st.dataframe(df.style.apply(highlight_min, subset=[f'dist_to_center_{i+1}' for i in range(3)]))

	# Section 4: Evaluating with the Elbow Method
	with st.expander("📉 How do we know if clustering worked well (Elbow Method)?"):
	st.markdown("""
	The Elbow Method helps identify the optimal number of clusters (K).
	- Plot the inertia (sum of squared distances from points to their cluster center) for different K.
	- The 'elbow' point in the curve is the ideal number of clusters.

	A sharp drop followed by a plateau indicates the elbow.

	This technique avoids both under- and over-clustering.
	""")

	from sklearn.cluster import KMeans

	X = generate_data(100, 1.5)
	inertias = []
	Ks = range(1, 10)
	for k in Ks:
	km = KMeans(n_clusters=k, n_init="auto", random_state=42)
	km.fit(X)
	inertias.append(km.inertia_)

	fig2, ax2 = plt.subplots()
	ax2.plot(Ks, inertias, marker='o')
	ax2.set_title("Elbow Method for Optimal K")
	ax2.set_xlabel("Number of Clusters (K)")
	ax2.set_ylabel("Inertia")
	st.pyplot(fig2)

	# Section 5: Challenges and Alternatives
	with st.expander("⚠️ Challenges with K-Means & Alternatives"):
	st.markdown("""
	K-Means limitations:
	- Requires choosing K manually
	- Assumes clusters are spherical and equal-sized
	- Sensitive to outliers and initial center placement

	Variants / Alternatives:
	- K-Medoids: More robust to outliers
	- DBSCAN: Density-based, no need to specify K
	- Hierarchical Clustering: Builds a tree of clusters
	- Gaussian Mixture Models (GMM): Probabilistic soft clustering

	Use-case and data characteristics often guide which method to choose.
	""")