import streamlit as st import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.datasets import make_blobs import time st.set_page_config(layout="wide") st.markdown("#### Clustering in AI - (unsupervised modeling)") # Section 1: What is clustering? with st.expander("🔍 What is clustering, and why is it relevant in business?"): st.markdown(""" Clustering is an **unsupervised machine learning technique** that groups similar data points together. It's commonly used in: - **Customer segmentation** (e.g., marketing campaigns) - **Anomaly detection** (e.g., fraud or system failures) - **Document categorization** Clustering helps discover **patterns** without labeled data, making it extremely useful in business scenarios where manual labeling is costly or infeasible. """) from sklearn.datasets import make_blobs from sklearn.cluster import KMeans import matplotlib.pyplot as plt import seaborn as sns # Set plot style sns.set(style="whitegrid") # --- 1. Customer Segmentation --- st.markdown("###### 📊 1. Customer Segmentation") st.write("Imagine customers represented by their **age** and **spending score**. Clustering reveals distinct customer groups.") X_seg, _ = make_blobs(n_samples=300, centers=4, cluster_std=1.0, random_state=42) kmeans_seg = KMeans(n_clusters=4, random_state=42).fit(X_seg) labels_seg = kmeans_seg.labels_ fig1, ax1 = plt.subplots(figsize=(9,4)) scatter1 = ax1.scatter(X_seg[:, 0], X_seg[:, 1], c=labels_seg, cmap='Accent') ax1.set_xlabel("Age") ax1.set_ylabel("Spending Score") ax1.set_title("Customer Segmentation Clusters") st.pyplot(fig1) st.markdown(""" **Interpretation:** Each cluster corresponds to a distinct customer segment, like: - High spenders vs budget-conscious - Young vs older demographics This allows targeted marketing and better personalization. """) # --- 2. Anomaly Detection --- st.markdown("###### 🚨 2. Anomaly Detection") st.write("Let’s simulate normal system activity with a few injected anomalies.") X_anom, _ = make_blobs(n_samples=290, centers=1, cluster_std=1.0, random_state=42) anomalies = np.random.uniform(low=-6, high=6, size=(10, 2)) X_anom_combined = np.vstack([X_anom, anomalies]) kmeans_anom = KMeans(n_clusters=1, random_state=42).fit(X_anom_combined) distances = np.linalg.norm(X_anom_combined - kmeans_anom.cluster_centers_[0], axis=1) threshold = np.percentile(distances, 95) outliers = distances > threshold fig2, ax2 = plt.subplots(figsize=(9,4)) ax2.scatter(X_anom_combined[~outliers, 0], X_anom_combined[~outliers, 1], label="Normal", alpha=0.6) ax2.scatter(X_anom_combined[outliers, 0], X_anom_combined[outliers, 1], color='red', label="Anomaly") ax2.set_title("Anomaly Detection using Clustering") ax2.legend() st.pyplot(fig2) st.markdown(""" **Interpretation:** Data points that are **far from the cluster center** are flagged as anomalies. Great for: - Fraud detection - Network intrusion - Fault detection in systems """) # --- 3. Document Categorization --- st.markdown("###### 📚 3. Document Categorization") st.write("Assume each document is reduced to 2D space using techniques like TF-IDF + PCA.") X_docs, _ = make_blobs(n_samples=300, centers=3, cluster_std=1.2, random_state=7) kmeans_docs = KMeans(n_clusters=3, random_state=7).fit(X_docs) fig3, ax3 = plt.subplots(figsize=(9,4)) ax3.scatter(X_docs[:, 0], X_docs[:, 1], c=kmeans_docs.labels_, cmap='Set2') ax3.set_title("Clustering Documents into Categories") ax3.set_xlabel("Topic Vector 1") ax3.set_ylabel("Topic Vector 2") st.pyplot(fig3) st.markdown(""" **Interpretation:** Clustering helps group similar documents or articles (e.g., tech, sports, health) without prior labels. It's used in: - News aggregation - Content recommendation - Automated document organization """) # Section 2: Key characteristics with st.expander("🧠 Key characteristics of clustering (Human-in-the-loop)"): st.markdown(""" - No predefined labels — clustering is exploratory. - Requires defining **number of clusters (K)** manually in many algorithms like K-Means. - Human input is essential for: - **Interpreting cluster meanings** - **Validating business relevance** - **Tuning parameters like K or distance metrics** This is where **"human-in-the-loop"** comes in — domain experts make sense of the clusters produced. """) # --- 1. Standard Numeric Dataset --- st.markdown("###### 🧮 1. Standard Numeric Dataset (e.g., Customer Features)") import pandas as pd import numpy as np df_numeric = pd.DataFrame({ "Age": np.random.randint(18, 65, size=5), "Annual Income ($)": np.random.randint(20000, 100000, size=5), "Spending Score": np.random.randint(1, 100, size=5), "Cluster_Label": ["" for _ in range(5)] }) st.dataframe(df_numeric) # --- 2. Text Dataset --- st.markdown("###### ✍️ 2. Text Dataset (e.g., Customer Reviews)") df_text = pd.DataFrame({ "Review_Text": [ "Great product, loved the quality!", "Terrible support. Never buying again.", "Okay-ish experience. Could be better.", "Fast delivery and nice packaging.", "Didn't meet my expectations." ], "Cluster_Label": ["" for _ in range(5)] }) st.dataframe(df_text) # --- 3. Image Dataset --- st.markdown("###### 🖼️ 3. Image Dataset (e.g., Pixel Vectors)") df_image = pd.DataFrame(np.random.randint(0, 256, size=(5, 10)), columns=[f"Pixel_{i}" for i in range(10)]) df_image["Cluster_Label"] = "" st.dataframe(df_image) st.markdown(""" **Notice:** There are **no predefined labels** (`Cluster_Label` is empty). Clustering algorithms group the rows based on internal patterns, and **humans interpret what those groupings mean**. """) # Section 3: Custom K-Means visualization with st.expander("📊 Visualizing K-Means Clustering (Custom Implementation)"): st.markdown("K-Means Clustering Demonstration (Custom Implementation)") # Sidebar parameters num_points = st.sidebar.slider("Number of points per cluster", 10, 100, 50) cluster_sep = st.sidebar.slider("Cluster separation", 0.5, 5.0, 2.0) sleep_interval = st.sidebar.slider("Sleep interval (seconds)", 0.1, 2.0, 0.5) show_table = st.sidebar.checkbox("Show cluster table") # Generate synthetic data @st.cache_data def generate_data(num_points, cluster_sep): points, _ = make_blobs(n_samples=num_points*3, centers=3, cluster_std=cluster_sep, n_features=2, random_state=42) return points points = generate_data(num_points, cluster_sep) # Random centers np.random.seed(42) centers = np.column_stack(( np.random.uniform(-10, 10, 3), np.random.uniform(-10, 5, 3) )) def calculate_distances(points, centers): return np.linalg.norm(points[:, np.newaxis] - centers, axis=2) fig, axes = plt.subplots(4, 3, figsize=(12, 16)) num_iterations = 12 for iteration in range(num_iterations): distances = calculate_distances(points, centers) closest = np.argmin(distances, axis=1) df = pd.DataFrame(points, columns=['x1', 'x2']) for i in range(3): df[f'dist_to_center_{i+1}'] = distances[:, i] df['closest_center'] = closest row, col = divmod(iteration, 3) ax = axes[row, col] colors = ['red', 'green', 'blue'] for i in range(3): cluster = df[df['closest_center'] == i] ax.scatter(cluster['x1'], cluster['x2'], color=colors[i], s=5, label=f'Cluster {i+1}') ax.scatter(centers[i][0], centers[i][1], color='black', marker='x', s=50, linewidths=2) ax.set_title(f"Iteration {iteration + 1}", fontsize=8) ax.set_xlabel("x1", fontsize=8) ax.set_ylabel("x2", fontsize=8) ax.tick_params(labelsize=6) ax.legend(fontsize=6) # Update centers centers = np.array([df[df['closest_center'] == i][['x1', 'x2']].mean() for i in range(3)]) time.sleep(sleep_interval) st.pyplot(fig) if show_table: def highlight_min(s): return ['background-color: lightgreen' if v == s.min() else '' for v in s] st.dataframe(df.style.apply(highlight_min, subset=[f'dist_to_center_{i+1}' for i in range(3)])) # Section 4: Evaluating with the Elbow Method with st.expander("📉 How do we know if clustering worked well (Elbow Method)?"): st.markdown(""" The **Elbow Method** helps identify the optimal number of clusters (K). - Plot the **inertia** (sum of squared distances from points to their cluster center) for different K. - The 'elbow' point in the curve is the ideal number of clusters. A sharp drop followed by a plateau indicates the elbow. This technique avoids both under- and over-clustering. """) from sklearn.cluster import KMeans X = generate_data(100, 1.5) inertias = [] Ks = range(1, 10) for k in Ks: km = KMeans(n_clusters=k, n_init="auto", random_state=42) km.fit(X) inertias.append(km.inertia_) fig2, ax2 = plt.subplots() ax2.plot(Ks, inertias, marker='o') ax2.set_title("Elbow Method for Optimal K") ax2.set_xlabel("Number of Clusters (K)") ax2.set_ylabel("Inertia") st.pyplot(fig2) # Section 5: Challenges and Alternatives with st.expander("⚠️ Challenges with K-Means & Alternatives"): st.markdown(""" **K-Means limitations:** - Requires choosing K manually - Assumes clusters are spherical and equal-sized - Sensitive to outliers and initial center placement **Variants / Alternatives:** - **K-Medoids**: More robust to outliers - **DBSCAN**: Density-based, no need to specify K - **Hierarchical Clustering**: Builds a tree of clusters - **Gaussian Mixture Models (GMM)**: Probabilistic soft clustering Use-case and data characteristics often guide which method to choose. """)