Spaces:

sklearn-docs
/

clustering

Runtime error

App Files Files Community

Benjamin Bossan commited on Jul 12, 2022

Commit

7c9b8f5

1 Parent(s): 0415b11

Change layout to show all models at once

Browse files

Using Blocks and grid to show the predictions of all models at once to
make it easier to compare them.

Files changed (1) hide show

app.py +77 -49

app.py CHANGED Viewed

@@ -4,6 +4,9 @@ Derived from https://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_
 """
 import gradio as gr
 import matplotlib.pyplot as plt
 import numpy as np
@@ -22,6 +25,12 @@ plt.style.use('seaborn')
 SEED = 0
 MAX_CLUSTERS = 10
 N_SAMPLES = 1000
 np.random.seed(SEED)
@@ -88,19 +97,29 @@ DATA_MAPPING = {
     'varied': get_varied,
 }
-def get_kmeans(X, n_clusters, **kwargs):
     model = KMeans(init="k-means++", n_clusters=n_clusters, n_init=10, random_state=SEED)
     model.set_params(**kwargs)
     return model.fit(X)
-def get_dbscan(X, n_clusters, **kwargs):
     model = DBSCAN(eps=0.3)
     model.set_params(**kwargs)
     return model.fit(X)
-def get_agglomerative(X, n_clusters, **kwargs):
     connectivity = kneighbors_graph(
         X, n_neighbors=n_clusters, include_self=False
     )
@@ -113,14 +132,14 @@ def get_agglomerative(X, n_clusters, **kwargs):
     return model.fit(X)
-def get_meanshift(X, n_clusters, **kwargs):
-    bandwidth = estimate_bandwidth(X, quantile=0.3)
     model = MeanShift(bandwidth=bandwidth, bin_seeding=True)
     model.set_params(**kwargs)
     return model.fit(X)
-def get_spectral(X, n_clusters, **kwargs):
     model = SpectralClustering(
         n_clusters=n_clusters,
         eigen_solver="arpack",
@@ -130,7 +149,7 @@ def get_spectral(X, n_clusters, **kwargs):
     return model.fit(X)
-def get_optics(X, n_clusters, **kwargs):
     model = OPTICS(
         min_samples=7,
         xi=0.05,
@@ -140,13 +159,13 @@ def get_optics(X, n_clusters, **kwargs):
     return model.fit(X)
-def get_birch(X, n_clusters, **kwargs):
     model = Birch(n_clusters=n_clusters)
     model.set_params(**kwargs)
     return model.fit(X)
-def get_gaussianmixture(X, n_clusters, **kwargs):
     model = GaussianMixture(
         n_components=n_clusters, covariance_type="full", random_state=SEED,
     )
@@ -155,25 +174,26 @@ def get_gaussianmixture(X, n_clusters, **kwargs):
 MODEL_MAPPING = {
     'KMeans': get_kmeans,
     'DBSCAN': get_dbscan,
-    'AgglomerativeClustering': get_agglomerative,
     'MeanShift': get_meanshift,
     'SpectralClustering': get_spectral,
     'OPTICS': get_optics,
     'Birch': get_birch,
     'GaussianMixture': get_gaussianmixture,
 }
 def plot_clusters(ax, X, labels):
     set_clusters = set(labels)
     set_clusters.discard(-1)  # -1 signifiies outliers, which we plot separately
-    for label in sorted(set_clusters):
         idx = labels == label
         if not sum(idx):
             continue
-        ax.scatter(X[idx, 0], X[idx, 1])
     # show outliers (if any)
     idx = labels == -1
@@ -186,26 +206,23 @@ def plot_clusters(ax, X, labels):
     return ax
-def cluster(clustering_algorithm: str, dataset: str, n_clusters: int):
-    n_clusters = int(n_clusters)
     X, labels = DATA_MAPPING[dataset](n_clusters)
-    model = MODEL_MAPPING[clustering_algorithm](X, n_clusters=n_clusters)
     if hasattr(model, "labels_"):
         y_pred = model.labels_.astype(int)
     else:
         y_pred = model.predict(X)
-    fig, axes = plt.subplots(1, 2, figsize=(16, 8))
-    # show true labels in first panel
-    ax = axes[0]
-    plot_clusters(ax, X, labels)
-    ax.set_title("True clusters")
-    # show learned clusters in second panel
-    ax = axes[1]
     plot_clusters(ax, X, y_pred)
-    ax.set_title(clustering_algorithm)
     return fig
@@ -213,31 +230,42 @@ def cluster(clustering_algorithm: str, dataset: str, n_clusters: int):
 title = "Clustering with Scikit-learn"
 description = (
     "This example shows how different clustering algorithms work. Simply pick "
-    "the algorithm and the dataset to see how the clustering algorithms work."
-)
-demo = gr.Interface(
-    fn=cluster,
-    inputs=[
-        gr.Radio(
-            list(MODEL_MAPPING),
-            value="KMeans",
-            label="clustering algorithm"
-        ),
-        gr.Radio(
-            list(DATA_MAPPING),
-            value="regular",
-            label="dataset"
-        ),
-        gr.Slider(
-            minimum=1,
-            maximum=MAX_CLUSTERS,
-            value=4,
-            step=1,
-        )
-    ],
-    title=title,
-    description=description,
-    outputs=gr.Plot(),
 )
 demo.launch()

 """
+import math
+from functools import partial
 import gradio as gr
 import matplotlib.pyplot as plt
 import numpy as np
 SEED = 0
 MAX_CLUSTERS = 10
 N_SAMPLES = 1000
+N_COLS = 3
+FIGSIZE = 7, 7  # does not affect size in webpage
+COLORS = [
+    'blue', 'orange', 'green', 'red', 'purple', 'brown', 'pink', 'gray', 'olive', 'cyan'
+]
+assert len(COLORS) >= MAX_CLUSTERS, "Not enough different colors for all clusters"
 np.random.seed(SEED)
     'varied': get_varied,
 }
+def get_groundtruth_model(X, labels, n_clusters, **kwargs):
+    # dummy model to show true label distribution
+    class Dummy:
+        def __init__(self, y):
+            self.labels_ = labels
+    return Dummy(labels)
+def get_kmeans(X, labels, n_clusters, **kwargs):
     model = KMeans(init="k-means++", n_clusters=n_clusters, n_init=10, random_state=SEED)
     model.set_params(**kwargs)
     return model.fit(X)
+def get_dbscan(X, labels, n_clusters, **kwargs):
     model = DBSCAN(eps=0.3)
     model.set_params(**kwargs)
     return model.fit(X)
+def get_agglomerative(X, labels, n_clusters, **kwargs):
     connectivity = kneighbors_graph(
         X, n_neighbors=n_clusters, include_self=False
     )
     return model.fit(X)
+def get_meanshift(X, labels, n_clusters, **kwargs):
+    bandwidth = estimate_bandwidth(X, quantile=0.25)
     model = MeanShift(bandwidth=bandwidth, bin_seeding=True)
     model.set_params(**kwargs)
     return model.fit(X)
+def get_spectral(X, labels, n_clusters, **kwargs):
     model = SpectralClustering(
         n_clusters=n_clusters,
         eigen_solver="arpack",
     return model.fit(X)
+def get_optics(X, labels, n_clusters, **kwargs):
     model = OPTICS(
         min_samples=7,
         xi=0.05,
     return model.fit(X)
+def get_birch(X, labels, n_clusters, **kwargs):
     model = Birch(n_clusters=n_clusters)
     model.set_params(**kwargs)
     return model.fit(X)
+def get_gaussianmixture(X, labels, n_clusters, **kwargs):
     model = GaussianMixture(
         n_components=n_clusters, covariance_type="full", random_state=SEED,
     )
 MODEL_MAPPING = {
+    'True labels': get_groundtruth_model,
     'KMeans': get_kmeans,
     'DBSCAN': get_dbscan,
     'MeanShift': get_meanshift,
     'SpectralClustering': get_spectral,
     'OPTICS': get_optics,
     'Birch': get_birch,
     'GaussianMixture': get_gaussianmixture,
+    'AgglomerativeClustering': get_agglomerative,
 }
 def plot_clusters(ax, X, labels):
     set_clusters = set(labels)
     set_clusters.discard(-1)  # -1 signifiies outliers, which we plot separately
+    for label, color in zip(sorted(set_clusters), COLORS):
         idx = labels == label
         if not sum(idx):
             continue
+        ax.scatter(X[idx, 0], X[idx, 1], color=color)
     # show outliers (if any)
     idx = labels == -1
     return ax
+def cluster(dataset: str, n_clusters: int, clustering_algorithm: str):
+    if isinstance(n_clusters, dict):
+        n_clusters = n_clusters['value']
+    else:
+        n_clusters = int(n_clusters)
     X, labels = DATA_MAPPING[dataset](n_clusters)
+    model = MODEL_MAPPING[clustering_algorithm](X, labels, n_clusters=n_clusters)
     if hasattr(model, "labels_"):
         y_pred = model.labels_.astype(int)
     else:
         y_pred = model.predict(X)
+    fig, ax = plt.subplots(figsize=FIGSIZE)
     plot_clusters(ax, X, y_pred)
+    ax.set_title(clustering_algorithm, fontsize=16)
     return fig
 title = "Clustering with Scikit-learn"
 description = (
     "This example shows how different clustering algorithms work. Simply pick "
+    "the dataset and the number of clusters to see how the clustering algorithms work."
 )
+with gr.Blocks(title=title) as demo:
+    gr.HTML(f"<b>{title}</b>")
+    gr.Markdown(description)
+    input_models = list(MODEL_MAPPING)
+    input_data = gr.Radio(
+        list(DATA_MAPPING),
+        value="regular",
+        label="dataset"
+    )
+    input_n_clusters = gr.Slider(
+        minimum=1,
+        maximum=MAX_CLUSTERS,
+        value=4,
+        step=1,
+        label='Number of clusters'
+    )
+    n_rows = int(math.ceil(len(input_models) / N_COLS))
+    counter = 0
+    # code below is not very elegant, maybe there is a better way?
+    for i in range(n_rows):
+        with gr.Row():
+            for j in range(N_COLS):
+                with gr.Column():
+                    if counter >= len(input_models):
+                        break
+                    input_model = input_models[counter]
+                    plot = gr.Plot(label=input_model)
+                    fn = partial(cluster, clustering_algorithm=input_model)
+                    input_data.change(fn=fn, inputs=[input_data, input_n_clusters], outputs=plot)
+                    input_n_clusters.change(fn=fn, inputs=[input_data, input_n_clusters], outputs=plot)
+                    counter += 1
 demo.launch()