Empirical evaluation of the impact of k-means initialization 📊

import gradio as gr
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import matplotlib.cm as cm
from sklearn.utils import shuffle
from sklearn.utils import check_random_state
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import KMeans

theme = gr.themes.Monochrome(
    primary_hue="indigo",
    secondary_hue="blue",
    neutral_hue="slate",
)

description = f"""
## Description
This demo can be used to evaluate the ability of k-means initializations strategies to make the algorithm convergence robust
"""

# TODO: Make the below parameters user passable
random_state = np.random.RandomState(0)

# k-means models can do several random inits so as to be able to trade
# CPU time for convergence robustness
n_init_range = np.array([1, 5, 10, 15, 20])

# Datasets generation parameters
n_samples_per_center = 100
grid_size = 3
scale = 0.1
n_clusters = grid_size**2

def make_data(random_state, n_samples_per_center, grid_size, scale):
    random_state = check_random_state(random_state)
    centers = np.array([[i, j] for i in range(grid_size) for j in range(grid_size)])
    n_clusters_true, n_features = centers.shape

    noise = random_state.normal(
        scale=scale, size=(n_samples_per_center, centers.shape[1])
    )

    X = np.concatenate([c + noise for c in centers])
    y = np.concatenate([[i] * n_samples_per_center for i in range(n_clusters_true)])
    return shuffle(X, y, random_state=random_state)

def quant_evaluation(n_runs):
    plt.figure()
    plots = []
    legends = []
    
    cases = [
        (KMeans, "k-means++", {}, "^-"),
        (KMeans, "random", {}, "o-"),
        (MiniBatchKMeans, "k-means++", {"max_no_improvement": 3}, "x-"),
        (MiniBatchKMeans, "random", {"max_no_improvement": 3, "init_size": 500}, "d-"),
    ]
    
    for factory, init, params, format in cases:
        print("Evaluation of %s with %s init" % (factory.__name__, init))
        inertia = np.empty((len(n_init_range), n_runs))
    
        for run_id in range(n_runs):
            X, y = make_data(run_id, n_samples_per_center, grid_size, scale)
            for i, n_init in enumerate(n_init_range):
                km = factory(
                    n_clusters=n_clusters,
                    init=init,
                    random_state=run_id,
                    n_init=n_init,
                    **params,
                ).fit(X)
                inertia[i, run_id] = km.inertia_
        p = plt.errorbar(
            n_init_range, inertia.mean(axis=1), inertia.std(axis=1), fmt=format
        )
        plots.append(p[0])
        legends.append("%s with %s init" % (factory.__name__, init))
    
    plt.xlabel("n_init")
    plt.ylabel("inertia")
    plt.legend(plots, legends)
    plt.title("Mean inertia for various k-means init across %d runs" % n_runs)
    return plt

def qual_evaluation():
    X, y = make_data(random_state, n_samples_per_center, grid_size, scale)
    km = MiniBatchKMeans(
    n_clusters=n_clusters, init="random", n_init=1, random_state=random_state
    ).fit(X)

    plt.figure()
    for k in range(n_clusters):
        my_members = km.labels_ == k
        color = cm.nipy_spectral(float(k) / n_clusters, 1)
        plt.plot(X[my_members, 0], X[my_members, 1], ".", c=color)
        cluster_center = km.cluster_centers_[k]
        plt.plot(
            cluster_center[0],
            cluster_center[1],
            "o",
            markerfacecolor=color,
            markeredgecolor="k",
            markersize=6,
        )
        plt.title(
            "Example cluster allocation with a single random init\nwith MiniBatchKMeans"
        )
    return plt

with gr.Blocks(theme=theme) as demo:
    gr.Markdown('''
            <h1 style='text-align: center'>Empirical evaluation of the impact of k-means initialization 📊</h1>
        ''')
    gr.Markdown(description)
    n_runs = gr.Slider(minimum=1, maximum=10, step=1, value=5, label="Number of Evaluation Runs")
    run_button = gr.Button('Evaluate')
    run_button_qual = gr.Button('Generate Cluster Allocations')
    with gr.Row():
        plot_inertia = gr.Plot()
        plot_vis = gr.Plot()
    run_button.click(fn=quant_evaluation, inputs=[n_runs], outputs=plot_inertia)
    run_button_qual.click(fn=qual_evaluation, inputs=[], outputs=plot_vis)

demo.launch()