import gradio as gr import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split import matplotlib.cm as cm from sklearn.utils import shuffle from sklearn.utils import check_random_state from sklearn.cluster import MiniBatchKMeans from sklearn.cluster import KMeans theme = gr.themes.Monochrome( primary_hue="indigo", secondary_hue="blue", neutral_hue="slate", ) description = f""" ## Description This demo can be used to evaluate the ability of k-means initializations strategies to make the algorithm convergence robust """ # TODO: Make the below parameters user passable random_state = np.random.RandomState(0) # k-means models can do several random inits so as to be able to trade # CPU time for convergence robustness n_init_range = np.array([1, 5, 10, 15, 20]) # Datasets generation parameters n_samples_per_center = 100 grid_size = 3 scale = 0.1 n_clusters = grid_size**2 def make_data(random_state, n_samples_per_center, grid_size, scale): random_state = check_random_state(random_state) centers = np.array([[i, j] for i in range(grid_size) for j in range(grid_size)]) n_clusters_true, n_features = centers.shape noise = random_state.normal( scale=scale, size=(n_samples_per_center, centers.shape[1]) ) X = np.concatenate([c + noise for c in centers]) y = np.concatenate([[i] * n_samples_per_center for i in range(n_clusters_true)]) return shuffle(X, y, random_state=random_state) def quant_evaluation(n_runs): plt.figure() plots = [] legends = [] cases = [ (KMeans, "k-means++", {}, "^-"), (KMeans, "random", {}, "o-"), (MiniBatchKMeans, "k-means++", {"max_no_improvement": 3}, "x-"), (MiniBatchKMeans, "random", {"max_no_improvement": 3, "init_size": 500}, "d-"), ] for factory, init, params, format in cases: print("Evaluation of %s with %s init" % (factory.__name__, init)) inertia = np.empty((len(n_init_range), n_runs)) for run_id in range(n_runs): X, y = make_data(run_id, n_samples_per_center, grid_size, scale) for i, n_init in enumerate(n_init_range): km = factory( n_clusters=n_clusters, init=init, random_state=run_id, n_init=n_init, **params, ).fit(X) inertia[i, run_id] = km.inertia_ p = plt.errorbar( n_init_range, inertia.mean(axis=1), inertia.std(axis=1), fmt=format ) plots.append(p[0]) legends.append("%s with %s init" % (factory.__name__, init)) plt.xlabel("n_init") plt.ylabel("inertia") plt.legend(plots, legends) plt.title("Mean inertia for various k-means init across %d runs" % n_runs) return plt def qual_evaluation(): X, y = make_data(random_state, n_samples_per_center, grid_size, scale) km = MiniBatchKMeans( n_clusters=n_clusters, init="random", n_init=1, random_state=random_state ).fit(X) plt.figure() for k in range(n_clusters): my_members = km.labels_ == k color = cm.nipy_spectral(float(k) / n_clusters, 1) plt.plot(X[my_members, 0], X[my_members, 1], ".", c=color) cluster_center = km.cluster_centers_[k] plt.plot( cluster_center[0], cluster_center[1], "o", markerfacecolor=color, markeredgecolor="k", markersize=6, ) plt.title( "Example cluster allocation with a single random init\nwith MiniBatchKMeans" ) return plt with gr.Blocks(theme=theme) as demo: gr.Markdown('''

Empirical evaluation of the impact of k-means initialization 📊

''') gr.Markdown(description) n_runs = gr.Slider(minimum=1, maximum=10, step=1, value=5, label="Number of Evaluation Runs") run_button = gr.Button('Evaluate') run_button_qual = gr.Button('Generate Cluster Allocations') with gr.Row(): plot_inertia = gr.Plot() plot_vis = gr.Plot() run_button.click(fn=quant_evaluation, inputs=[n_runs], outputs=plot_inertia) run_button_qual.click(fn=qual_evaluation, inputs=[], outputs=plot_vis) demo.launch()