merve's picture
merve HF staff
Update app.py
135ab42
import gradio as gr
import pandas as pd
import numpy as np
from time import time
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from huggingface_hub import login
from datasets import load_dataset
import matplotlib.pyplot as plt
# https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html#sphx-glr-auto-examples-cluster-plot-kmeans-digits-py
def display_plot(data, n_digits):
reduced_data = PCA(n_components=2).fit_transform(data)
kmeans = KMeans(init="k-means++", n_clusters=n_digits, n_init=4)
kmeans.fit(reduced_data)
# Step size of the mesh. Decrease to increase the quality of the VQ.
h = 0.02 # point in the mesh [x_min, x_max]x[y_min, y_max].
# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
fig = plt.figure()
plt.clf()
plt.imshow(
Z,
interpolation="nearest",
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=plt.cm.Paired,
aspect="auto",
origin="lower",
)
plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(
centroids[:, 0],
centroids[:, 1],
marker="x",
s=169,
linewidths=3,
color="w",
zorder=10,
)
plt.title(
"K-means clustering on the digits dataset (PCA-reduced data)\n"
"Centroids are marked with white cross"
)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
return fig
def bench_k_means(kmeans, name, data, labels):
"""Benchmark to evaluate the KMeans initialization methods.
Parameters
----------
kmeans : KMeans instance
A :class:`~sklearn.cluster.KMeans` instance with the initialization
already set.
name : str
Name given to the strategy. It will be used to show the results in a
table.
data : ndarray of shape (n_samples, n_features)
The data to cluster.
labels : ndarray of shape (n_samples,)
The labels used to compute the clustering metrics which requires some
supervision.
"""
t0 = time()
estimator = make_pipeline(StandardScaler(), kmeans).fit(data)
fit_time = time() - t0
results = [name, fit_time, estimator[-1].inertia_]
# Define the metrics which require only the true labels and estimator
# labels
clustering_metrics = [
metrics.homogeneity_score,
metrics.completeness_score,
metrics.v_measure_score,
metrics.adjusted_rand_score,
metrics.adjusted_mutual_info_score,
]
results += [m(labels, estimator[-1].labels_) for m in clustering_metrics]
# The silhouette score requires the full dataset
results += [
metrics.silhouette_score(
data,
estimator[-1].labels_,
metric="euclidean",
sample_size=300,
)
]
return results
title = "A demo of K-Means clustering on the handwritten digits data"
def do_submit(kmeans_n_digit,random_n_digit, pca_n_digit):
# Load the dataset
dataset = load_dataset("sklearn-docs/digits", header=None)
# convert dataset to pandas
df = dataset['train'].to_pandas()
data = df.iloc[:, :64]
labels = df.iloc[:, 64]
kmeans = KMeans(init="k-means++", n_clusters=int(kmeans_n_digit), n_init=4, random_state=0)
results = bench_k_means(kmeans=kmeans, name="k-means++", data=data, labels=labels)
df = pd.DataFrame(results).T
numeric_cols = ['time','inertia','homo','compl','v-meas','ARI','AMI','silhouette']
df.columns = ['init'] + numeric_cols
kmeans = KMeans(init="random", n_clusters=int(random_n_digit), n_init=4, random_state=0)
results = bench_k_means(kmeans=kmeans, name="random", data=data, labels=labels)
df.loc[len(df.index)] = results
pca = PCA(n_components=int(pca_n_digit)).fit(data)
kmeans = KMeans(init=pca.components_, n_clusters=int(pca_n_digit), n_init=1)
results = bench_k_means(kmeans=kmeans, name="PCA-based", data=data, labels=labels)
df.loc[len(df.index)] = results
df[df.columns[1:]] = df.iloc[:,1:].astype(float).round(3)
df = df.T #Transpose for display
df.columns = df.iloc[0,:].tolist()
df = df.iloc[1:,:].reset_index()
df.columns = ['metrics', 'k-means++', 'random', 'PCA-based']
return display_plot(data, kmeans_n_digit), df
#Theme from - https://huggingface.co/spaces/trl-lib/stack-llama/blob/main/app.py
theme = gr.themes.Monochrome(
primary_hue="indigo",
secondary_hue="blue",
neutral_hue="slate",
radius_size=gr.themes.sizes.radius_sm,
font=[gr.themes.GoogleFont("Open Sans"), "ui-sans-serif", "system-ui", "sans-serif"],
)
with gr.Blocks(title=title, theme=theme) as demo:
gr.Markdown(f"## {title}")
gr.Markdown("This demo is based on this [scikit-learn example](https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html#sphx-glr-auto-examples-cluster-plot-kmeans-digits-py)")
gr.Markdown("In this example we compare the various initialization strategies for K-means in terms of runtime and quality of the results.")
gr.Markdown("As the ground truth is known here, we also apply different cluster quality metrics to judge the goodness of fit of the cluster labels to the ground truth.")
gr.Markdown("Cluster quality metrics evaluated (see [Clustering performance evaluation](https://scikit-learn.org/stable/modules/clustering.html#clustering-evaluation) \
for definitions and discussions of the metrics):")
gr.Markdown("---")
gr.Markdown(" We will be utilizing [digits](https://huggingface.co/datasets/sklearn-docs/digits) dataset. This dataset contains handwritten digits from 0 to 9. \
In the context of clustering, one would like to group images such that the handwritten digits on the image are the same.")
with gr.Row():
with gr.Column(scale=0.5):
kmeans_n_digit = gr.Slider(minimum=2, maximum=10, label="KMeans n_digits", info="n_digits is number of handwritten digits" , step=1, value=10)
random_n_digit = gr.Slider(minimum=2, maximum=10, label="Random n_digits", step=1, value=10)
pca_n_digit = gr.Slider(minimum=2, maximum=10, label="PCA n_digits",step=1, value=10)
plt_out = gr.Plot()
with gr.Column(scale=0.5):
sample_df = pd.DataFrame(np.zeros((9,4)),columns=['metrics', 'k-means++', 'random', 'PCA-based'])
output = gr.Dataframe(sample_df, label="Clustering Metrics")
with gr.Row():
sub_btn = gr.Button("Submit")
sub_btn.click(fn=do_submit, inputs=[kmeans_n_digit,random_n_digit, pca_n_digit], outputs=[plt_out, output])
demo.launch()