rashmi commited on
Commit
c487a91
·
1 Parent(s): 8abd0e1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +183 -0
app.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ from time import time
5
+ from sklearn import metrics
6
+ from sklearn.pipeline import make_pipeline
7
+ from sklearn.preprocessing import StandardScaler
8
+ from sklearn.cluster import KMeans
9
+ from sklearn.decomposition import PCA
10
+ from huggingface_hub import login
11
+ from datasets import load_dataset
12
+ import matplotlib.pyplot as plt
13
+
14
+
15
+ # https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html#sphx-glr-auto-examples-cluster-plot-kmeans-digits-py
16
+
17
+ def display_plot(data, n_digits):
18
+ reduced_data = PCA(n_components=2).fit_transform(data)
19
+ kmeans = KMeans(init="k-means++", n_clusters=n_digits, n_init=4)
20
+ kmeans.fit(reduced_data)
21
+
22
+ # Step size of the mesh. Decrease to increase the quality of the VQ.
23
+ h = 0.02 # point in the mesh [x_min, x_max]x[y_min, y_max].
24
+
25
+ # Plot the decision boundary. For that, we will assign a color to each
26
+ x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
27
+ y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
28
+ xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
29
+
30
+ # Obtain labels for each point in mesh. Use last trained model.
31
+ Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
32
+
33
+ # Put the result into a color plot
34
+ Z = Z.reshape(xx.shape)
35
+
36
+ fig = plt.figure()
37
+
38
+ plt.clf()
39
+ plt.imshow(
40
+ Z,
41
+ interpolation="nearest",
42
+ extent=(xx.min(), xx.max(), yy.min(), yy.max()),
43
+ cmap=plt.cm.Paired,
44
+ aspect="auto",
45
+ origin="lower",
46
+ )
47
+
48
+ plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2)
49
+ # Plot the centroids as a white X
50
+ centroids = kmeans.cluster_centers_
51
+ plt.scatter(
52
+ centroids[:, 0],
53
+ centroids[:, 1],
54
+ marker="x",
55
+ s=169,
56
+ linewidths=3,
57
+ color="w",
58
+ zorder=10,
59
+ )
60
+ plt.title(
61
+ "K-means clustering on the digits dataset (PCA-reduced data)\n"
62
+ "Centroids are marked with white cross"
63
+ )
64
+ plt.xlim(x_min, x_max)
65
+ plt.ylim(y_min, y_max)
66
+ plt.xticks(())
67
+ plt.yticks(())
68
+ return fig
69
+
70
+ def bench_k_means(kmeans, name, data, labels):
71
+ """Benchmark to evaluate the KMeans initialization methods.
72
+
73
+ Parameters
74
+ ----------
75
+ kmeans : KMeans instance
76
+ A :class:`~sklearn.cluster.KMeans` instance with the initialization
77
+ already set.
78
+ name : str
79
+ Name given to the strategy. It will be used to show the results in a
80
+ table.
81
+ data : ndarray of shape (n_samples, n_features)
82
+ The data to cluster.
83
+ labels : ndarray of shape (n_samples,)
84
+ The labels used to compute the clustering metrics which requires some
85
+ supervision.
86
+ """
87
+ t0 = time()
88
+ estimator = make_pipeline(StandardScaler(), kmeans).fit(data)
89
+ fit_time = time() - t0
90
+ results = [name, fit_time, estimator[-1].inertia_]
91
+
92
+ # Define the metrics which require only the true labels and estimator
93
+ # labels
94
+ clustering_metrics = [
95
+ metrics.homogeneity_score,
96
+ metrics.completeness_score,
97
+ metrics.v_measure_score,
98
+ metrics.adjusted_rand_score,
99
+ metrics.adjusted_mutual_info_score,
100
+ ]
101
+ results += [m(labels, estimator[-1].labels_) for m in clustering_metrics]
102
+
103
+ # The silhouette score requires the full dataset
104
+ results += [
105
+ metrics.silhouette_score(
106
+ data,
107
+ estimator[-1].labels_,
108
+ metric="euclidean",
109
+ sample_size=300,
110
+ )
111
+ ]
112
+
113
+ return results
114
+
115
+ title = "A demo of K-Means clustering on the handwritten digits data"
116
+ def do_submit(kmeans_n_digit,random_n_digit, pca_n_digit):
117
+ # Load the dataset
118
+ dataset = load_dataset("sklearn-docs/digits", header=None)
119
+ # convert dataset to pandas
120
+ df = dataset['train'].to_pandas()
121
+ data = df.iloc[:, :64]
122
+ labels = df.iloc[:, 64]
123
+
124
+ kmeans = KMeans(init="k-means++", n_clusters=int(kmeans_n_digit), n_init=4, random_state=0)
125
+ results = bench_k_means(kmeans=kmeans, name="k-means++", data=data, labels=labels)
126
+
127
+ df = pd.DataFrame(results).T
128
+ numeric_cols = ['time','inertia','homo','compl','v-meas','ARI','AMI','silhouette']
129
+ df.columns = ['init'] + numeric_cols
130
+
131
+ kmeans = KMeans(init="random", n_clusters=int(random_n_digit), n_init=4, random_state=0)
132
+ results = bench_k_means(kmeans=kmeans, name="random", data=data, labels=labels)
133
+ df.loc[len(df.index)] = results
134
+
135
+ pca = PCA(n_components=int(pca_n_digit)).fit(data)
136
+ kmeans = KMeans(init=pca.components_, n_clusters=int(pca_n_digit), n_init=1)
137
+ results = bench_k_means(kmeans=kmeans, name="PCA-based", data=data, labels=labels)
138
+ df.loc[len(df.index)] = results
139
+ df[df.columns[1:]] = df.iloc[:,1:].astype(float).round(3)
140
+
141
+ df = df.T #Transpose for display
142
+ df.columns = df.iloc[0,:].tolist()
143
+ df = df.iloc[1:,:].reset_index()
144
+ df.columns = ['metrics', 'k-means++', 'random', 'PCA-based']
145
+ return display_plot(data, kmeans_n_digit), df
146
+
147
+ #Theme from - https://huggingface.co/spaces/trl-lib/stack-llama/blob/main/app.py
148
+ theme = gr.themes.Monochrome(
149
+ primary_hue="indigo",
150
+ secondary_hue="blue",
151
+ neutral_hue="slate",
152
+ radius_size=gr.themes.sizes.radius_sm,
153
+ font=[gr.themes.GoogleFont("Open Sans"), "ui-sans-serif", "system-ui", "sans-serif"],
154
+ )
155
+
156
+ with gr.Blocks(title=title, theme=theme) as demo:
157
+ gr.Markdown(f"## {title}")
158
+ gr.Markdown("[Scikit-learn Example](https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html#sphx-glr-auto-examples-cluster-plot-kmeans-digits-py)")
159
+ gr.Markdown("In this example we compare the various initialization strategies for K-means in terms of runtime and quality of the results.")
160
+ gr.Markdown("As the ground truth is known here, we also apply different cluster quality metrics to judge the goodness of fit of the cluster labels to the ground truth.")
161
+ gr.Markdown("Cluster quality metrics evaluated (see [Clustering performance evaluation](https://scikit-learn.org/stable/modules/clustering.html#clustering-evaluation) \
162
+ for definitions and discussions of the metrics):")
163
+
164
+ with gr.Row():
165
+ with gr.Column(scale=0.5):
166
+ kmeans_n_digit = gr.Slider(minimum=2, maximum=10, label="KMeans n_digits", step=1, value=10)
167
+ random_n_digit = gr.Slider(minimum=2, maximum=10, label="Random n_digits", step=1, value=10)
168
+ pca_n_digit = gr.Slider(minimum=2, maximum=10, label="PCA n_digits",step=1, value=10)
169
+
170
+ plt_out = gr.Plot()
171
+
172
+ with gr.Column(scale=0.5):
173
+ sample_df = pd.DataFrame(np.zeros((9,4)),columns=['metrics', 'k-means++', 'random', 'PCA-based'])
174
+
175
+ output = gr.Dataframe(sample_df, label="Output Table")
176
+
177
+
178
+
179
+ with gr.Row():
180
+ sub_btn = gr.Button("Submit")
181
+ sub_btn.click(fn=do_submit, inputs=[kmeans_n_digit,random_n_digit, pca_n_digit], outputs=[plt_out, output])
182
+
183
+ demo.launch()