Spaces:
Sleeping
Sleeping
File size: 8,592 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 |
# Natural Language Toolkit: K-Means Clusterer
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Trevor Cohn <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import copy
import random
import sys
try:
import numpy
except ImportError:
pass
from nltk.cluster.util import VectorSpaceClusterer
class KMeansClusterer(VectorSpaceClusterer):
"""
The K-means clusterer starts with k arbitrary chosen means then allocates
each vector to the cluster with the closest mean. It then recalculates the
means of each cluster as the centroid of the vectors in the cluster. This
process repeats until the cluster memberships stabilise. This is a
hill-climbing algorithm which may converge to a local maximum. Hence the
clustering is often repeated with random initial means and the most
commonly occurring output means are chosen.
"""
def __init__(
self,
num_means,
distance,
repeats=1,
conv_test=1e-6,
initial_means=None,
normalise=False,
svd_dimensions=None,
rng=None,
avoid_empty_clusters=False,
):
"""
:param num_means: the number of means to use (may use fewer)
:type num_means: int
:param distance: measure of distance between two vectors
:type distance: function taking two vectors and returning a float
:param repeats: number of randomised clustering trials to use
:type repeats: int
:param conv_test: maximum variation in mean differences before
deemed convergent
:type conv_test: number
:param initial_means: set of k initial means
:type initial_means: sequence of vectors
:param normalise: should vectors be normalised to length 1
:type normalise: boolean
:param svd_dimensions: number of dimensions to use in reducing vector
dimensionsionality with SVD
:type svd_dimensions: int
:param rng: random number generator (or None)
:type rng: Random
:param avoid_empty_clusters: include current centroid in computation
of next one; avoids undefined behavior
when clusters become empty
:type avoid_empty_clusters: boolean
"""
VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
self._num_means = num_means
self._distance = distance
self._max_difference = conv_test
assert not initial_means or len(initial_means) == num_means
self._means = initial_means
assert repeats >= 1
assert not (initial_means and repeats > 1)
self._repeats = repeats
self._rng = rng if rng else random.Random()
self._avoid_empty_clusters = avoid_empty_clusters
def cluster_vectorspace(self, vectors, trace=False):
if self._means and self._repeats > 1:
print("Warning: means will be discarded for subsequent trials")
meanss = []
for trial in range(self._repeats):
if trace:
print("k-means trial", trial)
if not self._means or trial > 1:
self._means = self._rng.sample(list(vectors), self._num_means)
self._cluster_vectorspace(vectors, trace)
meanss.append(self._means)
if len(meanss) > 1:
# sort the means first (so that different cluster numbering won't
# effect the distance comparison)
for means in meanss:
means.sort(key=sum)
# find the set of means that's minimally different from the others
min_difference = min_means = None
for i in range(len(meanss)):
d = 0
for j in range(len(meanss)):
if i != j:
d += self._sum_distances(meanss[i], meanss[j])
if min_difference is None or d < min_difference:
min_difference, min_means = d, meanss[i]
# use the best means
self._means = min_means
def _cluster_vectorspace(self, vectors, trace=False):
if self._num_means < len(vectors):
# perform k-means clustering
converged = False
while not converged:
# assign the tokens to clusters based on minimum distance to
# the cluster means
clusters = [[] for m in range(self._num_means)]
for vector in vectors:
index = self.classify_vectorspace(vector)
clusters[index].append(vector)
if trace:
print("iteration")
# for i in range(self._num_means):
# print ' mean', i, 'allocated', len(clusters[i]), 'vectors'
# recalculate cluster means by computing the centroid of each cluster
new_means = list(map(self._centroid, clusters, self._means))
# measure the degree of change from the previous step for convergence
difference = self._sum_distances(self._means, new_means)
if difference < self._max_difference:
converged = True
# remember the new means
self._means = new_means
def classify_vectorspace(self, vector):
# finds the closest cluster centroid
# returns that cluster's index
best_distance = best_index = None
for index in range(len(self._means)):
mean = self._means[index]
dist = self._distance(vector, mean)
if best_distance is None or dist < best_distance:
best_index, best_distance = index, dist
return best_index
def num_clusters(self):
if self._means:
return len(self._means)
else:
return self._num_means
def means(self):
"""
The means used for clustering.
"""
return self._means
def _sum_distances(self, vectors1, vectors2):
difference = 0.0
for u, v in zip(vectors1, vectors2):
difference += self._distance(u, v)
return difference
def _centroid(self, cluster, mean):
if self._avoid_empty_clusters:
centroid = copy.copy(mean)
for vector in cluster:
centroid += vector
return centroid / (1 + len(cluster))
else:
if not len(cluster):
sys.stderr.write("Error: no centroid defined for empty cluster.\n")
sys.stderr.write(
"Try setting argument 'avoid_empty_clusters' to True\n"
)
assert False
centroid = copy.copy(cluster[0])
for vector in cluster[1:]:
centroid += vector
return centroid / len(cluster)
def __repr__(self):
return "<KMeansClusterer means=%s repeats=%d>" % (self._means, self._repeats)
#################################################################################
def demo():
# example from figure 14.9, page 517, Manning and Schutze
from nltk.cluster import KMeansClusterer, euclidean_distance
vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]]
means = [[4, 3], [5, 5]]
clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means)
clusters = clusterer.cluster(vectors, True, trace=True)
print("Clustered:", vectors)
print("As:", clusters)
print("Means:", clusterer.means())
print()
vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
# test k-means using the euclidean distance metric, 2 means and repeat
# clustering 10 times with random seeds
clusterer = KMeansClusterer(2, euclidean_distance, repeats=10)
clusters = clusterer.cluster(vectors, True)
print("Clustered:", vectors)
print("As:", clusters)
print("Means:", clusterer.means())
print()
# classify a new vector
vector = numpy.array([3, 3])
print("classify(%s):" % vector, end=" ")
print(clusterer.classify(vector))
print()
if __name__ == "__main__":
demo()
|