File size: 8,419 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
# Natural Language Toolkit: Expectation Maximization Clusterer
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Trevor Cohn <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

try:
    import numpy
except ImportError:
    pass

from nltk.cluster.util import VectorSpaceClusterer


class EMClusterer(VectorSpaceClusterer):
    """

    The Gaussian EM clusterer models the vectors as being produced by

    a mixture of k Gaussian sources. The parameters of these sources

    (prior probability, mean and covariance matrix) are then found to

    maximise the likelihood of the given data. This is done with the

    expectation maximisation algorithm. It starts with k arbitrarily

    chosen means, priors and covariance matrices. It then calculates

    the membership probabilities for each vector in each of the

    clusters; this is the 'E' step. The cluster parameters are then

    updated in the 'M' step using the maximum likelihood estimate from

    the cluster membership probabilities. This process continues until

    the likelihood of the data does not significantly increase.

    """

    def __init__(

        self,

        initial_means,

        priors=None,

        covariance_matrices=None,

        conv_threshold=1e-6,

        bias=0.1,

        normalise=False,

        svd_dimensions=None,

    ):
        """

        Creates an EM clusterer with the given starting parameters,

        convergence threshold and vector mangling parameters.



        :param  initial_means: the means of the gaussian cluster centers

        :type   initial_means: [seq of] numpy array or seq of SparseArray

        :param  priors: the prior probability for each cluster

        :type   priors: numpy array or seq of float

        :param  covariance_matrices: the covariance matrix for each cluster

        :type   covariance_matrices: [seq of] numpy array

        :param  conv_threshold: maximum change in likelihood before deemed

                    convergent

        :type   conv_threshold: int or float

        :param  bias: variance bias used to ensure non-singular covariance

                      matrices

        :type   bias: float

        :param  normalise:  should vectors be normalised to length 1

        :type   normalise:  boolean

        :param  svd_dimensions: number of dimensions to use in reducing vector

                               dimensionsionality with SVD

        :type   svd_dimensions: int

        """
        VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
        self._means = numpy.array(initial_means, numpy.float64)
        self._num_clusters = len(initial_means)
        self._conv_threshold = conv_threshold
        self._covariance_matrices = covariance_matrices
        self._priors = priors
        self._bias = bias

    def num_clusters(self):
        return self._num_clusters

    def cluster_vectorspace(self, vectors, trace=False):
        assert len(vectors) > 0

        # set the parameters to initial values
        dimensions = len(vectors[0])
        means = self._means
        priors = self._priors
        if not priors:
            priors = self._priors = (
                numpy.ones(self._num_clusters, numpy.float64) / self._num_clusters
            )
        covariances = self._covariance_matrices
        if not covariances:
            covariances = self._covariance_matrices = [
                numpy.identity(dimensions, numpy.float64)
                for i in range(self._num_clusters)
            ]

        # do the E and M steps until the likelihood plateaus
        lastl = self._loglikelihood(vectors, priors, means, covariances)
        converged = False

        while not converged:
            if trace:
                print("iteration; loglikelihood", lastl)
            # E-step, calculate hidden variables, h[i,j]
            h = numpy.zeros((len(vectors), self._num_clusters), numpy.float64)
            for i in range(len(vectors)):
                for j in range(self._num_clusters):
                    h[i, j] = priors[j] * self._gaussian(
                        means[j], covariances[j], vectors[i]
                    )
                h[i, :] /= sum(h[i, :])

            # M-step, update parameters - cvm, p, mean
            for j in range(self._num_clusters):
                covariance_before = covariances[j]
                new_covariance = numpy.zeros((dimensions, dimensions), numpy.float64)
                new_mean = numpy.zeros(dimensions, numpy.float64)
                sum_hj = 0.0
                for i in range(len(vectors)):
                    delta = vectors[i] - means[j]
                    new_covariance += h[i, j] * numpy.multiply.outer(delta, delta)
                    sum_hj += h[i, j]
                    new_mean += h[i, j] * vectors[i]
                covariances[j] = new_covariance / sum_hj
                means[j] = new_mean / sum_hj
                priors[j] = sum_hj / len(vectors)

                # bias term to stop covariance matrix being singular
                covariances[j] += self._bias * numpy.identity(dimensions, numpy.float64)

            # calculate likelihood - FIXME: may be broken
            l = self._loglikelihood(vectors, priors, means, covariances)

            # check for convergence
            if abs(lastl - l) < self._conv_threshold:
                converged = True
            lastl = l

    def classify_vectorspace(self, vector):
        best = None
        for j in range(self._num_clusters):
            p = self._priors[j] * self._gaussian(
                self._means[j], self._covariance_matrices[j], vector
            )
            if not best or p > best[0]:
                best = (p, j)
        return best[1]

    def likelihood_vectorspace(self, vector, cluster):
        cid = self.cluster_names().index(cluster)
        return self._priors[cluster] * self._gaussian(
            self._means[cluster], self._covariance_matrices[cluster], vector
        )

    def _gaussian(self, mean, cvm, x):
        m = len(mean)
        assert cvm.shape == (m, m), "bad sized covariance matrix, %s" % str(cvm.shape)
        try:
            det = numpy.linalg.det(cvm)
            inv = numpy.linalg.inv(cvm)
            a = det**-0.5 * (2 * numpy.pi) ** (-m / 2.0)
            dx = x - mean
            print(dx, inv)
            b = -0.5 * numpy.dot(numpy.dot(dx, inv), dx)
            return a * numpy.exp(b)
        except OverflowError:
            # happens when the exponent is negative infinity - i.e. b = 0
            # i.e. the inverse of cvm is huge (cvm is almost zero)
            return 0

    def _loglikelihood(self, vectors, priors, means, covariances):
        llh = 0.0
        for vector in vectors:
            p = 0
            for j in range(len(priors)):
                p += priors[j] * self._gaussian(means[j], covariances[j], vector)
            llh += numpy.log(p)
        return llh

    def __repr__(self):
        return "<EMClusterer means=%s>" % list(self._means)


def demo():
    """

    Non-interactive demonstration of the clusterers with simple 2-D data.

    """

    from nltk import cluster

    # example from figure 14.10, page 519, Manning and Schutze

    vectors = [numpy.array(f) for f in [[0.5, 0.5], [1.5, 0.5], [1, 3]]]
    means = [[4, 2], [4, 2.01]]

    clusterer = cluster.EMClusterer(means, bias=0.1)
    clusters = clusterer.cluster(vectors, True, trace=True)

    print("Clustered:", vectors)
    print("As:       ", clusters)
    print()

    for c in range(2):
        print("Cluster:", c)
        print("Prior:  ", clusterer._priors[c])
        print("Mean:   ", clusterer._means[c])
        print("Covar:  ", clusterer._covariance_matrices[c])
        print()

    # classify a new vector
    vector = numpy.array([2, 2])
    print("classify(%s):" % vector, end=" ")
    print(clusterer.classify(vector))

    # show the classification probabilities
    vector = numpy.array([2, 2])
    print("classification_probdist(%s):" % vector)
    pdist = clusterer.classification_probdist(vector)
    for sample in pdist.samples():
        print(f"{sample} => {pdist.prob(sample) * 100:.0f}%")


if __name__ == "__main__":
    demo()