1
2
3
4
5
6
7
8
9 import numpy
10
11 from api import *
12 from util import *
13
14 -class EM(VectorSpace):
15 """
16 The Gaussian EM clusterer models the vectors as being produced by
17 a mixture of k Gaussian sources. The parameters of these sources
18 (prior probability, mean and covariance matrix) are then found to
19 maximise the likelihood of the given data. This is done with the
20 expectation maximisation algorithm. It starts with k arbitrarily
21 chosen means, priors and covariance matrices. It then calculates
22 the membership probabilities for each vector in each of the
23 clusters; this is the 'E' step. The cluster parameters are then
24 updated in the 'M' step using the maximum likelihood estimate from
25 the cluster membership probabilities. This process continues until
26 the likelihood of the data does not significantly increase.
27 """
28
29 - def __init__(self, initial_means, priors=None, covariance_matrices=None,
30 conv_threshold=1e-6, bias=0.1, normalise=False,
31 svd_dimensions=None):
32 """
33 Creates an EM clusterer with the given starting parameters,
34 convergence threshold and vector mangling parameters.
35
36 @param initial_means: the means of the gaussian cluster centers
37 @type initial_means: [seq of] numpy array or seq of SparseArray
38 @param priors: the prior probability for each cluster
39 @type priors: numpy array or seq of float
40 @param covariance_matrices: the covariance matrix for each cluster
41 @type covariance_matrices: [seq of] numpy array
42 @param conv_threshold: maximum change in likelihood before deemed
43 convergent
44 @type conv_threshold: int or float
45 @param bias: variance bias used to ensure non-singular covariance
46 matrices
47 @type bias: float
48 @param normalise: should vectors be normalised to length 1
49 @type normalise: boolean
50 @param svd_dimensions: number of dimensions to use in reducing vector
51 dimensionsionality with SVD
52 @type svd_dimensions: int
53 """
54 VectorSpace.__init__(self, normalise, svd_dimensions)
55 self._means = numpy.array(initial_means, numpy.float64)
56 self._num_clusters = len(initial_means)
57 self._conv_threshold = conv_threshold
58 self._covariance_matrices = covariance_matrices
59 self._priors = priors
60 self._bias = bias
61
63 return self._num_clusters
64
66 assert len(vectors) > 0
67
68
69 dimensions = len(vectors[0])
70 means = self._means
71 priors = self._priors
72 if not priors:
73 priors = self._priors = numpy.ones(self._num_clusters,
74 numpy.float64) / self._num_clusters
75 covariances = self._covariance_matrices
76 if not covariances:
77 covariances = self._covariance_matrices = \
78 [ numpy.identity(dimensions, numpy.float64)
79 for i in range(self._num_clusters) ]
80
81
82 lastl = self._loglikelihood(vectors, priors, means, covariances)
83 converged = False
84
85 while not converged:
86 if trace: print 'iteration; loglikelihood', lastl
87
88 h = numpy.zeros((len(vectors), self._num_clusters),
89 numpy.float64)
90 for i in range(len(vectors)):
91 for j in range(self._num_clusters):
92 h[i,j] = priors[j] * self._gaussian(means[j],
93 covariances[j], vectors[i])
94 h[i,:] /= sum(h[i,:])
95
96
97 for j in range(self._num_clusters):
98 covariance_before = covariances[j]
99 new_covariance = numpy.zeros((dimensions, dimensions),
100 numpy.float64)
101 new_mean = numpy.zeros(dimensions, numpy.float64)
102 sum_hj = 0.0
103 for i in range(len(vectors)):
104 delta = vectors[i] - means[j]
105 new_covariance += h[i,j] * \
106 numpy.multiply.outer(delta, delta)
107 sum_hj += h[i,j]
108 new_mean += h[i,j] * vectors[i]
109 covariances[j] = new_covariance / sum_hj
110 means[j] = new_mean / sum_hj
111 priors[j] = sum_hj / len(vectors)
112
113
114 covariances[j] += self._bias * \
115 numpy.identity(dimensions, numpy.float64)
116
117
118 l = self._loglikelihood(vectors, priors, means, covariances)
119
120
121 if abs(lastl - l) < self._conv_threshold:
122 converged = True
123 lastl = l
124
126 best = None
127 for j in range(self._num_clusters):
128 p = self._priors[j] * self._gaussian(self._means[j],
129 self._covariance_matrices[j], vector)
130 if not best or p > best[0]:
131 best = (p, j)
132 return best[1]
133
138
140 m = len(mean)
141 assert cvm.shape == (m, m), \
142 'bad sized covariance matrix, %s' % str(cvm.shape)
143 try:
144 det = numpy.linalg.det(cvm)
145 inv = numpy.linalg.inv(cvm)
146 a = det ** -0.5 * (2 * numpy.pi) ** (-m / 2.0)
147 dx = x - mean
148 print dx, inv
149 b = -0.5 * numpy.dot( numpy.dot(dx, inv), dx)
150 return a * numpy.exp(b)
151 except OverflowError:
152
153
154 return 0
155
157 llh = 0.0
158 for vector in vectors:
159 p = 0
160 for j in range(len(priors)):
161 p += priors[j] * \
162 self._gaussian(means[j], covariances[j], vector)
163 llh += numpy.log(p)
164 return llh
165
167 return '<EM Clusterer means=%s>' % list(self._means)
168
170 """
171 Non-interactive demonstration of the clusterers with simple 2-D data.
172 """
173
174 from nltk import cluster
175
176
177
178 vectors = [numpy.array(f) for f in [[0.5, 0.5], [1.5, 0.5], [1, 3]]]
179 means = [[4, 2], [4, 2.01]]
180
181 clusterer = cluster.EM(means, bias=0.1)
182 clusters = clusterer.cluster(vectors, True, trace=True)
183
184 print 'Clustered:', vectors
185 print 'As: ', clusters
186 print
187
188 for c in range(2):
189 print 'Cluster:', c
190 print 'Prior: ', clusterer._priors[c]
191 print 'Mean: ', clusterer._means[c]
192 print 'Covar: ', clusterer._covariance_matrices[c]
193 print
194
195
196 vector = numpy.array([2, 2])
197 print 'classify(%s):' % vector,
198 print clusterer.classify(vector)
199
200
201 vector = numpy.array([2, 2])
202 print 'classification_probdist(%s):' % vector
203 pdist = clusterer.classification_probdist(vector)
204 for sample in pdist:
205 print '%s => %.0f%%' % (sample,
206 pdist.prob(sample) *100)
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243 if __name__ == '__main__':
244 demo()
245