Package nltk :: Package cluster :: Module em
[hide private]
[frames] | no frames]

Source Code for Module nltk.cluster.em

  1  # Natural Language Toolkit: Expectation Maximization Clusterer 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Trevor Cohn <[email protected]> 
  5  # Porting: Steven Bird <[email protected]> 
  6  # URL: <http://nltk.org> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  import numpy 
 10   
 11  from api import * 
 12  from util import * 
 13   
14 -class EM(VectorSpace):
15 """ 16 The Gaussian EM clusterer models the vectors as being produced by 17 a mixture of k Gaussian sources. The parameters of these sources 18 (prior probability, mean and covariance matrix) are then found to 19 maximise the likelihood of the given data. This is done with the 20 expectation maximisation algorithm. It starts with k arbitrarily 21 chosen means, priors and covariance matrices. It then calculates 22 the membership probabilities for each vector in each of the 23 clusters; this is the 'E' step. The cluster parameters are then 24 updated in the 'M' step using the maximum likelihood estimate from 25 the cluster membership probabilities. This process continues until 26 the likelihood of the data does not significantly increase. 27 """ 28
29 - def __init__(self, initial_means, priors=None, covariance_matrices=None, 30 conv_threshold=1e-6, bias=0.1, normalise=False, 31 svd_dimensions=None):
32 """ 33 Creates an EM clusterer with the given starting parameters, 34 convergence threshold and vector mangling parameters. 35 36 @param initial_means: the means of the gaussian cluster centers 37 @type initial_means: [seq of] numpy array or seq of SparseArray 38 @param priors: the prior probability for each cluster 39 @type priors: numpy array or seq of float 40 @param covariance_matrices: the covariance matrix for each cluster 41 @type covariance_matrices: [seq of] numpy array 42 @param conv_threshold: maximum change in likelihood before deemed 43 convergent 44 @type conv_threshold: int or float 45 @param bias: variance bias used to ensure non-singular covariance 46 matrices 47 @type bias: float 48 @param normalise: should vectors be normalised to length 1 49 @type normalise: boolean 50 @param svd_dimensions: number of dimensions to use in reducing vector 51 dimensionsionality with SVD 52 @type svd_dimensions: int 53 """ 54 VectorSpace.__init__(self, normalise, svd_dimensions) 55 self._means = numpy.array(initial_means, numpy.float64) 56 self._num_clusters = len(initial_means) 57 self._conv_threshold = conv_threshold 58 self._covariance_matrices = covariance_matrices 59 self._priors = priors 60 self._bias = bias
61
62 - def num_clusters(self):
63 return self._num_clusters
64
65 - def cluster_vectorspace(self, vectors, trace=False):
66 assert len(vectors) > 0 67 68 # set the parameters to initial values 69 dimensions = len(vectors[0]) 70 means = self._means 71 priors = self._priors 72 if not priors: 73 priors = self._priors = numpy.ones(self._num_clusters, 74 numpy.float64) / self._num_clusters 75 covariances = self._covariance_matrices 76 if not covariances: 77 covariances = self._covariance_matrices = \ 78 [ numpy.identity(dimensions, numpy.float64) 79 for i in range(self._num_clusters) ] 80 81 # do the E and M steps until the likelihood plateaus 82 lastl = self._loglikelihood(vectors, priors, means, covariances) 83 converged = False 84 85 while not converged: 86 if trace: print 'iteration; loglikelihood', lastl 87 # E-step, calculate hidden variables, h[i,j] 88 h = numpy.zeros((len(vectors), self._num_clusters), 89 numpy.float64) 90 for i in range(len(vectors)): 91 for j in range(self._num_clusters): 92 h[i,j] = priors[j] * self._gaussian(means[j], 93 covariances[j], vectors[i]) 94 h[i,:] /= sum(h[i,:]) 95 96 # M-step, update parameters - cvm, p, mean 97 for j in range(self._num_clusters): 98 covariance_before = covariances[j] 99 new_covariance = numpy.zeros((dimensions, dimensions), 100 numpy.float64) 101 new_mean = numpy.zeros(dimensions, numpy.float64) 102 sum_hj = 0.0 103 for i in range(len(vectors)): 104 delta = vectors[i] - means[j] 105 new_covariance += h[i,j] * \ 106 numpy.multiply.outer(delta, delta) 107 sum_hj += h[i,j] 108 new_mean += h[i,j] * vectors[i] 109 covariances[j] = new_covariance / sum_hj 110 means[j] = new_mean / sum_hj 111 priors[j] = sum_hj / len(vectors) 112 113 # bias term to stop covariance matrix being singular 114 covariances[j] += self._bias * \ 115 numpy.identity(dimensions, numpy.float64) 116 117 # calculate likelihood - FIXME: may be broken 118 l = self._loglikelihood(vectors, priors, means, covariances) 119 120 # check for convergence 121 if abs(lastl - l) < self._conv_threshold: 122 converged = True 123 lastl = l
124
125 - def classify_vectorspace(self, vector):
126 best = None 127 for j in range(self._num_clusters): 128 p = self._priors[j] * self._gaussian(self._means[j], 129 self._covariance_matrices[j], vector) 130 if not best or p > best[0]: 131 best = (p, j) 132 return best[1]
133
134 - def likelihood_vectorspace(self, vector, cluster):
135 cid = self.cluster_names().index(cluster) 136 return self._priors[cluster] * self._gaussian(self._means[cluster], 137 self._covariance_matrices[cluster], vector)
138
139 - def _gaussian(self, mean, cvm, x):
140 m = len(mean) 141 assert cvm.shape == (m, m), \ 142 'bad sized covariance matrix, %s' % str(cvm.shape) 143 try: 144 det = numpy.linalg.det(cvm) 145 inv = numpy.linalg.inv(cvm) 146 a = det ** -0.5 * (2 * numpy.pi) ** (-m / 2.0) 147 dx = x - mean 148 print dx, inv 149 b = -0.5 * numpy.dot( numpy.dot(dx, inv), dx) 150 return a * numpy.exp(b) 151 except OverflowError: 152 # happens when the exponent is negative infinity - i.e. b = 0 153 # i.e. the inverse of cvm is huge (cvm is almost zero) 154 return 0
155
156 - def _loglikelihood(self, vectors, priors, means, covariances):
157 llh = 0.0 158 for vector in vectors: 159 p = 0 160 for j in range(len(priors)): 161 p += priors[j] * \ 162 self._gaussian(means[j], covariances[j], vector) 163 llh += numpy.log(p) 164 return llh
165
166 - def __repr__(self):
167 return '<EM Clusterer means=%s>' % list(self._means)
168
169 -def demo():
170 """ 171 Non-interactive demonstration of the clusterers with simple 2-D data. 172 """ 173 174 from nltk import cluster 175 176 # example from figure 14.10, page 519, Manning and Schutze 177 178 vectors = [numpy.array(f) for f in [[0.5, 0.5], [1.5, 0.5], [1, 3]]] 179 means = [[4, 2], [4, 2.01]] 180 181 clusterer = cluster.EM(means, bias=0.1) 182 clusters = clusterer.cluster(vectors, True, trace=True) 183 184 print 'Clustered:', vectors 185 print 'As: ', clusters 186 print 187 188 for c in range(2): 189 print 'Cluster:', c 190 print 'Prior: ', clusterer._priors[c] 191 print 'Mean: ', clusterer._means[c] 192 print 'Covar: ', clusterer._covariance_matrices[c] 193 print 194 195 # classify a new vector 196 vector = numpy.array([2, 2]) 197 print 'classify(%s):' % vector, 198 print clusterer.classify(vector) 199 200 # show the classification probabilities 201 vector = numpy.array([2, 2]) 202 print 'classification_probdist(%s):' % vector 203 pdist = clusterer.classification_probdist(vector) 204 for sample in pdist: 205 print '%s => %.0f%%' % (sample, 206 pdist.prob(sample) *100)
207 208 # 209 # The following demo code is broken. 210 # 211 # # use a set of tokens with 2D indices 212 # vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] 213 214 # # test the EM clusterer with means given by k-means (2) and 215 # # dimensionality reduction 216 # clusterer = cluster.KMeans(2, euclidean_distance, svd_dimensions=1) 217 # print 'Clusterer:', clusterer 218 # clusters = clusterer.cluster(vectors) 219 # means = clusterer.means() 220 # print 'Means:', clusterer.means() 221 # print 222 223 # clusterer = cluster.EM(means, svd_dimensions=1) 224 # clusters = clusterer.cluster(vectors, True) 225 # print 'Clusterer:', clusterer 226 # print 'Clustered:', str(vectors)[:60], '...' 227 # print 'As:', str(clusters)[:60], '...' 228 # print 229 230 # # classify a new vector 231 # vector = numpy.array([3, 3]) 232 # print 'classify(%s):' % vector, 233 # print clusterer.classify(vector) 234 # print 235 236 # # show the classification probabilities 237 # vector = numpy.array([2.2, 2]) 238 # print 'classification_probdist(%s)' % vector 239 # pdist = clusterer.classification_probdist(vector) 240 # for sample in pdist: 241 # print '%s => %.0f%%' % (sample, pdist.prob(sample) *100) 242 243 if __name__ == '__main__': 244 demo() 245