nltk.cluster.em

15 """ 16 The Gaussian EM clusterer models the vectors as being produced by 17 a mixture of k Gaussian sources. The parameters of these sources 18 (prior probability, mean and covariance matrix) are then found to 19 maximise the likelihood of the given data. This is done with the 20 expectation maximisation algorithm. It starts with k arbitrarily 21 chosen means, priors and covariance matrices. It then calculates 22 the membership probabilities for each vector in each of the 23 clusters; this is the 'E' step. The cluster parameters are then 24 updated in the 'M' step using the maximum likelihood estimate from 25 the cluster membership probabilities. This process continues until 26 the likelihood of the data does not significantly increase. 27 """ 28

29 - def __init__(self, initial_means, priors=None, covariance_matrices=None, 30 conv_threshold=1e-6, bias=0.1, normalise=False, 31 svd_dimensions=None):

32 """ 33 Creates an EM clusterer with the given starting parameters, 34 convergence threshold and vector mangling parameters. 35 36 @param initial_means: the means of the gaussian cluster centers 37 @type initial_means: [seq of] numpy array or seq of SparseArray 38 @param priors: the prior probability for each cluster 39 @type priors: numpy array or seq of float 40 @param covariance_matrices: the covariance matrix for each cluster 41 @type covariance_matrices: [seq of] numpy array 42 @param conv_threshold: maximum change in likelihood before deemed 43 convergent 44 @type conv_threshold: int or float 45 @param bias: variance bias used to ensure non-singular covariance 46 matrices 47 @type bias: float 48 @param normalise: should vectors be normalised to length 1 49 @type normalise: boolean 50 @param svd_dimensions: number of dimensions to use in reducing vector 51 dimensionsionality with SVD 52 @type svd_dimensions: int 53 """ 54 VectorSpace.__init__(self, normalise, svd_dimensions) 55 self._means = numpy.array(initial_means, numpy.float64) 56 self._num_clusters = len(initial_means) 57 self._conv_threshold = conv_threshold 58 self._covariance_matrices = covariance_matrices 59 self._priors = priors 60 self._bias = bias

61

62 - def num_clusters(self):

63 return self._num_clusters

64

65 - def cluster_vectorspace(self, vectors, trace=False):

66 assert len(vectors) > 0 67 68 # set the parameters to initial values 69 dimensions = len(vectors[0]) 70 means = self._means 71 priors = self._priors 72 if not priors: 73 priors = self._priors = numpy.ones(self._num_clusters, 74 numpy.float64) / self._num_clusters 75 covariances = self._covariance_matrices 76 if not covariances: 77 covariances = self._covariance_matrices = \ 78 [ numpy.identity(dimensions, numpy.float64) 79 for i in range(self._num_clusters) ] 80 81 # do the E and M steps until the likelihood plateaus 82 lastl = self._loglikelihood(vectors, priors, means, covariances) 83 converged = False 84 85 while not converged: 86 if trace: print 'iteration; loglikelihood', lastl 87 # E-step, calculate hidden variables, h[i,j] 88 h = numpy.zeros((len(vectors), self._num_clusters), 89 numpy.float64) 90 for i in range(len(vectors)): 91 for j in range(self._num_clusters): 92 h[i,j] = priors[j] * self._gaussian(means[j], 93 covariances[j], vectors[i]) 94 h[i,:] /= sum(h[i,:]) 95 96 # M-step, update parameters - cvm, p, mean 97 for j in range(self._num_clusters): 98 covariance_before = covariances[j] 99 new_covariance = numpy.zeros((dimensions, dimensions), 100 numpy.float64) 101 new_mean = numpy.zeros(dimensions, numpy.float64) 102 sum_hj = 0.0 103 for i in range(len(vectors)): 104 delta = vectors[i] - means[j] 105 new_covariance += h[i,j] * \ 106 numpy.multiply.outer(delta, delta) 107 sum_hj += h[i,j] 108 new_mean += h[i,j] * vectors[i] 109 covariances[j] = new_covariance / sum_hj 110 means[j] = new_mean / sum_hj 111 priors[j] = sum_hj / len(vectors) 112 113 # bias term to stop covariance matrix being singular 114 covariances[j] += self._bias * \ 115 numpy.identity(dimensions, numpy.float64) 116 117 # calculate likelihood - FIXME: may be broken 118 l = self._loglikelihood(vectors, priors, means, covariances) 119 120 # check for convergence 121 if abs(lastl - l) < self._conv_threshold: 122 converged = True 123 lastl = l

124

125 - def classify_vectorspace(self, vector):

126 best = None 127 for j in range(self._num_clusters): 128 p = self._priors[j] * self._gaussian(self._means[j], 129 self._covariance_matrices[j], vector) 130 if not best or p > best[0]: 131 best = (p, j) 132 return best[1]

133

134 - def likelihood_vectorspace(self, vector, cluster):

135 cid = self.cluster_names().index(cluster) 136 return self._priors[cluster] * self._gaussian(self._means[cluster], 137 self._covariance_matrices[cluster], vector)

138

139 - def _gaussian(self, mean, cvm, x):

140 m = len(mean) 141 assert cvm.shape == (m, m), \ 142 'bad sized covariance matrix, %s' % str(cvm.shape) 143 try: 144 det = numpy.linalg.det(cvm) 145 inv = numpy.linalg.inv(cvm) 146 a = det ** -0.5 * (2 * numpy.pi) ** (-m / 2.0) 147 dx = x - mean 148 print dx, inv 149 b = -0.5 * numpy.dot( numpy.dot(dx, inv), dx) 150 return a * numpy.exp(b) 151 except OverflowError: 152 # happens when the exponent is negative infinity - i.e. b = 0 153 # i.e. the inverse of cvm is huge (cvm is almost zero) 154 return 0

155

156 - def _loglikelihood(self, vectors, priors, means, covariances):

157 llh = 0.0 158 for vector in vectors: 159 p = 0 160 for j in range(len(priors)): 161 p += priors[j] * \ 162 self._gaussian(means[j], covariances[j], vector) 163 llh += numpy.log(p) 164 return llh

165

166 - def __repr__(self):

167 return '<EM Clusterer means=%s>' % list(self._means)

Source Code for Module nltk.cluster.em