Package nltk :: Package cluster :: Module kmeans
[hide private]
[frames] | no frames]

Source Code for Module nltk.cluster.kmeans

  1  # Natural Language Toolkit: K-Means Clusterer 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Trevor Cohn <[email protected]> 
  5  # Porting: Steven Bird <[email protected]> 
  6  # URL: <http://nltk.org> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  import numpy 
 10  import random 
 11   
 12  from api import * 
 13  from util import * 
 14   
15 -class KMeans(VectorSpace):
16 """ 17 The K-means clusterer starts with k arbitrary chosen means then allocates 18 each vector to the cluster with the closest mean. It then recalculates the 19 means of each cluster as the centroid of the vectors in the cluster. This 20 process repeats until the cluster memberships stabilise. This is a 21 hill-climbing algorithm which may converge to a local maximum. Hence the 22 clustering is often repeated with random initial means and the most 23 commonly occuring output means are chosen. 24 """ 25
26 - def __init__(self, num_means, distance, repeats=1, 27 conv_test=1e-6, initial_means=None, 28 normalise=False, svd_dimensions=None, 29 rng=None):
30 """ 31 @param num_means: the number of means to use (may use fewer) 32 @type num_means: int 33 @param distance: measure of distance between two vectors 34 @type distance: function taking two vectors and returing a float 35 @param repeats: number of randomised clustering trials to use 36 @type repeats: int 37 @param conv_test: maximum variation in mean differences before 38 deemed convergent 39 @type conv_test: number 40 @param initial_means: set of k initial means 41 @type initial_means: sequence of vectors 42 @param normalise: should vectors be normalised to length 1 43 @type normalise: boolean 44 @param svd_dimensions: number of dimensions to use in reducing vector 45 dimensionsionality with SVD 46 @type svd_dimensions: int 47 @param rng: random number generator (or None) 48 @type rng: Random 49 """ 50 VectorSpace.__init__(self, normalise, svd_dimensions) 51 self._num_means = num_means 52 self._distance = distance 53 self._max_difference = conv_test 54 assert not initial_means or len(initial_means) == num_means 55 self._means = initial_means 56 assert repeats >= 1 57 assert not (initial_means and repeats > 1) 58 self._repeats = repeats 59 if rng: self._rng = rng 60 else: self._rng = random.Random()
61
62 - def cluster_vectorspace(self, vectors, trace=False):
63 if self._means and self._repeats > 1: 64 print 'Warning: means will be discarded for subsequent trials' 65 66 meanss = [] 67 for trial in range(self._repeats): 68 if trace: print 'k-means trial', trial 69 if not self._means or trial > 1: 70 self._means = self._rng.sample(vectors, self._num_means) 71 self._cluster_vectorspace(vectors, trace) 72 meanss.append(self._means) 73 74 if len(meanss) > 1: 75 # sort the means first (so that different cluster numbering won't 76 # effect the distance comparison) 77 for means in meanss: 78 means.sort(cmp = _vector_compare) 79 80 # find the set of means that's minimally different from the others 81 min_difference = min_means = None 82 for i in range(len(meanss)): 83 d = 0 84 for j in range(len(meanss)): 85 if i != j: 86 d += self._sum_distances(meanss[i], meanss[j]) 87 if min_difference == None or d < min_difference: 88 min_difference, min_means = d, meanss[i] 89 90 # use the best means 91 self._means = min_means
92
93 - def _cluster_vectorspace(self, vectors, trace=False):
94 if self._num_means < len(vectors): 95 # perform k-means clustering 96 converged = False 97 while not converged: 98 # assign the tokens to clusters based on minimum distance to 99 # the cluster means 100 clusters = [[] for m in range(self._num_means)] 101 for vector in vectors: 102 index = self.classify_vectorspace(vector) 103 clusters[index].append(vector) 104 105 if trace: print 'iteration' 106 #for i in range(self._num_means): 107 #print ' mean', i, 'allocated', len(clusters[i]), 'vectors' 108 109 # recalculate cluster means by computing the centroid of each cluster 110 new_means = map(self._centroid, clusters) 111 112 # measure the degree of change from the previous step for convergence 113 difference = self._sum_distances(self._means, new_means) 114 if difference < self._max_difference: 115 converged = True 116 117 # remember the new means 118 self._means = new_means
119
120 - def classify_vectorspace(self, vector):
121 # finds the closest cluster centroid 122 # returns that cluster's index 123 best_distance = best_index = None 124 for index in range(len(self._means)): 125 mean = self._means[index] 126 dist = self._distance(vector, mean) 127 if best_distance == None or dist < best_distance: 128 best_index, best_distance = index, dist 129 return best_index
130
131 - def num_clusters(self):
132 if self._means: 133 return len(self._means) 134 else: 135 return self._num_means
136
137 - def means(self):
138 """ 139 The means used for clustering. 140 """ 141 return self._means
142
143 - def _sum_distances(self, vectors1, vectors2):
144 difference = 0.0 145 for u, v in zip(vectors1, vectors2): 146 difference += self._distance(u, v) 147 return difference
148
149 - def _centroid(self, cluster):
150 assert len(cluster) > 0 151 centroid = copy.copy(cluster[0]) 152 for vector in cluster[1:]: 153 centroid += vector 154 return centroid / float(len(cluster))
155
156 - def __repr__(self):
157 return '<KMeans Clusterer means=%s repeats=%d>' % \ 158 (self._means, self._repeats)
159
160 -def _vector_compare(x, y):
161 xs, ys = sum(x), sum(y) 162 if xs < ys: return -1 163 elif xs > ys: return 1 164 else: return 0
165 166 ################################################################################# 167
168 -def demo():
169 # example from figure 14.9, page 517, Manning and Schutze 170 171 from nltk import cluster 172 173 vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]] 174 means = [[4, 3], [5, 5]] 175 176 clusterer = cluster.KMeans(2, euclidean_distance, initial_means=means) 177 clusters = clusterer.cluster(vectors, True, trace=True) 178 179 print 'Clustered:', vectors 180 print 'As:', clusters 181 print 'Means:', clusterer.means() 182 print 183 184 vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] 185 186 # test k-means using the euclidean distance metric, 2 means and repeat 187 # clustering 10 times with random seeds 188 189 clusterer = cluster.KMeans(2, euclidean_distance, repeats=10) 190 clusters = clusterer.cluster(vectors, True) 191 print 'Clustered:', vectors 192 print 'As:', clusters 193 print 'Means:', clusterer.means() 194 print 195 196 # classify a new vector 197 vector = numpy.array([3, 3]) 198 print 'classify(%s):' % vector, 199 print clusterer.classify(vector) 200 print
201 202 if __name__ == '__main__': 203 demo() 204