1
2
3
4
5
6
7
8
9 import numpy
10 import random
11
12 from api import *
13 from util import *
14
16 """
17 The K-means clusterer starts with k arbitrary chosen means then allocates
18 each vector to the cluster with the closest mean. It then recalculates the
19 means of each cluster as the centroid of the vectors in the cluster. This
20 process repeats until the cluster memberships stabilise. This is a
21 hill-climbing algorithm which may converge to a local maximum. Hence the
22 clustering is often repeated with random initial means and the most
23 commonly occuring output means are chosen.
24 """
25
26 - def __init__(self, num_means, distance, repeats=1,
27 conv_test=1e-6, initial_means=None,
28 normalise=False, svd_dimensions=None,
29 rng=None):
30 """
31 @param num_means: the number of means to use (may use fewer)
32 @type num_means: int
33 @param distance: measure of distance between two vectors
34 @type distance: function taking two vectors and returing a float
35 @param repeats: number of randomised clustering trials to use
36 @type repeats: int
37 @param conv_test: maximum variation in mean differences before
38 deemed convergent
39 @type conv_test: number
40 @param initial_means: set of k initial means
41 @type initial_means: sequence of vectors
42 @param normalise: should vectors be normalised to length 1
43 @type normalise: boolean
44 @param svd_dimensions: number of dimensions to use in reducing vector
45 dimensionsionality with SVD
46 @type svd_dimensions: int
47 @param rng: random number generator (or None)
48 @type rng: Random
49 """
50 VectorSpace.__init__(self, normalise, svd_dimensions)
51 self._num_means = num_means
52 self._distance = distance
53 self._max_difference = conv_test
54 assert not initial_means or len(initial_means) == num_means
55 self._means = initial_means
56 assert repeats >= 1
57 assert not (initial_means and repeats > 1)
58 self._repeats = repeats
59 if rng: self._rng = rng
60 else: self._rng = random.Random()
61
63 if self._means and self._repeats > 1:
64 print 'Warning: means will be discarded for subsequent trials'
65
66 meanss = []
67 for trial in range(self._repeats):
68 if trace: print 'k-means trial', trial
69 if not self._means or trial > 1:
70 self._means = self._rng.sample(vectors, self._num_means)
71 self._cluster_vectorspace(vectors, trace)
72 meanss.append(self._means)
73
74 if len(meanss) > 1:
75
76
77 for means in meanss:
78 means.sort(cmp = _vector_compare)
79
80
81 min_difference = min_means = None
82 for i in range(len(meanss)):
83 d = 0
84 for j in range(len(meanss)):
85 if i != j:
86 d += self._sum_distances(meanss[i], meanss[j])
87 if min_difference == None or d < min_difference:
88 min_difference, min_means = d, meanss[i]
89
90
91 self._means = min_means
92
94 if self._num_means < len(vectors):
95
96 converged = False
97 while not converged:
98
99
100 clusters = [[] for m in range(self._num_means)]
101 for vector in vectors:
102 index = self.classify_vectorspace(vector)
103 clusters[index].append(vector)
104
105 if trace: print 'iteration'
106
107
108
109
110 new_means = map(self._centroid, clusters)
111
112
113 difference = self._sum_distances(self._means, new_means)
114 if difference < self._max_difference:
115 converged = True
116
117
118 self._means = new_means
119
121
122
123 best_distance = best_index = None
124 for index in range(len(self._means)):
125 mean = self._means[index]
126 dist = self._distance(vector, mean)
127 if best_distance == None or dist < best_distance:
128 best_index, best_distance = index, dist
129 return best_index
130
132 if self._means:
133 return len(self._means)
134 else:
135 return self._num_means
136
138 """
139 The means used for clustering.
140 """
141 return self._means
142
144 difference = 0.0
145 for u, v in zip(vectors1, vectors2):
146 difference += self._distance(u, v)
147 return difference
148
155
157 return '<KMeans Clusterer means=%s repeats=%d>' % \
158 (self._means, self._repeats)
159
161 xs, ys = sum(x), sum(y)
162 if xs < ys: return -1
163 elif xs > ys: return 1
164 else: return 0
165
166
167
169
170
171 from nltk import cluster
172
173 vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]]
174 means = [[4, 3], [5, 5]]
175
176 clusterer = cluster.KMeans(2, euclidean_distance, initial_means=means)
177 clusters = clusterer.cluster(vectors, True, trace=True)
178
179 print 'Clustered:', vectors
180 print 'As:', clusters
181 print 'Means:', clusterer.means()
182 print
183
184 vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
185
186
187
188
189 clusterer = cluster.KMeans(2, euclidean_distance, repeats=10)
190 clusters = clusterer.cluster(vectors, True)
191 print 'Clustered:', vectors
192 print 'As:', clusters
193 print 'Means:', clusterer.means()
194 print
195
196
197 vector = numpy.array([3, 3])
198 print 'classify(%s):' % vector,
199 print clusterer.classify(vector)
200 print
201
202 if __name__ == '__main__':
203 demo()
204