1
2
3
4
5
6
7
8
9 import copy
10 import sys
11 import math
12 import numpy
13
14 from api import *
15
17 """
18 Abstract clusterer which takes tokens and maps them into a vector space.
19 Optionally performs singular value decomposition to reduce the
20 dimensionality.
21 """
22 - def __init__(self, normalise=False, svd_dimensions=None):
23 """
24 @param normalise: should vectors be normalised to length 1
25 @type normalise: boolean
26 @param svd_dimensions: number of dimensions to use in reducing vector
27 dimensionsionality with SVD
28 @type svd_dimensions: int
29 """
30 self._Tt = None
31 self._should_normalise = normalise
32 self._svd_dimensions = svd_dimensions
33
34 - def cluster(self, vectors, assign_clusters=False, trace=False):
35 assert len(vectors) > 0
36
37
38 if self._should_normalise:
39 vectors = map(self._normalise, vectors)
40
41
42 if self._svd_dimensions and self._svd_dimensions < len(vectors[0]):
43 [u, d, vt] = numpy.linalg.svd(numpy.transpose(array(vectors)))
44 S = d[:self._svd_dimensions] * \
45 numpy.identity(self._svd_dimensions, numpy.Float64)
46 T = u[:,:self._svd_dimensions]
47 Dt = vt[:self._svd_dimensions,:]
48 vectors = numpy.transpose(numpy.matrixmultiply(S, Dt))
49 self._Tt = numpy.transpose(T)
50
51
52 self.cluster_vectorspace(vectors, trace)
53
54
55 if assign_clusters:
56 print self._Tt, vectors
57 return [self.classify(vector) for vector in vectors]
58
60 """
61 Finds the clusters using the given set of vectors.
62 """
63 raise AssertionError()
64
72
74 """
75 Returns the index of the appropriate cluster for the vector.
76 """
77 raise AssertionError()
78
85
87 """
88 Returns the likelihood of the vector belonging to the cluster.
89 """
90 predicted = self.classify_vectorspace(vector)
91 if cluster == predicted: return 1.0
92 else: return 0.0
93
95 """
96 Returns the vector after normalisation and dimensionality reduction
97 """
98 if self._should_normalise:
99 vector = self._normalise(vector)
100 if self._Tt != None:
101 vector = numpy.matrixmultiply(self._Tt, vector)
102 return vector
103
109
111 """
112 Returns the euclidean distance between vectors u and v. This is equivalent
113 to the length of the vector (u - v).
114 """
115 diff = u - v
116 return math.sqrt(numpy.dot(diff, diff))
117
119 """
120 Returns the cosine of the angle between vectors v and u. This is equal to
121 u.v / |u||v|.
122 """
123 return numpy.dot(u, v) / (math.sqrt(numpy.dot(u, u)) * math.sqrt(numpy.dot(v, v)))
124
126 """ Tree node of a dendogram. """
127
129 self._value = value
130 self._children = children
131
132 - def leaves(self, values=True):
142
163
165 """
166 Represents a dendogram, a tree with a specified branching order. This
167 must be initialised with the leaf items, then iteratively call merge for
168 each branch. This class constructs a tree representing the order of calls
169 to the merge function.
170 """
171
173 """
174 @param items: the items at the leaves of the dendogram
175 @type items: sequence of (any)
176 """
177 self._items = [_DendogramNode(item) for item in items]
178 self._original_items = copy.copy(self._items)
179 self._merge = 1
180
181 - def merge(self, *indices):
182 """
183 Merges nodes at given indices in the dendogram. The nodes will be
184 combined which then replaces the first node specified. All other nodes
185 involved in the merge will be removed.
186
187 @param indices: indices of the items to merge (at least two)
188 @type indices: seq of int
189 """
190 assert len(indices) >= 2
191 node = _DendogramNode(self._merge, *[self._items[i] for i in indices])
192 self._merge += 1
193 self._items[indices[0]] = node
194 for i in indices[1:]:
195 del self._items[i]
196
198 """
199 Finds the n-groups of items (leaves) reachable from a cut at depth n.
200 @param n: number of groups
201 @type n: int
202 """
203 if len(self._items) > 1:
204 root = _DendogramNode(self._merge, *self._items)
205 else:
206 root = self._items[0]
207 return root.groups(n)
208
210 """
211 Print the dendogram in ASCII art to standard out.
212 """
213
214
215 JOIN, HLINK, VLINK = '+', '-', '|'
216
217
218 if len(self._items) > 1:
219 root = _DendogramNode(self._merge, *self._items)
220 else:
221 root = self._items[0]
222 leaves = self._original_items
223
224
225 last_row = [str(leaf._value) for leaf in leaves]
226 width = max(map(len, last_row)) + 1
227 lhalf = width / 2
228 rhalf = width - lhalf - 1
229
230
231 def format(centre, left=' ', right=' '):
232 return '%s%s%s' % (lhalf*left, centre, right*rhalf)
233 def display(str):
234 sys.stdout.write(str)
235
236
237 queue = [(root._value, root)]
238 verticals = [ format(' ') for leaf in leaves ]
239 while queue:
240 priority, node = queue.pop()
241 child_left_leaf = map(lambda c: c.leaves(False)[0], node._children)
242 indices = map(leaves.index, child_left_leaf)
243 if child_left_leaf:
244 min_idx = min(indices)
245 max_idx = max(indices)
246 for i in range(len(leaves)):
247 if leaves[i] in child_left_leaf:
248 if i == min_idx: display(format(JOIN, ' ', HLINK))
249 elif i == max_idx: display(format(JOIN, HLINK, ' '))
250 else: display(format(JOIN, HLINK, HLINK))
251 verticals[i] = format(VLINK)
252 elif min_idx <= i <= max_idx:
253 display(format(HLINK, HLINK, HLINK))
254 else:
255 display(verticals[i])
256 display('\n')
257 for child in node._children:
258 if child._children:
259 queue.append((child._value, child))
260 queue.sort()
261
262 for vertical in verticals:
263 display(vertical)
264 display('\n')
265
266
267 display(''.join(item.center(width) for item in last_row))
268 display('\n')
269
277