Package nltk :: Package cluster :: Module util
[hide private]
[frames] | no frames]

Source Code for Module nltk.cluster.util

  1  # Natural Language Toolkit: Clusterer Utilities 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Trevor Cohn <[email protected]> 
  5  # Porting: Steven Bird <[email protected]> 
  6  # URL: <http://nltk.org> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  import copy 
 10  import sys 
 11  import math 
 12  import numpy 
 13   
 14  from api import * 
 15   
16 -class VectorSpace(ClusterI):
17 """ 18 Abstract clusterer which takes tokens and maps them into a vector space. 19 Optionally performs singular value decomposition to reduce the 20 dimensionality. 21 """
22 - def __init__(self, normalise=False, svd_dimensions=None):
23 """ 24 @param normalise: should vectors be normalised to length 1 25 @type normalise: boolean 26 @param svd_dimensions: number of dimensions to use in reducing vector 27 dimensionsionality with SVD 28 @type svd_dimensions: int 29 """ 30 self._Tt = None 31 self._should_normalise = normalise 32 self._svd_dimensions = svd_dimensions
33
34 - def cluster(self, vectors, assign_clusters=False, trace=False):
35 assert len(vectors) > 0 36 37 # normalise the vectors 38 if self._should_normalise: 39 vectors = map(self._normalise, vectors) 40 41 # use SVD to reduce the dimensionality 42 if self._svd_dimensions and self._svd_dimensions < len(vectors[0]): 43 [u, d, vt] = numpy.linalg.svd(numpy.transpose(array(vectors))) 44 S = d[:self._svd_dimensions] * \ 45 numpy.identity(self._svd_dimensions, numpy.Float64) 46 T = u[:,:self._svd_dimensions] 47 Dt = vt[:self._svd_dimensions,:] 48 vectors = numpy.transpose(numpy.matrixmultiply(S, Dt)) 49 self._Tt = numpy.transpose(T) 50 51 # call abstract method to cluster the vectors 52 self.cluster_vectorspace(vectors, trace) 53 54 # assign the vectors to clusters 55 if assign_clusters: 56 print self._Tt, vectors 57 return [self.classify(vector) for vector in vectors]
58
59 - def cluster_vectorspace(self, vectors, trace):
60 """ 61 Finds the clusters using the given set of vectors. 62 """ 63 raise AssertionError()
64
65 - def classify(self, vector):
66 if self._should_normalise: 67 vector = self._normalise(vector) 68 if self._Tt != None: 69 vector = numpy.matrixmultiply(self._Tt, vector) 70 cluster = self.classify_vectorspace(vector) 71 return self.cluster_name(cluster)
72
73 - def classify_vectorspace(self, vector):
74 """ 75 Returns the index of the appropriate cluster for the vector. 76 """ 77 raise AssertionError()
78
79 - def likelihood(self, vector, label):
80 if self._should_normalise: 81 vector = self._normalise(vector) 82 if self._Tt != None: 83 vector = numpy.matrixmultiply(self._Tt, vector) 84 return self.likelihood_vectorspace(vector, label)
85
86 - def likelihood_vectorspace(self, vector, cluster):
87 """ 88 Returns the likelihood of the vector belonging to the cluster. 89 """ 90 predicted = self.classify_vectorspace(vector) 91 if cluster == predicted: return 1.0 92 else: return 0.0
93
94 - def vector(self, vector):
95 """ 96 Returns the vector after normalisation and dimensionality reduction 97 """ 98 if self._should_normalise: 99 vector = self._normalise(vector) 100 if self._Tt != None: 101 vector = numpy.matrixmultiply(self._Tt, vector) 102 return vector
103
104 - def _normalise(self, vector):
105 """ 106 Normalises the vector to unit length. 107 """ 108 return vector / math.sqrt(numpy.dot(vector, vector))
109
110 -def euclidean_distance(u, v):
111 """ 112 Returns the euclidean distance between vectors u and v. This is equivalent 113 to the length of the vector (u - v). 114 """ 115 diff = u - v 116 return math.sqrt(numpy.dot(diff, diff))
117
118 -def cosine_distance(u, v):
119 """ 120 Returns the cosine of the angle between vectors v and u. This is equal to 121 u.v / |u||v|. 122 """ 123 return numpy.dot(u, v) / (math.sqrt(numpy.dot(u, u)) * math.sqrt(numpy.dot(v, v)))
124
125 -class _DendogramNode:
126 """ Tree node of a dendogram. """ 127
128 - def __init__(self, value, *children):
129 self._value = value 130 self._children = children
131
132 - def leaves(self, values=True):
133 if self._children: 134 leaves = [] 135 for child in self._children: 136 leaves.extend(child.leaves(values)) 137 return leaves 138 elif values: 139 return [self._value] 140 else: 141 return [self]
142
143 - def groups(self, n):
144 queue = [(self._value, self)] 145 146 while len(queue) < n: 147 priority, node = queue.pop() 148 if not node._children: 149 queue.push((priority, node)) 150 break 151 for child in node._children: 152 if child._children: 153 queue.append((child._value, child)) 154 else: 155 queue.append((0, child)) 156 # makes the earliest merges at the start, latest at the end 157 queue.sort() 158 159 groups = [] 160 for priority, node in queue: 161 groups.append(node.leaves()) 162 return groups
163
164 -class Dendogram:
165 """ 166 Represents a dendogram, a tree with a specified branching order. This 167 must be initialised with the leaf items, then iteratively call merge for 168 each branch. This class constructs a tree representing the order of calls 169 to the merge function. 170 """ 171
172 - def __init__(self, items=[]):
173 """ 174 @param items: the items at the leaves of the dendogram 175 @type items: sequence of (any) 176 """ 177 self._items = [_DendogramNode(item) for item in items] 178 self._original_items = copy.copy(self._items) 179 self._merge = 1
180
181 - def merge(self, *indices):
182 """ 183 Merges nodes at given indices in the dendogram. The nodes will be 184 combined which then replaces the first node specified. All other nodes 185 involved in the merge will be removed. 186 187 @param indices: indices of the items to merge (at least two) 188 @type indices: seq of int 189 """ 190 assert len(indices) >= 2 191 node = _DendogramNode(self._merge, *[self._items[i] for i in indices]) 192 self._merge += 1 193 self._items[indices[0]] = node 194 for i in indices[1:]: 195 del self._items[i]
196
197 - def groups(self, n):
198 """ 199 Finds the n-groups of items (leaves) reachable from a cut at depth n. 200 @param n: number of groups 201 @type n: int 202 """ 203 if len(self._items) > 1: 204 root = _DendogramNode(self._merge, *self._items) 205 else: 206 root = self._items[0] 207 return root.groups(n)
208
209 - def show(self):
210 """ 211 Print the dendogram in ASCII art to standard out. 212 """ 213 214 # ASCII rendering characters 215 JOIN, HLINK, VLINK = '+', '-', '|' 216 217 # find the root (or create one) 218 if len(self._items) > 1: 219 root = _DendogramNode(self._merge, *self._items) 220 else: 221 root = self._items[0] 222 leaves = self._original_items 223 224 # find the bottom row and the best cell width 225 last_row = [str(leaf._value) for leaf in leaves] 226 width = max(map(len, last_row)) + 1 227 lhalf = width / 2 228 rhalf = width - lhalf - 1 229 230 # display functions 231 def format(centre, left=' ', right=' '): 232 return '%s%s%s' % (lhalf*left, centre, right*rhalf)
233 def display(str): 234 sys.stdout.write(str)
235 236 # for each merge, top down 237 queue = [(root._value, root)] 238 verticals = [ format(' ') for leaf in leaves ] 239 while queue: 240 priority, node = queue.pop() 241 child_left_leaf = map(lambda c: c.leaves(False)[0], node._children) 242 indices = map(leaves.index, child_left_leaf) 243 if child_left_leaf: 244 min_idx = min(indices) 245 max_idx = max(indices) 246 for i in range(len(leaves)): 247 if leaves[i] in child_left_leaf: 248 if i == min_idx: display(format(JOIN, ' ', HLINK)) 249 elif i == max_idx: display(format(JOIN, HLINK, ' ')) 250 else: display(format(JOIN, HLINK, HLINK)) 251 verticals[i] = format(VLINK) 252 elif min_idx <= i <= max_idx: 253 display(format(HLINK, HLINK, HLINK)) 254 else: 255 display(verticals[i]) 256 display('\n') 257 for child in node._children: 258 if child._children: 259 queue.append((child._value, child)) 260 queue.sort() 261 262 for vertical in verticals: 263 display(vertical) 264 display('\n') 265 266 # finally, display the last line 267 display(''.join(item.center(width) for item in last_row)) 268 display('\n') 269
270 - def __repr__(self):
271 if len(self._items) > 1: 272 root = _DendogramNode(self._merge, *self._items) 273 else: 274 root = self._items[0] 275 leaves = root.leaves(False) 276 return '<Dendogram with %d leaves>' % len(leaves)
277