Package nltk :: Package cluster :: Module gaac
[hide private]
[frames] | no frames]

Source Code for Module nltk.cluster.gaac

  1  # Natural Language Toolkit: Group Average Agglomerative Clusterer 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Trevor Cohn <[email protected]> 
  5  # Porting: Steven Bird <[email protected]> 
  6  # URL: <http://nltk.org> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  import numpy 
 10   
 11  from api import * 
 12  from util import * 
 13   
14 -class GroupAverageAgglomerative(VectorSpace):
15 """ 16 The GAAC clusterer starts with each of the N vectors as singleton 17 clusters. It then iteratively merges pairs of clusters which have the 18 closest centroids. This continues until there is only one cluster. The 19 order of merges gives rise to a dendogram: a tree with the earlier merges 20 lower than later merges. The membership of a given number of clusters c, 1 21 <= c <= N, can be found by cutting the dendogram at depth c. 22 23 This clusterer uses the cosine similarity metric only, which allows for 24 efficient speed-up in the clustering process. 25 """ 26
27 - def __init__(self, num_clusters=1, normalise=True, svd_dimensions=None):
28 VectorSpace.__init__(self, normalise, svd_dimensions) 29 self._num_clusters = num_clusters 30 self._dendogram = None 31 self._groups_values = None
32
33 - def cluster(self, vectors, assign_clusters=False, trace=False):
34 # stores the merge order 35 self._dendogram = Dendogram( 36 [numpy.array(vector, numpy.float64) for vector in vectors]) 37 return VectorSpace.cluster(self, vectors, assign_clusters, trace)
38
39 - def cluster_vectorspace(self, vectors, trace=False):
40 # create a cluster for each vector 41 clusters = [[vector] for vector in vectors] 42 43 # the sum vectors 44 vector_sum = copy.copy(vectors) 45 46 while len(clusters) > max(self._num_clusters, 1): 47 # find the two best candidate clusters to merge, based on their 48 # S(union c_i, c_j) 49 best = None 50 for i in range(len(clusters)): 51 for j in range(i + 1, len(clusters)): 52 sim = self._average_similarity( 53 vector_sum[i], len(clusters[i]), 54 vector_sum[j], len(clusters[j])) 55 if not best or sim > best[0]: 56 best = (sim, i, j) 57 58 # merge them and replace in cluster list 59 i, j = best[1:] 60 sum = clusters[i] + clusters[j] 61 if trace: print 'merging %d and %d' % (i, j) 62 63 clusters[i] = sum 64 del clusters[j] 65 vector_sum[i] = vector_sum[i] + vector_sum[j] 66 del vector_sum[j] 67 68 self._dendogram.merge(i, j) 69 70 self.update_clusters(self._num_clusters)
71
72 - def update_clusters(self, num_clusters):
73 clusters = self._dendogram.groups(num_clusters) 74 self._centroids = [] 75 for cluster in clusters: 76 assert len(cluster) > 0 77 if self._should_normalise: 78 centroid = self._normalise(cluster[0]) 79 else: 80 centroid = numpy.array(cluster[0]) 81 for vector in cluster[1:]: 82 if self._should_normalise: 83 centroid += self._normalise(vector) 84 else: 85 centroid += vector 86 centroid /= float(len(cluster)) 87 self._centroids.append(centroid) 88 self._num_clusters = len(self._centroids)
89
90 - def classify_vectorspace(self, vector):
91 best = None 92 for i in range(self._num_clusters): 93 centroid = self._centroids[i] 94 sim = self._average_similarity(vector, 1, centroid, 1) 95 if not best or sim > best[0]: 96 best = (sim, i) 97 return best[1]
98
99 - def dendogram(self):
100 """ 101 @return: The dendogram representing the current clustering 102 @rtype: Dendogram 103 """ 104 return self._dendogram
105
106 - def num_clusters(self):
107 return self._num_clusters
108
109 - def _average_similarity(self, v1, l1, v2, l2):
110 sum = v1 + v2 111 length = l1 + l2 112 return (numpy.dot(sum, sum) - length) / (length * (length - 1))
113
114 - def __repr__(self):
115 return '<GroupAverageAgglomerative Clusterer n=%d>' % self._num_clusters
116
117 -def demo():
118 """ 119 Non-interactive demonstration of the clusterers with simple 2-D data. 120 """ 121 122 from nltk import cluster 123 124 # use a set of tokens with 2D indices 125 vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] 126 127 # test the GAAC clusterer with 4 clusters 128 clusterer = cluster.GroupAverageAgglomerative(4) 129 clusters = clusterer.cluster(vectors, True) 130 131 print 'Clusterer:', clusterer 132 print 'Clustered:', vectors 133 print 'As:', clusters 134 print 135 136 # show the dendogram 137 clusterer.dendogram().show() 138 139 # classify a new vector 140 vector = numpy.array([3, 3]) 141 print 'classify(%s):' % vector, 142 print clusterer.classify(vector) 143 print
144 145 146 if __name__ == '__main__': 147 demo() 148