1
2
3
4
5
6
7
8
9 import numpy
10
11 from api import *
12 from util import *
13
15 """
16 The GAAC clusterer starts with each of the N vectors as singleton
17 clusters. It then iteratively merges pairs of clusters which have the
18 closest centroids. This continues until there is only one cluster. The
19 order of merges gives rise to a dendogram: a tree with the earlier merges
20 lower than later merges. The membership of a given number of clusters c, 1
21 <= c <= N, can be found by cutting the dendogram at depth c.
22
23 This clusterer uses the cosine similarity metric only, which allows for
24 efficient speed-up in the clustering process.
25 """
26
27 - def __init__(self, num_clusters=1, normalise=True, svd_dimensions=None):
32
33 - def cluster(self, vectors, assign_clusters=False, trace=False):
38
40
41 clusters = [[vector] for vector in vectors]
42
43
44 vector_sum = copy.copy(vectors)
45
46 while len(clusters) > max(self._num_clusters, 1):
47
48
49 best = None
50 for i in range(len(clusters)):
51 for j in range(i + 1, len(clusters)):
52 sim = self._average_similarity(
53 vector_sum[i], len(clusters[i]),
54 vector_sum[j], len(clusters[j]))
55 if not best or sim > best[0]:
56 best = (sim, i, j)
57
58
59 i, j = best[1:]
60 sum = clusters[i] + clusters[j]
61 if trace: print 'merging %d and %d' % (i, j)
62
63 clusters[i] = sum
64 del clusters[j]
65 vector_sum[i] = vector_sum[i] + vector_sum[j]
66 del vector_sum[j]
67
68 self._dendogram.merge(i, j)
69
70 self.update_clusters(self._num_clusters)
71
89
91 best = None
92 for i in range(self._num_clusters):
93 centroid = self._centroids[i]
94 sim = self._average_similarity(vector, 1, centroid, 1)
95 if not best or sim > best[0]:
96 best = (sim, i)
97 return best[1]
98
100 """
101 @return: The dendogram representing the current clustering
102 @rtype: Dendogram
103 """
104 return self._dendogram
105
107 return self._num_clusters
108
113
115 return '<GroupAverageAgglomerative Clusterer n=%d>' % self._num_clusters
116
118 """
119 Non-interactive demonstration of the clusterers with simple 2-D data.
120 """
121
122 from nltk import cluster
123
124
125 vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
126
127
128 clusterer = cluster.GroupAverageAgglomerative(4)
129 clusters = clusterer.cluster(vectors, True)
130
131 print 'Clusterer:', clusterer
132 print 'Clustered:', vectors
133 print 'As:', clusters
134 print
135
136
137 clusterer.dendogram().show()
138
139
140 vector = numpy.array([3, 3])
141 print 'classify(%s):' % vector,
142 print clusterer.classify(vector)
143 print
144
145
146 if __name__ == '__main__':
147 demo()
148