1
2
3
4
5
6
7
8
9
10 import math
11
12 from nltk import defaultdict
13
14 from util import *
15
16
17
18
19
20
21
22
23
25 """
26 Path Distance Similarity:
27 Return a score denoting how similar two word senses are, based on the
28 shortest path that connects the senses in the is-a (hypernym/hypnoym)
29 taxonomy. The score is in the range 0 to 1, except in those cases
30 where a path cannot be found (will only be true for verbs as there are
31 many distinct verb taxonomies), in which case -1 is returned. A score of
32 1 represents identity i.e. comparing a sense with itself will return 1.
33
34 @type synset2: L{Synset}
35 @param synset2: The L{Synset} that this L{Synset} is being compared to.
36
37 @return: A score denoting the similarity of the two L{Synset}s,
38 normally between 0 and 1. -1 is returned if no connecting path
39 could be found. 1 is returned if a L{Synset} is compared with
40 itself.
41 """
42
43 distance = synset1.shortest_path_distance(synset2)
44 if distance >= 0:
45 return 1.0 / (distance + 1)
46 else:
47 return -1
48
50 """
51 Leacock Chodorow Similarity:
52 Return a score denoting how similar two word senses are, based on the
53 shortest path that connects the senses (as above) and the maximum depth
54 of the taxonomy in which the senses occur. The relationship is given as
55 -log(p/2d) where p is the shortest path length and d is the taxonomy depth.
56
57 @type synset2: L{Synset}
58 @param synset2: The L{Synset} that this L{Synset} is being compared to.
59
60 @return: A score denoting the similarity of the two L{Synset}s,
61 normally greater than 0. -1 is returned if no connecting path
62 could be found. If a L{Synset} is compared with itself, the
63 maximum score is returned, which varies depending on the taxonomy depth.
64 """
65
66 taxonomy_depths = {NOUN: 19, VERB: 13}
67 if synset1.pos not in taxonomy_depths.keys():
68 raise TypeError, "Can only calculate similarity for nouns or verbs"
69 depth = taxonomy_depths[synset1.pos]
70
71 distance = synset1.shortest_path_distance(synset2)
72 if distance >= 0:
73 return -math.log((distance + 1) / (2.0 * depth))
74 else:
75 return -1
76
78 """
79 Wu-Palmer Similarity:
80 Return a score denoting how similar two word senses are, based on the
81 depth of the two senses in the taxonomy and that of their Least Common
82 Subsumer (most specific ancestor node). Note that at this time the
83 scores given do _not_ always agree with those given by Pedersen's Perl
84 implementation of Wordnet Similarity.
85
86 The LCS does not necessarily feature in the shortest path connecting the
87 two senses, as it is by definition the common ancestor deepest in the
88 taxonomy, not closest to the two senses. Typically, however, it will so
89 feature. Where multiple candidates for the LCS exist, that whose
90 shortest path to the root node is the longest will be selected. Where
91 the LCS has multiple paths to the root, the longer path is used for
92 the purposes of the calculation.
93
94 @type synset2: L{Synset}
95 @param synset2: The L{Synset} that this L{Synset} is being compared to.
96 @return: A float score denoting the similarity of the two L{Synset}s,
97 normally greater than zero. If no connecting path between the two
98 senses can be found, -1 is returned.
99 """
100
101 subsumer = _lcs_by_depth(synset1, synset2, verbose)
102
103
104 if subsumer == None:
105 return -1
106
107
108
109
110
111 depth = subsumer.max_depth() + 1
112 if subsumer.pos != NOUN:
113 depth += 1
114
115
116
117 len1 = synset1.shortest_path_distance(subsumer) + depth
118 len2 = synset2.shortest_path_distance(subsumer) + depth
119 return (2.0 * depth) / (len1 + len2)
120
122 """
123 Resnik Similarity:
124 Return a score denoting how similar two word senses are, based on the
125 Information Content (IC) of the Least Common Subsumer (most specific
126 ancestor node).
127
128 @type synset1: L{Synset}
129 @param synset1: The first synset being compared
130 @type synset2: L{Synset}
131 @param synset2: The second synset being compared
132 @type ic: C{dict}
133 @param ic: an information content object (as returned by L{load_ic()}).
134 @return: A float score denoting the similarity of the two L{Synset}s.
135 Synsets whose LCS is the root node of the taxonomy will have a
136 score of 0 (e.g. N['dog'][0] and N['table'][0]). If no path exists
137 between the two synsets a score of -1 is returned.
138 """
139
140 ic1, ic2, lcs_ic = _lcs_ic(synset1, synset2, ic)
141 return lcs_ic
142
144 """
145 Jiang-Conrath Similarity:
146 Return a score denoting how similar two word senses are, based on the
147 Information Content (IC) of the Least Common Subsumer (most specific
148 ancestor node) and that of the two input Synsets. The relationship is
149 given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)).
150
151 @type synset1: L{Synset}
152 @param synset1: The first synset being compared
153 @type synset2: L{Synset}
154 @param synset2: The second synset being compared
155 @type ic: C{dict}
156 @param ic: an information content object (as returned by L{load_ic()}).
157 @return: A float score denoting the similarity of the two L{Synset}s.
158 If no path exists between the two synsets a score of -1 is returned.
159 """
160
161 if synset1 == synset2:
162 return inf
163
164 ic1, ic2, lcs_ic = _lcs_ic(synset1, synset2, ic)
165
166
167
168 if ic1 == 0 or ic2 == 0:
169 return 0
170
171 return 1 / (ic1 + ic2 - 2 * lcs_ic)
172
174 """
175 Lin Similarity:
176 Return a score denoting how similar two word senses are, based on the
177 Information Content (IC) of the Least Common Subsumer (most specific
178 ancestor node) and that of the two input Synsets. The relationship is
179 given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).
180
181 @type synset1: L{Synset}
182 @param synset1: The first synset being compared
183 @type synset2: L{Synset}
184 @param synset2: The second synset being compared
185 @type ic: C{dict}
186 @param ic: an information content object (as returned by L{load_ic()}).
187 @return: A float score denoting the similarity of the two L{Synset}s,
188 in the range 0 to 1. If no path exists between the two synsets a
189 score of -1 is returned.
190 """
191
192 ic1, ic2, lcs_ic = _lcs_ic(synset1, synset2, ic)
193 return (2.0 * lcs_ic) / (ic1 + ic2)
194
195
196
198 """
199 Find all synsets that are hypernyms of both input synsets.
200
201 @type synset1: L{Synset}
202 @param synset1: First input synset.
203 @type synset2: L{Synset}
204 @param synset2: Second input synset.
205 @return: The synsets that are hypernyms of both synset1 and synset2.
206 """
207
208
209 hypernyms1 = [synset1] + list(synset1.closure(HYPERNYM))
210 hypernyms2 = [synset2] + list(synset2.closure(HYPERNYM))
211 return [s for s in hypernyms1 if s in hypernyms2]
212
214 """
215 Finds the least common subsumer of two synsets in a Wordnet taxonomy,
216 where the least common subsumer is defined as the ancestor node common
217 to both input synsets whose shortest path to the root node is the longest.
218
219 @type synset1: L{Synset}
220 @param synset1: First input synset.
221 @type synset2: L{Synset}
222 @param synset2: Second input synset.
223 @return: The ancestor synset common to both input synsets which is also the LCS.
224 """
225 subsumer = None
226 max_min_path_length = -1
227
228 subsumers = common_hypernyms(synset1, synset2)
229
230 if verbose:
231 print "> Subsumers1:", subsumers
232
233
234
235
236 eliminated = set()
237 for s1 in subsumers:
238 for s2 in subsumers:
239 if s2 in s1.closure(HYPERNYM):
240 eliminated.add(s2)
241 if verbose:
242 print "> Eliminated:", eliminated
243
244 subsumers = [s for s in subsumers if s not in eliminated]
245
246 if verbose:
247 print "> Subsumers2:", subsumers
248
249
250
251
252 for candidate in subsumers:
253
254 paths_to_root = candidate.hypernym_paths()
255 min_path_length = -1
256
257 for path in paths_to_root:
258 if min_path_length < 0 or len(path) < min_path_length:
259 min_path_length = len(path)
260
261 if min_path_length > max_min_path_length:
262 max_min_path_length = min_path_length
263 subsumer = candidate
264
265 if verbose:
266 print "> LCS Subsumer by depth:", subsumer
267 return subsumer
268
269 -def _lcs_ic(synset1, synset2, ic, verbose=False):
270 """
271 Get the information content of the least common subsumer that has
272 the highest information content value.
273
274 @type synset1: L{Synset}
275 @param synset1: First input synset.
276 @type synset2: L{Synset}
277 @param synset2: Second input synset.
278 @type ic: C{dict}
279 @param ic: an information content object (as returned by L{load_ic()}).
280 @return: The information content of the two synsets and their most informative subsumer
281 """
282
283 pos = synset1.pos
284 ic1 = information_content(synset1, ic)
285 ic2 = information_content(synset2, ic)
286 subsumer_ic = max(information_content(s, ic) for s in common_hypernyms(synset1, synset2))
287
288 if verbose:
289 print "> LCS Subsumer by content:", subsumer_ic
290
291 return ic1, ic2, subsumer_ic
292
293
294
298
299
300
301
302
304 """
305 Load an information content file from the wordnet_ic corpus
306 and return a dictionary. This dictionary has just two keys,
307 NOUN and VERB, whose values are dictionaries that map from
308 synsets to information content values.
309
310 @type icfile: L{str}
311 @param icfile: The name of the wordnet_ic file (e.g. "ic-brown.dat")
312 @return: An information content dictionary
313 """
314 icfile = nltk.data.find('corpora/wordnet_ic/' + icfile)
315 ic = {}
316 ic[NOUN] = defaultdict(int)
317 ic[VERB] = defaultdict(int)
318 for num, line in enumerate(open(icfile)):
319 if num == 0:
320 continue
321 fields = line.split()
322 offset = int(fields[0][:-1])
323 value = float(fields[1])
324 pos = _get_pos(fields[0])
325 if num == 1:
326 ic[pos][0] = value
327 if value != 0:
328 ic[pos][offset] = value
329 return ic
330
331
332
334 if field[-1] == 'n':
335 return NOUN
336 elif field[-1] == 'v':
337 return VERB
338 else:
339 raise ValueError, "Unidentified part of speech in WordNet Information Content file"
340