nltk.wordnet.similarity

1 # Natural Language Toolkit: Wordnet Similarity 2 # 3 # Copyright (C) 2001-2008 NLTK Project 4 # Author: Oliver Steele <[email protected]> 5 # David Ormiston Smith <[email protected]>> 6 # Steven Bird <[email protected]> 7 # URL: <http://nltk.org> 8 # For license information, see LICENSE.TXT 9 10 import math 11 12 from nltk import defaultdict 13 14 from util import * 15 16 # Similarity metrics 17 18 # TODO: Add in the option to manually add a new root node; this will be 19 # useful for verb similarity as there exist multiple verb taxonomies. 20 21 # More information about the metrics is available at 22 # http://marimba.d.umn.edu/similarity/measures.html 23

24 -def path_similarity(synset1, synset2, verbose=False):

25 """ 26 Path Distance Similarity: 27 Return a score denoting how similar two word senses are, based on the 28 shortest path that connects the senses in the is-a (hypernym/hypnoym) 29 taxonomy. The score is in the range 0 to 1, except in those cases 30 where a path cannot be found (will only be true for verbs as there are 31 many distinct verb taxonomies), in which case -1 is returned. A score of 32 1 represents identity i.e. comparing a sense with itself will return 1. 33 34 @type synset2: L{Synset} 35 @param synset2: The L{Synset} that this L{Synset} is being compared to. 36 37 @return: A score denoting the similarity of the two L{Synset}s, 38 normally between 0 and 1. -1 is returned if no connecting path 39 could be found. 1 is returned if a L{Synset} is compared with 40 itself. 41 """ 42 43 distance = synset1.shortest_path_distance(synset2) 44 if distance >= 0: 45 return 1.0 / (distance + 1) 46 else: 47 return -1

48

49 -def lch_similarity(synset1, synset2, verbose=False):

50 """ 51 Leacock Chodorow Similarity: 52 Return a score denoting how similar two word senses are, based on the 53 shortest path that connects the senses (as above) and the maximum depth 54 of the taxonomy in which the senses occur. The relationship is given as 55 -log(p/2d) where p is the shortest path length and d is the taxonomy depth. 56 57 @type synset2: L{Synset} 58 @param synset2: The L{Synset} that this L{Synset} is being compared to. 59 60 @return: A score denoting the similarity of the two L{Synset}s, 61 normally greater than 0. -1 is returned if no connecting path 62 could be found. If a L{Synset} is compared with itself, the 63 maximum score is returned, which varies depending on the taxonomy depth. 64 """ 65 66 taxonomy_depths = {NOUN: 19, VERB: 13} 67 if synset1.pos not in taxonomy_depths.keys(): 68 raise TypeError, "Can only calculate similarity for nouns or verbs" 69 depth = taxonomy_depths[synset1.pos] 70 71 distance = synset1.shortest_path_distance(synset2) 72 if distance >= 0: 73 return -math.log((distance + 1) / (2.0 * depth)) 74 else: 75 return -1

76

77 -def wup_similarity(synset1, synset2, verbose=False):

78 """ 79 Wu-Palmer Similarity: 80 Return a score denoting how similar two word senses are, based on the 81 depth of the two senses in the taxonomy and that of their Least Common 82 Subsumer (most specific ancestor node). Note that at this time the 83 scores given do _not_ always agree with those given by Pedersen's Perl 84 implementation of Wordnet Similarity. 85 86 The LCS does not necessarily feature in the shortest path connecting the 87 two senses, as it is by definition the common ancestor deepest in the 88 taxonomy, not closest to the two senses. Typically, however, it will so 89 feature. Where multiple candidates for the LCS exist, that whose 90 shortest path to the root node is the longest will be selected. Where 91 the LCS has multiple paths to the root, the longer path is used for 92 the purposes of the calculation. 93 94 @type synset2: L{Synset} 95 @param synset2: The L{Synset} that this L{Synset} is being compared to. 96 @return: A float score denoting the similarity of the two L{Synset}s, 97 normally greater than zero. If no connecting path between the two 98 senses can be found, -1 is returned. 99 """ 100 101 subsumer = _lcs_by_depth(synset1, synset2, verbose) 102 103 # If no LCS was found return -1 104 if subsumer == None: 105 return -1 106 107 # Get the longest path from the LCS to the root, 108 # including two corrections: 109 # - add one because the calculations include both the start and end nodes 110 # - add one to non-nouns since they have an imaginary root node 111 depth = subsumer.max_depth() + 1 112 if subsumer.pos != NOUN: 113 depth += 1 114 115 # Get the shortest path from the LCS to each of the synsets it is subsuming. 116 # Add this to the LCS path length to get the path length from each synset to the root. 117 len1 = synset1.shortest_path_distance(subsumer) + depth 118 len2 = synset2.shortest_path_distance(subsumer) + depth 119 return (2.0 * depth) / (len1 + len2)

120

121 -def res_similarity(synset1, synset2, ic, verbose=False):

122 """ 123 Resnik Similarity: 124 Return a score denoting how similar two word senses are, based on the 125 Information Content (IC) of the Least Common Subsumer (most specific 126 ancestor node). 127 128 @type synset1: L{Synset} 129 @param synset1: The first synset being compared 130 @type synset2: L{Synset} 131 @param synset2: The second synset being compared 132 @type ic: C{dict} 133 @param ic: an information content object (as returned by L{load_ic()}). 134 @return: A float score denoting the similarity of the two L{Synset}s. 135 Synsets whose LCS is the root node of the taxonomy will have a 136 score of 0 (e.g. N['dog'][0] and N['table'][0]). If no path exists 137 between the two synsets a score of -1 is returned. 138 """ 139 140 ic1, ic2, lcs_ic = _lcs_ic(synset1, synset2, ic) 141 return lcs_ic

142

143 -def jcn_similarity(synset1, synset2, ic, verbose=False):

144 """ 145 Jiang-Conrath Similarity: 146 Return a score denoting how similar two word senses are, based on the 147 Information Content (IC) of the Least Common Subsumer (most specific 148 ancestor node) and that of the two input Synsets. The relationship is 149 given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)). 150 151 @type synset1: L{Synset} 152 @param synset1: The first synset being compared 153 @type synset2: L{Synset} 154 @param synset2: The second synset being compared 155 @type ic: C{dict} 156 @param ic: an information content object (as returned by L{load_ic()}). 157 @return: A float score denoting the similarity of the two L{Synset}s. 158 If no path exists between the two synsets a score of -1 is returned. 159 """ 160 161 if synset1 == synset2: 162 return inf 163 164 ic1, ic2, lcs_ic = _lcs_ic(synset1, synset2, ic) 165 166 # If either of the input synsets are the root synset, or have a 167 # frequency of 0 (sparse data problem), return 0. 168 if ic1 == 0 or ic2 == 0: 169 return 0 170 171 return 1 / (ic1 + ic2 - 2 * lcs_ic)

172

173 -def lin_similarity(synset1, synset2, ic, verbose=False):

174 """ 175 Lin Similarity: 176 Return a score denoting how similar two word senses are, based on the 177 Information Content (IC) of the Least Common Subsumer (most specific 178 ancestor node) and that of the two input Synsets. The relationship is 179 given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)). 180 181 @type synset1: L{Synset} 182 @param synset1: The first synset being compared 183 @type synset2: L{Synset} 184 @param synset2: The second synset being compared 185 @type ic: C{dict} 186 @param ic: an information content object (as returned by L{load_ic()}). 187 @return: A float score denoting the similarity of the two L{Synset}s, 188 in the range 0 to 1. If no path exists between the two synsets a 189 score of -1 is returned. 190 """ 191 192 ic1, ic2, lcs_ic = _lcs_ic(synset1, synset2, ic) 193 return (2.0 * lcs_ic) / (ic1 + ic2)

194 195 # Utility functions 196

197 -def common_hypernyms(synset1, synset2):

198 """ 199 Find all synsets that are hypernyms of both input synsets. 200 201 @type synset1: L{Synset} 202 @param synset1: First input synset. 203 @type synset2: L{Synset} 204 @param synset2: Second input synset. 205 @return: The synsets that are hypernyms of both synset1 and synset2. 206 """ 207 208 # can't use set operations here 209 hypernyms1 = [synset1] + list(synset1.closure(HYPERNYM)) 210 hypernyms2 = [synset2] + list(synset2.closure(HYPERNYM)) 211 return [s for s in hypernyms1 if s in hypernyms2]

212

213 -def _lcs_by_depth(synset1, synset2, verbose=False):

214 """ 215 Finds the least common subsumer of two synsets in a Wordnet taxonomy, 216 where the least common subsumer is defined as the ancestor node common 217 to both input synsets whose shortest path to the root node is the longest. 218 219 @type synset1: L{Synset} 220 @param synset1: First input synset. 221 @type synset2: L{Synset} 222 @param synset2: Second input synset. 223 @return: The ancestor synset common to both input synsets which is also the LCS. 224 """ 225 subsumer = None 226 max_min_path_length = -1 227 228 subsumers = common_hypernyms(synset1, synset2) 229 230 if verbose: 231 print "> Subsumers1:", subsumers 232 233 # Eliminate those synsets which are ancestors of other synsets in the 234 # set of subsumers. 235 236 eliminated = set() 237 for s1 in subsumers: 238 for s2 in subsumers: 239 if s2 in s1.closure(HYPERNYM): 240 eliminated.add(s2) 241 if verbose: 242 print "> Eliminated:", eliminated 243 244 subsumers = [s for s in subsumers if s not in eliminated] 245 246 if verbose: 247 print "> Subsumers2:", subsumers 248 249 # Calculate the length of the shortest path to the root for each 250 # subsumer. Select the subsumer with the longest of these. 251 252 for candidate in subsumers: 253 254 paths_to_root = candidate.hypernym_paths() 255 min_path_length = -1 256 257 for path in paths_to_root: 258 if min_path_length < 0 or len(path) < min_path_length: 259 min_path_length = len(path) 260 261 if min_path_length > max_min_path_length: 262 max_min_path_length = min_path_length 263 subsumer = candidate 264 265 if verbose: 266 print "> LCS Subsumer by depth:", subsumer 267 return subsumer

268

269 -def _lcs_ic(synset1, synset2, ic, verbose=False):

270 """ 271 Get the information content of the least common subsumer that has 272 the highest information content value. 273 274 @type synset1: L{Synset} 275 @param synset1: First input synset. 276 @type synset2: L{Synset} 277 @param synset2: Second input synset. 278 @type ic: C{dict} 279 @param ic: an information content object (as returned by L{load_ic()}). 280 @return: The information content of the two synsets and their most informative subsumer 281 """ 282 283 pos = synset1.pos 284 ic1 = information_content(synset1, ic) 285 ic2 = information_content(synset2, ic) 286 subsumer_ic = max(information_content(s, ic) for s in common_hypernyms(synset1, synset2)) 287 288 if verbose: 289 print "> LCS Subsumer by content:", subsumer_ic 290 291 return ic1, ic2, subsumer_ic

292 293 # Utility functions 294

295 -def information_content(synset, ic):

296 pos = synset.pos 297 return -math.log(ic[pos][synset.offset] / ic[pos][0])

298 299 # this load function would be more efficient if the data was pickled 300 # Note that we can't use NLTK's frequency distributions because 301 # synsets are overlapping (each instance of a synset also counts 302 # as an instance of its hypernyms)

303 -def load_ic(icfile):

304 """ 305 Load an information content file from the wordnet_ic corpus 306 and return a dictionary. This dictionary has just two keys, 307 NOUN and VERB, whose values are dictionaries that map from 308 synsets to information content values. 309 310 @type icfile: L{str} 311 @param icfile: The name of the wordnet_ic file (e.g. "ic-brown.dat") 312 @return: An information content dictionary 313 """ 314 icfile = nltk.data.find('corpora/wordnet_ic/' + icfile) 315 ic = {} 316 ic[NOUN] = defaultdict(int) 317 ic[VERB] = defaultdict(int) 318 for num, line in enumerate(open(icfile)): 319 if num == 0: # skip the header 320 continue 321 fields = line.split() 322 offset = int(fields[0][:-1]) 323 value = float(fields[1]) 324 pos = _get_pos(fields[0]) 325 if num == 1: # store root count 326 ic[pos][0] = value 327 if value != 0: 328 ic[pos][offset] = value 329 return ic

330 331 # get the part of speech (NOUN or VERB) from the information content record 332 # (each identifier has a 'n' or 'v' suffix)

333 -def _get_pos(field):

334 if field[-1] == 'n': 335 return NOUN 336 elif field[-1] == 'v': 337 return VERB 338 else: 339 raise ValueError, "Unidentified part of speech in WordNet Information Content file"

340

Source Code for Module nltk.wordnet.similarity