Package nltk :: Package wordnet :: Module brown_ic
[hide private]
[frames] | no frames]

Source Code for Module nltk.wordnet.brown_ic

  1  # Natural Language Toolkit: Brown Corpus Information Content Module 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Oliver Steele <[email protected]> 
  5  #         David Ormiston Smith <[email protected]>> 
  6  #         Steven Bird <[email protected]> 
  7  #         Jordan Boyd-Graber <[email protected]> 
  8  # URL: <http://nltk.org> 
  9  # For license information, see LICENSE.TXT 
 10   
 11  # This module can be used to build information content files. 
 12  # However, NLTK's WordNet Similarity code uses pre-built IC 
 13  # files obtained from the WordNet Similarity website [1] and 
 14  # distributed as part of the NLTK corpus collection. 
 15  # [1] http://wn-similarity.sourceforge.net 
 16   
 17  import pickle 
 18  import sys 
 19  from itertools import islice 
 20   
 21  from nltk.corpus import brown 
 22  from nltk.probability import FreqDist 
 23  from nltk.stem.wordnet import morphy 
 24   
 25  from util import * 
 26  from dictionary import N, V 
 27   
 28  # Perform a binary search through the list of all compounds. The 
 29  # search necessarily accepts partial matches. The search returns 
 30  # the compound type ('nc' for noun compound or 'vbc' for verb 
 31  # compound) of the matched compound, or False if no match was 
 32  # found. Recording the compound type is necessary so that the 
 33  # correct frequency distribution can be updated later. 
 34   
35 -def substr_binary_search(item, data):
36 if not data: 37 return None 38 low = 0 39 high = len(data) - 1 40 mid = high / 2 41 42 while data[mid].find(item) < 0: 43 if mid >= high or mid <= low: 44 return False 45 elif data[mid] > item: 46 high = mid 47 mid -= (high - low) / 2 48 elif data[mid] < item: 49 low = mid 50 mid += (high - low) / 2 51 52 return data[mid].split(':')[1]
53
54 -def generate_compound_list(filename=None):
55 56 dictionaries = [N, V] 57 compound_types = ['nc', 'vbc'] 58 59 if filename: 60 outfile = open(filename, 'w') 61 else: 62 outfile = sys.stdout 63 64 for dict, type in zip(dictionaries, compound_types): 65 for term in dict: 66 term = term.form 67 if ' ' in term: 68 outfile.write("%s:%s\n" % (term, type))
69
70 -def read_word_list(filename):
71 word_list = [line.rstrip() for line in open(filename, 'r')] 72 word_list.sort() 73 return word_list
74
75 -def get_roots(dictionary):
76 roots = [] 77 for word in dictionary: 78 for synset in dictionary[word].synsets(): 79 hypernyms = set(synset[HYPERNYM]) | set(synset[INSTANCE_HYPERNYM]) 80 if len(hypernyms) == 0: roots.append(synset) 81 return roots
82
83 -def propagate_frequencies(freq_dist, synset, count):
84 parents = set() 85 for path in synset.hypernym_paths(): 86 parents = parents | set(map(lambda x: x.offset, path)) 87 parents = parents - set(synset.offset) 88 for i in parents: 89 freq_dist.inc(i, count)
90
91 -def brown_information_content(output_filename, compounds_filename, \ 92 stopwords_filename=None, smoothing=False):
93 94 # A list of all the noun and verb parts of speech as recorded in the 95 # Brown corpus. Currently many are ignored because the Wordnet morphy() 96 # method, which derives the base form of inflected words, can't handle 97 # contractions, genetive forms etc. 98 99 noun_tags = [ 100 'nn', # N. sing. common (burden) 101 # 'nn$', # N. sing. common, genetive (company's) 102 # 'nn+bez', # N. sing. common + 'to be' pres. 3rd pers sing. (wife's) 103 # 'nn+hvd', # N. sing. common + 'to have' past (James'd) 104 # 'nn+hvz', # N. sing. common + 'to have' pres. 3rd pers sing. (guy's) 105 # 'nn+in', # N. sing. common + prep. (buncha) 106 # 'nn+md', # N. sing. common + modal aux. (sun'll) 107 # 'nn+nn', # N. sing. common, hyphenated pair (stomach-belly) 108 'nns', # N. plu. common (burdens) 109 # 'nns$', # N. plu. common genetive (companies') 110 # 'nns+md', # N. plu. common + modal aux. (cowboys'll) 111 'np', # N. sing. proper (September) 112 # 'np$', # N. sing. proper genetive (William's) 113 # 'np+bez', # N. sing. proper + 'to be' pres. 3rd pers sing. (Rob's) 114 # 'np+hvd', # N. sing. proper + 'to have' past (James'd) 115 # 'np+hvz', # N. sing. proper + 'to have' pres. 3rd pers sing. (Bill's) 116 # 'np+md', # N. sing. proper + modal aux. (John'll) 117 'nps', # N. plu. proper (Catholics) 118 # 'nps$', # N. plu. proper, genetive (Republicans') 119 'nr', # N. sing. adverbial (today, Saturday, East) 120 # 'nr$', # N. sing. adverbial, genetive (Saturday's) 121 # 'nr+md' # N. sing. adverbial + modal aux. (today'll) 122 'nrs', # N. plu. adverbial (Sundays) 123 'nc', # N. compound (jigsaw puzzle, egg cup) 124 ] 125 126 verb_tags = [ 127 'vb', # V. base: pres. imp. or inf. (find, act) 128 # 'vb+at', # V. base: pres. or inf. + article (wanna) 129 # 'vb+in', # V. base: pres. imp. or inf. + prep. (lookit) 130 # 'vb+jj', # V. base: pres. imp. or inf. + adj. (die-dead) 131 # 'vb+ppo', # V. pres. + pronoun, personal, acc. (let's) 132 # 'vb+rp', # V. imperative + adverbial particle (g'ahn, c'mon) 133 # 'vb+vb', # V. base: pres. imp. or inf. hyphenated pair (say-speak) 134 'vbd', # V. past (said, produced) 135 'vbg', # V. pres. part./gerund (keeping, attending) 136 # 'vbg+to', # V. pres part. + infinitival to (gonna) 137 'vbn', # V. past part. (conducted, adopted) 138 # 'vbn+to', # V. past part. + infinitival to (gotta) 139 'vbz', # V. pres. 3rd pers. sing. (deserves, seeks) 140 'vbc' # V. compound (band together, run over) 141 ] 142 143 outfile = open(output_filename, "wb") 144 145 if compounds_filename: 146 compounds = read_word_list(compounds_filename) 147 else: 148 compounds = [] 149 150 if stopwords_filename: 151 stopwords = read_word_list(stopword_filename) 152 else: 153 stopwords = [] 154 155 noun_fd = FreqDist() 156 verb_fd = FreqDist() 157 158 count = 0 159 increment = 1000 160 161 # If smoothing is True perform Laplacian smoothing i.e. add 1 to each 162 # synset's frequency count. 163 164 if smoothing: 165 for term in N: 166 for sense in N[term]: 167 if noun_fd[sense.offset] == 0: 168 noun_fd.inc(sense.offset) 169 for term in V: 170 for sense in V[term]: 171 if verb_fd[sense.offset] == 0: 172 verb_fd.inc(sense.offset) 173 174 175 sys.stdout.write("Building initial frequency distributions...\n") 176 177 for file in brown.files(): 178 sys.stdout.write("\t%s/%s\n" % (file, brown.files()[-1])) 179 sys.stdout.flush() 180 181 for sentence in brown.tagged_sents(file): 182 183 if len(sentence) == 0: 184 continue 185 186 # Greedily search for compound nouns/verbs. The search is naive and 187 # doesn't account for inflected words within the compound (so 188 # variant forms of the compound will not be identified e.g. the 189 # compound 'abdominal cavities' will not be recognised as the plural of 190 # 'abdominal cavity'); this is in keeping with the original Perl 191 # implementation. Rectifying this is mildy tricky in that some compound 192 # constituents are expected to be inflected e.g. 'abandoned ship' so 193 # it isn't possible to simply uninflect each constituent before 194 # searching; rather, a list of variant compounds containing all possible 195 # inflected/uninflected constituents would be needed (compounds rarely 196 # exceed length four so the quadratic search space wouldn't be too scary). 197 198 new_sentence = [] 199 (token, tag) = sentence.pop(0) 200 token = token.lower() 201 tag = tag.lower() 202 compound = (token, tag) 203 204 # Pop (word token, PoS tag) tuples from the sentence until all words 205 # have been consumed. Glue the word tokens together while they form 206 # a substring of a valid compound. When adding a new token makes the 207 # compound invalid, append the current compound onto the new sentence 208 # queue and assign the new (token, tag) tuple as the current compound base. 209 210 while len(sentence) > 0: 211 (token, tag) = sentence.pop(0) 212 token = token.lower() 213 tag = tag.lower() 214 215 # Add this token to the current compound string, creating a 216 # candidate compound token that may or may not exist in Wordnet. 217 compound_token = compound[0] + ' ' + token 218 compound_tag = substr_binary_search(compound_token, compounds) 219 220 if compound_tag: 221 compound = (compound_token, compound_tag) 222 223 # If appending the new token to the current compound results in 224 # an invalid compound, append the current compound to the new 225 # sentence queue and reset it, placing the new token as the 226 # beginning of a (possible) new compound. 227 228 else: 229 new_sentence.append(compound) 230 compound = (token, tag) 231 232 # The final (possibly compound) token in each sentence needs to be 233 # manually appended onto the new sentence list. 234 235 new_sentence.append(compound) 236 237 for (token, tag) in new_sentence: 238 239 # Give the user some feedback to let him or her know the 240 # distributions are still being built. The initial stage can take 241 # quite some time (half an hour or more). 242 243 count += 1 244 245 # Basic splitting based on the word token's POS. Later this could 246 # be further developed using the additional (now commented out) 247 # tag types and adding conditional checks to turn e.g. "you'll" 248 # into "you" + "will". This would increase the accuracy of the 249 # distribution, as currently all such contractions are discarded 250 # (because they won't match any entries in the dictionary). 251 252 if tag in noun_tags: 253 pos = "noun" 254 dictionary = N 255 freq_dist = noun_fd 256 elif tag in verb_tags: 257 pos = "verb" 258 dictionary = V 259 freq_dist = verb_fd 260 else: 261 token = None 262 263 # If the word form is inflected e.g. plural, retrieve its base 264 # or uninflected form. 265 266 if token is not None: 267 if token in dictionary: 268 uninflected_token = token 269 else: 270 uninflected_token = morphy(token, pos) 271 272 else: uninflected_token = None 273 274 # Increment the count for each sense of the word token, there 275 # being no practical way to distinguish between word senses in the 276 # Brown corpus (SemCor would be another story). 277 278 if uninflected_token: 279 if uninflected_token in dictionary: 280 for synset in dictionary[uninflected_token]: 281 freq_dist.inc(synset.offset) 282 283 # Propagate the frequency counts up the taxonomy structure. Thus the 284 # root node (or nodes) will have a frequency equal to the sum of all 285 # of their descendent node frequencies (plus a bit extra, if the root 286 # node appeared in the source text). The distribution will then adhere 287 # to the IR principle that nodes (synsets) that appear less frequently 288 # have a higher information content. 289 290 sys.stdout.write(" done.\n") 291 292 sys.stdout.write("Propagating frequencies up the taxonomy trees...\n") 293 sys.stdout.flush() 294 295 # Keep track of how many synsets we need to percolate counts up for 296 synsets_done = 0 297 synsets_total = len(noun_fd) + len(verb_fd) 298 299 # We'll copy the information in noun_fd into a new distribution 300 # and percolate up the tree 301 temp = FreqDist() 302 for noun_offset in noun_fd: 303 if synsets_done % increment == 0: 304 sys.stdout.write("\tSynset %d/%d\n" % (synsets_done, synsets_total)) 305 sys.stdout.flush() 306 synsets_done += 1 307 308 count = noun_fd[noun_offset] 309 propagate_frequencies(temp, N.synset(noun_offset), count) 310 noun_fd = temp 311 312 temp = FreqDist() 313 for verb_offset in verb_fd: 314 if synsets_done % increment == 0: 315 sys.stdout.write("\tSynset %d/%d\n" % (synsets_done, synsets_total)) 316 sys.stdout.flush() 317 synsets_done += 1 318 319 count = verb_fd[verb_offset] 320 propagate_frequencies(temp, V.synset(verb_offset), count) 321 verb_fd = temp 322 323 sys.stdout.write(" done.\n") 324 325 # Output the frequency distributions to a file. Rather than pickle the 326 # frequency distributions, and the synsets contained therein, output 327 # a dict of synset identifiers and probabilities. This results in a file 328 # which is a great deal smaller than the pickled FreqDist object file. 329 330 sys.stdout.write("Converting to synset/sample count dictionaries...") 331 332 noun_dict = {} 333 verb_dict = {} 334 335 # The probabilities are not calculated as is normal for a frequency 336 # distribution (i.e. sample count / sum of all sample counts). Instead 337 # they are (sample count / sample count of root node), because of the 338 # propagation step that was performed earlier; this has the desirable 339 # consequence that root nodes have a probability of 1 and an 340 # Information Content (IC) score of 0. 341 342 root = N['entity'][0] 343 344 for sample in noun_fd: 345 root = N.synset(sample).hypernym_paths()[0][0] 346 noun_dict[sample] = (noun_fd[sample], noun_fd[root.offset]) 347 348 for sample in verb_fd: 349 root = V.synset(sample).hypernym_paths()[0][0] 350 verb_dict[sample] = (verb_fd[sample], verb_fd[root.offset]) 351 352 sys.stdout.write(" done.\n") 353 sys.stdout.write("Writing probability hashes to file %s..." % (output_filename)) 354 355 pickle.dump(noun_dict, outfile) 356 pickle.dump(verb_dict, outfile) 357 358 sys.stdout.write(" done.\n") 359 360 outfile.close()
361 362 if __name__ == '__main__': 363 brown_information_content('brown_ic.dat', None) 364