| Home | Trees | Indices | Help |
|
|---|
|
|
1 # Natural Language Toolkit: Brown Corpus Information Content Module
2 #
3 # Copyright (C) 2001-2008 NLTK Project
4 # Author: Oliver Steele <[email protected]>
5 # David Ormiston Smith <[email protected]>>
6 # Steven Bird <[email protected]>
7 # Jordan Boyd-Graber <[email protected]>
8 # URL: <http://nltk.org>
9 # For license information, see LICENSE.TXT
10
11 # This module can be used to build information content files.
12 # However, NLTK's WordNet Similarity code uses pre-built IC
13 # files obtained from the WordNet Similarity website [1] and
14 # distributed as part of the NLTK corpus collection.
15 # [1] http://wn-similarity.sourceforge.net
16
17 import pickle
18 import sys
19 from itertools import islice
20
21 from nltk.corpus import brown
22 from nltk.probability import FreqDist
23 from nltk.stem.wordnet import morphy
24
25 from util import *
26 from dictionary import N, V
27
28 # Perform a binary search through the list of all compounds. The
29 # search necessarily accepts partial matches. The search returns
30 # the compound type ('nc' for noun compound or 'vbc' for verb
31 # compound) of the matched compound, or False if no match was
32 # found. Recording the compound type is necessary so that the
33 # correct frequency distribution can be updated later.
34
36 if not data:
37 return None
38 low = 0
39 high = len(data) - 1
40 mid = high / 2
41
42 while data[mid].find(item) < 0:
43 if mid >= high or mid <= low:
44 return False
45 elif data[mid] > item:
46 high = mid
47 mid -= (high - low) / 2
48 elif data[mid] < item:
49 low = mid
50 mid += (high - low) / 2
51
52 return data[mid].split(':')[1]
53
55
56 dictionaries = [N, V]
57 compound_types = ['nc', 'vbc']
58
59 if filename:
60 outfile = open(filename, 'w')
61 else:
62 outfile = sys.stdout
63
64 for dict, type in zip(dictionaries, compound_types):
65 for term in dict:
66 term = term.form
67 if ' ' in term:
68 outfile.write("%s:%s\n" % (term, type))
69
71 word_list = [line.rstrip() for line in open(filename, 'r')]
72 word_list.sort()
73 return word_list
74
76 roots = []
77 for word in dictionary:
78 for synset in dictionary[word].synsets():
79 hypernyms = set(synset[HYPERNYM]) | set(synset[INSTANCE_HYPERNYM])
80 if len(hypernyms) == 0: roots.append(synset)
81 return roots
82
84 parents = set()
85 for path in synset.hypernym_paths():
86 parents = parents | set(map(lambda x: x.offset, path))
87 parents = parents - set(synset.offset)
88 for i in parents:
89 freq_dist.inc(i, count)
90
91 -def brown_information_content(output_filename, compounds_filename, \
92 stopwords_filename=None, smoothing=False):
93
94 # A list of all the noun and verb parts of speech as recorded in the
95 # Brown corpus. Currently many are ignored because the Wordnet morphy()
96 # method, which derives the base form of inflected words, can't handle
97 # contractions, genetive forms etc.
98
99 noun_tags = [
100 'nn', # N. sing. common (burden)
101 # 'nn$', # N. sing. common, genetive (company's)
102 # 'nn+bez', # N. sing. common + 'to be' pres. 3rd pers sing. (wife's)
103 # 'nn+hvd', # N. sing. common + 'to have' past (James'd)
104 # 'nn+hvz', # N. sing. common + 'to have' pres. 3rd pers sing. (guy's)
105 # 'nn+in', # N. sing. common + prep. (buncha)
106 # 'nn+md', # N. sing. common + modal aux. (sun'll)
107 # 'nn+nn', # N. sing. common, hyphenated pair (stomach-belly)
108 'nns', # N. plu. common (burdens)
109 # 'nns$', # N. plu. common genetive (companies')
110 # 'nns+md', # N. plu. common + modal aux. (cowboys'll)
111 'np', # N. sing. proper (September)
112 # 'np$', # N. sing. proper genetive (William's)
113 # 'np+bez', # N. sing. proper + 'to be' pres. 3rd pers sing. (Rob's)
114 # 'np+hvd', # N. sing. proper + 'to have' past (James'd)
115 # 'np+hvz', # N. sing. proper + 'to have' pres. 3rd pers sing. (Bill's)
116 # 'np+md', # N. sing. proper + modal aux. (John'll)
117 'nps', # N. plu. proper (Catholics)
118 # 'nps$', # N. plu. proper, genetive (Republicans')
119 'nr', # N. sing. adverbial (today, Saturday, East)
120 # 'nr$', # N. sing. adverbial, genetive (Saturday's)
121 # 'nr+md' # N. sing. adverbial + modal aux. (today'll)
122 'nrs', # N. plu. adverbial (Sundays)
123 'nc', # N. compound (jigsaw puzzle, egg cup)
124 ]
125
126 verb_tags = [
127 'vb', # V. base: pres. imp. or inf. (find, act)
128 # 'vb+at', # V. base: pres. or inf. + article (wanna)
129 # 'vb+in', # V. base: pres. imp. or inf. + prep. (lookit)
130 # 'vb+jj', # V. base: pres. imp. or inf. + adj. (die-dead)
131 # 'vb+ppo', # V. pres. + pronoun, personal, acc. (let's)
132 # 'vb+rp', # V. imperative + adverbial particle (g'ahn, c'mon)
133 # 'vb+vb', # V. base: pres. imp. or inf. hyphenated pair (say-speak)
134 'vbd', # V. past (said, produced)
135 'vbg', # V. pres. part./gerund (keeping, attending)
136 # 'vbg+to', # V. pres part. + infinitival to (gonna)
137 'vbn', # V. past part. (conducted, adopted)
138 # 'vbn+to', # V. past part. + infinitival to (gotta)
139 'vbz', # V. pres. 3rd pers. sing. (deserves, seeks)
140 'vbc' # V. compound (band together, run over)
141 ]
142
143 outfile = open(output_filename, "wb")
144
145 if compounds_filename:
146 compounds = read_word_list(compounds_filename)
147 else:
148 compounds = []
149
150 if stopwords_filename:
151 stopwords = read_word_list(stopword_filename)
152 else:
153 stopwords = []
154
155 noun_fd = FreqDist()
156 verb_fd = FreqDist()
157
158 count = 0
159 increment = 1000
160
161 # If smoothing is True perform Laplacian smoothing i.e. add 1 to each
162 # synset's frequency count.
163
164 if smoothing:
165 for term in N:
166 for sense in N[term]:
167 if noun_fd[sense.offset] == 0:
168 noun_fd.inc(sense.offset)
169 for term in V:
170 for sense in V[term]:
171 if verb_fd[sense.offset] == 0:
172 verb_fd.inc(sense.offset)
173
174
175 sys.stdout.write("Building initial frequency distributions...\n")
176
177 for file in brown.files():
178 sys.stdout.write("\t%s/%s\n" % (file, brown.files()[-1]))
179 sys.stdout.flush()
180
181 for sentence in brown.tagged_sents(file):
182
183 if len(sentence) == 0:
184 continue
185
186 # Greedily search for compound nouns/verbs. The search is naive and
187 # doesn't account for inflected words within the compound (so
188 # variant forms of the compound will not be identified e.g. the
189 # compound 'abdominal cavities' will not be recognised as the plural of
190 # 'abdominal cavity'); this is in keeping with the original Perl
191 # implementation. Rectifying this is mildy tricky in that some compound
192 # constituents are expected to be inflected e.g. 'abandoned ship' so
193 # it isn't possible to simply uninflect each constituent before
194 # searching; rather, a list of variant compounds containing all possible
195 # inflected/uninflected constituents would be needed (compounds rarely
196 # exceed length four so the quadratic search space wouldn't be too scary).
197
198 new_sentence = []
199 (token, tag) = sentence.pop(0)
200 token = token.lower()
201 tag = tag.lower()
202 compound = (token, tag)
203
204 # Pop (word token, PoS tag) tuples from the sentence until all words
205 # have been consumed. Glue the word tokens together while they form
206 # a substring of a valid compound. When adding a new token makes the
207 # compound invalid, append the current compound onto the new sentence
208 # queue and assign the new (token, tag) tuple as the current compound base.
209
210 while len(sentence) > 0:
211 (token, tag) = sentence.pop(0)
212 token = token.lower()
213 tag = tag.lower()
214
215 # Add this token to the current compound string, creating a
216 # candidate compound token that may or may not exist in Wordnet.
217 compound_token = compound[0] + ' ' + token
218 compound_tag = substr_binary_search(compound_token, compounds)
219
220 if compound_tag:
221 compound = (compound_token, compound_tag)
222
223 # If appending the new token to the current compound results in
224 # an invalid compound, append the current compound to the new
225 # sentence queue and reset it, placing the new token as the
226 # beginning of a (possible) new compound.
227
228 else:
229 new_sentence.append(compound)
230 compound = (token, tag)
231
232 # The final (possibly compound) token in each sentence needs to be
233 # manually appended onto the new sentence list.
234
235 new_sentence.append(compound)
236
237 for (token, tag) in new_sentence:
238
239 # Give the user some feedback to let him or her know the
240 # distributions are still being built. The initial stage can take
241 # quite some time (half an hour or more).
242
243 count += 1
244
245 # Basic splitting based on the word token's POS. Later this could
246 # be further developed using the additional (now commented out)
247 # tag types and adding conditional checks to turn e.g. "you'll"
248 # into "you" + "will". This would increase the accuracy of the
249 # distribution, as currently all such contractions are discarded
250 # (because they won't match any entries in the dictionary).
251
252 if tag in noun_tags:
253 pos = "noun"
254 dictionary = N
255 freq_dist = noun_fd
256 elif tag in verb_tags:
257 pos = "verb"
258 dictionary = V
259 freq_dist = verb_fd
260 else:
261 token = None
262
263 # If the word form is inflected e.g. plural, retrieve its base
264 # or uninflected form.
265
266 if token is not None:
267 if token in dictionary:
268 uninflected_token = token
269 else:
270 uninflected_token = morphy(token, pos)
271
272 else: uninflected_token = None
273
274 # Increment the count for each sense of the word token, there
275 # being no practical way to distinguish between word senses in the
276 # Brown corpus (SemCor would be another story).
277
278 if uninflected_token:
279 if uninflected_token in dictionary:
280 for synset in dictionary[uninflected_token]:
281 freq_dist.inc(synset.offset)
282
283 # Propagate the frequency counts up the taxonomy structure. Thus the
284 # root node (or nodes) will have a frequency equal to the sum of all
285 # of their descendent node frequencies (plus a bit extra, if the root
286 # node appeared in the source text). The distribution will then adhere
287 # to the IR principle that nodes (synsets) that appear less frequently
288 # have a higher information content.
289
290 sys.stdout.write(" done.\n")
291
292 sys.stdout.write("Propagating frequencies up the taxonomy trees...\n")
293 sys.stdout.flush()
294
295 # Keep track of how many synsets we need to percolate counts up for
296 synsets_done = 0
297 synsets_total = len(noun_fd) + len(verb_fd)
298
299 # We'll copy the information in noun_fd into a new distribution
300 # and percolate up the tree
301 temp = FreqDist()
302 for noun_offset in noun_fd:
303 if synsets_done % increment == 0:
304 sys.stdout.write("\tSynset %d/%d\n" % (synsets_done, synsets_total))
305 sys.stdout.flush()
306 synsets_done += 1
307
308 count = noun_fd[noun_offset]
309 propagate_frequencies(temp, N.synset(noun_offset), count)
310 noun_fd = temp
311
312 temp = FreqDist()
313 for verb_offset in verb_fd:
314 if synsets_done % increment == 0:
315 sys.stdout.write("\tSynset %d/%d\n" % (synsets_done, synsets_total))
316 sys.stdout.flush()
317 synsets_done += 1
318
319 count = verb_fd[verb_offset]
320 propagate_frequencies(temp, V.synset(verb_offset), count)
321 verb_fd = temp
322
323 sys.stdout.write(" done.\n")
324
325 # Output the frequency distributions to a file. Rather than pickle the
326 # frequency distributions, and the synsets contained therein, output
327 # a dict of synset identifiers and probabilities. This results in a file
328 # which is a great deal smaller than the pickled FreqDist object file.
329
330 sys.stdout.write("Converting to synset/sample count dictionaries...")
331
332 noun_dict = {}
333 verb_dict = {}
334
335 # The probabilities are not calculated as is normal for a frequency
336 # distribution (i.e. sample count / sum of all sample counts). Instead
337 # they are (sample count / sample count of root node), because of the
338 # propagation step that was performed earlier; this has the desirable
339 # consequence that root nodes have a probability of 1 and an
340 # Information Content (IC) score of 0.
341
342 root = N['entity'][0]
343
344 for sample in noun_fd:
345 root = N.synset(sample).hypernym_paths()[0][0]
346 noun_dict[sample] = (noun_fd[sample], noun_fd[root.offset])
347
348 for sample in verb_fd:
349 root = V.synset(sample).hypernym_paths()[0][0]
350 verb_dict[sample] = (verb_fd[sample], verb_fd[root.offset])
351
352 sys.stdout.write(" done.\n")
353 sys.stdout.write("Writing probability hashes to file %s..." % (output_filename))
354
355 pickle.dump(noun_dict, outfile)
356 pickle.dump(verb_dict, outfile)
357
358 sys.stdout.write(" done.\n")
359
360 outfile.close()
361
362 if __name__ == '__main__':
363 brown_information_content('brown_ic.dat', None)
364
| Home | Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0beta1 on Wed Aug 27 15:09:10 2008 | http://epydoc.sourceforge.net |