1
2
3
4
5
6
7
8 from nltk.corpus.reader.util import *
9 from nltk.corpus.reader.api import *
10 from nltk.tree import Tree
11 from nltk.etree import ElementTree
12 import re, codecs
13
15 """
16 Corpus reader for the propbank corpus, which augments the Penn
17 Treebank with information about the predicate argument structure
18 of every verb instance. The corpus consists of two parts: the
19 predicate-argument annotations themselves, and a set of X{frameset
20 files} which define the argument labels used by the annotations,
21 on a per-verb basis. Each X{frameset file} contains one or more
22 predicates, such as C{'turn'} or C{'turn_on'}, each of which is
23 divided into coarse-grained word senses called X{rolesets}. For
24 each X{roleset}, the frameset file provides descriptions of the
25 argument roles, along with examples.
26 """
27 - def __init__(self, root, propfile, framefiles='',
28 verbsfile=None, parse_filename_xform=None,
29 parse_corpus=None, encoding=None):
30 """
31 @param root: The root directory for this corpus.
32 @param propfile: The name of the file containing the predicate-
33 argument annotations (relative to C{root}).
34 @param framefiles: A list or regexp specifying the frameset
35 files for this corpus.
36 @param parse_filename_xform: A transform that should be applied
37 to the filenames in this corpus. This should be a function
38 of one argument (a filename) that returns a string (the new
39 filename).
40 @param parse_corpus: The corpus containing the parse trees
41 corresponding to this corpus. These parse trees are
42 necessary to resolve the tree pointers used by propbank.
43 """
44
45 if isinstance(framefiles, basestring):
46 framefiles = nltk.corpus.reader.find_corpus_files(root, framefiles)
47 framefiles = list(framefiles)
48
49 CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles,
50 encoding)
51
52
53 self._propfile = propfile
54 self._framefiles = framefiles
55 self._verbsfile = verbsfile
56 self._parse_filename_xform = parse_filename_xform
57 self._parse_corpus = parse_corpus
58
59 - def raw(self, files=None):
60 """
61 @return: the text contents of the given files, as a single string.
62 """
63 if files is None: files = self._files
64 elif isinstance(files, basestring): files = [files]
65 return concat([self.open(f).read() for f in files])
66
75
84
86 """
87 @return: the xml description for the given roleset.
88 """
89 lemma = roleset_id.split('.')[0]
90 framefile = 'frames/%s.xml' % lemma
91 if framefile not in self._framefiles:
92 raise ValueError('Frameset file for %s not found' %
93 roleset_id)
94
95
96
97 etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
98 for roleset in etree.findall('predicate/roleset'):
99 if roleset.attrib['id'] == roleset_id:
100 return roleset
101 else:
102 raise ValueError('Roleset %s not found in %s' %
103 (roleset_id, framefile))
104
113
126
127
128
129
130
132
133 - def __init__(self, filename, sentnum, wordnum, tagger, roleset,
134 inflection, predicate, arguments, parse_corpus=None):
135
136 self.filename = filename
137 """The name of the file containing the parse tree for this
138 instance's sentence."""
139
140 self.sentnum = sentnum
141 """The sentence number of this sentence within L{filename}.
142 Indexing starts from zero."""
143
144 self.wordnum = wordnum
145 """The word number of this instance's predicate within its
146 containing sentence. Word numbers are indexed starting from
147 zero, and include traces and other empty parse elements."""
148
149 self.tagger = tagger
150 """An identifier for the tagger who tagged this instance; or
151 C{'gold'} if this is an adjuticated instance."""
152
153 self.roleset = roleset
154 """The name of the roleset used by this instance's predicate.
155 Use L{propbank.roleset() <PropbankCorpusReader.roleset>} to
156 look up information about the roleset."""
157
158 self.inflection = inflection
159 """A {PropbankInflection} object describing the inflection of
160 this instance's predicate."""
161
162 self.predicate = predicate
163 """A L{PropbankTreePointer} indicating the position of this
164 instance's predicate within its containing sentence."""
165
166 self.arguments = tuple(arguments)
167 """A list of tuples (argloc, argid), specifying the location
168 and identifier for each of the predicate's argument in the
169 containing sentence. Argument identifiers are strings such as
170 C{'ARG0'} or C{'ARGM-TMP'}. This list does *not* contain
171 the predicate."""
172
173 self.parse_corpus = parse_corpus
174 """A corpus reader for the parse trees corresponding to the
175 instances in this propbank corpus."""
176
178 return ('<PropbankInstance: %s, sent %s, word %s>' %
179 (self.filename, self.sentnum, self.wordnum))
180
182 s = '%s %s %s %s %s %s' % (self.filename, self.sentnum, self.wordnum,
183 self.tagger, self.roleset, self.inflection)
184 items = self.arguments + ((self.predicate, 'rel'),)
185 for (argloc, argid) in sorted(items):
186 s += ' %s-%s' % (argloc, argid)
187 return s
188
193 tree = property(_get_tree, doc="""
194 The parse tree corresponding to this instance, or C{None} if
195 the corresponding tree is not available.""")
196
197 @staticmethod
198 - def parse(s, parse_filename_xform=None, parse_corpus=None):
199 pieces = s.split()
200 if len(pieces) < 7:
201 raise ValueError('Badly formatted propbank line: %r' % s)
202
203
204 (filename, sentnum, wordnum,
205 tagger, roleset, inflection) = pieces[:6]
206 rel = [p for p in pieces[6:] if p.endswith('-rel')]
207 args = [p for p in pieces[6:] if not p.endswith('-rel')]
208 if len(rel) != 1:
209 raise ValueError('Badly formatted propbank line: %r' % s)
210
211
212 if parse_filename_xform is not None:
213 filename = parse_filename_xform(filename)
214
215
216 sentnum = int(sentnum)
217 wordnum = int(wordnum)
218
219
220 inflection = PropbankInflection.parse(inflection)
221
222
223 predicate = PropbankTreePointer.parse(rel[0][:-4])
224
225
226 arguments = []
227 for arg in args:
228 argloc, argid = arg.split('-', 1)
229 arguments.append( (PropbankTreePointer.parse(argloc), argid) )
230
231
232 return PropbankInstance(filename, sentnum, wordnum, tagger,
233 roleset, inflection, predicate,
234 arguments, parse_corpus)
235
237 """
238 A pointer used by propbank to identify one or more constituents in
239 a parse tree. C{PropbankPointer} is an abstract base class with
240 three concrete subclasses:
241
242 - L{PropbankTreePointer} is used to point to single constituents.
243 - L{PropbankSplitTreePointer} is used to point to 'split'
244 constituents, which consist of a sequence of two or more
245 C{PropbankTreePointer}s.
246 - L{PropbankChainTreePointer} is used to point to entire trace
247 chains in a tree. It consists of a sequence of pieces, which
248 can be C{PropbankTreePointer}s or C{PropbankSplitTreePointer}s.
249 """
251 if self.__class__ == PropbankPoitner:
252 raise AssertionError('PropbankPointer is an abstract base class')
253
256 self.pieces = pieces
257 """A list of the pieces that make up this chain. Elements may
258 be either L{PropbankSplitTreePointer}s or
259 L{PropbankTreePointer}s."""
260
262 return '*'.join('%s' % p for p in self.pieces)
264 return '<PropbankChainTreePointer: %s>' % self
266 if tree is None: raise ValueError('Parse tree not avaialable')
267 return Tree('*CHAIN*', [p.select(tree) for p in self.pieces])
268
271 self.pieces = pieces
272 """A list of the pieces that make up this chain. Elements are
273 all L{PropbankTreePointer}s."""
274
276 return ','.join('%s' % p for p in self.pieces)
278 return '<PropbankSplitTreePointer: %s>' % self
280 if tree is None: raise ValueError('Parse tree not avaialable')
281 return Tree('*SPLIT*', [p.select(tree) for p in self.pieces])
282
284 """
285 wordnum:height*wordnum:height*...
286 wordnum:height,
287
288 """
292
293 @staticmethod
311
313 return '%s:%s' % (self.wordnum, self.height)
314
316 return 'PropbankTreePointer(%d, %d)' % (self.wordnum, self.height)
317
328
330 if tree is None: raise ValueError('Parse tree not avaialable')
331 return tree[self.treepos(tree)]
332
367
369
370 INFINITIVE = 'i'
371 GERUND = 'g'
372 PARTICIPLE = 'p'
373 FINITE = 'v'
374
375 FUTURE = 'f'
376 PAST = 'p'
377 PRESENT = 'n'
378
379 PERFECT = 'p'
380 PROGRESSIVE = 'o'
381 PERFECT_AND_PROGRESSIVE = 'b'
382
383 THIRD_PERSON = '3'
384
385 ACTIVE = 'a'
386 PASSIVE = 'p'
387
388 NONE = '-'
389
390
391 - def __init__(self, form='-', tense='-', aspect='-', person='-', voice='-'):
392 self.form = form
393 self.tense = tense
394 self.aspect = aspect
395 self.person = person
396 self.voice = voice
397
399 return self.form+self.tense+self.aspect+self.person+self.voice
400
402 return '<PropbankInflection: %s>' % self
403
404 _VALIDATE = re.compile(r'[igpv\-][fpn\-][pob\-][3\-][ap\-]$')
405
406 @staticmethod
414