Package nltk :: Package corpus :: Package reader :: Module propbank
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.propbank

  1  # Natural Language Toolkit: PropBank Corpus Reader 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Edward Loper <[email protected]> 
  5  # URL: <http://nltk.org> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  from nltk.corpus.reader.util import * 
  9  from nltk.corpus.reader.api import * 
 10  from nltk.tree import Tree 
 11  from nltk.etree import ElementTree 
 12  import re, codecs 
 13   
14 -class PropbankCorpusReader(CorpusReader):
15 """ 16 Corpus reader for the propbank corpus, which augments the Penn 17 Treebank with information about the predicate argument structure 18 of every verb instance. The corpus consists of two parts: the 19 predicate-argument annotations themselves, and a set of X{frameset 20 files} which define the argument labels used by the annotations, 21 on a per-verb basis. Each X{frameset file} contains one or more 22 predicates, such as C{'turn'} or C{'turn_on'}, each of which is 23 divided into coarse-grained word senses called X{rolesets}. For 24 each X{roleset}, the frameset file provides descriptions of the 25 argument roles, along with examples. 26 """
27 - def __init__(self, root, propfile, framefiles='', 28 verbsfile=None, parse_filename_xform=None, 29 parse_corpus=None, encoding=None):
30 """ 31 @param root: The root directory for this corpus. 32 @param propfile: The name of the file containing the predicate- 33 argument annotations (relative to C{root}). 34 @param framefiles: A list or regexp specifying the frameset 35 files for this corpus. 36 @param parse_filename_xform: A transform that should be applied 37 to the filenames in this corpus. This should be a function 38 of one argument (a filename) that returns a string (the new 39 filename). 40 @param parse_corpus: The corpus containing the parse trees 41 corresponding to this corpus. These parse trees are 42 necessary to resolve the tree pointers used by propbank. 43 """ 44 # If framefiles is specified as a regexp, expand it. 45 if isinstance(framefiles, basestring): 46 framefiles = nltk.corpus.reader.find_corpus_files(root, framefiles) 47 framefiles = list(framefiles) 48 # Initialze the corpus reader. 49 CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles, 50 encoding) 51 52 # Record our frame files & prop file. 53 self._propfile = propfile 54 self._framefiles = framefiles 55 self._verbsfile = verbsfile 56 self._parse_filename_xform = parse_filename_xform 57 self._parse_corpus = parse_corpus
58
59 - def raw(self, files=None):
60 """ 61 @return: the text contents of the given files, as a single string. 62 """ 63 if files is None: files = self._files 64 elif isinstance(files, basestring): files = [files] 65 return concat([self.open(f).read() for f in files])
66
67 - def instances(self):
68 """ 69 @return: a corpus view that acts as a list of 70 L{PropbankInstance} objects, one for each verb in the corpus. 71 """ 72 return StreamBackedCorpusView(self.abspath(self._propfile), 73 self._read_instance_block, 74 encoding=self.encoding(self._propfile))
75
76 - def lines(self):
77 """ 78 @return: a corpus view that acts as a list of strings, one for 79 each line in the predicate-argument annotation file. 80 """ 81 return StreamBackedCorpusView(self.abspath(self._propfile), 82 read_line_block, 83 encoding=self.encoding(self._propfile))
84
85 - def roleset(self, roleset_id):
86 """ 87 @return: the xml description for the given roleset. 88 """ 89 lemma = roleset_id.split('.')[0] 90 framefile = 'frames/%s.xml' % lemma 91 if framefile not in self._framefiles: 92 raise ValueError('Frameset file for %s not found' % 93 roleset_id) 94 95 # n.b.: The encoding for XML files is specified by the file 96 # itself; so we ignore self._encoding here. 97 etree = ElementTree.parse(self.abspath(framefile).open()).getroot() 98 for roleset in etree.findall('predicate/roleset'): 99 if roleset.attrib['id'] == roleset_id: 100 return roleset 101 else: 102 raise ValueError('Roleset %s not found in %s' % 103 (roleset_id, framefile))
104
105 - def verbs(self):
106 """ 107 @return: a corpus view that acts as a list of all verb lemmas 108 in this corpus (from the verbs.txt file). 109 """ 110 return StreamBackedCorpusView(self.abspath(self._verbsfile), 111 read_line_block, 112 encoding=self.encoding(self._verbsfile))
113
114 - def _read_instance_block(self, stream):
115 block = [] 116 117 # Read 100 at a time. 118 for i in range(100): 119 line = stream.readline().strip() 120 if line: 121 block.append(PropbankInstance.parse( 122 line, self._parse_filename_xform, 123 self._parse_corpus)) 124 125 return block
126 127 ###################################################################### 128 #{ Propbank Instance & related datatypes 129 ###################################################################### 130
131 -class PropbankInstance(object):
132
133 - def __init__(self, filename, sentnum, wordnum, tagger, roleset, 134 inflection, predicate, arguments, parse_corpus=None):
135 136 self.filename = filename 137 """The name of the file containing the parse tree for this 138 instance's sentence.""" 139 140 self.sentnum = sentnum 141 """The sentence number of this sentence within L{filename}. 142 Indexing starts from zero.""" 143 144 self.wordnum = wordnum 145 """The word number of this instance's predicate within its 146 containing sentence. Word numbers are indexed starting from 147 zero, and include traces and other empty parse elements.""" 148 149 self.tagger = tagger 150 """An identifier for the tagger who tagged this instance; or 151 C{'gold'} if this is an adjuticated instance.""" 152 153 self.roleset = roleset 154 """The name of the roleset used by this instance's predicate. 155 Use L{propbank.roleset() <PropbankCorpusReader.roleset>} to 156 look up information about the roleset.""" 157 158 self.inflection = inflection 159 """A {PropbankInflection} object describing the inflection of 160 this instance's predicate.""" 161 162 self.predicate = predicate 163 """A L{PropbankTreePointer} indicating the position of this 164 instance's predicate within its containing sentence.""" 165 166 self.arguments = tuple(arguments) 167 """A list of tuples (argloc, argid), specifying the location 168 and identifier for each of the predicate's argument in the 169 containing sentence. Argument identifiers are strings such as 170 C{'ARG0'} or C{'ARGM-TMP'}. This list does *not* contain 171 the predicate.""" 172 173 self.parse_corpus = parse_corpus 174 """A corpus reader for the parse trees corresponding to the 175 instances in this propbank corpus."""
176
177 - def __repr__(self):
178 return ('<PropbankInstance: %s, sent %s, word %s>' % 179 (self.filename, self.sentnum, self.wordnum))
180
181 - def __str__(self):
182 s = '%s %s %s %s %s %s' % (self.filename, self.sentnum, self.wordnum, 183 self.tagger, self.roleset, self.inflection) 184 items = self.arguments + ((self.predicate, 'rel'),) 185 for (argloc, argid) in sorted(items): 186 s += ' %s-%s' % (argloc, argid) 187 return s
188
189 - def _get_tree(self):
190 if self.parse_corpus is None: return None 191 if self.filename not in self.parse_corpus.files(): return None 192 return self.parse_corpus.parsed_sents(self.filename)[self.sentnum]
193 tree = property(_get_tree, doc=""" 194 The parse tree corresponding to this instance, or C{None} if 195 the corresponding tree is not available.""") 196 197 @staticmethod
198 - def parse(s, parse_filename_xform=None, parse_corpus=None):
199 pieces = s.split() 200 if len(pieces) < 7: 201 raise ValueError('Badly formatted propbank line: %r' % s) 202 203 # Divide the line into its basic pieces. 204 (filename, sentnum, wordnum, 205 tagger, roleset, inflection) = pieces[:6] 206 rel = [p for p in pieces[6:] if p.endswith('-rel')] 207 args = [p for p in pieces[6:] if not p.endswith('-rel')] 208 if len(rel) != 1: 209 raise ValueError('Badly formatted propbank line: %r' % s) 210 211 # Apply the filename selector, if any. 212 if parse_filename_xform is not None: 213 filename = parse_filename_xform(filename) 214 215 # Convert sentence & word numbers to ints. 216 sentnum = int(sentnum) 217 wordnum = int(wordnum) 218 219 # Parse the inflection 220 inflection = PropbankInflection.parse(inflection) 221 222 # Parse the predicate location. 223 predicate = PropbankTreePointer.parse(rel[0][:-4]) 224 225 # Parse the arguments. 226 arguments = [] 227 for arg in args: 228 argloc, argid = arg.split('-', 1) 229 arguments.append( (PropbankTreePointer.parse(argloc), argid) ) 230 231 # Put it all together. 232 return PropbankInstance(filename, sentnum, wordnum, tagger, 233 roleset, inflection, predicate, 234 arguments, parse_corpus)
235
236 -class PropbankPointer(object):
237 """ 238 A pointer used by propbank to identify one or more constituents in 239 a parse tree. C{PropbankPointer} is an abstract base class with 240 three concrete subclasses: 241 242 - L{PropbankTreePointer} is used to point to single constituents. 243 - L{PropbankSplitTreePointer} is used to point to 'split' 244 constituents, which consist of a sequence of two or more 245 C{PropbankTreePointer}s. 246 - L{PropbankChainTreePointer} is used to point to entire trace 247 chains in a tree. It consists of a sequence of pieces, which 248 can be C{PropbankTreePointer}s or C{PropbankSplitTreePointer}s. 249 """
250 - def __init__(self):
251 if self.__class__ == PropbankPoitner: 252 raise AssertionError('PropbankPointer is an abstract base class')
253
254 -class PropbankChainTreePointer(PropbankPointer):
255 - def __init__(self, pieces):
256 self.pieces = pieces 257 """A list of the pieces that make up this chain. Elements may 258 be either L{PropbankSplitTreePointer}s or 259 L{PropbankTreePointer}s."""
260
261 - def __str__(self):
262 return '*'.join('%s' % p for p in self.pieces)
263 - def __repr__(self):
264 return '<PropbankChainTreePointer: %s>' % self
265 - def select(self, tree):
266 if tree is None: raise ValueError('Parse tree not avaialable') 267 return Tree('*CHAIN*', [p.select(tree) for p in self.pieces])
268
269 -class PropbankSplitTreePointer(PropbankPointer):
270 - def __init__(self, pieces):
271 self.pieces = pieces 272 """A list of the pieces that make up this chain. Elements are 273 all L{PropbankTreePointer}s."""
274
275 - def __str__(self):
276 return ','.join('%s' % p for p in self.pieces)
277 - def __repr__(self):
278 return '<PropbankSplitTreePointer: %s>' % self
279 - def select(self, tree):
280 if tree is None: raise ValueError('Parse tree not avaialable') 281 return Tree('*SPLIT*', [p.select(tree) for p in self.pieces])
282
283 -class PropbankTreePointer(PropbankPointer):
284 """ 285 wordnum:height*wordnum:height*... 286 wordnum:height, 287 288 """
289 - def __init__(self, wordnum, height):
290 self.wordnum = wordnum 291 self.height = height
292 293 @staticmethod
294 - def parse(s):
295 # Deal with chains (xx*yy*zz) 296 pieces = s.split('*') 297 if len(pieces) > 1: 298 return PropbankChainTreePointer([PropbankTreePointer.parse(elt) 299 for elt in pieces]) 300 301 # Deal with split args (xx,yy,zz) 302 pieces = s.split(',') 303 if len(pieces) > 1: 304 return PropbankSplitTreePointer([PropbankTreePointer.parse(elt) 305 for elt in pieces]) 306 307 # Deal with normal pointers. 308 pieces = s.split(':') 309 if len(pieces) != 2: raise ValueError('bad propbank pointer %r' % s) 310 return PropbankTreePointer(int(pieces[0]), int(pieces[1]))
311
312 - def __str__(self):
313 return '%s:%s' % (self.wordnum, self.height)
314
315 - def __repr__(self):
316 return 'PropbankTreePointer(%d, %d)' % (self.wordnum, self.height)
317
318 - def __cmp__(self, other):
319 while isinstance(other, (PropbankChainTreePointer, 320 PropbankSplitTreePointer)): 321 other = other.pieces[0] 322 323 if not isinstance(other, PropbankTreePointer): 324 return cmp(id(self), id(other)) 325 326 return cmp( (self.wordnum, -self.height), 327 (other.wordnum, -other.height) )
328
329 - def select(self, tree):
330 if tree is None: raise ValueError('Parse tree not avaialable') 331 return tree[self.treepos(tree)]
332
333 - def treepos(self, tree):
334 """ 335 Convert this pointer to a standard 'tree position' pointer, 336 given that it points to the given tree. 337 """ 338 if tree is None: raise ValueError('Parse tree not avaialable') 339 stack = [tree] 340 treepos = [] 341 342 wordnum = 0 343 while True: 344 #print treepos 345 #print stack[-1] 346 # tree node: 347 if isinstance(stack[-1], Tree): 348 # Select the next child. 349 if len(treepos) < len(stack): 350 treepos.append(0) 351 else: 352 treepos[-1] += 1 353 # Update the stack. 354 if treepos[-1] < len(stack[-1]): 355 stack.append(stack[-1][treepos[-1]]) 356 else: 357 # End of node's child list: pop up a level. 358 stack.pop() 359 treepos.pop() 360 # word node: 361 else: 362 if wordnum == self.wordnum: 363 return tuple(treepos[:len(treepos)-self.height-1]) 364 else: 365 wordnum += 1 366 stack.pop()
367
368 -class PropbankInflection(object):
369 #{ Inflection Form 370 INFINITIVE = 'i' 371 GERUND = 'g' 372 PARTICIPLE = 'p' 373 FINITE = 'v' 374 #{ Inflection Tense 375 FUTURE = 'f' 376 PAST = 'p' 377 PRESENT = 'n' 378 #{ Inflection Aspect 379 PERFECT = 'p' 380 PROGRESSIVE = 'o' 381 PERFECT_AND_PROGRESSIVE = 'b' 382 #{ Inflection Person 383 THIRD_PERSON = '3' 384 #{ Inflection Voice 385 ACTIVE = 'a' 386 PASSIVE = 'p' 387 #{ Inflection 388 NONE = '-' 389 #} 390
391 - def __init__(self, form='-', tense='-', aspect='-', person='-', voice='-'):
392 self.form = form 393 self.tense = tense 394 self.aspect = aspect 395 self.person = person 396 self.voice = voice
397
398 - def __str__(self):
399 return self.form+self.tense+self.aspect+self.person+self.voice
400
401 - def __repr__(self):
402 return '<PropbankInflection: %s>' % self
403 404 _VALIDATE = re.compile(r'[igpv\-][fpn\-][pob\-][3\-][ap\-]$') 405 406 @staticmethod
407 - def parse(s):
408 if not isinstance(s, basestring): 409 raise TypeError('expected a string') 410 if (len(s) != 5 or 411 not PropbankInflection._VALIDATE.match(s)): 412 raise ValueError('Bad propbank inflection string %r' % s) 413 return PropbankInflection(*s)
414