Package nltk :: Package corpus :: Package reader :: Module ppattach
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.ppattach

  1  # Natural Language Toolkit: PP Attachment Corpus Reader 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Steven Bird <[email protected]> 
  5  #         Edward Loper <[email protected]> 
  6  # URL: <http://nltk.org> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  """ 
 10  Read lines from the Prepositional Phrase Attachment Corpus. 
 11   
 12  The PP Attachment Corpus contains several files having the format: 
 13   
 14  sentence_id verb noun1 preposition noun2 attachment 
 15   
 16  For example: 
 17   
 18  42960 gives authority to administration V 
 19  46742 gives inventors of microchip N 
 20   
 21  The PP attachment is to the verb phrase (V) or noun phrase (N), i.e.: 
 22   
 23  (VP gives (NP authority) (PP to administration)) 
 24  (VP gives (NP inventors (PP of microchip))) 
 25   
 26  The corpus contains the following files: 
 27   
 28  training:   training set 
 29  devset:     development test set, used for algorithm development. 
 30  test:       test set, used to report results 
 31  bitstrings: word classes derived from Mutual Information Clustering for the Wall Street Journal. 
 32   
 33  Ratnaparkhi, Adwait (1994). A Maximum Entropy Model for Prepositional 
 34  Phrase Attachment.  Proceedings of the ARPA Human Language Technology 
 35  Conference.  [http://www.cis.upenn.edu/~adwait/papers/hlt94.ps] 
 36   
 37  The PP Attachment Corpus is distributed with NLTK with the permission 
 38  of the author. 
 39  """        
 40   
 41  from util import * 
 42  from api import * 
 43  from nltk import tokenize 
 44  import codecs 
 45  from nltk.internals import deprecated 
 46   
47 -class PPAttachment:
48 - def __init__(self, sent, verb, noun1, prep, noun2, attachment):
49 self.sent = sent 50 self.verb = verb 51 self.noun1 = noun1 52 self.prep = prep 53 self.noun2 = noun2 54 self.attachment = attachment
55
56 - def __repr__(self):
57 return ('PPAttachment(sent=%r, verb=%r, noun1=%r, prep=%r, ' 58 'noun2=%r, attachment=%r)' % 59 (self.sent, self.verb, self.noun1, self.prep, 60 self.noun2, self.attachment))
61
62 -class PPAttachmentCorpusReader(CorpusReader):
63 """ 64 sentence_id verb noun1 preposition noun2 attachment 65 """
66 - def attachments(self, files):
67 return concat([StreamBackedCorpusView(filename, self._read_obj_block, 68 encoding=enc) 69 for (filename, enc) in self.abspaths(files, True)])
70
71 - def tuples(self, files):
72 return concat([StreamBackedCorpusView(filename, self._read_tuple_block, 73 encoding=enc) 74 for (filename, enc) in self.abspaths(files, True)])
75
76 - def raw(self, files=None):
77 if files is None: files = self._files 78 elif isinstance(files, basestring): files = [files] 79 return concat([self.open(f).read() for f in files])
80
81 - def _read_tuple_block(self, stream):
82 line = stream.readline() 83 if line: 84 return [tuple(line.split())] 85 else: 86 return []
87
88 - def _read_obj_block(self, stream):
89 line = stream.readline() 90 if line: 91 return [PPAttachment(*line.split())] 92 else: 93 return []
94 95 #{ Deprecated since 0.8 96 @deprecated("Use .tuples() or .raw() or .attachments() instead.")
97 - def read(self, items, format='tuple'):
98 if format == 'tuple': return self.tuples(items) 99 if format == 'raw': return self.raw(items) 100 raise ValueError('bad format %r' % format)
101 #} 102