Package nltk :: Package corpus :: Package reader :: Module ieer
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.ieer

  1  # Natural Language Toolkit: IEER Corpus Reader 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Steven Bird <[email protected]> 
  5  #         Edward Loper <[email protected]> 
  6  # URL: <http://nltk.org> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  """ 
 10  Corpus reader for the Information Extraction and Entity Recognition Corpus. 
 11   
 12  NIST 1999 Information Extraction: Entity Recognition Evaluation 
 13  http://www.itl.nist.gov/iad/894.01/tests/ie-er/er_99/er_99.htm 
 14   
 15  This corpus contains the NEWSWIRE development test data for the 
 16  NIST 1999 IE-ER Evaluation.  The files were taken from the 
 17  subdirectory: /ie_er_99/english/devtest/newswire/*.ref.nwt 
 18  and filenames were shortened. 
 19   
 20  The corpus contains the following files: APW_19980314, APW_19980424, 
 21  APW_19980429, NYT_19980315, NYT_19980403, and NYT_19980407. 
 22  """ 
 23   
 24  from nltk.corpus.reader.api import * 
 25  from nltk.corpus.reader.util import * 
 26  from nltk import chunk 
 27  import codecs 
 28  from nltk.internals import deprecated 
 29   
 30  #: A dictionary whose keys are the names of documents in this corpus; 
 31  #: and whose values are descriptions of those documents' contents. 
 32  titles = { 
 33      'APW_19980314': 'Associated Press Weekly, 14 March 1998', 
 34      'APW_19980424': 'Associated Press Weekly, 24 April 1998', 
 35      'APW_19980429': 'Associated Press Weekly, 29 April 1998', 
 36      'NYT_19980315': 'New York Times, 15 March 1998', 
 37      'NYT_19980403': 'New York Times, 3 April 1998', 
 38      'NYT_19980407': 'New York Times, 7 April 1998', 
 39      } 
 40   
 41  #: A list of all documents in this corpus. 
 42  documents = sorted(titles) 
 43   
44 -class IEERDocument:
45 - def __init__(self, text, docno=None, doctype=None, 46 date_time=None, headline=''):
47 self.text = text 48 self.docno = docno 49 self.doctype = doctype 50 self.date_time = date_time 51 self.headline = headline
52 - def __repr__(self):
53 if self.headline: 54 headline = ' '.join(self.headline.leaves()) 55 else: 56 headline = ' '.join([w for w in self.text.leaves() 57 if w[:1] != '<'][:12])+'...' 58 if self.docno is not None: 59 return '<IEERDocument %s: %r>' % (self.docno, headline) 60 else: 61 return '<IEERDocument: %r>' % headline
62
63 -class IEERCorpusReader(CorpusReader):
64 """ 65 """
66 - def raw(self, files=None):
67 if files is None: files = self._files 68 elif isinstance(files, basestring): files = [files] 69 return concat([self.open(f).read() for f in files])
70
71 - def docs(self, files=None):
72 return concat([StreamBackedCorpusView(filename, self._read_block, 73 encoding=enc) 74 for (filename, enc) in self.abspaths(files, True)])
75
76 - def parsed_docs(self, files=None):
77 return concat([StreamBackedCorpusView(filename, 78 self._read_parsed_block, 79 encoding=enc) 80 for (filename, enc) in self.abspaths(files, True)])
81
82 - def _read_parsed_block(self,stream):
83 # TODO: figure out while empty documents are being returned 84 return [self._parse(doc) for doc in self._read_block(stream) 85 if self._parse(doc).docno is not None]
86
87 - def _parse(self, doc):
88 val = chunk.ieerstr2tree(doc, top_node="DOCUMENT") 89 if isinstance(val, dict): 90 return IEERDocument(**val) 91 else: 92 return IEERDocument(val)
93
94 - def _read_block(self, stream):
95 out = [] 96 # Skip any preamble. 97 while True: 98 line = stream.readline() 99 if not line: break 100 if line.strip() == '<DOC>': break 101 out.append(line) 102 # Read the document 103 while True: 104 line = stream.readline() 105 if not line: break 106 out.append(line) 107 if line.strip() == '</DOC>': break 108 # Return the document 109 return ['\n'.join(out)]
110 111 #{ Deprecated since 0.8 112 @deprecated("Use .parsed_docs() or .raw() or .docs() instead.")
113 - def read(self, items, format='parsed'):
114 if format == 'parsed': return self.parsed_docs(items) 115 if format == 'raw': return self.raw(items) 116 if format == 'docs': return self.docs(items) 117 raise ValueError('bad format %r' % format)
118 @deprecated("Use .parsed_docs() instead.")
119 - def parsed(self, items):
120 return self.parsed_docs(items)
121 #} 122