| Home | Trees | Indices | Help |
|
|---|
|
|
1 # Natural Language Toolkit: IEER Corpus Reader
2 #
3 # Copyright (C) 2001-2008 NLTK Project
4 # Author: Steven Bird <[email protected]>
5 # Edward Loper <[email protected]>
6 # URL: <http://nltk.org>
7 # For license information, see LICENSE.TXT
8
9 """
10 Corpus reader for the Information Extraction and Entity Recognition Corpus.
11
12 NIST 1999 Information Extraction: Entity Recognition Evaluation
13 http://www.itl.nist.gov/iad/894.01/tests/ie-er/er_99/er_99.htm
14
15 This corpus contains the NEWSWIRE development test data for the
16 NIST 1999 IE-ER Evaluation. The files were taken from the
17 subdirectory: /ie_er_99/english/devtest/newswire/*.ref.nwt
18 and filenames were shortened.
19
20 The corpus contains the following files: APW_19980314, APW_19980424,
21 APW_19980429, NYT_19980315, NYT_19980403, and NYT_19980407.
22 """
23
24 from nltk.corpus.reader.api import *
25 from nltk.corpus.reader.util import *
26 from nltk import chunk
27 import codecs
28 from nltk.internals import deprecated
29
30 #: A dictionary whose keys are the names of documents in this corpus;
31 #: and whose values are descriptions of those documents' contents.
32 titles = {
33 'APW_19980314': 'Associated Press Weekly, 14 March 1998',
34 'APW_19980424': 'Associated Press Weekly, 24 April 1998',
35 'APW_19980429': 'Associated Press Weekly, 29 April 1998',
36 'NYT_19980315': 'New York Times, 15 March 1998',
37 'NYT_19980403': 'New York Times, 3 April 1998',
38 'NYT_19980407': 'New York Times, 7 April 1998',
39 }
40
41 #: A list of all documents in this corpus.
42 documents = sorted(titles)
43
47 self.text = text
48 self.docno = docno
49 self.doctype = doctype
50 self.date_time = date_time
51 self.headline = headline
53 if self.headline:
54 headline = ' '.join(self.headline.leaves())
55 else:
56 headline = ' '.join([w for w in self.text.leaves()
57 if w[:1] != '<'][:12])+'...'
58 if self.docno is not None:
59 return '<IEERDocument %s: %r>' % (self.docno, headline)
60 else:
61 return '<IEERDocument: %r>' % headline
62
64 """
65 """
67 if files is None: files = self._files
68 elif isinstance(files, basestring): files = [files]
69 return concat([self.open(f).read() for f in files])
70
72 return concat([StreamBackedCorpusView(filename, self._read_block,
73 encoding=enc)
74 for (filename, enc) in self.abspaths(files, True)])
75
77 return concat([StreamBackedCorpusView(filename,
78 self._read_parsed_block,
79 encoding=enc)
80 for (filename, enc) in self.abspaths(files, True)])
81
83 # TODO: figure out while empty documents are being returned
84 return [self._parse(doc) for doc in self._read_block(stream)
85 if self._parse(doc).docno is not None]
86
88 val = chunk.ieerstr2tree(doc, top_node="DOCUMENT")
89 if isinstance(val, dict):
90 return IEERDocument(**val)
91 else:
92 return IEERDocument(val)
93
95 out = []
96 # Skip any preamble.
97 while True:
98 line = stream.readline()
99 if not line: break
100 if line.strip() == '<DOC>': break
101 out.append(line)
102 # Read the document
103 while True:
104 line = stream.readline()
105 if not line: break
106 out.append(line)
107 if line.strip() == '</DOC>': break
108 # Return the document
109 return ['\n'.join(out)]
110
111 #{ Deprecated since 0.8
112 @deprecated("Use .parsed_docs() or .raw() or .docs() instead.")
114 if format == 'parsed': return self.parsed_docs(items)
115 if format == 'raw': return self.raw(items)
116 if format == 'docs': return self.docs(items)
117 raise ValueError('bad format %r' % format)
118 @deprecated("Use .parsed_docs() instead.")
121 #}
122
| Home | Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0beta1 on Wed Aug 27 15:09:09 2008 | http://epydoc.sourceforge.net |