1
2
3
4
5
6
7
8
9 """
10 Corpus reader for the Information Extraction and Entity Recognition Corpus.
11
12 NIST 1999 Information Extraction: Entity Recognition Evaluation
13 http://www.itl.nist.gov/iad/894.01/tests/ie-er/er_99/er_99.htm
14
15 This corpus contains the NEWSWIRE development test data for the
16 NIST 1999 IE-ER Evaluation. The files were taken from the
17 subdirectory: /ie_er_99/english/devtest/newswire/*.ref.nwt
18 and filenames were shortened.
19
20 The corpus contains the following files: APW_19980314, APW_19980424,
21 APW_19980429, NYT_19980315, NYT_19980403, and NYT_19980407.
22 """
23
24 from nltk.corpus.reader.api import *
25 from nltk.corpus.reader.util import *
26 from nltk import chunk
27 import codecs
28 from nltk.internals import deprecated
29
30
31
32 titles = {
33 'APW_19980314': 'Associated Press Weekly, 14 March 1998',
34 'APW_19980424': 'Associated Press Weekly, 24 April 1998',
35 'APW_19980429': 'Associated Press Weekly, 29 April 1998',
36 'NYT_19980315': 'New York Times, 15 March 1998',
37 'NYT_19980403': 'New York Times, 3 April 1998',
38 'NYT_19980407': 'New York Times, 7 April 1998',
39 }
40
41
42 documents = sorted(titles)
43
45 - def __init__(self, text, docno=None, doctype=None,
46 date_time=None, headline=''):
47 self.text = text
48 self.docno = docno
49 self.doctype = doctype
50 self.date_time = date_time
51 self.headline = headline
53 if self.headline:
54 headline = ' '.join(self.headline.leaves())
55 else:
56 headline = ' '.join([w for w in self.text.leaves()
57 if w[:1] != '<'][:12])+'...'
58 if self.docno is not None:
59 return '<IEERDocument %s: %r>' % (self.docno, headline)
60 else:
61 return '<IEERDocument: %r>' % headline
62
64 """
65 """
66 - def raw(self, files=None):
70
71 - def docs(self, files=None):
75
81
86
93
110
111
112 @deprecated("Use .parsed_docs() or .raw() or .docs() instead.")
113 - def read(self, items, format='parsed'):
118 @deprecated("Use .parsed_docs() instead.")
121
122