1
2
3
4
5
6
7
8
9 """
10 NLTK corpus readers. The modules in this package provide functions
11 that can be used to read corpus files in a variety of formats. These
12 functions can be used to read both the corpus files that are
13 distributed in the NLTK corpus package, and corpus files that are part
14 of external corpora.
15
16 Corpus Reader Functions
17 =======================
18 Each corpus module defines one or more X{corpus reader functions},
19 which can be used to read documents from that corpus. These functions
20 take an argument, C{item}, which is used to indicate which document
21 should be read from the corpus:
22
23 - If C{item} is one of the unique identifiers listed in the corpus
24 module's C{items} variable, then the corresponding document will
25 be loaded from the NLTK corpus package.
26
27 - If C{item} is a filename, then that file will be read.
28
29 Additionally, corpus reader functions can be given lists of item
30 names; in which case, they will return a concatenation of the
31 corresponding documents.
32
33 Corpus reader functions are named based on the type of information
34 they return. Some common examples, and their return types, are:
35
36 - I{corpus}.words(): list of str
37 - I{corpus}.sents(): list of (list of str)
38 - I{corpus}.paras(): list of (list of (list of str))
39 - I{corpus}.tagged_words(): list of (str,str) tuple
40 - I{corpus}.tagged_sents(): list of (list of (str,str))
41 - I{corpus}.tagged_paras(): list of (list of (list of (str,str)))
42 - I{corpus}.chunked_sents(): list of (Tree w/ (str,str) leaves)
43 - I{corpus}.parsed_sents(): list of (Tree with str leaves)
44 - I{corpus}.parsed_paras(): list of (list of (Tree with str leaves))
45 - I{corpus}.xml(): A single xml ElementTree
46 - I{corpus}.raw(): unprocessed corpus contents
47
48 For example, to read a list of the words in the Brown Corpus, use
49 C{nltk.corpus.brown.words()}:
50
51 >>> from nltk.corpus import brown
52 >>> print brown.words()
53 ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]
54
55 Corpus Metadata
56 ===============
57 Metadata about the NLTK corpora, and their individual documents, is
58 stored using U{Open Language Archives Community (OLAC)
59 <http://www.language-archives.org/>} metadata records. These records
60 can be accessed using C{nltk.corpus.I{corpus}.olac()}.
61 """
62
63 import re
64
65 from nltk.tokenize import RegexpTokenizer
66 from nltk.tag import simplify_brown_tag, simplify_wsj_tag,\
67 simplify_alpino_tag, simplify_indian_tag,\
68 simplify_tag
69
70 from util import LazyCorpusLoader
71 from reader import *
72 import chat80
73
74 abc = LazyCorpusLoader(
75 'abc', PlaintextCorpusReader, r'(?!\.).*\.txt')
76 alpino = LazyCorpusLoader(
77 'alpino', AlpinoCorpusReader, tag_mapping_function=simplify_alpino_tag)
78 brown = LazyCorpusLoader(
79 'brown', CategorizedTaggedCorpusReader, r'c[a-z]\d\d',
80 cat_pattern=r'c([a-z])\d\d', tag_mapping_function=simplify_brown_tag)
81 cess_cat = LazyCorpusLoader(
82 'cess_cat', BracketParseCorpusReader, r'(?!\.).*\.tbf',
83 tag_mapping_function=simplify_tag)
84 cess_esp = LazyCorpusLoader(
85 'cess_esp', BracketParseCorpusReader, r'(?!\.).*\.tbf',
86 tag_mapping_function=simplify_tag)
87 cmudict = LazyCorpusLoader(
88 'cmudict', CMUDictCorpusReader, ['cmudict'])
89 conll2000 = LazyCorpusLoader(
90 'conll2000', ConllChunkCorpusReader,
91 ['train.txt', 'test.txt'], ('NP','VP','PP'))
92 conll2002 = LazyCorpusLoader(
93 'conll2002', ConllChunkCorpusReader, '.*\.(test|train).*',
94 ('LOC', 'PER', 'ORG', 'MISC'))
95 floresta = LazyCorpusLoader(
96 'floresta', BracketParseCorpusReader, r'(?!\.).*\.ptb', '#',
97 tag_mapping_function=simplify_tag)
98 genesis = LazyCorpusLoader(
99 'genesis', PlaintextCorpusReader, r'(?!\.).*\.txt')
100 gutenberg = LazyCorpusLoader(
101 'gutenberg', PlaintextCorpusReader, r'(?!\.).*\.txt')
102 ieer = LazyCorpusLoader(
103 'ieer', IEERCorpusReader, r'(?!README|\.).*')
104 inaugural = LazyCorpusLoader(
105 'inaugural', PlaintextCorpusReader, r'(?!\.).*\.txt')
106
107 indian = LazyCorpusLoader(
108 'indian', IndianCorpusReader, r'(?!\.).*\.pos',
109 tag_mapping_function=simplify_indian_tag)
110 mac_morpho = LazyCorpusLoader(
111 'mac_morpho', MacMorphoCorpusReader, r'(?!\.).*\.txt',
112 tag_mapping_function=simplify_tag)
113 movie_reviews = LazyCorpusLoader(
114 'movie_reviews', CategorizedPlaintextCorpusReader,
115 r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*')
116 names = LazyCorpusLoader(
117 'names', WordListCorpusReader, r'(?!\.).*\.txt')
118 nps_chat = LazyCorpusLoader(
119 'nps_chat', NPSChatCorpusReader, r'(?!README|\.).*',
120 tag_mapping_function=simplify_wsj_tag)
121 ppattach = LazyCorpusLoader(
122 'ppattach', PPAttachmentCorpusReader, ['training', 'test', 'devset'])
123 qc = LazyCorpusLoader(
124 'qc', StringCategoryCorpusReader, ['train.txt', 'test.txt'])
125 reuters = LazyCorpusLoader(
126 'reuters', CategorizedPlaintextCorpusReader, '(training|test).*',
127 cat_file='cats.txt')
128 rte = LazyCorpusLoader(
129 'rte', RTECorpusReader, r'(?!\.).*\.xml')
130 senseval = LazyCorpusLoader(
131 'senseval', SensevalCorpusReader, r'(?!\.).*\.pos')
132 shakespeare = LazyCorpusLoader(
133 'shakespeare', XMLCorpusReader, r'(?!\.).*\.xml')
134 sinica_treebank = LazyCorpusLoader(
135 'sinica_treebank', SinicaTreebankCorpusReader, ['parsed'],
136 tag_mapping_function=simplify_tag)
137 state_union = LazyCorpusLoader(
138 'state_union', PlaintextCorpusReader, r'(?!\.).*\.txt')
139 stopwords = LazyCorpusLoader(
140 'stopwords', WordListCorpusReader, r'(?!README|\.).*')
141 timit = LazyCorpusLoader(
142 'timit', TimitCorpusReader)
143 toolbox = LazyCorpusLoader(
144 'toolbox', ToolboxCorpusReader, r'(?!.*(README|\.)).*\.(dic|txt)')
145 treebank = LazyCorpusLoader(
146 'treebank/combined', BracketParseCorpusReader, r'wsj_.*\.mrg',
147 tag_mapping_function=simplify_wsj_tag)
148 hebrew_treebank = LazyCorpusLoader(
149 'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt')
150 propbank = LazyCorpusLoader(
151 'propbank', PropbankCorpusReader,
152 'prop.txt', 'frames/.*\.xml', 'verbs.txt',
153 lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
154 treebank)
155 treebank_chunk = LazyCorpusLoader(
156 'treebank/tagged', ChunkedCorpusReader, r'wsj_.*\.pos',
157 sent_tokenizer=RegexpTokenizer(r'(?<=/\.)\s*(?![^\[]*\])', gaps=True),
158 para_block_reader=tagged_treebank_para_block_reader)
159 treebank_raw = LazyCorpusLoader(
160 'treebank/raw', PlaintextCorpusReader, r'wsj_.*')
161 udhr = LazyCorpusLoader(
162 'udhr', PlaintextCorpusReader, r'(?!README|\.).*',
163
164
165
166
167 encoding=[('.*-UTF8$', 'utf-8'), ('.*-Latin1$', 'latin-1'),
168 ('.*-Hebrew$', 'hebrew'), ('.*-Arabic$', 'arabic'),
169 ('.*-Cyrillic$', 'cyrillic'), ('.*-SJIS$', 'SJIS'),
170 ('.*-GB2312$', 'GB2312'), ('.*-Latin2$', 'ISO-8859-2'),
171 ('.*-Greek$', 'greek'), ('.*-UFT8$', 'utf-8'),
172 ('Hungarian_Magyar-Unicode', 'utf-16-le')]
173 )
174 verbnet = LazyCorpusLoader(
175 'verbnet', VerbnetCorpusReader, r'(?!\.).*\.xml')
176 webtext = LazyCorpusLoader(
177 'webtext', PlaintextCorpusReader, r'(?!README|\.).*')
178 words = LazyCorpusLoader(
179 'words', WordListCorpusReader, r'(?!README|\.).*')
180 ycoe = LazyCorpusLoader(
181 'ycoe', YCOECorpusReader)
182
183
210
211
212 if __name__ == '__main__':
213
214 pass
215