1
2
3
4
5
6
7
8
9 """
10 A reader for corpora that contain chunked (and optionally tagged)
11 documents.
12 """
13
14 from nltk.corpus.reader.util import *
15 from nltk.corpus.reader.api import *
16 from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
17 from nltk.tree import Tree
18 from nltk.tokenize import *
19 from nltk import chunk
20 import os.path, codecs
21
23 """
24 Reader for chunked (and optionally tagged) corpora. Paragraphs
25 are split using a block reader. They are then tokenized into
26 sentences using a sentence tokenizer. Finally, these sentences
27 are parsed into chunk trees using a string-to-chunktree conversion
28 function. Each of these steps can be performed using a default
29 function or a custom function. By default, paragraphs are split
30 on blank lines; sentences are listed one per line; and sentences
31 are parsed into chunk trees using L{chunk.tagstr2tree}.
32 """
38 """
39 @param root: The root directory for this corpus.
40 @param files: A list or regexp specifying the files in this corpus.
41 """
42 CorpusReader.__init__(self, root, files, encoding)
43
44 self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader)
45 """Arguments for corpus views generated by this corpus: a tuple
46 (str2chunktree, sent_tokenizer, para_block_tokenizer)"""
47
48 - def raw(self, files=None):
49 """
50 @return: the given file or files as a single string.
51 @rtype: C{str}
52 """
53 if files is None: files = self._files
54 elif isinstance(files, basestring): files = [files]
55 return concat([self.open(f).read() for f in files])
56
57 - def words(self, files=None):
58 """
59 @return: the given file or files as a list of words
60 and punctuation symbols.
61 @rtype: C{list} of C{str}
62 """
63 return concat([ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args)
64 for (f, enc) in self.abspaths(files, True)])
65
66 - def sents(self, files=None):
67 """
68 @return: the given file or files as a list of
69 sentences or utterances, each encoded as a list of word
70 strings.
71 @rtype: C{list} of (C{list} of C{str})
72 """
73 return concat([ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args)
74 for (f, enc) in self.abspaths(files, True)])
75
76 - def paras(self, files=None):
77 """
78 @return: the given file or files as a list of
79 paragraphs, each encoded as a list of sentences, which are
80 in turn encoded as lists of word strings.
81 @rtype: C{list} of (C{list} of (C{list} of C{str}))
82 """
83 return concat([ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args)
84 for (f, enc) in self.abspaths(files, True)])
85
87 """
88 @return: the given file or files as a list of tagged
89 words and punctuation symbols, encoded as tuples
90 C{(word,tag)}.
91 @rtype: C{list} of C{(str,str)}
92 """
93 return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 0, *self._cv_args)
94 for (f, enc) in self.abspaths(files, True)])
95
97 """
98 @return: the given file or files as a list of
99 sentences, each encoded as a list of C{(word,tag)} tuples.
100
101 @rtype: C{list} of (C{list} of C{(str,str)})
102 """
103 return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 0, *self._cv_args)
104 for (f, enc) in self.abspaths(files, True)])
105
107 """
108 @return: the given file or files as a list of
109 paragraphs, each encoded as a list of sentences, which are
110 in turn encoded as lists of C{(word,tag)} tuples.
111 @rtype: C{list} of (C{list} of (C{list} of C{(str,str)}))
112 """
113 return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 0, *self._cv_args)
114 for (f, enc) in self.abspaths(files, True)])
115
117 """
118 @return: the given file or files as a list of tagged
119 words and chunks. Words are encoded as C{(word, tag)}
120 tuples (if the corpus has tags) or word strings (if the
121 corpus has no tags). Chunks are encoded as depth-one
122 trees over C{(word,tag)} tuples or word strings.
123 @rtype: C{list} of (C{(str,str)} and L{Tree})
124 """
125 return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 1, *self._cv_args)
126 for (f, enc) in self.abspaths(files, True)])
127
129 """
130 @return: the given file or file as a list of
131 sentences, each encoded as a shallow C{Tree}. The leaves
132 of these trees are encoded as C{(word, tag)} tuples (if
133 the corpus has tags) or word strings (if the corpus has no
134 tags).
135 @rtype: C{list} of L{Tree}
136 """
137 return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 1, *self._cv_args)
138 for (f, enc) in self.abspaths(files, True)])
139
141 """
142 @return: the given file or files as a list of
143 paragraphs, each encoded as a list of sentences, which are
144 in turn encoded as a shallow C{Tree}. The leaves of these
145 trees are encoded as C{(word, tag)} tuples (if the corpus
146 has tags) or word strings (if the corpus has no tags).
147 @rtype: C{list} of (C{list} of L{Tree})
148 """
149 return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 1, *self._cv_args)
150 for (f, enc) in self.abspaths(files, True)])
151
155
157 - def __init__(self, filename, encoding, tagged, group_by_sent,
158 group_by_para, chunked, str2chunktree, sent_tokenizer,
159 para_block_reader):
168
170 block = []
171 for para_str in self._para_block_reader(stream):
172 para = []
173 for sent_str in self._sent_tokenizer.tokenize(para_str):
174 sent = self._str2chunktree(sent_str)
175
176
177 if not self._tagged:
178 sent = self._untag(sent)
179
180
181 if not self._chunked:
182 sent = sent.leaves()
183
184
185 if self._group_by_sent:
186 para.append(sent)
187 else:
188 para.extend(sent)
189
190
191 if self._group_by_para:
192 block.append(para)
193 else:
194 block.extend(para)
195
196
197 return block
198
208