1
2
3
4
5
6
7
8
9 """
10 A reader for corpora whose documents contain part-of-speech-tagged words.
11 """
12
13 from api import *
14 from util import *
15 from nltk.tag import str2tuple
16 from nltk.tokenize import *
17 import os
18 from nltk.internals import deprecated
19
21 """
22 Reader for simple part-of-speech tagged corpora. Paragraphs are
23 assumed to be split using blank lines. Sentences and words can be
24 tokenized using the default tokenizers, or by custom tokenizers
25 specified as parameters to the constructor. Words are parsed
26 using L{nltk.tag.str2tuple}. By default, C{'/'} is used as the
27 separator. I.e., words should have the form::
28
29 word1/tag1 word2/tag2 word3/tag3 ...
30
31 But custom separators may be specified as parameters to the
32 constructor. Part of speech tags are case-normalized to upper
33 case.
34 """
41 """
42 Construct a new Tagged Corpus reader for a set of documents
43 located at the given root directory. Example usage:
44
45 >>> root = '/...path to corpus.../'
46 >>> reader = TaggedCorpusReader(root, '.*', '.txt')
47
48 @param root: The root directory for this corpus.
49 @param files: A list or regexp specifying the files in this corpus.
50 """
51 CorpusReader.__init__(self, root, files, encoding)
52 self._sep = sep
53 self._word_tokenizer = word_tokenizer
54 self._sent_tokenizer = sent_tokenizer
55 self._para_block_reader = para_block_reader
56 self._tag_mapping_function = tag_mapping_function
57
58 - def raw(self, files=None):
59 """
60 @return: the given file or files as a single string.
61 @rtype: C{str}
62 """
63 if files is None: files = self._files
64 elif isinstance(files, basestring): files = [files]
65 return concat([self.open(f).read() for f in files])
66
67 - def words(self, files=None):
68 """
69 @return: the given file or files as a list of words
70 and punctuation symbols.
71 @rtype: C{list} of C{str}
72 """
73 return concat([TaggedCorpusView(filename, enc,
74 False, False, False,
75 self._sep, self._word_tokenizer,
76 self._sent_tokenizer,
77 self._para_block_reader,
78 None)
79 for (filename, enc) in self.abspaths(files, True)])
80
81 - def sents(self, files=None):
82 """
83 @return: the given file or files as a list of
84 sentences or utterances, each encoded as a list of word
85 strings.
86 @rtype: C{list} of (C{list} of C{str})
87 """
88 return concat([TaggedCorpusView(filename, enc,
89 False, True, False,
90 self._sep, self._word_tokenizer,
91 self._sent_tokenizer,
92 self._para_block_reader,
93 None)
94 for (filename, enc) in self.abspaths(files, True)])
95
96 - def paras(self, files=None):
97 """
98 @return: the given file or files as a list of
99 paragraphs, each encoded as a list of sentences, which are
100 in turn encoded as lists of word strings.
101 @rtype: C{list} of (C{list} of (C{list} of C{str}))
102 """
103 return concat([TaggedCorpusView(filename, enc,
104 False, True, True,
105 self._sep, self._word_tokenizer,
106 self._sent_tokenizer,
107 self._para_block_reader,
108 None)
109 for (filename, enc) in self.abspaths(files, True)])
110
112 """
113 @return: the given file or files as a list of tagged
114 words and punctuation symbols, encoded as tuples
115 C{(word,tag)}.
116 @rtype: C{list} of C{(str,str)}
117 """
118 if simplify_tags:
119 tag_mapping_function = self._tag_mapping_function
120 else:
121 tag_mapping_function = None
122 return concat([TaggedCorpusView(filename, enc,
123 True, False, False,
124 self._sep, self._word_tokenizer,
125 self._sent_tokenizer,
126 self._para_block_reader,
127 tag_mapping_function)
128 for (filename, enc) in self.abspaths(files, True)])
129
131 """
132 @return: the given file or files as a list of
133 sentences, each encoded as a list of C{(word,tag)} tuples.
134
135 @rtype: C{list} of (C{list} of C{(str,str)})
136 """
137 if simplify_tags:
138 tag_mapping_function = self._tag_mapping_function
139 else:
140 tag_mapping_function = None
141 return concat([TaggedCorpusView(filename, enc,
142 True, True, False,
143 self._sep, self._word_tokenizer,
144 self._sent_tokenizer,
145 self._para_block_reader,
146 tag_mapping_function)
147 for (filename, enc) in self.abspaths(files, True)])
148
150 """
151 @return: the given file or files as a list of
152 paragraphs, each encoded as a list of sentences, which are
153 in turn encoded as lists of C{(word,tag)} tuples.
154 @rtype: C{list} of (C{list} of (C{list} of C{(str,str)}))
155 """
156 if simplify_tags:
157 tag_mapping_function = self._tag_mapping_function
158 else:
159 tag_mapping_function = None
160 return concat([TaggedCorpusView(filename, enc,
161 True, True, True,
162 self._sep, self._word_tokenizer,
163 self._sent_tokenizer,
164 self._para_block_reader,
165 tag_mapping_function)
166 for (filename, enc) in self.abspaths(files, True)])
167
168
169 @deprecated("Use .raw() or .words() or .sents() or .paras() or "
170 ".tagged_words() or .tagged_sents() or .tagged_paras() "
171 "instead.")
172 - def read(self, items=None, format='tagged', gs=True, gp=False):
176 @deprecated("Use .words() or .sents() or .paras() instead.")
177 - def tokenized(self, items=None, gs=True, gp=False):
178 if gs and gp: return self.paras()
179 elif gs and not gp: return self.sents()
180 elif not gs and not gp: return self.words()
181 else: return 'Operation no longer supported.'
182 @deprecated("Use .tagged_words() or .tagged_sents() or "
183 ".tagged_paras() instead.")
184 - def tagged(self, items=None, gs=True, gp=False):
185 if gs and gp: return self.tagged_paras()
186 elif gs and not gp: return self.tagged_sents()
187 elif not gs and not gp: return self.tagged_words()
188 else: return 'Operation no longer supported.'
189
190
193 """
194 A reader for part-of-speech tagged corpora whose documents are
195 divided into categories based on their file identifiers.
196 """
198 """
199 Initialize the corpus reader. Categorization arguments
200 (C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to
201 the L{CategorizedCorpusReader constructor
202 <CategorizedCorpusReader.__init__>}. The remaining arguments
203 are passed to the L{TaggedCorpusReader constructor
204 <TaggedCorpusReader.__init__>}.
205 """
206 CategorizedCorpusReader.__init__(self, kwargs)
207 TaggedCorpusReader.__init__(self, *args, **kwargs)
208
216 - def raw(self, files=None, categories=None):
219 - def words(self, files=None, categories=None):
222 - def sents(self, files=None, categories=None):
225 - def paras(self, files=None, categories=None):
228 - def tagged_words(self, files=None, categories=None, simplify_tags=False):
231 - def tagged_sents(self, files=None, categories=None, simplify_tags=False):
234 - def tagged_paras(self, files=None, categories=None, simplify_tags=False):
237
239 """
240 A specialized corpus view for tagged documents. It can be
241 customized via flags to divide the tagged corpus documents up by
242 sentence or paragraph, and to include or omit part of speech tags.
243 C{TaggedCorpusView} objects are typically created by
244 L{TaggedCorpusReader} (not directly by nltk users).
245 """
246 - def __init__(self, corpus_file, encoding, tagged, group_by_sent,
247 group_by_para, sep, word_tokenizer, sent_tokenizer,
248 para_block_reader, tag_mapping_function=None):
249 self._tagged = tagged
250 self._group_by_sent = group_by_sent
251 self._group_by_para = group_by_para
252 self._sep = sep
253 self._word_tokenizer = word_tokenizer
254 self._sent_tokenizer = sent_tokenizer
255 self._para_block_reader = para_block_reader
256 self._tag_mapping_function = tag_mapping_function
257 StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
258
260 """Reads one paragraph at a time."""
261 block = []
262 for para_str in self._para_block_reader(stream):
263 para = []
264 for sent_str in self._sent_tokenizer.tokenize(para_str):
265 sent = [str2tuple(s, self._sep) for s in
266 self._word_tokenizer.tokenize(sent_str)]
267 if self._tag_mapping_function:
268 sent = [(w, self._tag_mapping_function(t)) for (w,t) in sent]
269 if not self._tagged:
270 sent = [w for (w,t) in sent]
271 if self._group_by_sent:
272 para.append(sent)
273 else:
274 para.extend(sent)
275 if self._group_by_para:
276 block.append(para)
277 else:
278 block.extend(para)
279 return block
280
281
283 """
284 A corpus reader for the MAC_MORPHO corpus. Each line contains a
285 single tagged word, using '_' as a separator. Sentence boundaries
286 are based on the end-sentence tag ('_.'). Paragraph information
287 is not included in the corpus, so each paragraph returned by
288 L{self.paras()} and L{self.tagged_paras()} contains a single
289 sentence.
290 """
291 - def __init__(self, root, files, encoding=None, tag_mapping_function=None):
299
302