nltk.corpus.reader.tagged

1 # Natural Language Toolkit: Tagged Corpus Reader 2 # 3 # Copyright (C) 2001-2008 NLTK Project 4 # Author: Edward Loper <[email protected]> 5 # Steven Bird <[email protected]> 6 # URL: <http://nltk.org> 7 # For license information, see LICENSE.TXT 8 9 """ 10 A reader for corpora whose documents contain part-of-speech-tagged words. 11 """ 12 13 from api import * 14 from util import * 15 from nltk.tag import str2tuple 16 from nltk.tokenize import * 17 import os 18 from nltk.internals import deprecated 19

20 -class TaggedCorpusReader(CorpusReader):

21 """ 22 Reader for simple part-of-speech tagged corpora. Paragraphs are 23 assumed to be split using blank lines. Sentences and words can be 24 tokenized using the default tokenizers, or by custom tokenizers 25 specified as parameters to the constructor. Words are parsed 26 using L{nltk.tag.str2tuple}. By default, C{'/'} is used as the 27 separator. I.e., words should have the form:: 28 29 word1/tag1 word2/tag2 word3/tag3 ... 30 31 But custom separators may be specified as parameters to the 32 constructor. Part of speech tags are case-normalized to upper 33 case. 34 """

35 - def __init__(self, root, files, 36 sep='/', word_tokenizer=WhitespaceTokenizer(), 37 sent_tokenizer=RegexpTokenizer('\n', gaps=True), 38 para_block_reader=read_blankline_block, 39 encoding=None, 40 tag_mapping_function=None):

41 """ 42 Construct a new Tagged Corpus reader for a set of documents 43 located at the given root directory. Example usage: 44 45 >>> root = '/...path to corpus.../' 46 >>> reader = TaggedCorpusReader(root, '.*', '.txt') 47 48 @param root: The root directory for this corpus. 49 @param files: A list or regexp specifying the files in this corpus. 50 """ 51 CorpusReader.__init__(self, root, files, encoding) 52 self._sep = sep 53 self._word_tokenizer = word_tokenizer 54 self._sent_tokenizer = sent_tokenizer 55 self._para_block_reader = para_block_reader 56 self._tag_mapping_function = tag_mapping_function

57

58 - def raw(self, files=None):

59 """ 60 @return: the given file or files as a single string. 61 @rtype: C{str} 62 """ 63 if files is None: files = self._files 64 elif isinstance(files, basestring): files = [files] 65 return concat([self.open(f).read() for f in files])

66

67 - def words(self, files=None):

68 """ 69 @return: the given file or files as a list of words 70 and punctuation symbols. 71 @rtype: C{list} of C{str} 72 """ 73 return concat([TaggedCorpusView(filename, enc, 74 False, False, False, 75 self._sep, self._word_tokenizer, 76 self._sent_tokenizer, 77 self._para_block_reader, 78 None) 79 for (filename, enc) in self.abspaths(files, True)])

80

81 - def sents(self, files=None):

82 """ 83 @return: the given file or files as a list of 84 sentences or utterances, each encoded as a list of word 85 strings. 86 @rtype: C{list} of (C{list} of C{str}) 87 """ 88 return concat([TaggedCorpusView(filename, enc, 89 False, True, False, 90 self._sep, self._word_tokenizer, 91 self._sent_tokenizer, 92 self._para_block_reader, 93 None) 94 for (filename, enc) in self.abspaths(files, True)])

95

96 - def paras(self, files=None):

97 """ 98 @return: the given file or files as a list of 99 paragraphs, each encoded as a list of sentences, which are 100 in turn encoded as lists of word strings. 101 @rtype: C{list} of (C{list} of (C{list} of C{str})) 102 """ 103 return concat([TaggedCorpusView(filename, enc, 104 False, True, True, 105 self._sep, self._word_tokenizer, 106 self._sent_tokenizer, 107 self._para_block_reader, 108 None) 109 for (filename, enc) in self.abspaths(files, True)])

110

111 - def tagged_words(self, files=None, simplify_tags=False):

112 """ 113 @return: the given file or files as a list of tagged 114 words and punctuation symbols, encoded as tuples 115 C{(word,tag)}. 116 @rtype: C{list} of C{(str,str)} 117 """ 118 if simplify_tags: 119 tag_mapping_function = self._tag_mapping_function 120 else: 121 tag_mapping_function = None 122 return concat([TaggedCorpusView(filename, enc, 123 True, False, False, 124 self._sep, self._word_tokenizer, 125 self._sent_tokenizer, 126 self._para_block_reader, 127 tag_mapping_function) 128 for (filename, enc) in self.abspaths(files, True)])

129

130 - def tagged_sents(self, files=None, simplify_tags=False):

131 """ 132 @return: the given file or files as a list of 133 sentences, each encoded as a list of C{(word,tag)} tuples. 134 135 @rtype: C{list} of (C{list} of C{(str,str)}) 136 """ 137 if simplify_tags: 138 tag_mapping_function = self._tag_mapping_function 139 else: 140 tag_mapping_function = None 141 return concat([TaggedCorpusView(filename, enc, 142 True, True, False, 143 self._sep, self._word_tokenizer, 144 self._sent_tokenizer, 145 self._para_block_reader, 146 tag_mapping_function) 147 for (filename, enc) in self.abspaths(files, True)])

148

149 - def tagged_paras(self, files=None, simplify_tags=False):

150 """ 151 @return: the given file or files as a list of 152 paragraphs, each encoded as a list of sentences, which are 153 in turn encoded as lists of C{(word,tag)} tuples. 154 @rtype: C{list} of (C{list} of (C{list} of C{(str,str)})) 155 """ 156 if simplify_tags: 157 tag_mapping_function = self._tag_mapping_function 158 else: 159 tag_mapping_function = None 160 return concat([TaggedCorpusView(filename, enc, 161 True, True, True, 162 self._sep, self._word_tokenizer, 163 self._sent_tokenizer, 164 self._para_block_reader, 165 tag_mapping_function) 166 for (filename, enc) in self.abspaths(files, True)])

167 168 #{ Deprecated since 0.8 169 @deprecated("Use .raw() or .words() or .sents() or .paras() or " 170 ".tagged_words() or .tagged_sents() or .tagged_paras() " 171 "instead.")

172 - def read(self, items=None, format='tagged', gs=True, gp=False):

173 if format == 'tagged': return self.tagged(items, gs, gp) 174 if format == 'tokenized': return self.tokenized(items, gs, gp) 175 raise ValueError('bad format %r' % format)

176 @deprecated("Use .words() or .sents() or .paras() instead.")

177 - def tokenized(self, items=None, gs=True, gp=False):

178 if gs and gp: return self.paras() 179 elif gs and not gp: return self.sents() 180 elif not gs and not gp: return self.words() 181 else: return 'Operation no longer supported.'

182 @deprecated("Use .tagged_words() or .tagged_sents() or " 183 ".tagged_paras() instead.")

184 - def tagged(self, items=None, gs=True, gp=False):

185 if gs and gp: return self.tagged_paras() 186 elif gs and not gp: return self.tagged_sents() 187 elif not gs and not gp: return self.tagged_words() 188 else: return 'Operation no longer supported.'

189 #} 190

191 -class CategorizedTaggedCorpusReader(CategorizedCorpusReader, 192 TaggedCorpusReader):

193 """ 194 A reader for part-of-speech tagged corpora whose documents are 195 divided into categories based on their file identifiers. 196 """

197 - def __init__(self, *args, **kwargs):

198 """ 199 Initialize the corpus reader. Categorization arguments 200 (C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to 201 the L{CategorizedCorpusReader constructor 202 <CategorizedCorpusReader.__init__>}. The remaining arguments 203 are passed to the L{TaggedCorpusReader constructor 204 <TaggedCorpusReader.__init__>}. 205 """ 206 CategorizedCorpusReader.__init__(self, kwargs) 207 TaggedCorpusReader.__init__(self, *args, **kwargs)

208

209 - def _resolve(self, files, categories):

210 if files is not None and categories is not None: 211 raise ValueError('Specify files or categories, not both') 212 if categories is not None: 213 return self.files(categories) 214 else: 215 return files

216 - def raw(self, files=None, categories=None):

217 return TaggedCorpusReader.raw( 218 self, self._resolve(files, categories))

219 - def words(self, files=None, categories=None):

220 return TaggedCorpusReader.words( 221 self, self._resolve(files, categories))

222 - def sents(self, files=None, categories=None):

223 return TaggedCorpusReader.sents( 224 self, self._resolve(files, categories))

225 - def paras(self, files=None, categories=None):

226 return TaggedCorpusReader.paras( 227 self, self._resolve(files, categories))

228 - def tagged_words(self, files=None, categories=None, simplify_tags=False):

229 return TaggedCorpusReader.tagged_words( 230 self, self._resolve(files, categories), simplify_tags)

231 - def tagged_sents(self, files=None, categories=None, simplify_tags=False):

232 return TaggedCorpusReader.tagged_sents( 233 self, self._resolve(files, categories), simplify_tags)

234 - def tagged_paras(self, files=None, categories=None, simplify_tags=False):

235 return TaggedCorpusReader.tagged_paras( 236 self, self._resolve(files, categories), simplify_tags)

237

238 -class TaggedCorpusView(StreamBackedCorpusView):

239 """ 240 A specialized corpus view for tagged documents. It can be 241 customized via flags to divide the tagged corpus documents up by 242 sentence or paragraph, and to include or omit part of speech tags. 243 C{TaggedCorpusView} objects are typically created by 244 L{TaggedCorpusReader} (not directly by nltk users). 245 """

246 - def __init__(self, corpus_file, encoding, tagged, group_by_sent, 247 group_by_para, sep, word_tokenizer, sent_tokenizer, 248 para_block_reader, tag_mapping_function=None):

249 self._tagged = tagged 250 self._group_by_sent = group_by_sent 251 self._group_by_para = group_by_para 252 self._sep = sep 253 self._word_tokenizer = word_tokenizer 254 self._sent_tokenizer = sent_tokenizer 255 self._para_block_reader = para_block_reader 256 self._tag_mapping_function = tag_mapping_function 257 StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)

258

259 - def read_block(self, stream):

260 """Reads one paragraph at a time.""" 261 block = [] 262 for para_str in self._para_block_reader(stream): 263 para = [] 264 for sent_str in self._sent_tokenizer.tokenize(para_str): 265 sent = [str2tuple(s, self._sep) for s in 266 self._word_tokenizer.tokenize(sent_str)] 267 if self._tag_mapping_function: 268 sent = [(w, self._tag_mapping_function(t)) for (w,t) in sent] 269 if not self._tagged: 270 sent = [w for (w,t) in sent] 271 if self._group_by_sent: 272 para.append(sent) 273 else: 274 para.extend(sent) 275 if self._group_by_para: 276 block.append(para) 277 else: 278 block.extend(para) 279 return block

280 281 # needs to implement simplified tags

282 -class MacMorphoCorpusReader(TaggedCorpusReader):

283 """ 284 A corpus reader for the MAC_MORPHO corpus. Each line contains a 285 single tagged word, using '_' as a separator. Sentence boundaries 286 are based on the end-sentence tag ('_.'). Paragraph information 287 is not included in the corpus, so each paragraph returned by 288 L{self.paras()} and L{self.tagged_paras()} contains a single 289 sentence. 290 """

291 - def __init__(self, root, files, encoding=None, tag_mapping_function=None):

292 TaggedCorpusReader.__init__( 293 self, root, files, sep='_', 294 word_tokenizer=LineTokenizer(), 295 sent_tokenizer=RegexpTokenizer('.*\n'), 296 para_block_reader=self._read_block, 297 encoding=encoding, 298 tag_mapping_function=tag_mapping_function)

299

300 - def _read_block(self, stream):

301 return read_regexp_block(stream, r'.*', r'.*_\.')

302

Source Code for Module nltk.corpus.reader.tagged