Package nltk :: Package corpus :: Package reader :: Module plaintext
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.plaintext

  1  # Natural Language Toolkit: Plaintext Corpus Reader 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Steven Bird <[email protected]> 
  5  #         Edward Loper <[email protected]> 
  6  # URL: <http://nltk.org> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  """ 
 10  A reader for corpora that consist of plaintext documents. 
 11  """ 
 12   
 13  import codecs 
 14  import nltk.data 
 15  from nltk.tokenize import * 
 16  from nltk.corpus.reader.util import * 
 17  from nltk.corpus.reader.api import * 
 18  from nltk.internals import deprecated 
 19   
20 -class PlaintextCorpusReader(CorpusReader):
21 """ 22 Reader for corpora that consist of plaintext documents. Paragraphs 23 are assumed to be split using blank lines. Sentences and words can 24 be tokenized using the default tokenizers, or by custom tokenizers 25 specificed as parameters to the constructor. 26 27 This corpus reader can be customized (e.g., to skip preface 28 sections of specific document formats) by creating a subclass and 29 overriding the L{CorpusView} class variable. 30 """ 31 32 CorpusView = StreamBackedCorpusView 33 """The corpus view class used by this reader. Subclasses of 34 L{PlaintextCorpusReader} may specify alternative corpus view 35 classes (e.g., to skip the preface sections of documents.)""" 36
37 - def __init__(self, root, files, 38 word_tokenizer=WordPunctTokenizer(), 39 sent_tokenizer=nltk.data.LazyLoader( 40 'tokenizers/punkt/english.pickle'), 41 para_block_reader=read_blankline_block, 42 encoding=None):
43 """ 44 Construct a new plaintext corpus reader for a set of documents 45 located at the given root directory. Example usage: 46 47 >>> root = '/...path to corpus.../' 48 >>> reader = PlaintextCorpusReader(root, '.*', '.txt') 49 50 @param root: The root directory for this corpus. 51 @param files: A list or regexp specifying the files in this corpus. 52 @param word_tokenizer: Tokenizer for breaking sentences or 53 paragraphs into words. 54 @param sent_tokenizer: Tokenizer for breaking paragraphs 55 into words. 56 @param para_block_reader: The block reader used to divide the 57 corpus into paragraph blocks. 58 """ 59 CorpusReader.__init__(self, root, files, encoding) 60 self._word_tokenizer = word_tokenizer 61 self._sent_tokenizer = sent_tokenizer 62 self._para_block_reader = para_block_reader
63
64 - def raw(self, files=None):
65 """ 66 @return: the given file or files as a single string. 67 @rtype: C{str} 68 """ 69 if files is None: files = self._files 70 elif isinstance(files, basestring): files = [files] 71 return concat([self.open(f).read() for f in files])
72
73 - def words(self, files=None):
74 """ 75 @return: the given file or files as a list of words 76 and punctuation symbols. 77 @rtype: C{list} of C{str} 78 """ 79 return concat([self.CorpusView(filename, self._read_word_block, 80 encoding=enc) 81 for (filename, enc) in self.abspaths(files, True)])
82
83 - def sents(self, files=None):
84 """ 85 @return: the given file or files as a list of 86 sentences or utterances, each encoded as a list of word 87 strings. 88 @rtype: C{list} of (C{list} of C{str}) 89 """ 90 if self._sent_tokenizer is None: 91 raise ValueError('No sentence tokenizer for this corpus') 92 return concat([self.CorpusView(filename, self._read_sent_block, 93 encoding=enc) 94 for (filename, enc) in self.abspaths(files, True)])
95
96 - def paras(self, files=None):
97 """ 98 @return: the given file or files as a list of 99 paragraphs, each encoded as a list of sentences, which are 100 in turn encoded as lists of word strings. 101 @rtype: C{list} of (C{list} of (C{list} of C{str})) 102 """ 103 if self._sent_tokenizer is None: 104 raise ValueError('No sentence tokenizer for this corpus') 105 return concat([self.CorpusView(filename, self._read_para_block, 106 encoding=enc) 107 for (filename, enc) in self.abspaths(files, True)])
108
109 - def _read_word_block(self, stream):
110 words = [] 111 for i in range(20): # Read 20 lines at a time. 112 words.extend(self._word_tokenizer.tokenize(stream.readline())) 113 return words
114
115 - def _read_sent_block(self, stream):
116 sents = [] 117 for para in self._para_block_reader(stream): 118 sents.extend([self._word_tokenizer.tokenize(sent) 119 for sent in self._sent_tokenizer.tokenize(para)]) 120 return sents
121
122 - def _read_para_block(self, stream):
123 paras = [] 124 for para in self._para_block_reader(stream): 125 paras.append([self._word_tokenizer.tokenize(sent) 126 for sent in self._sent_tokenizer.tokenize(para)]) 127 return paras
128 129 #{ Deprecated since 0.8 130 @deprecated("Use .raw() or .words() instead.")
131 - def read(self, items=None, format='tokenized'):
132 if format == 'raw': return self.raw(items) 133 if format == 'tokenized': return self.words(items) 134 raise ValueError('bad format %r' % format)
135 @deprecated("Use .words() instead.")
136 - def tokenized(self, items=None):
137 return self.words(items)
138 #} 139
140 -class CategorizedPlaintextCorpusReader(CategorizedCorpusReader, 141 PlaintextCorpusReader):
142 """ 143 A reader for plaintext corpora whose documents are divided into 144 categories based on their file identifiers. 145 """
146 - def __init__(self, *args, **kwargs):
147 """ 148 Initialize the corpus reader. Categorization arguments 149 (C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to 150 the L{CategorizedCorpusReader constructor 151 <CategorizedCorpusReader.__init__>}. The remaining arguments 152 are passed to the L{PlaintextCorpusReader constructor 153 <PlaintextCorpusReader.__init__>}. 154 """ 155 CategorizedCorpusReader.__init__(self, kwargs) 156 PlaintextCorpusReader.__init__(self, *args, **kwargs)
157
158 - def _resolve(self, files, categories):
159 if files is not None and categories is not None: 160 raise ValueError('Specify files or categories, not both') 161 if categories is not None: 162 return self.files(categories) 163 else: 164 return files
165 - def raw(self, files=None, categories=None):
166 return PlaintextCorpusReader.raw( 167 self, self._resolve(files, categories))
168 - def words(self, files=None, categories=None):
169 return PlaintextCorpusReader.words( 170 self, self._resolve(files, categories))
171 - def sents(self, files=None, categories=None):
172 return PlaintextCorpusReader.sents( 173 self, self._resolve(files, categories))
174 - def paras(self, files=None, categories=None):
175 return PlaintextCorpusReader.paras( 176 self, self._resolve(files, categories))
177