1
2
3
4
5
6
7
8
9 """
10 A reader for corpora that consist of plaintext documents.
11 """
12
13 import codecs
14 import nltk.data
15 from nltk.tokenize import *
16 from nltk.corpus.reader.util import *
17 from nltk.corpus.reader.api import *
18 from nltk.internals import deprecated
19
20 -class PlaintextCorpusReader(CorpusReader):
21 """
22 Reader for corpora that consist of plaintext documents. Paragraphs
23 are assumed to be split using blank lines. Sentences and words can
24 be tokenized using the default tokenizers, or by custom tokenizers
25 specificed as parameters to the constructor.
26
27 This corpus reader can be customized (e.g., to skip preface
28 sections of specific document formats) by creating a subclass and
29 overriding the L{CorpusView} class variable.
30 """
31
32 CorpusView = StreamBackedCorpusView
33 """The corpus view class used by this reader. Subclasses of
34 L{PlaintextCorpusReader} may specify alternative corpus view
35 classes (e.g., to skip the preface sections of documents.)"""
36
37 - def __init__(self, root, files,
38 word_tokenizer=WordPunctTokenizer(),
39 sent_tokenizer=nltk.data.LazyLoader(
40 'tokenizers/punkt/english.pickle'),
41 para_block_reader=read_blankline_block,
42 encoding=None):
43 """
44 Construct a new plaintext corpus reader for a set of documents
45 located at the given root directory. Example usage:
46
47 >>> root = '/...path to corpus.../'
48 >>> reader = PlaintextCorpusReader(root, '.*', '.txt')
49
50 @param root: The root directory for this corpus.
51 @param files: A list or regexp specifying the files in this corpus.
52 @param word_tokenizer: Tokenizer for breaking sentences or
53 paragraphs into words.
54 @param sent_tokenizer: Tokenizer for breaking paragraphs
55 into words.
56 @param para_block_reader: The block reader used to divide the
57 corpus into paragraph blocks.
58 """
59 CorpusReader.__init__(self, root, files, encoding)
60 self._word_tokenizer = word_tokenizer
61 self._sent_tokenizer = sent_tokenizer
62 self._para_block_reader = para_block_reader
63
64 - def raw(self, files=None):
65 """
66 @return: the given file or files as a single string.
67 @rtype: C{str}
68 """
69 if files is None: files = self._files
70 elif isinstance(files, basestring): files = [files]
71 return concat([self.open(f).read() for f in files])
72
73 - def words(self, files=None):
74 """
75 @return: the given file or files as a list of words
76 and punctuation symbols.
77 @rtype: C{list} of C{str}
78 """
79 return concat([self.CorpusView(filename, self._read_word_block,
80 encoding=enc)
81 for (filename, enc) in self.abspaths(files, True)])
82
83 - def sents(self, files=None):
84 """
85 @return: the given file or files as a list of
86 sentences or utterances, each encoded as a list of word
87 strings.
88 @rtype: C{list} of (C{list} of C{str})
89 """
90 if self._sent_tokenizer is None:
91 raise ValueError('No sentence tokenizer for this corpus')
92 return concat([self.CorpusView(filename, self._read_sent_block,
93 encoding=enc)
94 for (filename, enc) in self.abspaths(files, True)])
95
96 - def paras(self, files=None):
97 """
98 @return: the given file or files as a list of
99 paragraphs, each encoded as a list of sentences, which are
100 in turn encoded as lists of word strings.
101 @rtype: C{list} of (C{list} of (C{list} of C{str}))
102 """
103 if self._sent_tokenizer is None:
104 raise ValueError('No sentence tokenizer for this corpus')
105 return concat([self.CorpusView(filename, self._read_para_block,
106 encoding=enc)
107 for (filename, enc) in self.abspaths(files, True)])
108
109 - def _read_word_block(self, stream):
110 words = []
111 for i in range(20):
112 words.extend(self._word_tokenizer.tokenize(stream.readline()))
113 return words
114
115 - def _read_sent_block(self, stream):
116 sents = []
117 for para in self._para_block_reader(stream):
118 sents.extend([self._word_tokenizer.tokenize(sent)
119 for sent in self._sent_tokenizer.tokenize(para)])
120 return sents
121
122 - def _read_para_block(self, stream):
123 paras = []
124 for para in self._para_block_reader(stream):
125 paras.append([self._word_tokenizer.tokenize(sent)
126 for sent in self._sent_tokenizer.tokenize(para)])
127 return paras
128
129
130 @deprecated("Use .raw() or .words() instead.")
131 - def read(self, items=None, format='tokenized'):
132 if format == 'raw': return self.raw(items)
133 if format == 'tokenized': return self.words(items)
134 raise ValueError('bad format %r' % format)
135 @deprecated("Use .words() instead.")
136 - def tokenized(self, items=None):
137 return self.words(items)
138
139
140 -class CategorizedPlaintextCorpusReader(CategorizedCorpusReader,
141 PlaintextCorpusReader):
142 """
143 A reader for plaintext corpora whose documents are divided into
144 categories based on their file identifiers.
145 """
146 - def __init__(self, *args, **kwargs):
147 """
148 Initialize the corpus reader. Categorization arguments
149 (C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to
150 the L{CategorizedCorpusReader constructor
151 <CategorizedCorpusReader.__init__>}. The remaining arguments
152 are passed to the L{PlaintextCorpusReader constructor
153 <PlaintextCorpusReader.__init__>}.
154 """
155 CategorizedCorpusReader.__init__(self, kwargs)
156 PlaintextCorpusReader.__init__(self, *args, **kwargs)
157
158 - def _resolve(self, files, categories):
159 if files is not None and categories is not None:
160 raise ValueError('Specify files or categories, not both')
161 if categories is not None:
162 return self.files(categories)
163 else:
164 return files
165 - def raw(self, files=None, categories=None):
168 - def words(self, files=None, categories=None):
171 - def sents(self, files=None, categories=None):
174 - def paras(self, files=None, categories=None):
177