nltk.corpus.reader.chunked

23 """ 24 Reader for chunked (and optionally tagged) corpora. Paragraphs 25 are split using a block reader. They are then tokenized into 26 sentences using a sentence tokenizer. Finally, these sentences 27 are parsed into chunk trees using a string-to-chunktree conversion 28 function. Each of these steps can be performed using a default 29 function or a custom function. By default, paragraphs are split 30 on blank lines; sentences are listed one per line; and sentences 31 are parsed into chunk trees using L{chunk.tagstr2tree}. 32 """

33 - def __init__(self, root, files, extension='', 34 str2chunktree=chunk.tagstr2tree, 35 sent_tokenizer=RegexpTokenizer('\n', gaps=True), 36 para_block_reader=read_blankline_block, 37 encoding=None):

38 """ 39 @param root: The root directory for this corpus. 40 @param files: A list or regexp specifying the files in this corpus. 41 """ 42 CorpusReader.__init__(self, root, files, encoding) 43 44 self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader) 45 """Arguments for corpus views generated by this corpus: a tuple 46 (str2chunktree, sent_tokenizer, para_block_tokenizer)"""

47

48 - def raw(self, files=None):

49 """ 50 @return: the given file or files as a single string. 51 @rtype: C{str} 52 """ 53 if files is None: files = self._files 54 elif isinstance(files, basestring): files = [files] 55 return concat([self.open(f).read() for f in files])

56

57 - def words(self, files=None):

58 """ 59 @return: the given file or files as a list of words 60 and punctuation symbols. 61 @rtype: C{list} of C{str} 62 """ 63 return concat([ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args) 64 for (f, enc) in self.abspaths(files, True)])

65

66 - def sents(self, files=None):

67 """ 68 @return: the given file or files as a list of 69 sentences or utterances, each encoded as a list of word 70 strings. 71 @rtype: C{list} of (C{list} of C{str}) 72 """ 73 return concat([ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args) 74 for (f, enc) in self.abspaths(files, True)])

75

76 - def paras(self, files=None):

77 """ 78 @return: the given file or files as a list of 79 paragraphs, each encoded as a list of sentences, which are 80 in turn encoded as lists of word strings. 81 @rtype: C{list} of (C{list} of (C{list} of C{str})) 82 """ 83 return concat([ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args) 84 for (f, enc) in self.abspaths(files, True)])

85

86 - def tagged_words(self, files=None):

87 """ 88 @return: the given file or files as a list of tagged 89 words and punctuation symbols, encoded as tuples 90 C{(word,tag)}. 91 @rtype: C{list} of C{(str,str)} 92 """ 93 return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 0, *self._cv_args) 94 for (f, enc) in self.abspaths(files, True)])

95

96 - def tagged_sents(self, files=None):

97 """ 98 @return: the given file or files as a list of 99 sentences, each encoded as a list of C{(word,tag)} tuples. 100 101 @rtype: C{list} of (C{list} of C{(str,str)}) 102 """ 103 return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 0, *self._cv_args) 104 for (f, enc) in self.abspaths(files, True)])

105

106 - def tagged_paras(self, files=None):

107 """ 108 @return: the given file or files as a list of 109 paragraphs, each encoded as a list of sentences, which are 110 in turn encoded as lists of C{(word,tag)} tuples. 111 @rtype: C{list} of (C{list} of (C{list} of C{(str,str)})) 112 """ 113 return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 0, *self._cv_args) 114 for (f, enc) in self.abspaths(files, True)])

115

116 - def chunked_words(self, files=None):

117 """ 118 @return: the given file or files as a list of tagged 119 words and chunks. Words are encoded as C{(word, tag)} 120 tuples (if the corpus has tags) or word strings (if the 121 corpus has no tags). Chunks are encoded as depth-one 122 trees over C{(word,tag)} tuples or word strings. 123 @rtype: C{list} of (C{(str,str)} and L{Tree}) 124 """ 125 return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 1, *self._cv_args) 126 for (f, enc) in self.abspaths(files, True)])

127

128 - def chunked_sents(self, files=None):

129 """ 130 @return: the given file or file as a list of 131 sentences, each encoded as a shallow C{Tree}. The leaves 132 of these trees are encoded as C{(word, tag)} tuples (if 133 the corpus has tags) or word strings (if the corpus has no 134 tags). 135 @rtype: C{list} of L{Tree} 136 """ 137 return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 1, *self._cv_args) 138 for (f, enc) in self.abspaths(files, True)])

139

140 - def chunked_paras(self, files=None):

141 """ 142 @return: the given file or files as a list of 143 paragraphs, each encoded as a list of sentences, which are 144 in turn encoded as a shallow C{Tree}. The leaves of these 145 trees are encoded as C{(word, tag)} tuples (if the corpus 146 has tags) or word strings (if the corpus has no tags). 147 @rtype: C{list} of (C{list} of L{Tree}) 148 """ 149 return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 1, *self._cv_args) 150 for (f, enc) in self.abspaths(files, True)])

151

152 - def _read_block(self, stream):

153 return [chunk.tagstr2tree(t) for t in 154 read_blankline_block(stream)]

157 - def __init__(self, filename, encoding, tagged, group_by_sent, 158 group_by_para, chunked, str2chunktree, sent_tokenizer, 159 para_block_reader):

160 StreamBackedCorpusView.__init__(self, filename, encoding=encoding) 161 self._tagged = tagged 162 self._group_by_sent = group_by_sent 163 self._group_by_para = group_by_para 164 self._chunked = chunked 165 self._str2chunktree = str2chunktree 166 self._sent_tokenizer = sent_tokenizer 167 self._para_block_reader = para_block_reader

168

169 - def read_block(self, stream):

170 block = [] 171 for para_str in self._para_block_reader(stream): 172 para = [] 173 for sent_str in self._sent_tokenizer.tokenize(para_str): 174 sent = self._str2chunktree(sent_str) 175 176 # If requested, throw away the tags. 177 if not self._tagged: 178 sent = self._untag(sent) 179 180 # If requested, throw away the chunks. 181 if not self._chunked: 182 sent = sent.leaves() 183 184 # Add the sentence to `para`. 185 if self._group_by_sent: 186 para.append(sent) 187 else: 188 para.extend(sent) 189 190 # Add the paragraph to `block`. 191 if self._group_by_para: 192 block.append(para) 193 else: 194 block.extend(para) 195 196 # Return the block 197 return block

198

199 - def _untag(self, tree):

200 for i, child in enumerate(tree): 201 if isinstance(child, Tree): 202 self._untag(child) 203 elif isinstance(child, tuple): 204 tree[i] = child[0] 205 else: 206 raise ValueError('expected child to be Tree or tuple') 207 return tree

Source Code for Module nltk.corpus.reader.chunked