Package nltk :: Package corpus :: Package reader :: Module chunked
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.chunked

  1  # Natural Language Toolkit: Chunked Corpus Reader 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Steven Bird <[email protected]> 
  5  #         Edward Loper <[email protected]> 
  6  # URL: <http://nltk.org> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  """ 
 10  A reader for corpora that contain chunked (and optionally tagged) 
 11  documents. 
 12  """ 
 13   
 14  from nltk.corpus.reader.util import * 
 15  from nltk.corpus.reader.api import * 
 16  from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader 
 17  from nltk.tree import Tree 
 18  from nltk.tokenize import * 
 19  from nltk import chunk 
 20  import os.path, codecs 
 21   
22 -class ChunkedCorpusReader(CorpusReader):
23 """ 24 Reader for chunked (and optionally tagged) corpora. Paragraphs 25 are split using a block reader. They are then tokenized into 26 sentences using a sentence tokenizer. Finally, these sentences 27 are parsed into chunk trees using a string-to-chunktree conversion 28 function. Each of these steps can be performed using a default 29 function or a custom function. By default, paragraphs are split 30 on blank lines; sentences are listed one per line; and sentences 31 are parsed into chunk trees using L{chunk.tagstr2tree}. 32 """
33 - def __init__(self, root, files, extension='', 34 str2chunktree=chunk.tagstr2tree, 35 sent_tokenizer=RegexpTokenizer('\n', gaps=True), 36 para_block_reader=read_blankline_block, 37 encoding=None):
38 """ 39 @param root: The root directory for this corpus. 40 @param files: A list or regexp specifying the files in this corpus. 41 """ 42 CorpusReader.__init__(self, root, files, encoding) 43 44 self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader) 45 """Arguments for corpus views generated by this corpus: a tuple 46 (str2chunktree, sent_tokenizer, para_block_tokenizer)"""
47
48 - def raw(self, files=None):
49 """ 50 @return: the given file or files as a single string. 51 @rtype: C{str} 52 """ 53 if files is None: files = self._files 54 elif isinstance(files, basestring): files = [files] 55 return concat([self.open(f).read() for f in files])
56
57 - def words(self, files=None):
58 """ 59 @return: the given file or files as a list of words 60 and punctuation symbols. 61 @rtype: C{list} of C{str} 62 """ 63 return concat([ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args) 64 for (f, enc) in self.abspaths(files, True)])
65
66 - def sents(self, files=None):
67 """ 68 @return: the given file or files as a list of 69 sentences or utterances, each encoded as a list of word 70 strings. 71 @rtype: C{list} of (C{list} of C{str}) 72 """ 73 return concat([ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args) 74 for (f, enc) in self.abspaths(files, True)])
75
76 - def paras(self, files=None):
77 """ 78 @return: the given file or files as a list of 79 paragraphs, each encoded as a list of sentences, which are 80 in turn encoded as lists of word strings. 81 @rtype: C{list} of (C{list} of (C{list} of C{str})) 82 """ 83 return concat([ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args) 84 for (f, enc) in self.abspaths(files, True)])
85
86 - def tagged_words(self, files=None):
87 """ 88 @return: the given file or files as a list of tagged 89 words and punctuation symbols, encoded as tuples 90 C{(word,tag)}. 91 @rtype: C{list} of C{(str,str)} 92 """ 93 return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 0, *self._cv_args) 94 for (f, enc) in self.abspaths(files, True)])
95
96 - def tagged_sents(self, files=None):
97 """ 98 @return: the given file or files as a list of 99 sentences, each encoded as a list of C{(word,tag)} tuples. 100 101 @rtype: C{list} of (C{list} of C{(str,str)}) 102 """ 103 return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 0, *self._cv_args) 104 for (f, enc) in self.abspaths(files, True)])
105
106 - def tagged_paras(self, files=None):
107 """ 108 @return: the given file or files as a list of 109 paragraphs, each encoded as a list of sentences, which are 110 in turn encoded as lists of C{(word,tag)} tuples. 111 @rtype: C{list} of (C{list} of (C{list} of C{(str,str)})) 112 """ 113 return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 0, *self._cv_args) 114 for (f, enc) in self.abspaths(files, True)])
115
116 - def chunked_words(self, files=None):
117 """ 118 @return: the given file or files as a list of tagged 119 words and chunks. Words are encoded as C{(word, tag)} 120 tuples (if the corpus has tags) or word strings (if the 121 corpus has no tags). Chunks are encoded as depth-one 122 trees over C{(word,tag)} tuples or word strings. 123 @rtype: C{list} of (C{(str,str)} and L{Tree}) 124 """ 125 return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 1, *self._cv_args) 126 for (f, enc) in self.abspaths(files, True)])
127
128 - def chunked_sents(self, files=None):
129 """ 130 @return: the given file or file as a list of 131 sentences, each encoded as a shallow C{Tree}. The leaves 132 of these trees are encoded as C{(word, tag)} tuples (if 133 the corpus has tags) or word strings (if the corpus has no 134 tags). 135 @rtype: C{list} of L{Tree} 136 """ 137 return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 1, *self._cv_args) 138 for (f, enc) in self.abspaths(files, True)])
139
140 - def chunked_paras(self, files=None):
141 """ 142 @return: the given file or files as a list of 143 paragraphs, each encoded as a list of sentences, which are 144 in turn encoded as a shallow C{Tree}. The leaves of these 145 trees are encoded as C{(word, tag)} tuples (if the corpus 146 has tags) or word strings (if the corpus has no tags). 147 @rtype: C{list} of (C{list} of L{Tree}) 148 """ 149 return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 1, *self._cv_args) 150 for (f, enc) in self.abspaths(files, True)])
151
152 - def _read_block(self, stream):
153 return [chunk.tagstr2tree(t) for t in 154 read_blankline_block(stream)]
155
156 -class ChunkedCorpusView(StreamBackedCorpusView):
157 - def __init__(self, filename, encoding, tagged, group_by_sent, 158 group_by_para, chunked, str2chunktree, sent_tokenizer, 159 para_block_reader):
160 StreamBackedCorpusView.__init__(self, filename, encoding=encoding) 161 self._tagged = tagged 162 self._group_by_sent = group_by_sent 163 self._group_by_para = group_by_para 164 self._chunked = chunked 165 self._str2chunktree = str2chunktree 166 self._sent_tokenizer = sent_tokenizer 167 self._para_block_reader = para_block_reader
168
169 - def read_block(self, stream):
170 block = [] 171 for para_str in self._para_block_reader(stream): 172 para = [] 173 for sent_str in self._sent_tokenizer.tokenize(para_str): 174 sent = self._str2chunktree(sent_str) 175 176 # If requested, throw away the tags. 177 if not self._tagged: 178 sent = self._untag(sent) 179 180 # If requested, throw away the chunks. 181 if not self._chunked: 182 sent = sent.leaves() 183 184 # Add the sentence to `para`. 185 if self._group_by_sent: 186 para.append(sent) 187 else: 188 para.extend(sent) 189 190 # Add the paragraph to `block`. 191 if self._group_by_para: 192 block.append(para) 193 else: 194 block.extend(para) 195 196 # Return the block 197 return block
198
199 - def _untag(self, tree):
200 for i, child in enumerate(tree): 201 if isinstance(child, Tree): 202 self._untag(child) 203 elif isinstance(child, tuple): 204 tree[i] = child[0] 205 else: 206 raise ValueError('expected child to be Tree or tuple') 207 return tree
208