Code Coverage for nltk.corpus.reader.chunked
Untested Functions
- ChunkedCorpusReader: __init__(), _read_block(), chunked_paras(), chunked_sents(), chunked_words(), paras(), raw(), sents(), tagged_paras(), tagged_sents(), tagged_words(), words()
- ChunkedCorpusView: __init__(), _untag(), read_block()
"""
A reader for corpora that contain chunked (and optionally tagged)
documents.
"""
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
from nltk.tree import Tree
from nltk.tokenize import *
from nltk import chunk
import os.path, codecs
class ChunkedCorpusReader(CorpusReader):
"""
Reader for chunked (and optionally tagged) corpora. Paragraphs
are split using a block reader. They are then tokenized into
sentences using a sentence tokenizer. Finally, these sentences
are parsed into chunk trees using a string-to-chunktree conversion
function. Each of these steps can be performed using a default
function or a custom function. By default, paragraphs are split
on blank lines; sentences are listed one per line; and sentences
are parsed into chunk trees using L{chunk.tagstr2tree}.
"""
def __init__(self, root, files, extension='',
str2chunktree=chunk.tagstr2tree,
sent_tokenizer=RegexpTokenizer('\n', gaps=True),
para_block_reader=read_blankline_block,
encoding=None):
"""
@param root: The root directory for this corpus.
@param files: A list or regexp specifying the files in this corpus.
"""
CorpusReader.__init__(self, root, files, encoding)
self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader)
"""Arguments for corpus views generated by this corpus: a tuple
(str2chunktree, sent_tokenizer, para_block_tokenizer)"""
def raw(self, files=None):
"""
@return: the given file or files as a single string.
@rtype: C{str}
"""
if files is None: files = self._files
elif isinstance(files, basestring): files = [files]
return concat([self.open(f).read() for f in files])
def words(self, files=None):
"""
@return: the given file or files as a list of words
and punctuation symbols.
@rtype: C{list} of C{str}
"""
return concat([ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args)
for (f, enc) in self.abspaths(files, True)])
def sents(self, files=None):
"""
@return: the given file or files as a list of
sentences or utterances, each encoded as a list of word
strings.
@rtype: C{list} of (C{list} of C{str})
"""
return concat([ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args)
for (f, enc) in self.abspaths(files, True)])
def paras(self, files=None):
"""
@return: the given file or files as a list of
paragraphs, each encoded as a list of sentences, which are
in turn encoded as lists of word strings.
@rtype: C{list} of (C{list} of (C{list} of C{str}))
"""
return concat([ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args)
for (f, enc) in self.abspaths(files, True)])
def tagged_words(self, files=None):
"""
@return: the given file or files as a list of tagged
words and punctuation symbols, encoded as tuples
C{(word,tag)}.
@rtype: C{list} of C{(str,str)}
"""
return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 0, *self._cv_args)
for (f, enc) in self.abspaths(files, True)])
def tagged_sents(self, files=None):
"""
@return: the given file or files as a list of
sentences, each encoded as a list of C{(word,tag)} tuples.
@rtype: C{list} of (C{list} of C{(str,str)})
"""
return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 0, *self._cv_args)
for (f, enc) in self.abspaths(files, True)])
def tagged_paras(self, files=None):
"""
@return: the given file or files as a list of
paragraphs, each encoded as a list of sentences, which are
in turn encoded as lists of C{(word,tag)} tuples.
@rtype: C{list} of (C{list} of (C{list} of C{(str,str)}))
"""
return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 0, *self._cv_args)
for (f, enc) in self.abspaths(files, True)])
def chunked_words(self, files=None):
"""
@return: the given file or files as a list of tagged
words and chunks. Words are encoded as C{(word, tag)}
tuples (if the corpus has tags) or word strings (if the
corpus has no tags). Chunks are encoded as depth-one
trees over C{(word,tag)} tuples or word strings.
@rtype: C{list} of (C{(str,str)} and L{Tree})
"""
return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 1, *self._cv_args)
for (f, enc) in self.abspaths(files, True)])
def chunked_sents(self, files=None):
"""
@return: the given file or file as a list of
sentences, each encoded as a shallow C{Tree}. The leaves
of these trees are encoded as C{(word, tag)} tuples (if
the corpus has tags) or word strings (if the corpus has no
tags).
@rtype: C{list} of L{Tree}
"""
return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 1, *self._cv_args)
for (f, enc) in self.abspaths(files, True)])
def chunked_paras(self, files=None):
"""
@return: the given file or files as a list of
paragraphs, each encoded as a list of sentences, which are
in turn encoded as a shallow C{Tree}. The leaves of these
trees are encoded as C{(word, tag)} tuples (if the corpus
has tags) or word strings (if the corpus has no tags).
@rtype: C{list} of (C{list} of L{Tree})
"""
return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 1, *self._cv_args)
for (f, enc) in self.abspaths(files, True)])
def _read_block(self, stream):
return [chunk.tagstr2tree(t) for t in
read_blankline_block(stream)]
class ChunkedCorpusView(StreamBackedCorpusView):
def __init__(self, filename, encoding, tagged, group_by_sent,
group_by_para, chunked, str2chunktree, sent_tokenizer,
para_block_reader):
StreamBackedCorpusView.__init__(self, filename, encoding=encoding)
self._tagged = tagged
self._group_by_sent = group_by_sent
self._group_by_para = group_by_para
self._chunked = chunked
self._str2chunktree = str2chunktree
self._sent_tokenizer = sent_tokenizer
self._para_block_reader = para_block_reader
def read_block(self, stream):
block = []
for para_str in self._para_block_reader(stream):
para = []
for sent_str in self._sent_tokenizer.tokenize(para_str):
sent = self._str2chunktree(sent_str)
if not self._tagged:
sent = self._untag(sent)
if not self._chunked:
sent = sent.leaves()
if self._group_by_sent:
para.append(sent)
else:
para.extend(sent)
if self._group_by_para:
block.append(para)
else:
block.extend(para)
return block
def _untag(self, tree):
for i, child in enumerate(tree):
if isinstance(child, Tree):
self._untag(child)
elif isinstance(child, tuple):
tree[i] = child[0]
else:
raise ValueError('expected child to be Tree or tuple')
return tree