Code Coverage for nltk.corpus.reader.chunked

Untested Functions

# Natural Language Toolkit: Chunked Corpus Reader
#
# Copyright (C) 2001-2008 NLTK Project
# Author: Steven Bird <[email protected]>
#         Edward Loper <[email protected]>
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT

"""
A reader for corpora that contain chunked (and optionally tagged)
documents.
"""

from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
from nltk.tree import Tree
from nltk.tokenize import *
from nltk import chunk
import os.path, codecs

class ChunkedCorpusReader(CorpusReader):
    """
    Reader for chunked (and optionally tagged) corpora.  Paragraphs
    are split using a block reader.  They are then tokenized into
    sentences using a sentence tokenizer.  Finally, these sentences
    are parsed into chunk trees using a string-to-chunktree conversion
    function.  Each of these steps can be performed using a default
    function or a custom function.  By default, paragraphs are split
    on blank lines; sentences are listed one per line; and sentences
    are parsed into chunk trees using L{chunk.tagstr2tree}.
    """
    def __init__(self, root, files, extension='', 
                 str2chunktree=chunk.tagstr2tree,
                 sent_tokenizer=RegexpTokenizer('\n', gaps=True),
                 para_block_reader=read_blankline_block,
                 encoding=None):
        """
        @param root: The root directory for this corpus.
        @param files: A list or regexp specifying the files in this corpus.
        """
        CorpusReader.__init__(self, root, files, encoding)

        self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader)
        """Arguments for corpus views generated by this corpus: a tuple
        (str2chunktree, sent_tokenizer, para_block_tokenizer)"""

    def raw(self, files=None):
        """
        @return: the given file or files as a single string.
        @rtype: C{str}
        """
        if files is None: files = self._files
        elif isinstance(files, basestring): files = [files]
        return concat([self.open(f).read() for f in files])

    def words(self, files=None):
        """
        @return: the given file or files as a list of words
            and punctuation symbols.
        @rtype: C{list} of C{str}
        """
        return concat([ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args)
                       for (f, enc) in self.abspaths(files, True)])

    def sents(self, files=None):
        """
        @return: the given file or files as a list of
            sentences or utterances, each encoded as a list of word
            strings.
        @rtype: C{list} of (C{list} of C{str})
        """
        return concat([ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args)
                       for (f, enc) in self.abspaths(files, True)])

    def paras(self, files=None):
        """
        @return: the given file or files as a list of
            paragraphs, each encoded as a list of sentences, which are
            in turn encoded as lists of word strings.
        @rtype: C{list} of (C{list} of (C{list} of C{str}))
        """
        return concat([ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args)
                       for (f, enc) in self.abspaths(files, True)])

    def tagged_words(self, files=None):
        """
        @return: the given file or files as a list of tagged
            words and punctuation symbols, encoded as tuples
            C{(word,tag)}.
        @rtype: C{list} of C{(str,str)}
        """
        return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 0, *self._cv_args)
                       for (f, enc) in self.abspaths(files, True)])

    def tagged_sents(self, files=None):
        """
        @return: the given file or files as a list of
            sentences, each encoded as a list of C{(word,tag)} tuples.
            
        @rtype: C{list} of (C{list} of C{(str,str)})
        """
        return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 0, *self._cv_args)
                       for (f, enc) in self.abspaths(files, True)])

    def tagged_paras(self, files=None):
        """
        @return: the given file or files as a list of
            paragraphs, each encoded as a list of sentences, which are
            in turn encoded as lists of C{(word,tag)} tuples.
        @rtype: C{list} of (C{list} of (C{list} of C{(str,str)}))
        """
        return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 0, *self._cv_args)
                       for (f, enc) in self.abspaths(files, True)])

    def chunked_words(self, files=None):
        """
        @return: the given file or files as a list of tagged
            words and chunks.  Words are encoded as C{(word, tag)}
            tuples (if the corpus has tags) or word strings (if the
            corpus has no tags).  Chunks are encoded as depth-one
            trees over C{(word,tag)} tuples or word strings.
        @rtype: C{list} of (C{(str,str)} and L{Tree})
        """
        return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 1, *self._cv_args)
                       for (f, enc) in self.abspaths(files, True)])

    def chunked_sents(self, files=None):
        """
        @return: the given file or file as a list of
            sentences, each encoded as a shallow C{Tree}.  The leaves
            of these trees are encoded as C{(word, tag)} tuples (if
            the corpus has tags) or word strings (if the corpus has no
            tags).
        @rtype: C{list} of L{Tree}
        """
        return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 1, *self._cv_args)
                       for (f, enc) in self.abspaths(files, True)])

    def chunked_paras(self, files=None):
        """
        @return: the given file or files as a list of
            paragraphs, each encoded as a list of sentences, which are
            in turn encoded as a shallow C{Tree}.  The leaves of these
            trees are encoded as C{(word, tag)} tuples (if the corpus
            has tags) or word strings (if the corpus has no tags).
        @rtype: C{list} of (C{list} of L{Tree})
        """
        return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 1, *self._cv_args)
                       for (f, enc) in self.abspaths(files, True)])

    def _read_block(self, stream):
        return [chunk.tagstr2tree(t) for t in
                read_blankline_block(stream)]

class ChunkedCorpusView(StreamBackedCorpusView):
    def __init__(self, filename, encoding, tagged, group_by_sent,
                 group_by_para, chunked, str2chunktree, sent_tokenizer,
                 para_block_reader):
        StreamBackedCorpusView.__init__(self, filename, encoding=encoding)
        self._tagged = tagged
        self._group_by_sent = group_by_sent
        self._group_by_para = group_by_para
        self._chunked = chunked
        self._str2chunktree = str2chunktree
        self._sent_tokenizer = sent_tokenizer
        self._para_block_reader = para_block_reader

    def read_block(self, stream):
        block = []
        for para_str in self._para_block_reader(stream):
            para = []
            for sent_str in self._sent_tokenizer.tokenize(para_str):
                sent = self._str2chunktree(sent_str)
                
                # If requested, throw away the tags.
                if not self._tagged:
                    sent = self._untag(sent)

                # If requested, throw away the chunks.
                if not self._chunked:
                    sent = sent.leaves()

                # Add the sentence to `para`.
                if self._group_by_sent:
                    para.append(sent)
                else:
                    para.extend(sent)
                    
            # Add the paragraph to `block`.
            if self._group_by_para:
                block.append(para)
            else:
                block.extend(para)
                
        # Return the block
        return block

    def _untag(self, tree):
        for i, child in enumerate(tree):
            if isinstance(child, Tree):
                self._untag(child)
            elif isinstance(child, tuple):
                tree[i] = child[0]
            else:
                raise ValueError('expected child to be Tree or tuple')
        return tree