Code Coverage for nltk.corpus.reader.bracket_parse
Untested Functions
|
Partially Tested Functions
|
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
from nltk.tree import bracket_parse, Tree
import sys
"""
Corpus reader for corpora that consist of parenthesis-delineated parse trees.
"""
TAGWORD = re.compile(r'\(([^\s()]+) ([^\s()]+)\)')
WORD = re.compile(r'\([^\s()]+ ([^\s()]+)\)')
EMPTY_BRACKETS = re.compile(r'\s*\(\s*\(')
class BracketParseCorpusReader(SyntaxCorpusReader):
"""
Reader for corpora that consist of parenthesis-delineated parse
trees.
"""
def __init__(self, root, files, comment_char=None,
detect_blocks='unindented_paren', encoding=None,
tag_mapping_function=None):
"""
@param root: The root directory for this corpus.
@param files: A list or regexp specifying the files in this corpus.
@param comment_char: The character which can appear at the start of
a line to indicate that the rest of the line is a comment.
@param detect_blocks: The method that is used to find blocks
in the corpus; can be 'unindented_paren' (every unindented
parenthesis starts a new parse) or 'sexpr' (brackets are
matched).
"""
CorpusReader.__init__(self, root, files, encoding)
self._comment_char = comment_char
self._detect_blocks = detect_blocks
self._tag_mapping_function = tag_mapping_function
def _read_block(self, stream):
if self._detect_blocks == 'sexpr':
return read_sexpr_block(stream, comment_char=self._comment_char)
elif self._detect_blocks == 'blankline':
return read_blankline_block(stream)
elif self._detect_blocks == 'unindented_paren':
toks = read_regexp_block(stream, start_re=r'^\(')
if self._comment_char:
toks = [re.sub('(?m)^%s.*'%re.escape(self._comment_char),
'', tok)
for tok in toks]
return toks
else:
assert 0, 'bad block type'
def _normalize(self, t):
if EMPTY_BRACKETS.match(t):
t = t.strip()[1:-1]
t = re.sub(r"\((.)\)", r"(\1 \1)", t)
t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t)
return t
def _parse(self, t):
try:
return bracket_parse(self._normalize(t))
except ValueError, e:
sys.stderr.write("Bad tree detected; trying to recover...\n")
if e.args == ('mismatched parens',):
for n in range(1, 5):
try:
v = bracket_parse(self._normalize(t+')'*n))
sys.stderr.write(" Recovered by adding %d close "
"paren(s)\n" % n)
return v
except ValueError: pass
sys.stderr.write(" Recovered by returning a flat parse.\n")
return Tree('S', self._tag(t))
def _tag(self, t, simplify_tags=False):
tagged_sent = [(w,t) for (t,w) in TAGWORD.findall(self._normalize(t))]
if simplify_tags:
tagged_sent = [(w, self._tag_mapping_function(t))
for (w,t) in tagged_sent]
return tagged_sent
def _word(self, t):
return WORD.findall(self._normalize(t))
class AlpinoCorpusReader(BracketParseCorpusReader):
"""
Reader for the Alpino Dutch Treebank.
"""
def __init__(self, root, encoding=None, tag_mapping_function=None):
BracketParseCorpusReader.__init__(self, root, 'alpino\.xml',
detect_blocks='blankline',
encoding=encoding,
tag_mapping_function=tag_mapping_function)
def _normalize(self, t):
if t[:10] != "<alpino_ds":
return ""
t = re.sub(r' <node .*? cat="(\w+)".*>', r"(\1", t)
t = re.sub(r' <node .*? pos="(\w+)".*? word="([^"]+)".*/>', r"(\1 \2)", t)
t = re.sub(r" </node>", r")", t)
t = re.sub(r"<sentence>.*</sentence>", r"", t)
t = re.sub(r"</?alpino_ds.*>", r"", t)
return t