Package nltk :: Package corpus :: Package reader :: Module bracket_parse
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.bracket_parse

  1  # Natural Language Toolkit: Penn Treebank Reader 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Steven Bird <[email protected]> 
  5  #         Edward Loper <[email protected]> 
  6  # URL: <http://nltk.org> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  from nltk.corpus.reader.util import * 
 10  from nltk.corpus.reader.api import * 
 11  from nltk.tree import bracket_parse, Tree 
 12  import sys 
 13   
 14  """ 
 15  Corpus reader for corpora that consist of parenthesis-delineated parse trees. 
 16  """ 
 17   
 18  # we use [^\s()]+ instead of \S+? to avoid matching () 
 19  TAGWORD = re.compile(r'\(([^\s()]+) ([^\s()]+)\)') 
 20  WORD = re.compile(r'\([^\s()]+ ([^\s()]+)\)') 
 21  EMPTY_BRACKETS = re.compile(r'\s*\(\s*\(') 
 22   
23 -class BracketParseCorpusReader(SyntaxCorpusReader):
24 """ 25 Reader for corpora that consist of parenthesis-delineated parse 26 trees. 27 """
28 - def __init__(self, root, files, comment_char=None, 29 detect_blocks='unindented_paren', encoding=None, 30 tag_mapping_function=None):
31 """ 32 @param root: The root directory for this corpus. 33 @param files: A list or regexp specifying the files in this corpus. 34 @param comment_char: The character which can appear at the start of 35 a line to indicate that the rest of the line is a comment. 36 @param detect_blocks: The method that is used to find blocks 37 in the corpus; can be 'unindented_paren' (every unindented 38 parenthesis starts a new parse) or 'sexpr' (brackets are 39 matched). 40 """ 41 CorpusReader.__init__(self, root, files, encoding) 42 self._comment_char = comment_char 43 self._detect_blocks = detect_blocks 44 self._tag_mapping_function = tag_mapping_function
45
46 - def _read_block(self, stream):
47 if self._detect_blocks == 'sexpr': 48 return read_sexpr_block(stream, comment_char=self._comment_char) 49 elif self._detect_blocks == 'blankline': 50 return read_blankline_block(stream) 51 elif self._detect_blocks == 'unindented_paren': 52 # Tokens start with unindented left parens. 53 toks = read_regexp_block(stream, start_re=r'^\(') 54 # Strip any comments out of the tokens. 55 if self._comment_char: 56 toks = [re.sub('(?m)^%s.*'%re.escape(self._comment_char), 57 '', tok) 58 for tok in toks] 59 return toks 60 else: 61 assert 0, 'bad block type'
62
63 - def _normalize(self, t):
64 # If there's an empty set of brackets surrounding the actual 65 # parse, then strip them off. 66 if EMPTY_BRACKETS.match(t): 67 t = t.strip()[1:-1] 68 # Replace leaves of the form (!), (,), with (! !), (, ,) 69 t = re.sub(r"\((.)\)", r"(\1 \1)", t) 70 # Replace leaves of the form (tag word root) with (tag word) 71 t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t) 72 return t
73
74 - def _parse(self, t):
75 try: 76 return bracket_parse(self._normalize(t)) 77 except ValueError, e: 78 sys.stderr.write("Bad tree detected; trying to recover...\n") 79 # Try to recover, if we can: 80 if e.args == ('mismatched parens',): 81 for n in range(1, 5): 82 try: 83 v = bracket_parse(self._normalize(t+')'*n)) 84 sys.stderr.write(" Recovered by adding %d close " 85 "paren(s)\n" % n) 86 return v 87 except ValueError: pass 88 # Try something else: 89 sys.stderr.write(" Recovered by returning a flat parse.\n") 90 #sys.stderr.write(' '.join(t.split())+'\n') 91 return Tree('S', self._tag(t))
92
93 - def _tag(self, t, simplify_tags=False):
94 tagged_sent = [(w,t) for (t,w) in TAGWORD.findall(self._normalize(t))] 95 if simplify_tags: 96 tagged_sent = [(w, self._tag_mapping_function(t)) 97 for (w,t) in tagged_sent] 98 return tagged_sent
99
100 - def _word(self, t):
101 return WORD.findall(self._normalize(t))
102
103 -class AlpinoCorpusReader(BracketParseCorpusReader):
104 """ 105 Reader for the Alpino Dutch Treebank. 106 """
107 - def __init__(self, root, encoding=None, tag_mapping_function=None):
108 BracketParseCorpusReader.__init__(self, root, 'alpino\.xml', 109 detect_blocks='blankline', 110 encoding=encoding, 111 tag_mapping_function=tag_mapping_function)
112
113 - def _normalize(self, t):
114 if t[:10] != "<alpino_ds": 115 return "" 116 # convert XML to sexpr notation 117 t = re.sub(r' <node .*? cat="(\w+)".*>', r"(\1", t) 118 t = re.sub(r' <node .*? pos="(\w+)".*? word="([^"]+)".*/>', r"(\1 \2)", t) 119 t = re.sub(r" </node>", r")", t) 120 t = re.sub(r"<sentence>.*</sentence>", r"", t) 121 t = re.sub(r"</?alpino_ds.*>", r"", t) 122 return t
123