nltk.corpus.reader.bracket

24 """ 25 Reader for corpora that consist of parenthesis-delineated parse 26 trees. 27 """

28 - def __init__(self, root, files, comment_char=None, 29 detect_blocks='unindented_paren', encoding=None, 30 tag_mapping_function=None):

31 """ 32 @param root: The root directory for this corpus. 33 @param files: A list or regexp specifying the files in this corpus. 34 @param comment_char: The character which can appear at the start of 35 a line to indicate that the rest of the line is a comment. 36 @param detect_blocks: The method that is used to find blocks 37 in the corpus; can be 'unindented_paren' (every unindented 38 parenthesis starts a new parse) or 'sexpr' (brackets are 39 matched). 40 """ 41 CorpusReader.__init__(self, root, files, encoding) 42 self._comment_char = comment_char 43 self._detect_blocks = detect_blocks 44 self._tag_mapping_function = tag_mapping_function

45

46 - def _read_block(self, stream):

47 if self._detect_blocks == 'sexpr': 48 return read_sexpr_block(stream, comment_char=self._comment_char) 49 elif self._detect_blocks == 'blankline': 50 return read_blankline_block(stream) 51 elif self._detect_blocks == 'unindented_paren': 52 # Tokens start with unindented left parens. 53 toks = read_regexp_block(stream, start_re=r'^\(') 54 # Strip any comments out of the tokens. 55 if self._comment_char: 56 toks = [re.sub('(?m)^%s.*'%re.escape(self._comment_char), 57 '', tok) 58 for tok in toks] 59 return toks 60 else: 61 assert 0, 'bad block type'

62

63 - def _normalize(self, t):

64 # If there's an empty set of brackets surrounding the actual 65 # parse, then strip them off. 66 if EMPTY_BRACKETS.match(t): 67 t = t.strip()[1:-1] 68 # Replace leaves of the form (!), (,), with (! !), (, ,) 69 t = re.sub(r"\((.)\)", r"(\1 \1)", t) 70 # Replace leaves of the form (tag word root) with (tag word) 71 t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t) 72 return t

73

74 - def _parse(self, t):

75 try: 76 return bracket_parse(self._normalize(t)) 77 except ValueError, e: 78 sys.stderr.write("Bad tree detected; trying to recover...\n") 79 # Try to recover, if we can: 80 if e.args == ('mismatched parens',): 81 for n in range(1, 5): 82 try: 83 v = bracket_parse(self._normalize(t+')'*n)) 84 sys.stderr.write(" Recovered by adding %d close " 85 "paren(s)\n" % n) 86 return v 87 except ValueError: pass 88 # Try something else: 89 sys.stderr.write(" Recovered by returning a flat parse.\n") 90 #sys.stderr.write(' '.join(t.split())+'\n') 91 return Tree('S', self._tag(t))

92

93 - def _tag(self, t, simplify_tags=False):

94 tagged_sent = [(w,t) for (t,w) in TAGWORD.findall(self._normalize(t))] 95 if simplify_tags: 96 tagged_sent = [(w, self._tag_mapping_function(t)) 97 for (w,t) in tagged_sent] 98 return tagged_sent

99

100 - def _word(self, t):

101 return WORD.findall(self._normalize(t))

104 """ 105 Reader for the Alpino Dutch Treebank. 106 """

107 - def __init__(self, root, encoding=None, tag_mapping_function=None):

108 BracketParseCorpusReader.__init__(self, root, 'alpino\.xml', 109 detect_blocks='blankline', 110 encoding=encoding, 111 tag_mapping_function=tag_mapping_function)

112

113 - def _normalize(self, t):

114 if t[:10] != "<alpino_ds": 115 return "" 116 # convert XML to sexpr notation 117 t = re.sub(r' <node .*? cat="(\w+)".*>', r"(\1", t) 118 t = re.sub(r' <node .*? pos="(\w+)".*? word="([^"]+)".*/>', r"(\1 \2)", t) 119 t = re.sub(r" </node>", r")", t) 120 t = re.sub(r"<sentence>.*</sentence>", r"", t) 121 t = re.sub(r"</?alpino_ds.*>", r"", t) 122 return t

Source Code for Module nltk.corpus.reader.bracket_parse