1
2
3
4
5
6
7
8
9 from nltk.corpus.reader.util import *
10 from nltk.corpus.reader.api import *
11 from nltk.tree import bracket_parse, Tree
12 import sys
13
14 """
15 Corpus reader for corpora that consist of parenthesis-delineated parse trees.
16 """
17
18
19 TAGWORD = re.compile(r'\(([^\s()]+) ([^\s()]+)\)')
20 WORD = re.compile(r'\([^\s()]+ ([^\s()]+)\)')
21 EMPTY_BRACKETS = re.compile(r'\s*\(\s*\(')
22
24 """
25 Reader for corpora that consist of parenthesis-delineated parse
26 trees.
27 """
28 - def __init__(self, root, files, comment_char=None,
29 detect_blocks='unindented_paren', encoding=None,
30 tag_mapping_function=None):
31 """
32 @param root: The root directory for this corpus.
33 @param files: A list or regexp specifying the files in this corpus.
34 @param comment_char: The character which can appear at the start of
35 a line to indicate that the rest of the line is a comment.
36 @param detect_blocks: The method that is used to find blocks
37 in the corpus; can be 'unindented_paren' (every unindented
38 parenthesis starts a new parse) or 'sexpr' (brackets are
39 matched).
40 """
41 CorpusReader.__init__(self, root, files, encoding)
42 self._comment_char = comment_char
43 self._detect_blocks = detect_blocks
44 self._tag_mapping_function = tag_mapping_function
45
47 if self._detect_blocks == 'sexpr':
48 return read_sexpr_block(stream, comment_char=self._comment_char)
49 elif self._detect_blocks == 'blankline':
50 return read_blankline_block(stream)
51 elif self._detect_blocks == 'unindented_paren':
52
53 toks = read_regexp_block(stream, start_re=r'^\(')
54
55 if self._comment_char:
56 toks = [re.sub('(?m)^%s.*'%re.escape(self._comment_char),
57 '', tok)
58 for tok in toks]
59 return toks
60 else:
61 assert 0, 'bad block type'
62
64
65
66 if EMPTY_BRACKETS.match(t):
67 t = t.strip()[1:-1]
68
69 t = re.sub(r"\((.)\)", r"(\1 \1)", t)
70
71 t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t)
72 return t
73
75 try:
76 return bracket_parse(self._normalize(t))
77 except ValueError, e:
78 sys.stderr.write("Bad tree detected; trying to recover...\n")
79
80 if e.args == ('mismatched parens',):
81 for n in range(1, 5):
82 try:
83 v = bracket_parse(self._normalize(t+')'*n))
84 sys.stderr.write(" Recovered by adding %d close "
85 "paren(s)\n" % n)
86 return v
87 except ValueError: pass
88
89 sys.stderr.write(" Recovered by returning a flat parse.\n")
90
91 return Tree('S', self._tag(t))
92
93 - def _tag(self, t, simplify_tags=False):
94 tagged_sent = [(w,t) for (t,w) in TAGWORD.findall(self._normalize(t))]
95 if simplify_tags:
96 tagged_sent = [(w, self._tag_mapping_function(t))
97 for (w,t) in tagged_sent]
98 return tagged_sent
99
102
104 """
105 Reader for the Alpino Dutch Treebank.
106 """
107 - def __init__(self, root, encoding=None, tag_mapping_function=None):
112
114 if t[:10] != "<alpino_ds":
115 return ""
116
117 t = re.sub(r' <node .*? cat="(\w+)".*>', r"(\1", t)
118 t = re.sub(r' <node .*? pos="(\w+)".*? word="([^"]+)".*/>', r"(\1 \2)", t)
119 t = re.sub(r" </node>", r")", t)
120 t = re.sub(r"<sentence>.*</sentence>", r"", t)
121 t = re.sub(r"</?alpino_ds.*>", r"", t)
122 return t
123