1
2
3
4
5
6
7
8
9 import string
10
11 from nltk import tokenize, cfg
12 from nltk.tree import Tree, ImmutableTree
13
14 from api import *
15
16
17
18
20 """
21 A simple top-down CFG parser that parses texts by recursively
22 expanding the fringe of a C{Tree}, and matching it against a
23 text.
24
25 C{RecursiveDescentParser} uses a list of tree locations called a
26 X{frontier} to remember which subtrees have not yet been expanded
27 and which leaves have not yet been matched against the text. Each
28 tree location consists of a list of child indices specifying the
29 path from the root of the tree to a subtree or a leaf; see the
30 reference documentation for C{Tree} for more information
31 about tree locations.
32
33 When the parser begins parsing a text, it constructs a tree
34 containing only the start symbol, and a frontier containing the
35 location of the tree's root node. It then extends the tree to
36 cover the text, using the following recursive procedure:
37
38 - If the frontier is empty, and the text is covered by the tree,
39 then return the tree as a possible parse.
40 - If the frontier is empty, and the text is not covered by the
41 tree, then return no parses.
42 - If the first element of the frontier is a subtree, then
43 use CFG productions to X{expand} it. For each applicable
44 production, add the expanded subtree's children to the
45 frontier, and recursively find all parses that can be
46 generated by the new tree and frontier.
47 - If the first element of the frontier is a token, then X{match}
48 it against the next token from the text. Remove the token
49 from the frontier, and recursively find all parses that can be
50 generated by the new tree and frontier.
51
52 @see: C{nltk.cfg}
53 """
55 """
56 Create a new C{RecursiveDescentParser}, that uses C{grammar}
57 to parse texts.
58
59 @type grammar: C{Grammar}
60 @param grammar: The grammar used to parse texts.
61 @type trace: C{int}
62 @param trace: The level of tracing that should be used when
63 parsing a text. C{0} will generate no tracing output;
64 and higher numbers will produce more verbose tracing
65 output.
66 """
67 self._grammar = grammar
68 self._trace = trace
69
72
90
91 - def _parse(self, remaining_text, tree, frontier):
92 """
93 Recursively expand and match each elements of C{tree}
94 specified by C{frontier}, to cover C{remaining_text}. Return
95 a list of all parses found.
96
97 @return: A list of all parses that can be generated by
98 matching and expanding the elements of C{tree}
99 specified by C{frontier}.
100 @rtype: C{list} of C{Tree}
101 @type tree: C{Tree}
102 @param tree: A partial structure for the text that is
103 currently being parsed. The elements of C{tree}
104 that are specified by C{frontier} have not yet been
105 expanded or matched.
106 @type remaining_text: C{list} of C{String}s
107 @param remaining_text: The portion of the text that is not yet
108 covered by C{tree}.
109 @type frontier: C{list} of C{tuple} of C{int}
110 @param frontier: A list of the locations within C{tree} of
111 all subtrees that have not yet been expanded, and all
112 leaves that have not yet been matched. This list sorted
113 in left-to-right order of location within the tree.
114 """
115
116
117
118 if len(remaining_text) == 0 and len(frontier) == 0:
119 if self._trace:
120 self._trace_succeed(tree, frontier)
121 return [tree]
122
123
124 elif len(frontier) == 0:
125 if self._trace:
126 self._trace_backtrack(tree, frontier)
127 return []
128
129
130 elif isinstance(tree[frontier[0]], Tree):
131 return self._expand(remaining_text, tree, frontier)
132
133
134 else:
135 return self._match(remaining_text, tree, frontier)
136
137 - def _match(self, rtext, tree, frontier):
138 """
139 @rtype: C{list} of C{Tree}
140 @return: a list of all parses that can be generated by
141 matching the first element of C{frontier} against the
142 first token in C{rtext}. In particular, if the first
143 element of C{frontier} has the same type as the first
144 token in C{rtext}, then substitute the token into
145 C{tree}; and return all parses that can be generated by
146 matching and expanding the remaining elements of
147 C{frontier}. If the first element of C{frontier} does not
148 have the same type as the first token in C{rtext}, then
149 return empty list.
150
151 @type tree: C{Tree}
152 @param tree: A partial structure for the text that is
153 currently being parsed. The elements of C{tree}
154 that are specified by C{frontier} have not yet been
155 expanded or matched.
156 @type rtext: C{list} of C{String}s
157 @param rtext: The portion of the text that is not yet
158 covered by C{tree}.
159 @type frontier: C{list} of C{tuple} of C{int}
160 @param frontier: A list of the locations within C{tree} of
161 all subtrees that have not yet been expanded, and all
162 leaves that have not yet been matched.
163 """
164
165 tree_leaf = tree[frontier[0]]
166 if (len(rtext) > 0 and tree_leaf == rtext[0]):
167
168
169 newtree = tree.copy(deep=True)
170 newtree[frontier[0]] = rtext[0]
171 if self._trace:
172 self._trace_match(newtree, frontier[1:], rtext[0])
173 return self._parse(rtext[1:], newtree, frontier[1:])
174 else:
175
176 if self._trace:
177 self._trace_backtrack(tree, frontier, rtext[:1])
178 return []
179
180 - def _expand(self, remaining_text, tree, frontier, production=None):
181 """
182 @rtype: C{list} of C{Tree}
183 @return: A list of all parses that can be generated by
184 expanding the first element of C{frontier} with
185 C{production}. In particular, if the first element of
186 C{frontier} is a subtree whose node type is equal to
187 C{production}'s left hand side, then add a child to that
188 subtree for each element of C{production}'s right hand
189 side; and return all parses that can be generated by
190 matching and expanding the remaining elements of
191 C{frontier}. If the first element of C{frontier} is not a
192 subtree whose node type is equal to C{production}'s left
193 hand side, then return an empty list. If C{production} is
194 not specified, then return a list of all parses that can
195 be generated by expanding the first element of C{frontier}
196 with I{any} CFG production.
197
198 @type tree: C{Tree}
199 @param tree: A partial structure for the text that is
200 currently being parsed. The elements of C{tree}
201 that are specified by C{frontier} have not yet been
202 expanded or matched.
203 @type remaining_text: C{list} of C{String}s
204 @param remaining_text: The portion of the text that is not yet
205 covered by C{tree}.
206 @type frontier: C{list} of C{tuple} of C{int}
207 @param frontier: A list of the locations within C{tree} of
208 all subtrees that have not yet been expanded, and all
209 leaves that have not yet been matched.
210 """
211
212 if production is None: productions = self._grammar.productions()
213 else: productions = [production]
214
215 parses = []
216 for production in productions:
217 lhs = production.lhs().symbol()
218 if lhs == tree[frontier[0]].node:
219 subtree = self._production_to_tree(production)
220 if frontier[0] == ():
221 newtree = subtree
222 else:
223 newtree = tree.copy(deep=True)
224 newtree[frontier[0]] = subtree
225 new_frontier = [frontier[0]+(i,) for i in
226 range(len(production.rhs()))]
227 if self._trace:
228 self._trace_expand(newtree, new_frontier, production)
229 parses += self._parse(remaining_text, newtree,
230 new_frontier + frontier[1:])
231 return parses
232
234 """
235 @rtype: C{Tree}
236 @return: The C{Tree} that is licensed by C{production}.
237 In particular, given the production::
238
239 C{[M{lhs} -> M{elt[1]} ... M{elt[n]}]}
240
241 Return a tree token that has a node C{M{lhs}.symbol}, and
242 C{M{n}} children. For each nonterminal element
243 C{M{elt[i]}} in the production, the tree token has a
244 childless subtree with node value C{M{elt[i]}.symbol}; and
245 for each terminal element C{M{elt[j]}}, the tree token has
246 a leaf token with type C{M{elt[j]}}.
247
248 @param production: The CFG production that licenses the tree
249 token that should be returned.
250 @type production: C{Production}
251 """
252 children = []
253 for elt in production.rhs():
254 if isinstance(elt, cfg.Nonterminal):
255 children.append(Tree(elt.symbol(), []))
256 else:
257
258 children.append(elt)
259 return Tree(production.lhs().symbol(), children)
260
261 - def trace(self, trace=2):
262 """
263 Set the level of tracing output that should be generated when
264 parsing a text.
265
266 @type trace: C{int}
267 @param trace: The trace level. A trace level of C{0} will
268 generate no tracing output; and higher trace levels will
269 produce more verbose tracing output.
270 @rtype: C{None}
271 """
272 self._trace = trace
273
275 """
276 Print trace output displaying the fringe of C{tree}. The
277 fringe of C{tree} consists of all of its leaves and all of
278 its childless subtrees.
279
280 @rtype: C{None}
281 """
282
283 if treeloc == (): print "*",
284 if isinstance(tree, Tree):
285 if len(tree) == 0: print `cfg.Nonterminal(tree.node)`,
286 for i in range(len(tree)):
287 if treeloc is not None and i == treeloc[0]:
288 self._trace_fringe(tree[i], treeloc[1:])
289 else:
290 self._trace_fringe(tree[i])
291 else:
292 print `tree`,
293
295 """
296 Print trace output displaying the parser's current state.
297
298 @param operation: A character identifying the operation that
299 generated the current state.
300 @rtype: C{None}
301 """
302 if self._trace == 2: print ' %c [' % operation,
303 else: print ' [',
304 if len(frontier) > 0: self._trace_fringe(tree, frontier[0])
305 else: self._trace_fringe(tree)
306 print ']'
307
312
314 if self._trace > 2: print 'Expand: %s' % production
315 if self._trace > 1: self._trace_tree(tree, frontier, 'E')
316
320
322 if self._trace > 2: print 'GOOD PARSE:'
323 if self._trace == 1: print 'Found a parse:\n%s' % tree
324 if self._trace > 1: self._trace_tree(tree, frontier, '+')
325
327 if self._trace > 2:
328 if toks: print 'Backtrack: %r match failed' % toks[0]
329 else: print 'Backtrack'
330
331
332
333
335 """
336 A C{RecursiveDescentParser} that allows you to step through the
337 parsing process, performing a single operation at a time.
338
339 The C{initialize} method is used to start parsing a text.
340 C{expand} expands the first element on the frontier using a single
341 CFG production, and C{match} matches the first element on the
342 frontier against the next text token. C{backtrack} undoes the most
343 recent expand or match operation. C{step} performs a single
344 expand, match, or backtrack operation. C{parses} returns the set
345 of parses that have been found by the parser.
346
347 @ivar _history: A list of C{(rtext, tree, frontier)} tripples,
348 containing the previous states of the parser. This history is
349 used to implement the C{backtrack} operation.
350 @ivar _tried_e: A record of all productions that have been tried
351 for a given tree. This record is used by C{expand} to perform
352 the next untried production.
353 @ivar _tried_m: A record of what tokens have been matched for a
354 given tree. This record is used by C{step} to decide whether
355 or not to match a token.
356 @see: C{nltk.cfg}
357 """
359 self._grammar = grammar
360 self._trace = trace
361 self._rtext = None
362 self._tree = None
363 self._frontier = [()]
364 self._tried_e = {}
365 self._tried_m = {}
366 self._history = []
367 self._parses = []
368
369
370
376
383
385 """
386 Start parsing a given text. This sets the parser's tree to
387 the start symbol, its frontier to the root node, and its
388 remaining text to C{token['SUBTOKENS']}.
389 """
390
391 self._rtext = tokens
392 start = self._grammar.start().symbol()
393 self._tree = Tree(start, [])
394 self._frontier = [()]
395 self._tried_e = {}
396 self._tried_m = {}
397 self._history = []
398 self._parses = []
399 if self._trace:
400 self._trace_start(self._tree, self._frontier, self._rtext)
401
402 - def remaining_text(self):
403 """
404 @return: The portion of the text that is not yet covered by the
405 tree.
406 @rtype: C{list} of C{String}
407 """
408 return self._rtext
409
411 """
412 @return: A list of the tree locations of all subtrees that
413 have not yet been expanded, and all leaves that have not
414 yet been matched.
415 @rtype: C{list} of C{tuple} of C{int}
416 """
417 return self._frontier
418
420 """
421 @return: A partial structure for the text that is
422 currently being parsed. The elements specified by the
423 frontier have not yet been expanded or matched.
424 @rtype: C{Tree}
425 """
426 return self._tree
427
429 """
430 Perform a single parsing operation. If an untried match is
431 possible, then perform the match, and return the matched
432 token. If an untried expansion is possible, then perform the
433 expansion, and return the production that it is based on. If
434 backtracking is possible, then backtrack, and return 1.
435 Otherwise, return 0.
436
437 @return: 0 if no operation was performed; a token if a match
438 was performed; a production if an expansion was performed;
439 and 1 if a backtrack operation was performed.
440 @rtype: C{Production} or C{String} or C{boolean}
441 """
442
443 if self.untried_match():
444 token = self.match()
445 if token is not None: return token
446
447
448 production = self.expand()
449 if production is not None: return production
450
451
452 if self.backtrack():
453 self._trace_backtrack(self._tree, self._frontier)
454 return 1
455
456
457 return None
458
459 - def expand(self, production=None):
460 """
461 Expand the first element of the frontier. In particular, if
462 the first element of the frontier is a subtree whose node type
463 is equal to C{production}'s left hand side, then add a child
464 to that subtree for each element of C{production}'s right hand
465 side. If C{production} is not specified, then use the first
466 untried expandable production. If all expandable productions
467 have been tried, do nothing.
468
469 @return: The production used to expand the frontier, if an
470 expansion was performed. If no expansion was performed,
471 return C{None}.
472 @rtype: C{Production} or C{None}
473 """
474
475
476 if len(self._frontier) == 0:
477 return None
478 if not isinstance(self._tree[self._frontier[0]], Tree):
479 return None
480
481
482 if production is None:
483 productions = self.untried_expandable_productions()
484 else: productions = [production]
485
486 parses = []
487 for prod in productions:
488
489 self._tried_e.setdefault(self._freeze(self._tree), []).append(prod)
490
491
492 if self._expand(self._rtext, self._tree, self._frontier, prod):
493 return prod
494
495
496 return None
497
499 """
500 Match the first element of the frontier. In particular, if
501 the first element of the frontier has the same type as the
502 next text token, then substitute the text token into the tree.
503
504 @return: The token matched, if a match operation was
505 performed. If no match was performed, return C{None}
506 @rtype: C{String} or C{None}
507 """
508
509
510 tok = self._rtext[0]
511 self._tried_m.setdefault(self._freeze(self._tree), []).append(tok)
512
513
514 if len(self._frontier) == 0:
515 return None
516 if isinstance(self._tree[self._frontier[0]], Tree):
517 return None
518
519 if self._match(self._rtext, self._tree, self._frontier):
520
521 return self._history[-1][0][0]
522 else:
523 return None
524
526 """
527 Return the parser to its state before the most recent
528 match or expand operation. Calling C{undo} repeatedly return
529 the parser to successively earlier states. If no match or
530 expand operations have been performed, C{undo} will make no
531 changes.
532
533 @return: true if an operation was successfully undone.
534 @rtype: C{boolean}
535 """
536 if len(self._history) == 0: return 0
537 (self._rtext, self._tree, self._frontier) = self._history.pop()
538 return 1
539
541 """
542 @return: A list of all the productions for which expansions
543 are available for the current parser state.
544 @rtype: C{list} of C{Production}
545 """
546
547 if len(self._frontier) == 0: return []
548 frontier_child = self._tree[self._frontier[0]]
549 if (len(self._frontier) == 0 or
550 not isinstance(frontier_child, Tree)):
551 return []
552
553 return [p for p in self._grammar.productions()
554 if p.lhs().symbol() == frontier_child.node]
555
557 """
558 @return: A list of all the untried productions for which
559 expansions are available for the current parser state.
560 @rtype: C{list} of C{Production}
561 """
562
563 tried_expansions = self._tried_e.get(self._freeze(self._tree), [])
564 return [p for p in self.expandable_productions()
565 if p not in tried_expansions]
566
568 """
569 @return: Whether the first element of the frontier is a token
570 that has not yet been matched.
571 @rtype: C{boolean}
572 """
573
574 if len(self._rtext) == 0: return 0
575 tried_matches = self._tried_m.get(self._freeze(self._tree), [])
576 return (self._rtext[0] not in tried_matches)
577
579 """
580 @return: Whether the parser's current state represents a
581 complete parse.
582 @rtype: C{boolean}
583 """
584 return (len(self._frontier) == 0 and len(self._rtext) == 0)
585
586 - def _parse(self, remaining_text, tree, frontier):
587 """
588 A stub version of C{_parse} that sets the parsers current
589 state to the given arguments. In C{RecursiveDescentParser},
590 the C{_parse} method is used to recursively continue parsing a
591 text. C{SteppingRecursiveDescentParser} overrides it to
592 capture these recursive calls. It records the parser's old
593 state in the history (to allow for backtracking), and updates
594 the parser's new state using the given arguments. Finally, it
595 returns C{[1]}, which is used by C{match} and C{expand} to
596 detect whether their operations were successful.
597
598 @return: C{[1]}
599 @rtype: C{list} of C{int}
600 """
601 self._history.append( (self._rtext, self._tree, self._frontier) )
602 self._rtext = remaining_text
603 self._tree = tree
604 self._frontier = frontier
605
606
607 if (len(frontier) == 0 and len(remaining_text) == 0):
608 self._parses.append(tree)
609 self._trace_succeed(self._tree, self._frontier)
610
611 return [1]
612
614 """
615 @return: A list of the parses that have been found by this
616 parser so far.
617 @rtype: C{list} of C{Tree}
618 """
619 return self._parses
620
622 """
623 Change the grammar used to parse texts.
624
625 @param grammar: The new grammar.
626 @type grammar: C{CFG}
627 """
628 self._grammar = grammar
629
630
631
632
633
635 """
636 A demonstration of the recursive descent parser.
637 """
638
639 from nltk import parse, cfg
640
641 grammar = cfg.parse_cfg("""
642 S -> NP VP
643 NP -> Det N | Det N PP
644 VP -> V NP | V NP PP
645 PP -> P NP
646 NP -> 'I'
647 N -> 'man' | 'park' | 'telescope' | 'dog'
648 Det -> 'the' | 'a'
649 P -> 'in' | 'with'
650 V -> 'saw'
651 """)
652
653 for prod in grammar.productions():
654 print prod
655
656 sent = 'I saw a man in the park'.split()
657 parser = parse.RecursiveDescentParser(grammar, trace=2)
658 for p in parser.nbest_parse(sent):
659 print p
660
661 if __name__ == '__main__':
662 demo()
663