nltk.parse.rd

1 # Natural Language Toolkit: Recursive Descent Parser 2 # 3 # Copyright (C) 2001-2008 NLTK Project 4 # Author: Edward Loper <[email protected]> 5 # Steven Bird <[email protected]> 6 # URL: <http://nltk.org> 7 # For license information, see LICENSE.TXT 8 9 import string 10 11 from nltk import tokenize, cfg 12 from nltk.tree import Tree, ImmutableTree 13 14 from api import * 15 16 ##////////////////////////////////////////////////////// 17 ## Recursive Descent Parser 18 ##//////////////////////////////////////////////////////

19 -class RecursiveDescentParser(ParserI):

20 """ 21 A simple top-down CFG parser that parses texts by recursively 22 expanding the fringe of a C{Tree}, and matching it against a 23 text. 24 25 C{RecursiveDescentParser} uses a list of tree locations called a 26 X{frontier} to remember which subtrees have not yet been expanded 27 and which leaves have not yet been matched against the text. Each 28 tree location consists of a list of child indices specifying the 29 path from the root of the tree to a subtree or a leaf; see the 30 reference documentation for C{Tree} for more information 31 about tree locations. 32 33 When the parser begins parsing a text, it constructs a tree 34 containing only the start symbol, and a frontier containing the 35 location of the tree's root node. It then extends the tree to 36 cover the text, using the following recursive procedure: 37 38 - If the frontier is empty, and the text is covered by the tree, 39 then return the tree as a possible parse. 40 - If the frontier is empty, and the text is not covered by the 41 tree, then return no parses. 42 - If the first element of the frontier is a subtree, then 43 use CFG productions to X{expand} it. For each applicable 44 production, add the expanded subtree's children to the 45 frontier, and recursively find all parses that can be 46 generated by the new tree and frontier. 47 - If the first element of the frontier is a token, then X{match} 48 it against the next token from the text. Remove the token 49 from the frontier, and recursively find all parses that can be 50 generated by the new tree and frontier. 51 52 @see: C{nltk.cfg} 53 """

54 - def __init__(self, grammar, trace=0):

55 """ 56 Create a new C{RecursiveDescentParser}, that uses C{grammar} 57 to parse texts. 58 59 @type grammar: C{Grammar} 60 @param grammar: The grammar used to parse texts. 61 @type trace: C{int} 62 @param trace: The level of tracing that should be used when 63 parsing a text. C{0} will generate no tracing output; 64 and higher numbers will produce more verbose tracing 65 output. 66 """ 67 self._grammar = grammar 68 self._trace = trace

69

70 - def grammar(self):

71 return self._grammar

72

73 - def nbest_parse(self, tokens, n=None):

74 # Inherit docs from ParserI 75 76 tokens = list(tokens) 77 self._grammar.check_coverage(tokens) 78 79 # Start a recursive descent parse, with an initial tree 80 # containing just the start symbol. 81 start = self._grammar.start().symbol() 82 initial_tree = Tree(start, []) 83 frontier = [()] 84 if self._trace: 85 self._trace_start(initial_tree, frontier, tokens) 86 parses = self._parse(tokens, initial_tree, frontier) 87 88 # Return the parses. 89 return parses[:n]

90

91 - def _parse(self, remaining_text, tree, frontier):

92 """ 93 Recursively expand and match each elements of C{tree} 94 specified by C{frontier}, to cover C{remaining_text}. Return 95 a list of all parses found. 96 97 @return: A list of all parses that can be generated by 98 matching and expanding the elements of C{tree} 99 specified by C{frontier}. 100 @rtype: C{list} of C{Tree} 101 @type tree: C{Tree} 102 @param tree: A partial structure for the text that is 103 currently being parsed. The elements of C{tree} 104 that are specified by C{frontier} have not yet been 105 expanded or matched. 106 @type remaining_text: C{list} of C{String}s 107 @param remaining_text: The portion of the text that is not yet 108 covered by C{tree}. 109 @type frontier: C{list} of C{tuple} of C{int} 110 @param frontier: A list of the locations within C{tree} of 111 all subtrees that have not yet been expanded, and all 112 leaves that have not yet been matched. This list sorted 113 in left-to-right order of location within the tree. 114 """ 115 116 # If the tree covers the text, and there's nothing left to 117 # expand, then we've found a complete parse; return it. 118 if len(remaining_text) == 0 and len(frontier) == 0: 119 if self._trace: 120 self._trace_succeed(tree, frontier) 121 return [tree] 122 123 # If there's still text, but nothing left to expand, we failed. 124 elif len(frontier) == 0: 125 if self._trace: 126 self._trace_backtrack(tree, frontier) 127 return [] 128 129 # If the next element on the frontier is a tree, expand it. 130 elif isinstance(tree[frontier[0]], Tree): 131 return self._expand(remaining_text, tree, frontier) 132 133 # If the next element on the frontier is a token, match it. 134 else: 135 return self._match(remaining_text, tree, frontier)

136

137 - def _match(self, rtext, tree, frontier):

138 """ 139 @rtype: C{list} of C{Tree} 140 @return: a list of all parses that can be generated by 141 matching the first element of C{frontier} against the 142 first token in C{rtext}. In particular, if the first 143 element of C{frontier} has the same type as the first 144 token in C{rtext}, then substitute the token into 145 C{tree}; and return all parses that can be generated by 146 matching and expanding the remaining elements of 147 C{frontier}. If the first element of C{frontier} does not 148 have the same type as the first token in C{rtext}, then 149 return empty list. 150 151 @type tree: C{Tree} 152 @param tree: A partial structure for the text that is 153 currently being parsed. The elements of C{tree} 154 that are specified by C{frontier} have not yet been 155 expanded or matched. 156 @type rtext: C{list} of C{String}s 157 @param rtext: The portion of the text that is not yet 158 covered by C{tree}. 159 @type frontier: C{list} of C{tuple} of C{int} 160 @param frontier: A list of the locations within C{tree} of 161 all subtrees that have not yet been expanded, and all 162 leaves that have not yet been matched. 163 """ 164 165 tree_leaf = tree[frontier[0]] 166 if (len(rtext) > 0 and tree_leaf == rtext[0]): 167 # If it's a terminal that matches rtext[0], then substitute 168 # in the token, and continue parsing. 169 newtree = tree.copy(deep=True) 170 newtree[frontier[0]] = rtext[0] 171 if self._trace: 172 self._trace_match(newtree, frontier[1:], rtext[0]) 173 return self._parse(rtext[1:], newtree, frontier[1:]) 174 else: 175 # If it's a non-matching terminal, fail. 176 if self._trace: 177 self._trace_backtrack(tree, frontier, rtext[:1]) 178 return []

179

180 - def _expand(self, remaining_text, tree, frontier, production=None):

181 """ 182 @rtype: C{list} of C{Tree} 183 @return: A list of all parses that can be generated by 184 expanding the first element of C{frontier} with 185 C{production}. In particular, if the first element of 186 C{frontier} is a subtree whose node type is equal to 187 C{production}'s left hand side, then add a child to that 188 subtree for each element of C{production}'s right hand 189 side; and return all parses that can be generated by 190 matching and expanding the remaining elements of 191 C{frontier}. If the first element of C{frontier} is not a 192 subtree whose node type is equal to C{production}'s left 193 hand side, then return an empty list. If C{production} is 194 not specified, then return a list of all parses that can 195 be generated by expanding the first element of C{frontier} 196 with I{any} CFG production. 197 198 @type tree: C{Tree} 199 @param tree: A partial structure for the text that is 200 currently being parsed. The elements of C{tree} 201 that are specified by C{frontier} have not yet been 202 expanded or matched. 203 @type remaining_text: C{list} of C{String}s 204 @param remaining_text: The portion of the text that is not yet 205 covered by C{tree}. 206 @type frontier: C{list} of C{tuple} of C{int} 207 @param frontier: A list of the locations within C{tree} of 208 all subtrees that have not yet been expanded, and all 209 leaves that have not yet been matched. 210 """ 211 212 if production is None: productions = self._grammar.productions() 213 else: productions = [production] 214 215 parses = [] 216 for production in productions: 217 lhs = production.lhs().symbol() 218 if lhs == tree[frontier[0]].node: 219 subtree = self._production_to_tree(production) 220 if frontier[0] == (): 221 newtree = subtree 222 else: 223 newtree = tree.copy(deep=True) 224 newtree[frontier[0]] = subtree 225 new_frontier = [frontier[0]+(i,) for i in 226 range(len(production.rhs()))] 227 if self._trace: 228 self._trace_expand(newtree, new_frontier, production) 229 parses += self._parse(remaining_text, newtree, 230 new_frontier + frontier[1:]) 231 return parses

232

233 - def _production_to_tree(self, production):

234 """ 235 @rtype: C{Tree} 236 @return: The C{Tree} that is licensed by C{production}. 237 In particular, given the production:: 238 239 C{[M{lhs} -> M{elt[1]} ... M{elt[n]}]} 240 241 Return a tree token that has a node C{M{lhs}.symbol}, and 242 C{M{n}} children. For each nonterminal element 243 C{M{elt[i]}} in the production, the tree token has a 244 childless subtree with node value C{M{elt[i]}.symbol}; and 245 for each terminal element C{M{elt[j]}}, the tree token has 246 a leaf token with type C{M{elt[j]}}. 247 248 @param production: The CFG production that licenses the tree 249 token that should be returned. 250 @type production: C{Production} 251 """ 252 children = [] 253 for elt in production.rhs(): 254 if isinstance(elt, cfg.Nonterminal): 255 children.append(Tree(elt.symbol(), [])) 256 else: 257 # This will be matched. 258 children.append(elt) 259 return Tree(production.lhs().symbol(), children)

260

261 - def trace(self, trace=2):

262 """ 263 Set the level of tracing output that should be generated when 264 parsing a text. 265 266 @type trace: C{int} 267 @param trace: The trace level. A trace level of C{0} will 268 generate no tracing output; and higher trace levels will 269 produce more verbose tracing output. 270 @rtype: C{None} 271 """ 272 self._trace = trace

273

274 - def _trace_fringe(self, tree, treeloc=None):

275 """ 276 Print trace output displaying the fringe of C{tree}. The 277 fringe of C{tree} consists of all of its leaves and all of 278 its childless subtrees. 279 280 @rtype: C{None} 281 """ 282 283 if treeloc == (): print "*", 284 if isinstance(tree, Tree): 285 if len(tree) == 0: print `cfg.Nonterminal(tree.node)`, 286 for i in range(len(tree)): 287 if treeloc is not None and i == treeloc[0]: 288 self._trace_fringe(tree[i], treeloc[1:]) 289 else: 290 self._trace_fringe(tree[i]) 291 else: 292 print `tree`,

293

294 - def _trace_tree(self, tree, frontier, operation):

295 """ 296 Print trace output displaying the parser's current state. 297 298 @param operation: A character identifying the operation that 299 generated the current state. 300 @rtype: C{None} 301 """ 302 if self._trace == 2: print ' %c [' % operation, 303 else: print ' [', 304 if len(frontier) > 0: self._trace_fringe(tree, frontier[0]) 305 else: self._trace_fringe(tree) 306 print ']'

307

308 - def _trace_start(self, tree, frontier, text):

309 print 'Parsing %r' % string.join(text) 310 if self._trace > 2: print 'Start:' 311 if self._trace > 1: self._trace_tree(tree, frontier, ' ')

312

313 - def _trace_expand(self, tree, frontier, production):

314 if self._trace > 2: print 'Expand: %s' % production 315 if self._trace > 1: self._trace_tree(tree, frontier, 'E')

316

317 - def _trace_match(self, tree, frontier, tok):

318 if self._trace > 2: print 'Match: %r' % tok 319 if self._trace > 1: self._trace_tree(tree, frontier, 'M')

320

321 - def _trace_succeed(self, tree, frontier):

322 if self._trace > 2: print 'GOOD PARSE:' 323 if self._trace == 1: print 'Found a parse:\n%s' % tree 324 if self._trace > 1: self._trace_tree(tree, frontier, '+')

325

326 - def _trace_backtrack(self, tree, frontier, toks=None):

327 if self._trace > 2: 328 if toks: print 'Backtrack: %r match failed' % toks[0] 329 else: print 'Backtrack'

330 331 ##////////////////////////////////////////////////////// 332 ## Stepping Recursive Descent Parser 333 ##//////////////////////////////////////////////////////

334 -class SteppingRecursiveDescentParser(RecursiveDescentParser):

335 """ 336 A C{RecursiveDescentParser} that allows you to step through the 337 parsing process, performing a single operation at a time. 338 339 The C{initialize} method is used to start parsing a text. 340 C{expand} expands the first element on the frontier using a single 341 CFG production, and C{match} matches the first element on the 342 frontier against the next text token. C{backtrack} undoes the most 343 recent expand or match operation. C{step} performs a single 344 expand, match, or backtrack operation. C{parses} returns the set 345 of parses that have been found by the parser. 346 347 @ivar _history: A list of C{(rtext, tree, frontier)} tripples, 348 containing the previous states of the parser. This history is 349 used to implement the C{backtrack} operation. 350 @ivar _tried_e: A record of all productions that have been tried 351 for a given tree. This record is used by C{expand} to perform 352 the next untried production. 353 @ivar _tried_m: A record of what tokens have been matched for a 354 given tree. This record is used by C{step} to decide whether 355 or not to match a token. 356 @see: C{nltk.cfg} 357 """

358 - def __init__(self, grammar, trace=0):

359 self._grammar = grammar 360 self._trace = trace 361 self._rtext = None 362 self._tree = None 363 self._frontier = [()] 364 self._tried_e = {} 365 self._tried_m = {} 366 self._history = [] 367 self._parses = []

368 369 # [XX] TEMPORARY HACK WARNING! This should be replaced with 370 # something nicer when we get the chance.

371 - def _freeze(self, tree):

372 c = tree.copy() 373 # for pos in c.treepositions('leaves'): 374 # c[pos] = c[pos].freeze() 375 return ImmutableTree.convert(c)

376

377 - def nbest_parse(self, tokens, n=None):

378 tokens = list(tokens) 379 self.initialize(tokens) 380 while self.step() is not None: pass 381 382 return self.parses()[:n]

383

384 - def initialize(self, tokens):

385 """ 386 Start parsing a given text. This sets the parser's tree to 387 the start symbol, its frontier to the root node, and its 388 remaining text to C{token['SUBTOKENS']}. 389 """ 390 391 self._rtext = tokens 392 start = self._grammar.start().symbol() 393 self._tree = Tree(start, []) 394 self._frontier = [()] 395 self._tried_e = {} 396 self._tried_m = {} 397 self._history = [] 398 self._parses = [] 399 if self._trace: 400 self._trace_start(self._tree, self._frontier, self._rtext)

401

402 - def remaining_text(self):

403 """ 404 @return: The portion of the text that is not yet covered by the 405 tree. 406 @rtype: C{list} of C{String} 407 """ 408 return self._rtext

409

410 - def frontier(self):

411 """ 412 @return: A list of the tree locations of all subtrees that 413 have not yet been expanded, and all leaves that have not 414 yet been matched. 415 @rtype: C{list} of C{tuple} of C{int} 416 """ 417 return self._frontier

418

419 - def tree(self):

420 """ 421 @return: A partial structure for the text that is 422 currently being parsed. The elements specified by the 423 frontier have not yet been expanded or matched. 424 @rtype: C{Tree} 425 """ 426 return self._tree

427

428 - def step(self):

429 """ 430 Perform a single parsing operation. If an untried match is 431 possible, then perform the match, and return the matched 432 token. If an untried expansion is possible, then perform the 433 expansion, and return the production that it is based on. If 434 backtracking is possible, then backtrack, and return 1. 435 Otherwise, return 0. 436 437 @return: 0 if no operation was performed; a token if a match 438 was performed; a production if an expansion was performed; 439 and 1 if a backtrack operation was performed. 440 @rtype: C{Production} or C{String} or C{boolean} 441 """ 442 # Try matching (if we haven't already) 443 if self.untried_match(): 444 token = self.match() 445 if token is not None: return token 446 447 # Try expanding. 448 production = self.expand() 449 if production is not None: return production 450 451 # Try backtracking 452 if self.backtrack(): 453 self._trace_backtrack(self._tree, self._frontier) 454 return 1 455 456 # Nothing left to do. 457 return None

458

459 - def expand(self, production=None):

460 """ 461 Expand the first element of the frontier. In particular, if 462 the first element of the frontier is a subtree whose node type 463 is equal to C{production}'s left hand side, then add a child 464 to that subtree for each element of C{production}'s right hand 465 side. If C{production} is not specified, then use the first 466 untried expandable production. If all expandable productions 467 have been tried, do nothing. 468 469 @return: The production used to expand the frontier, if an 470 expansion was performed. If no expansion was performed, 471 return C{None}. 472 @rtype: C{Production} or C{None} 473 """ 474 475 # Make sure we *can* expand. 476 if len(self._frontier) == 0: 477 return None 478 if not isinstance(self._tree[self._frontier[0]], Tree): 479 return None 480 481 # If they didn't specify a production, check all untried ones. 482 if production is None: 483 productions = self.untried_expandable_productions() 484 else: productions = [production] 485 486 parses = [] 487 for prod in productions: 488 # Record that we've tried this production now. 489 self._tried_e.setdefault(self._freeze(self._tree), []).append(prod) 490 491 # Try expanding. 492 if self._expand(self._rtext, self._tree, self._frontier, prod): 493 return prod 494 495 # We didn't expand anything. 496 return None

497

498 - def match(self):

499 """ 500 Match the first element of the frontier. In particular, if 501 the first element of the frontier has the same type as the 502 next text token, then substitute the text token into the tree. 503 504 @return: The token matched, if a match operation was 505 performed. If no match was performed, return C{None} 506 @rtype: C{String} or C{None} 507 """ 508 509 # Record that we've tried matching this token. 510 tok = self._rtext[0] 511 self._tried_m.setdefault(self._freeze(self._tree), []).append(tok) 512 513 # Make sure we *can* match. 514 if len(self._frontier) == 0: 515 return None 516 if isinstance(self._tree[self._frontier[0]], Tree): 517 return None 518 519 if self._match(self._rtext, self._tree, self._frontier): 520 # Return the token we just matched. 521 return self._history[-1][0][0] 522 else: 523 return None

524

525 - def backtrack(self):

526 """ 527 Return the parser to its state before the most recent 528 match or expand operation. Calling C{undo} repeatedly return 529 the parser to successively earlier states. If no match or 530 expand operations have been performed, C{undo} will make no 531 changes. 532 533 @return: true if an operation was successfully undone. 534 @rtype: C{boolean} 535 """ 536 if len(self._history) == 0: return 0 537 (self._rtext, self._tree, self._frontier) = self._history.pop() 538 return 1

539

540 - def expandable_productions(self):

541 """ 542 @return: A list of all the productions for which expansions 543 are available for the current parser state. 544 @rtype: C{list} of C{Production} 545 """ 546 # Make sure we *can* expand. 547 if len(self._frontier) == 0: return [] 548 frontier_child = self._tree[self._frontier[0]] 549 if (len(self._frontier) == 0 or 550 not isinstance(frontier_child, Tree)): 551 return [] 552 553 return [p for p in self._grammar.productions() 554 if p.lhs().symbol() == frontier_child.node]

555

556 - def untried_expandable_productions(self):

557 """ 558 @return: A list of all the untried productions for which 559 expansions are available for the current parser state. 560 @rtype: C{list} of C{Production} 561 """ 562 563 tried_expansions = self._tried_e.get(self._freeze(self._tree), []) 564 return [p for p in self.expandable_productions() 565 if p not in tried_expansions]

566

567 - def untried_match(self):

568 """ 569 @return: Whether the first element of the frontier is a token 570 that has not yet been matched. 571 @rtype: C{boolean} 572 """ 573 574 if len(self._rtext) == 0: return 0 575 tried_matches = self._tried_m.get(self._freeze(self._tree), []) 576 return (self._rtext[0] not in tried_matches)

577

578 - def currently_complete(self):

579 """ 580 @return: Whether the parser's current state represents a 581 complete parse. 582 @rtype: C{boolean} 583 """ 584 return (len(self._frontier) == 0 and len(self._rtext) == 0)

585

586 - def _parse(self, remaining_text, tree, frontier):

587 """ 588 A stub version of C{_parse} that sets the parsers current 589 state to the given arguments. In C{RecursiveDescentParser}, 590 the C{_parse} method is used to recursively continue parsing a 591 text. C{SteppingRecursiveDescentParser} overrides it to 592 capture these recursive calls. It records the parser's old 593 state in the history (to allow for backtracking), and updates 594 the parser's new state using the given arguments. Finally, it 595 returns C{[1]}, which is used by C{match} and C{expand} to 596 detect whether their operations were successful. 597 598 @return: C{[1]} 599 @rtype: C{list} of C{int} 600 """ 601 self._history.append( (self._rtext, self._tree, self._frontier) ) 602 self._rtext = remaining_text 603 self._tree = tree 604 self._frontier = frontier 605 606 # Is it a good parse? If so, record it. 607 if (len(frontier) == 0 and len(remaining_text) == 0): 608 self._parses.append(tree) 609 self._trace_succeed(self._tree, self._frontier) 610 611 return [1]

612

613 - def parses(self):

614 """ 615 @return: A list of the parses that have been found by this 616 parser so far. 617 @rtype: C{list} of C{Tree} 618 """ 619 return self._parses

620

621 - def set_grammar(self, grammar):

622 """ 623 Change the grammar used to parse texts. 624 625 @param grammar: The new grammar. 626 @type grammar: C{CFG} 627 """ 628 self._grammar = grammar

629 630 ##////////////////////////////////////////////////////// 631 ## Demonstration Code 632 ##////////////////////////////////////////////////////// 633

634 -def demo():

635 """ 636 A demonstration of the recursive descent parser. 637 """ 638 639 from nltk import parse, cfg 640 641 grammar = cfg.parse_cfg(""" 642 S -> NP VP 643 NP -> Det N | Det N PP 644 VP -> V NP | V NP PP 645 PP -> P NP 646 NP -> 'I' 647 N -> 'man' | 'park' | 'telescope' | 'dog' 648 Det -> 'the' | 'a' 649 P -> 'in' | 'with' 650 V -> 'saw' 651 """) 652 653 for prod in grammar.productions(): 654 print prod 655 656 sent = 'I saw a man in the park'.split() 657 parser = parse.RecursiveDescentParser(grammar, trace=2) 658 for p in parser.nbest_parse(sent): 659 print p

660 661 if __name__ == '__main__': 662 demo() 663

Source Code for Module nltk.parse.rd