nltk.parse.chart

1 # Natural Language Toolkit: A Chart Parser 2 # 3 # Copyright (C) 2001-2008 NLTK Project 4 # Author: Edward Loper <[email protected]> 5 # Steven Bird <[email protected]> 6 # Jean Mark Gawron <[email protected]> 7 # URL: <http://nltk.org> 8 # For license information, see LICENSE.TXT 9 # 10 # $Id: chart.py 6387 2008-08-06 00:02:14Z stevenbird $ 11 12 from nltk import Tree, cfg, defaultdict 13 14 from api import * 15 16 """ 17 Data classes and parser implementations for \"chart parsers\", which 18 use dynamic programming to efficiently parse a text. A X{chart 19 parser} derives parse trees for a text by iteratively adding \"edges\" 20 to a \"chart.\" Each X{edge} represents a hypothesis about the tree 21 structure for a subsequence of the text. The X{chart} is a 22 \"blackboard\" for composing and combining these hypotheses. 23 24 When a chart parser begins parsing a text, it creates a new (empty) 25 chart, spanning the text. It then incrementally adds new edges to the 26 chart. A set of X{chart rules} specifies the conditions under which 27 new edges should be added to the chart. Once the chart reaches a 28 stage where none of the chart rules adds any new edges, parsing is 29 complete. 30 31 Charts are encoded with the L{Chart} class, and edges are encoded with 32 the L{TreeEdge} and L{LeafEdge} classes. The chart parser module 33 defines three chart parsers: 34 35 - C{ChartParser} is a simple and flexible chart parser. Given a 36 set of chart rules, it will apply those rules to the chart until 37 no more edges are added. 38 39 - C{SteppingChartParser} is a subclass of C{ChartParser} that can 40 be used to step through the parsing process. 41 """ 42 43 import re 44 45 ######################################################################## 46 ## Edges 47 ######################################################################## 48

49 -class EdgeI(object):

50 """ 51 A hypothesis about the structure of part of a sentence. 52 Each edge records the fact that a structure is (partially) 53 consistent with the sentence. An edge contains: 54 55 - A X{span}, indicating what part of the sentence is 56 consistent with the hypothesized structure. 57 58 - A X{left-hand side}, specifying what kind of structure is 59 hypothesized. 60 61 - A X{right-hand side}, specifying the contents of the 62 hypothesized structure. 63 64 - A X{dot position}, indicating how much of the hypothesized 65 structure is consistent with the sentence. 66 67 Every edge is either X{complete} or X{incomplete}: 68 69 - An edge is X{complete} if its structure is fully consistent 70 with the sentence. 71 72 - An edge is X{incomplete} if its structure is partially 73 consistent with the sentence. For every incomplete edge, the 74 span specifies a possible prefix for the edge's structure. 75 76 There are two kinds of edge: 77 78 - C{TreeEdges<TreeEdge>} record which trees have been found to 79 be (partially) consistent with the text. 80 81 - C{LeafEdges<leafEdge>} record the tokens occur in the text. 82 83 The C{EdgeI} interface provides a common interface to both types 84 of edge, allowing chart parsers to treat them in a uniform manner. 85 """

86 - def __init__(self):

87 if self.__class__ == EdgeI: 88 raise TypeError('Edge is an abstract interface')

89 90 #//////////////////////////////////////////////////////////// 91 # Span 92 #//////////////////////////////////////////////////////////// 93

94 - def span(self):

95 """ 96 @return: A tuple C{(s,e)}, where C{subtokens[s:e]} is the 97 portion of the sentence that is consistent with this 98 edge's structure. 99 @rtype: C{(int, int)} 100 """ 101 raise AssertionError('EdgeI is an abstract interface')

102

103 - def start(self):

104 """ 105 @return: The start index of this edge's span. 106 @rtype: C{int} 107 """ 108 raise AssertionError('EdgeI is an abstract interface')

109

110 - def end(self):

111 """ 112 @return: The end index of this edge's span. 113 @rtype: C{int} 114 """ 115 raise AssertionError('EdgeI is an abstract interface')

116

117 - def length(self):

118 """ 119 @return: The length of this edge's span. 120 @rtype: C{int} 121 """ 122 raise AssertionError('EdgeI is an abstract interface')

123 124 #//////////////////////////////////////////////////////////// 125 # Left Hand Side 126 #//////////////////////////////////////////////////////////// 127

128 - def lhs(self):

129 """ 130 @return: This edge's left-hand side, which specifies what kind 131 of structure is hypothesized by this edge. 132 @see: L{TreeEdge} and L{LeafEdge} for a description of 133 the left-hand side values for each edge type. 134 """ 135 raise AssertionError('EdgeI is an abstract interface')

136 137 #//////////////////////////////////////////////////////////// 138 # Right Hand Side 139 #//////////////////////////////////////////////////////////// 140

141 - def rhs(self):

142 """ 143 @return: This edge's right-hand side, which specifies 144 the content of the structure hypothesized by this 145 edge. 146 @see: L{TreeEdge} and L{LeafEdge} for a description of 147 the right-hand side values for each edge type. 148 """ 149 raise AssertionError('EdgeI is an abstract interface')

150

151 - def dot(self):

152 """ 153 @return: This edge's dot position, which indicates how much of 154 the hypothesized structure is consistent with the 155 sentence. In particular, C{self.rhs[:dot]} is consistent 156 with C{subtoks[self.start():self.end()]}. 157 @rtype: C{int} 158 """ 159 raise AssertionError('EdgeI is an abstract interface')

160

161 - def next(self):

162 """ 163 @return: The element of this edge's right-hand side that 164 immediately follows its dot. 165 @rtype: C{Nonterminal} or X{terminal} or C{None} 166 """ 167 raise AssertionError('EdgeI is an abstract interface')

168

169 - def is_complete(self):

170 """ 171 @return: True if this edge's structure is fully consistent 172 with the text. 173 @rtype: C{boolean} 174 """ 175 raise AssertionError('EdgeI is an abstract interface')

176

177 - def is_incomplete(self):

178 """ 179 @return: True if this edge's structure is partially consistent 180 with the text. 181 @rtype: C{boolean} 182 """ 183 raise AssertionError('EdgeI is an abstract interface')

184 185 #//////////////////////////////////////////////////////////// 186 # Comparisons 187 #////////////////////////////////////////////////////////////

188 - def __cmp__(self, other):

189 raise AssertionError('EdgeI is an abstract interface')

190

191 - def __hash__(self, other):

192 raise AssertionError('EdgeI is an abstract interface')

193

194 -class TreeEdge(EdgeI):

195 """ 196 An edge that records the fact that a tree is (partially) 197 consistent with the sentence. A tree edge consists of: 198 199 - A X{span}, indicating what part of the sentence is 200 consistent with the hypothesized tree. 201 202 - A X{left-hand side}, specifying the hypothesized tree's node 203 value. 204 205 - A X{right-hand side}, specifying the hypothesized tree's 206 children. Each element of the right-hand side is either a 207 terminal, specifying a token with that terminal as its leaf 208 value; or a nonterminal, specifying a subtree with that 209 nonterminal's symbol as its node value. 210 211 - A X{dot position}, indicating which children are consistent 212 with part of the sentence. In particular, if C{dot} is the 213 dot position, C{rhs} is the right-hand size, C{(start,end)} 214 is the span, and C{sentence} is the list of subtokens in the 215 sentence, then C{subtokens[start:end]} can be spanned by the 216 children specified by C{rhs[:dot]}. 217 218 For more information about edges, see the L{EdgeI} interface. 219 """

220 - def __init__(self, span, lhs, rhs, dot=0):

221 """ 222 Construct a new C{TreeEdge}. 223 224 @type span: C{(int, int)} 225 @param span: A tuple C{(s,e)}, where C{subtokens[s:e]} is the 226 portion of the sentence that is consistent with the new 227 edge's structure. 228 @type lhs: L{Nonterminal} 229 @param lhs: The new edge's left-hand side, specifying the 230 hypothesized tree's node value. 231 @type rhs: C{list} of (L{Nonterminal} and C{string}) 232 @param rhs: The new edge's right-hand side, specifying the 233 hypothesized tree's children. 234 @type dot: C{int} 235 @param dot: The position of the new edge's dot. This position 236 specifies what prefix of the production's right hand side 237 is consistent with the text. In particular, if 238 C{sentence} is the list of subtokens in the sentence, then 239 C{subtokens[span[0]:span[1]]} can be spanned by the 240 children specified by C{rhs[:dot]}. 241 """ 242 self._lhs = lhs 243 self._rhs = tuple(rhs) 244 self._span = span 245 self._dot = dot

246 247 # [staticmethod]

248 - def from_production(production, index):

249 """ 250 @return: A new C{TreeEdge} formed from the given production. 251 The new edge's left-hand side and right-hand side will 252 be taken from C{production}; its span will be C{(index, 253 index)}; and its dot position will be C{0}. 254 @rtype: L{TreeEdge} 255 """ 256 return TreeEdge(span=(index, index), lhs=production.lhs(), 257 rhs=production.rhs(), dot=0)

258 from_production = staticmethod(from_production) 259 260 # Accessors

261 - def lhs(self): return self._lhs

262 - def span(self): return self._span

263 - def start(self): return self._span[0]

264 - def end(self): return self._span[1]

265 - def length(self): return self._span[1] - self._span[0]

266 - def rhs(self): return self._rhs

267 - def dot(self): return self._dot

268 - def is_complete(self): return self._dot == len(self._rhs)

269 - def is_incomplete(self): return self._dot != len(self._rhs)

270 - def next(self):

271 if self._dot >= len(self._rhs): return None 272 else: return self._rhs[self._dot]

273 274 # Comparisons & hashing

275 - def __cmp__(self, other):

276 if self.__class__ != other.__class__: return -1 277 return cmp((self._span, self.lhs(), self.rhs(), self._dot), 278 (other._span, other.lhs(), other.rhs(), other._dot))

279 - def __hash__(self):

280 return hash((self.lhs(), self.rhs(), self._span, self._dot))

281 282 # String representation

283 - def __str__(self):

284 str = '[%s:%s] ' % (self._span[0], self._span[1]) 285 if isinstance(self._lhs.symbol(), basestring): 286 str += '%-2s ->' % (self._lhs,) 287 else: 288 str += '%-2r ->' % (self._lhs,) 289 290 for i in range(len(self._rhs)): 291 if i == self._dot: str += ' *' 292 if (isinstance(self._rhs[i], cfg.Nonterminal) and 293 isinstance(self._rhs[i].symbol(), basestring)): 294 str += ' %s' % (self._rhs[i],) 295 else: 296 str += ' %r' % (self._rhs[i],) 297 if len(self._rhs) == self._dot: str += ' *' 298 return str

299

300 - def __repr__(self):

301 return '[Edge: %s]' % self

302

303 -class LeafEdge(EdgeI):

304 """ 305 An edge that records the fact that a leaf value is consistent with 306 a word in the sentence. A leaf edge consists of: 307 308 - An X{index}, indicating the position of the word. 309 - A X{leaf}, specifying the word's content. 310 311 A leaf edge's left-hand side is its leaf value, and its right hand 312 side is C{()}. Its span is C{[index, index+1]}, and its dot 313 position is C{0}. 314 """

315 - def __init__(self, leaf, index):

316 """ 317 Construct a new C{LeafEdge}. 318 319 @param leaf: The new edge's leaf value, specifying the word 320 that is recorded by this edge. 321 @param index: The new edge's index, specifying the position of 322 the word that is recorded by this edge. 323 """ 324 self._leaf = leaf 325 self._index = index

326 327 # Accessors

328 - def lhs(self): return self._leaf

329 - def span(self): return (self._index, self._index+1)

330 - def start(self): return self._index

331 - def end(self): return self._index+1

332 - def length(self): return 1

333 - def rhs(self): return ()

334 - def dot(self): return 0

335 - def is_complete(self): return True

336 - def is_incomplete(self): return False

337 - def next(self): return None

338 339 # Comparisons & hashing

340 - def __cmp__(self, other):

341 if not isinstance(other, LeafEdge): return -1 342 return cmp((self._index, self._leaf), (other._index, other._leaf))

343 - def __hash__(self):

344 return hash((self._index, self._leaf))

345 346 # String representations

347 - def __str__(self): return '[%s:%s] %r' % (self._index, self._index+1, self._leaf)

348 - def __repr__(self):

349 return '[Edge: %s]' % (self)

350 351 ######################################################################## 352 ## Chart 353 ######################################################################## 354

355 -class Chart(object):

356 """ 357 A blackboard for hypotheses about the syntactic constituents of a 358 sentence. A chart contains a set of edges, and each edge encodes 359 a single hypothesis about the structure of some portion of the 360 sentence. 361 362 The L{select} method can be used to select a specific collection 363 of edges. For example C{chart.select(is_complete=True, start=0)} 364 yields all complete edges whose start indices are 0. To ensure 365 the efficiency of these selection operations, C{Chart} dynamically 366 creates and maintains an index for each set of attributes that 367 have been selected on. 368 369 In order to reconstruct the trees that are represented by an edge, 370 the chart associates each edge with a set of child pointer lists. 371 A X{child pointer list} is a list of the edges that license an 372 edge's right-hand side. 373 374 @ivar _tokens: The sentence that the chart covers. 375 @ivar _num_leaves: The number of tokens. 376 @ivar _edges: A list of the edges in the chart 377 @ivar _edge_to_cpls: A dictionary mapping each edge to a set 378 of child pointer lists that are associated with that edge. 379 @ivar _indexes: A dictionary mapping tuples of edge attributes 380 to indices, where each index maps the corresponding edge 381 attribute values to lists of edges. 382 """

383 - def __init__(self, tokens):

384 """ 385 Construct a new empty chart. 386 387 @type tokens: L{list} 388 @param tokens: The sentence that this chart will be used to parse. 389 """ 390 # Record the sentence token and the sentence length. 391 self._tokens = list(tokens) 392 self._num_leaves = len(self._tokens) 393 394 # A list of edges contained in this chart. 395 self._edges = [] 396 397 # The set of child pointer lists associated with each edge. 398 self._edge_to_cpls = {} 399 400 # Indexes mapping attribute values to lists of edges (used by 401 # select()). 402 self._indexes = {}

403 404 #//////////////////////////////////////////////////////////// 405 # Sentence Access 406 #//////////////////////////////////////////////////////////// 407

408 - def num_leaves(self):

409 """ 410 @return: The number of words in this chart's sentence. 411 @rtype: C{int} 412 """ 413 return self._num_leaves

414

415 - def leaf(self, index):

416 """ 417 @return: The leaf value of the word at the given index. 418 @rtype: C{string} 419 """ 420 return self._tokens[index]

421

422 - def leaves(self):

423 """ 424 @return: A list of the leaf values of each word in the 425 chart's sentence. 426 @rtype: C{list} of C{string} 427 """ 428 return self._tokens

429 430 #//////////////////////////////////////////////////////////// 431 # Edge access 432 #//////////////////////////////////////////////////////////// 433

434 - def edges(self):

435 """ 436 @return: A list of all edges in this chart. New edges 437 that are added to the chart after the call to edges() 438 will I{not} be contained in this list. 439 @rtype: C{list} of L{EdgeI} 440 @see: L{iteredges}, L{select} 441 """ 442 return self._edges[:]

443

444 - def iteredges(self):

445 """ 446 @return: An iterator over the edges in this chart. Any 447 new edges that are added to the chart before the iterator 448 is exahusted will also be generated. 449 @rtype: C{iter} of L{EdgeI} 450 @see: L{edges}, L{select} 451 """ 452 return iter(self._edges)

453 454 # Iterating over the chart yields its edges. 455 __iter__ = iteredges 456

457 - def num_edges(self):

458 """ 459 @return: The number of edges contained in this chart. 460 @rtype: C{int} 461 """ 462 return len(self._edge_to_cpls)

463

464 - def select(self, **restrictions):

465 """ 466 @return: An iterator over the edges in this chart. Any 467 new edges that are added to the chart before the iterator 468 is exahusted will also be generated. C{restrictions} 469 can be used to restrict the set of edges that will be 470 generated. 471 @rtype: C{iter} of L{EdgeI} 472 473 @kwarg span: Only generate edges C{e} where C{e.span()==span} 474 @kwarg start: Only generate edges C{e} where C{e.start()==start} 475 @kwarg end: Only generate edges C{e} where C{e.end()==end} 476 @kwarg length: Only generate edges C{e} where C{e.length()==length} 477 @kwarg lhs: Only generate edges C{e} where C{e.lhs()==lhs} 478 @kwarg rhs: Only generate edges C{e} where C{e.rhs()==rhs} 479 @kwarg next: Only generate edges C{e} where C{e.next()==next} 480 @kwarg dot: Only generate edges C{e} where C{e.dot()==dot} 481 @kwarg is_complete: Only generate edges C{e} where 482 C{e.is_complete()==is_complete} 483 @kwarg is_incomplete: Only generate edges C{e} where 484 C{e.is_incomplete()==is_incomplete} 485 """ 486 # If there are no restrictions, then return all edges. 487 if restrictions=={}: return iter(self._edges) 488 489 # Find the index corresponding to the given restrictions. 490 restr_keys = restrictions.keys() 491 restr_keys.sort() 492 restr_keys = tuple(restr_keys) 493 494 # If it doesn't exist, then create it. 495 if restr_keys not in self._indexes: 496 self._add_index(restr_keys) 497 498 vals = [restrictions[k] for k in restr_keys] 499 return iter(self._indexes[restr_keys].get(tuple(vals), []))

500

501 - def _add_index(self, restr_keys):

502 """ 503 A helper function for L{select}, which creates a new index for 504 a given set of attributes (aka restriction keys). 505 """ 506 # Make sure it's a valid index. 507 for k in restr_keys: 508 if not hasattr(EdgeI, k): 509 raise ValueError, 'Bad restriction: %s' % k 510 511 # Create the index. 512 self._indexes[restr_keys] = {} 513 514 # Add all existing edges to the index. 515 for edge in self._edges: 516 vals = [getattr(edge, k)() for k in restr_keys] 517 index = self._indexes[restr_keys] 518 index.setdefault(tuple(vals),[]).append(edge)

519 520 #//////////////////////////////////////////////////////////// 521 # Edge Insertion 522 #//////////////////////////////////////////////////////////// 523

524 - def insert(self, edge, child_pointer_list):

525 """ 526 Add a new edge to the chart. 527 528 @type edge: L{EdgeI} 529 @param edge: The new edge 530 @type child_pointer_list: C{tuple} of L{EdgeI} 531 @param child_pointer_list: A list of the edges that were used to 532 form this edge. This list is used to reconstruct the trees 533 (or partial trees) that are associated with C{edge}. 534 @rtype: C{bool} 535 @return: True if this operation modified the chart. In 536 particular, return true iff the chart did not already 537 contain C{edge}, or if it did not already associate 538 C{child_pointer_list} with C{edge}. 539 """ 540 # Is it a new edge? 541 if edge not in self._edge_to_cpls: 542 # Add it to the list of edges. 543 self._edges.append(edge) 544 545 # Register with indexes 546 for (restr_keys, index) in self._indexes.items(): 547 vals = [getattr(edge, k)() for k in restr_keys] 548 index = self._indexes[restr_keys] 549 index.setdefault(tuple(vals),[]).append(edge) 550 551 # Get the set of child pointer lists for this edge. 552 cpls = self._edge_to_cpls.setdefault(edge,{}) 553 child_pointer_list = tuple(child_pointer_list) 554 555 if child_pointer_list in cpls: 556 # We've already got this CPL; return false. 557 return False 558 else: 559 # It's a new CPL; register it, and return true. 560 cpls[child_pointer_list] = True 561 return True

562 563 #//////////////////////////////////////////////////////////// 564 # Tree extraction & child pointer lists 565 #//////////////////////////////////////////////////////////// 566

567 - def parses(self, root, tree_class=Tree):

568 """ 569 @return: A list of the complete tree structures that span 570 the entire chart, and whose root node is C{root}. 571 """ 572 trees = [] 573 for edge in self.select(span=(0,self._num_leaves), lhs=root): 574 trees += self.trees(edge, tree_class=tree_class, complete=True) 575 return trees

576

577 - def trees(self, edge, tree_class=Tree, complete=False):

578 """ 579 @return: A list of the tree structures that are associated 580 with C{edge}. 581 582 If C{edge} is incomplete, then the unexpanded children will be 583 encoded as childless subtrees, whose node value is the 584 corresponding terminal or nonterminal. 585 586 @rtype: C{list} of L{Tree} 587 @note: If two trees share a common subtree, then the same 588 C{Tree} may be used to encode that subtree in 589 both trees. If you need to eliminate this subtree 590 sharing, then create a deep copy of each tree. 591 """ 592 return self._trees(edge, complete, memo={}, tree_class=tree_class)

593

594 - def _trees(self, edge, complete, memo, tree_class):

595 """ 596 A helper function for L{trees}. 597 @param memo: A dictionary used to record the trees that we've 598 generated for each edge, so that when we see an edge more 599 than once, we can reuse the same trees. 600 """ 601 # If we've seen this edge before, then reuse our old answer. 602 if edge in memo: 603 return memo[edge] 604 605 trees = [] 606 607 # when we're reading trees off the chart, don't use incomplete edges 608 if complete and edge.is_incomplete(): 609 return trees 610 611 # Until we're done computing the trees for edge, set 612 # memo[edge] to be empty. This has the effect of filtering 613 # out any cyclic trees (i.e., trees that contain themselves as 614 # descendants), because if we reach this edge via a cycle, 615 # then it will appear that the edge doesn't generate any 616 # trees. 617 memo[edge] = [] 618 619 # Leaf edges. 620 if isinstance(edge, LeafEdge): 621 leaf = self._tokens[edge.start()] 622 memo[edge] = leaf 623 return [leaf] 624 625 # Each child pointer list can be used to form trees. 626 for cpl in self.child_pointer_lists(edge): 627 # Get the set of child choices for each child pointer. 628 # child_choices[i] is the set of choices for the tree's 629 # ith child. 630 child_choices = [self._trees(cp, complete, memo, tree_class) 631 for cp in cpl] 632 633 # Kludge to ensure child_choices is a doubly-nested list 634 if len(child_choices) > 0 and type(child_choices[0]) == type(""): 635 child_choices = [child_choices] 636 637 # For each combination of children, add a tree. 638 for children in self._choose_children(child_choices): 639 lhs = edge.lhs().symbol() 640 trees.append(tree_class(lhs, children)) 641 642 # If the edge is incomplete, then extend it with "partial trees": 643 if edge.is_incomplete(): 644 unexpanded = [tree_class(elt,[]) 645 for elt in edge.rhs()[edge.dot():]] 646 for tree in trees: 647 tree.extend(unexpanded) 648 649 # Update the memoization dictionary. 650 memo[edge] = trees 651 652 # Return the list of trees. 653 return trees

654

655 - def _choose_children(self, child_choices):

656 """ 657 A helper function for L{_trees} that finds the possible sets 658 of subtrees for a new tree. 659 660 @param child_choices: A list that specifies the options for 661 each child. In particular, C{child_choices[i]} is a list of 662 tokens and subtrees that can be used as the C{i}th child. 663 """ 664 children_lists = [[]] 665 for child_choice in child_choices: 666 children_lists = [child_list+[child] 667 for child in child_choice 668 for child_list in children_lists] 669 return children_lists

670

671 - def child_pointer_lists(self, edge):

672 """ 673 @rtype: C{list} of C{list} of C{EdgeI} 674 @return: The set of child pointer lists for the given edge. 675 Each child pointer list is a list of edges that have 676 been used to form this edge. 677 """ 678 # Make a copy, in case they modify it. 679 return self._edge_to_cpls.get(edge, {}).keys()

680 681 #//////////////////////////////////////////////////////////// 682 # Display 683 #////////////////////////////////////////////////////////////

684 - def pp_edge(self, edge, width=None):

685 """ 686 @return: A pretty-printed string representation of a given edge 687 in this chart. 688 @rtype: C{string} 689 @param width: The number of characters allotted to each 690 index in the sentence. 691 """ 692 if width is None: width = 50/(self.num_leaves()+1) 693 (start, end) = (edge.start(), edge.end()) 694 695 str = '|' + ('.'+' '*(width-1))*start 696 697 # Zero-width edges are "#" if complete, ">" if incomplete 698 if start == end: 699 if edge.is_complete(): str += '#' 700 else: str += '>' 701 702 # Spanning complete edges are "[===]"; Other edges are 703 # "[---]" if complete, "[--->" if incomplete 704 elif edge.is_complete() and edge.span() == (0,self._num_leaves): 705 str += '['+('='*width)*(end-start-1) + '='*(width-1)+']' 706 elif edge.is_complete(): 707 str += '['+('-'*width)*(end-start-1) + '-'*(width-1)+']' 708 else: 709 str += '['+('-'*width)*(end-start-1) + '-'*(width-1)+'>' 710 711 str += (' '*(width-1)+'.')*(self._num_leaves-end) 712 return str + '| %s' % edge

713

714 - def pp_leaves(self, width=None):

715 """ 716 @return: A pretty-printed string representation of this 717 chart's leaves. This string can be used as a header 718 for calls to L{pp_edge}. 719 """ 720 if width is None: width = 50/(self.num_leaves()+1) 721 722 if self._tokens is not None and width>1: 723 header = '|.' 724 for tok in self._tokens: 725 header += tok[:width-1].center(width-1)+'.' 726 header += '|' 727 else: 728 header = '' 729 730 return header

731

732 - def pp(self, width=None):

733 """ 734 @return: A pretty-printed string representation of this chart. 735 @rtype: C{string} 736 @param width: The number of characters allotted to each 737 index in the sentence. 738 """ 739 if width is None: width = 50/(self.num_leaves()+1) 740 # sort edges: primary key=length, secondary key=start index. 741 # (and filter out the token edges) 742 edges = [(e.length(), e.start(), e) for e in self] 743 edges.sort() 744 edges = [e for (_,_,e) in edges] 745 746 return (self.pp_leaves(width) + '\n' + 747 '\n'.join(self.pp_edge(edge, width) for edge in edges))

748 749 #//////////////////////////////////////////////////////////// 750 # Display: Dot (AT&T Graphviz) 751 #//////////////////////////////////////////////////////////// 752

753 - def dot_digraph(self):

754 # Header 755 s = 'digraph nltk_chart {\n' 756 #s += ' size="5,5";\n' 757 s += ' rankdir=LR;\n' 758 s += ' node [height=0.1,width=0.1];\n' 759 s += ' node [style=filled, color="lightgray"];\n' 760 761 # Set up the nodes 762 for y in range(self.num_edges(), -1, -1): 763 if y == 0: 764 s += ' node [style=filled, color="black"];\n' 765 for x in range(self.num_leaves()+1): 766 if y == 0 or (x <= self._edges[y-1].start() or 767 x >= self._edges[y-1].end()): 768 s += ' %04d.%04d [label=""];\n' % (x,y) 769 770 # Add a spacer 771 s += ' x [style=invis]; x->0000.0000 [style=invis];\n' 772 773 # Declare ranks. 774 for x in range(self.num_leaves()+1): 775 s += ' {rank=same;' 776 for y in range(self.num_edges()+1): 777 if y == 0 or (x <= self._edges[y-1].start() or 778 x >= self._edges[y-1].end()): 779 s += ' %04d.%04d' % (x,y) 780 s += '}\n' 781 782 # Add the leaves 783 s += ' edge [style=invis, weight=100];\n' 784 s += ' node [shape=plaintext]\n' 785 s += ' 0000.0000' 786 for x in range(self.num_leaves()): 787 s += '->%s->%04d.0000' % (self.leaf(x), x+1) 788 s += ';\n\n' 789 790 # Add the edges 791 s += ' edge [style=solid, weight=1];\n' 792 for y, edge in enumerate(self): 793 for x in range(edge.start()): 794 s += (' %04d.%04d -> %04d.%04d [style="invis"];\n' % 795 (x, y+1, x+1, y+1)) 796 s += (' %04d.%04d -> %04d.%04d [label="%s"];\n' % 797 (edge.start(), y+1, edge.end(), y+1, edge)) 798 for x in range(edge.end(), self.num_leaves()): 799 s += (' %04d.%04d -> %04d.%04d [style="invis"];\n' % 800 (x, y+1, x+1, y+1)) 801 s += '}\n' 802 return s

803 804 ######################################################################## 805 ## Chart Rules 806 ######################################################################## 807

808 -class ChartRuleI(object):

809 """ 810 A rule that specifies what new edges are licensed by any given set 811 of existing edges. Each chart rule expects a fixed number of 812 edges, as indicated by the class variable L{NUM_EDGES}. In 813 particular: 814 815 - A chart rule with C{NUM_EDGES=0} specifies what new edges are 816 licensed, regardless of existing edges. 817 818 - A chart rule with C{NUM_EDGES=1} specifies what new edges are 819 licensed by a single existing edge. 820 821 - A chart rule with C{NUM_EDGES=2} specifies what new edges are 822 licensed by a pair of existing edges. 823 824 @type NUM_EDGES: C{int} 825 @cvar NUM_EDGES: The number of existing edges that this rule uses 826 to license new edges. Typically, this number ranges from zero 827 to two. 828 """

829 - def apply(self, chart, grammar, *edges):

830 """ 831 Add the edges licensed by this rule and the given edges to the 832 chart. 833 834 @type edges: C{list} of L{EdgeI} 835 @param edges: A set of existing edges. The number of edges 836 that should be passed to C{apply} is specified by the 837 L{NUM_EDGES} class variable. 838 @rtype: C{list} of L{EdgeI} 839 @return: A list of the edges that were added. 840 """ 841 raise AssertionError, 'ChartRuleI is an abstract interface'

842

843 - def apply_iter(self, chart, grammar, *edges):

844 """ 845 @return: A generator that will add edges licensed by this rule 846 and the given edges to the chart, one at a time. Each 847 time the generator is resumed, it will either add a new 848 edge and yield that edge; or return. 849 @rtype: C{iter} of L{EdgeI} 850 851 @type edges: C{list} of L{EdgeI} 852 @param edges: A set of existing edges. The number of edges 853 that should be passed to C{apply} is specified by the 854 L{NUM_EDGES} class variable. 855 """ 856 raise AssertionError, 'ChartRuleI is an abstract interface'

857

858 - def apply_everywhere(self, chart, grammar):

859 """ 860 Add all the edges licensed by this rule and the edges in the 861 chart to the chart. 862 863 @rtype: C{list} of L{EdgeI} 864 @return: A list of the edges that were added. 865 """ 866 raise AssertionError, 'ChartRuleI is an abstract interface'

867

868 - def apply_everywhere_iter(self, chart, grammar):

869 """ 870 @return: A generator that will add all edges licensed by 871 this rule, given the edges that are currently in the 872 chart, one at a time. Each time the generator is resumed, 873 it will either add a new edge and yield that edge; or 874 return. 875 @rtype: C{iter} of L{EdgeI} 876 """ 877 raise AssertionError, 'ChartRuleI is an abstract interface'

878

879 -class AbstractChartRule(object):

880 """ 881 An abstract base class for chart rules. C{AbstractChartRule} 882 provides: 883 - A default implementation for C{apply}, based on C{apply_iter}. 884 - A default implementation for C{apply_everywhere_iter}, 885 based on C{apply_iter}. 886 - A default implementation for C{apply_everywhere}, based on 887 C{apply_everywhere_iter}. Currently, this implementation 888 assumes that C{NUM_EDGES}<=3. 889 - A default implementation for C{__str__}, which returns a 890 name basd on the rule's class name. 891 """ 892 893 # Subclasses must define apply_iter.

894 - def apply_iter(self, chart, grammar, *edges):

895 raise AssertionError, 'AbstractChartRule is an abstract class'

896 897 # Default: loop through the given number of edges, and call 898 # self.apply() for each set of edges.

899 - def apply_everywhere_iter(self, chart, grammar):

900 if self.NUM_EDGES == 0: 901 for new_edge in self.apply_iter(chart, grammar): 902 yield new_edge 903 904 elif self.NUM_EDGES == 1: 905 for e1 in chart: 906 for new_edge in self.apply_iter(chart, grammar, e1): 907 yield new_edge 908 909 elif self.NUM_EDGES == 2: 910 for e1 in chart: 911 for e2 in chart: 912 for new_edge in self.apply_iter(chart, grammar, e1, e2): 913 yield new_edge 914 915 elif self.NUM_EDGES == 3: 916 for e1 in chart: 917 for e2 in chart: 918 for e3 in chart: 919 for new_edge in self.apply_iter(chart,grammar,e1,e2,e3): 920 yield new_edge 921 922 else: 923 raise AssertionError, 'NUM_EDGES>3 is not currently supported'

924 925 # Default: delegate to apply_iter.

926 - def apply(self, chart, grammar, *edges):

927 return list(self.apply_iter(chart, grammar, *edges))

928 929 # Default: delegate to apply_everywhere_iter.

930 - def apply_everywhere(self, chart, grammar):

931 return list(self.apply_everywhere_iter(chart, grammar))

932 933 # Default: return a name based on the class name.

934 - def __str__(self):

935 # Add spaces between InitialCapsWords. 936 return re.sub('([a-z])([A-Z])', r'\1 \2', self.__class__.__name__)

937 938 #//////////////////////////////////////////////////////////// 939 # Fundamental Rule 940 #////////////////////////////////////////////////////////////

941 -class FundamentalRule(AbstractChartRule):

942 """ 943 A rule that joins two adjacent edges to form a single combined 944 edge. In particular, this rule specifies that any pair of edges: 945 946 - [AS{->}S{alpha}*BS{beta}][i:j] 947 - [BS{->}S{gamma}*][j:k] 948 licenses the edge: 949 - [AS{->}S{alpha}B*S{beta}][i:j] 950 """ 951 NUM_EDGES = 2

952 - def apply_iter(self, chart, grammar, left_edge, right_edge):

953 # Make sure the rule is applicable. 954 if not (left_edge.end() == right_edge.start() and 955 left_edge.next() == right_edge.lhs() and 956 left_edge.is_incomplete() and right_edge.is_complete()): 957 return 958 959 # Construct the new edge. 960 new_edge = TreeEdge(span=(left_edge.start(), right_edge.end()), 961 lhs=left_edge.lhs(), rhs=left_edge.rhs(), 962 dot=left_edge.dot()+1) 963 964 # Add it to the chart, with appropriate child pointers. 965 changed_chart = False 966 for cpl1 in chart.child_pointer_lists(left_edge): 967 if chart.insert(new_edge, cpl1+(right_edge,)): 968 changed_chart = True 969 970 # If we changed the chart, then generate the edge. 971 if changed_chart: yield new_edge

972

973 -class SingleEdgeFundamentalRule(AbstractChartRule):

974 """ 975 A rule that joins a given edge with adjacent edges in the chart, 976 to form combined edges. In particular, this rule specifies that 977 either of the edges: 978 - [AS{->}S{alpha}*BS{beta}][i:j] 979 - [BS{->}S{gamma}*][j:k] 980 licenses the edge: 981 - [AS{->}S{alpha}B*S{beta}][i:j] 982 if the other edge is already in the chart. 983 @note: This is basically L{FundamentalRule}, with one edge is left 984 unspecified. 985 """ 986 NUM_EDGES = 1 987 988 _fundamental_rule = FundamentalRule() 989

990 - def apply_iter(self, chart, grammar, edge1):

991 fr = self._fundamental_rule 992 if edge1.is_incomplete(): 993 # edge1 = left_edge; edge2 = right_edge 994 for edge2 in chart.select(start=edge1.end(), is_complete=True, 995 lhs=edge1.next()): 996 for new_edge in fr.apply_iter(chart, grammar, edge1, edge2): 997 yield new_edge 998 else: 999 # edge2 = left_edge; edge1 = right_edge 1000 for edge2 in chart.select(end=edge1.start(), is_complete=False, 1001 next=edge1.lhs()): 1002 for new_edge in fr.apply_iter(chart, grammar, edge2, edge1): 1003 yield new_edge

1004

1005 - def __str__(self): return 'Fundamental Rule'

1006

1007 -class BottomUpInitRule(AbstractChartRule):

1008 """ 1009 A rule licensing any edges corresponding to terminals in the 1010 text. In particular, this rule licenses the leaf edge: 1011 - [wS{->}*][i:i+1] 1012 for C{w} is a word in the text, where C{i} is C{w}'s index. 1013 """ 1014 NUM_EDGES = 0

1015 - def apply_iter(self, chart, grammar):

1016 for index in range(chart.num_leaves()): 1017 new_edge = LeafEdge(chart.leaf(index), index) 1018 if chart.insert(new_edge, ()): 1019 yield new_edge

1020 1021 #//////////////////////////////////////////////////////////// 1022 # Top-Down Parsing 1023 #//////////////////////////////////////////////////////////// 1024 1025

1026 -class TopDownInitRule(AbstractChartRule):

1027 """ 1028 A rule licensing edges corresponding to the grammar productions for 1029 the grammar's start symbol. In particular, this rule specifies that: 1030 - [SS{->}*S{alpha}][0:i] 1031 is licensed for each grammar production C{SS{->}S{alpha}}, where 1032 C{S} is the grammar's start symbol. 1033 """ 1034 NUM_EDGES = 0

1035 - def apply_iter(self, chart, grammar):

1036 for prod in grammar.productions(lhs=grammar.start()): 1037 new_edge = TreeEdge.from_production(prod, 0) 1038 if chart.insert(new_edge, ()): 1039 yield new_edge

1040

1041 -class TopDownExpandRule(AbstractChartRule):

1042 """ 1043 A rule licensing edges corresponding to the grammar productions 1044 for the nonterminal following an incomplete edge's dot. In 1045 particular, this rule specifies that: 1046 - [AS{->}S{alpha}*BS{beta}][i:j] 1047 licenses the edge: 1048 - [BS{->}*S{gamma}][j:j] 1049 for each grammar production C{BS{->}S{gamma}}. 1050 """ 1051 NUM_EDGES = 1

1052 - def apply_iter(self, chart, grammar, edge):

1053 if edge.is_complete(): return 1054 for prod in grammar.productions(lhs=edge.next()): 1055 new_edge = TreeEdge.from_production(prod, edge.end()) 1056 if chart.insert(new_edge, ()): 1057 yield new_edge

1058

1059 -class TopDownMatchRule(AbstractChartRule):

1060 """ 1061 A rule licensing an edge corresponding to a terminal following an 1062 incomplete edge's dot. In particular, this rule specifies that: 1063 - [AS{->}S{alpha}*w{beta}][i:j] 1064 licenses the leaf edge: 1065 - [wS{->}*][j:j+1] 1066 if the C{j}th word in the text is C{w}. 1067 """ 1068 NUM_EDGES = 1

1069 - def apply_iter(self, chart, grammar, edge):

1070 if edge.is_complete() or edge.end() >= chart.num_leaves(): return 1071 index = edge.end() 1072 leaf = chart.leaf(index) 1073 if edge.next() == leaf: 1074 new_edge = LeafEdge(leaf, index) 1075 if chart.insert(new_edge, ()): 1076 yield new_edge

1077 1078 # Add a cache, to prevent recalculating.

1079 -class CachedTopDownInitRule(TopDownInitRule):

1080 """ 1081 A cached version of L{TopDownInitRule}. After the first time this 1082 rule is applied, it will not generate any more edges. 1083 1084 If C{chart} or C{grammar} are changed, then the cache is flushed. 1085 """

1086 - def __init__(self):

1087 AbstractChartRule.__init__(self) 1088 self._done = (None, None)

1089

1090 - def apply_iter(self, chart, grammar):

1091 # If we've already applied this rule, and the chart & grammar 1092 # have not changed, then just return (no new edges to add). 1093 if self._done[0] is chart and self._done[1] is grammar: return 1094 1095 # Add all the edges indicated by the top down init rule. 1096 for e in TopDownInitRule.apply_iter(self, chart, grammar): 1097 yield e 1098 1099 # Record the fact that we've applied this rule. 1100 self._done = (chart, grammar)

1101

1102 - def __str__(self): return 'Top Down Init Rule'

1103

1104 -class CachedTopDownExpandRule(TopDownExpandRule):

1105 """ 1106 A cached version of L{TopDownExpandRule}. After the first time 1107 this rule is applied to an edge with a given C{end} and C{next}, 1108 it will not generate any more edges for edges with that C{end} and 1109 C{next}. 1110 1111 If C{chart} or C{grammar} are changed, then the cache is flushed. 1112 """

1113 - def __init__(self):

1114 AbstractChartRule.__init__(self) 1115 self._done = {}

1116

1117 - def apply_iter(self, chart, grammar, edge):

1118 # If we've already applied this rule to an edge with the same 1119 # next & end, and the chart & grammar have not changed, then 1120 # just return (no new edges to add). 1121 done = self._done.get((edge.next(), edge.end()), (None,None)) 1122 if done[0] is chart and done[1] is grammar: return 1123 1124 # Add all the edges indicated by the top down expand rule. 1125 for e in TopDownExpandRule.apply_iter(self, chart, grammar, edge): 1126 yield e 1127 1128 # Record the fact that we've applied this rule. 1129 self._done[edge.next(), edge.end()] = (chart, grammar)

1130

1131 - def __str__(self): return 'Top Down Expand Rule'

1132 1133 #//////////////////////////////////////////////////////////// 1134 # Bottom-Up Parsing 1135 #//////////////////////////////////////////////////////////// 1136

1137 -class BottomUpInitRule(AbstractChartRule):

1138 """ 1139 A rule licensing any edges corresponding to terminals in the 1140 text. In particular, this rule licenses the leaf edge: 1141 - [wS{->}*][i:i+1] 1142 for C{w} is a word in the text, where C{i} is C{w}'s index. 1143 """ 1144 NUM_EDGES = 0

1145 - def apply_iter(self, chart, grammar):

1146 for index in range(chart.num_leaves()): 1147 new_edge = LeafEdge(chart.leaf(index), index) 1148 if chart.insert(new_edge, ()): 1149 yield new_edge

1150

1151 -class BottomUpPredictRule(AbstractChartRule):

1152 """ 1153 A rule licensing any edge corresponding to a production whose 1154 right-hand side begins with a complete edge's left-hand side. In 1155 particular, this rule specifies that: 1156 - [AS{->}S{alpha}*] 1157 licenses the edge: 1158 - [BS{->}*AS{beta}] 1159 for each grammar production C{BS{->}AS{beta}} 1160 """ 1161 NUM_EDGES = 1

1162 - def apply_iter(self, chart, grammar, edge):

1163 if edge.is_incomplete(): return 1164 for prod in grammar.productions(rhs=edge.lhs()): 1165 new_edge = TreeEdge.from_production(prod, edge.start()) 1166 if chart.insert(new_edge, ()): 1167 yield new_edge

1168 1169 #//////////////////////////////////////////////////////////// 1170 # Earley Parsing 1171 #//////////////////////////////////////////////////////////// 1172

1173 -class CompleterRule(AbstractChartRule):

1174 """ 1175 A rule that joins a given complete edge with adjacent incomplete 1176 edges in the chart, to form combined edges. In particular, this 1177 rule specifies that: 1178 - [BS{->}S{gamma}*][j:k] 1179 licenses the edge: 1180 - [AS{->}S{alpha}B*S{beta}][i:j] 1181 given that the chart contains: 1182 - [AS{->}S{alpha}*BS{beta}][i:j] 1183 @note: This is basically L{FundamentalRule}, with the left edge 1184 left unspecified. 1185 """ 1186 NUM_EDGES = 1 1187 1188 _fundamental_rule = FundamentalRule() 1189

1190 - def apply_iter(self, chart, grammar, right_edge):

1191 if right_edge.is_incomplete(): return 1192 fr = self._fundamental_rule 1193 for left_edge in chart.select(end=right_edge.start(), 1194 is_complete=False, 1195 next=right_edge.lhs()): 1196 for e in fr.apply_iter(chart, grammar, left_edge, right_edge): 1197 yield e

1198

1199 - def __str__(self): return 'Completer Rule'

1200

1201 -class ScannerRule(AbstractChartRule):

1202 """ 1203 A rule licensing a leaf edge corresponding to a part-of-speech 1204 terminal following an incomplete edge's dot. In particular, this 1205 rule specifies that: 1206 - [AS{->}S{alpha}*PS{beta}][i:j] 1207 licenses the edges: 1208 - [PS{->}w*][j:j+1] 1209 - [wS{->}*][j:j+1] 1210 if the C{j}th word in the text is C{w}; and C{P} is a valid part 1211 of speech for C{w}. 1212 """ 1213 NUM_EDGES = 1

1214 - def __init__(self, word_to_pos_lexicon):

1215 self._word_to_pos = word_to_pos_lexicon

1216

1217 - def apply_iter(self, chart, gramar, edge):

1218 if edge.is_complete() or edge.end()>=chart.num_leaves(): return 1219 index = edge.end() 1220 leaf = chart.leaf(index) 1221 if edge.next() in self._word_to_pos.get(leaf, []): 1222 new_leaf_edge = LeafEdge(leaf, index) 1223 if chart.insert(new_leaf_edge, ()): 1224 yield new_leaf_edge 1225 new_pos_edge = TreeEdge((index,index+1), edge.next(), 1226 [leaf], 1) 1227 if chart.insert(new_pos_edge, (new_leaf_edge,)): 1228 yield new_pos_edge

1229 1230 # Very similar to TopDownExpandRule, but be sure not to predict lexical edges. 1231 # (The ScannerRule takes care of those.)

1232 -class PredictorRule(CachedTopDownExpandRule):

1233 - def apply_iter(self, chart, grammar, edge):

1234 # If we've already applied this rule to an edge with the same 1235 # next & end, and the chart & grammar have not changed, then 1236 # just return (no new edges to add). 1237 done = self._done.get((edge.next(), edge.end()), (None,None)) 1238 if done[0] is chart and done[1] is grammar: return 1239 1240 # Add all the edges indicated by the top down expand rule. 1241 if edge.is_complete(): return 1242 for prod in grammar.productions(lhs=edge.next()): 1243 if len(prod.rhs()) == 1 and isinstance(prod.rhs()[0], str): continue 1244 new_edge = TreeEdge.from_production(prod, edge.end()) 1245 if chart.insert(new_edge, ()): 1246 yield new_edge 1247 1248 # Record the fact that we've applied this rule. 1249 self._done[edge.next(), edge.end()] = (chart, grammar)

1250

1251 - def __str__(self): return 'Predictor Rule'

1252

1253 -def makeEarleyStrategy(grammar):

1254 """Given a grammar with both grammatical and lexical productions, 1255 produce an Earley strategy that uses that lexicon.""" 1256 lexicon = cfg.earley_lexicon(grammar.productions()) 1257 strategy = [TopDownInitRule(), PredictorRule(), ScannerRule(lexicon), CompleterRule()] 1258 return strategy

1259 1260 1261 ######################################################################## 1262 ## Generic Chart Parser 1263 ######################################################################## 1264 1265 TD_STRATEGY = [CachedTopDownInitRule(), CachedTopDownExpandRule(), 1266 TopDownMatchRule(), SingleEdgeFundamentalRule()] 1267 BU_STRATEGY = [BottomUpInitRule(), BottomUpPredictRule(), 1268 SingleEdgeFundamentalRule()] 1269

1270 -class ChartParser(ParserI):

1271 """ 1272 A generic chart parser. A X{strategy}, or list of 1273 L{ChartRules<ChartRuleI>}, is used to decide what edges to add to 1274 the chart. In particular, C{ChartParser} uses the following 1275 algorithm to parse texts: 1276 1277 - Until no new edges are added: 1278 - For each I{rule} in I{strategy}: 1279 - Apply I{rule} to any applicable edges in the chart. 1280 - Return any complete parses in the chart 1281 """

1282 - def __init__(self, grammar, strategy=TD_STRATEGY, trace=0):

1283 """ 1284 Create a new chart parser, that uses C{grammar} to parse 1285 texts. 1286 1287 @type grammar: L{cfg.Grammar} 1288 @param grammar: The grammar used to parse texts. 1289 @type strategy: C{list} of L{ChartRuleI} 1290 @param strategy: A list of rules that should be used to decide 1291 what edges to add to the chart (top-down strategy by default). 1292 @type trace: C{int} 1293 @param trace: The level of tracing that should be used when 1294 parsing a text. C{0} will generate no tracing output; 1295 and higher numbers will produce more verbose tracing 1296 output. 1297 """ 1298 self._grammar = grammar 1299 self._strategy = strategy 1300 self._trace = trace

1301

1302 - def grammar(self):

1303 return self._grammar

1304

1305 - def nbest_parse(self, tokens, n=None, tree_class=Tree):

1306 tokens = list(tokens) 1307 self._grammar.check_coverage(tokens) 1308 chart = Chart(list(tokens)) 1309 grammar = self._grammar 1310 1311 # Width, for printing trace edges. 1312 w = 50/(chart.num_leaves()+1) 1313 if self._trace > 0: print chart.pp_leaves(w) 1314 1315 edges_added = 1 1316 while edges_added > 0: 1317 edges_added = 0 1318 for rule in self._strategy: 1319 edges_added_by_rule = 0 1320 for e in rule.apply_everywhere(chart, grammar): 1321 if self._trace > 0 and edges_added_by_rule == 0: 1322 print '%s:' % rule 1323 edges_added_by_rule += 1 1324 if self._trace > 1: print chart.pp_edge(e,w) 1325 if self._trace == 1 and edges_added_by_rule > 0: 1326 print ' - Added %d edges' % edges_added_by_rule 1327 edges_added += edges_added_by_rule 1328 1329 # Return a list of complete parses. 1330 return chart.parses(grammar.start(), tree_class=tree_class)[:n]

1331 1332 ######################################################################## 1333 ## Stepping Chart Parser 1334 ######################################################################## 1335

1336 -class SteppingChartParser(ChartParser):

1337 """ 1338 A C{ChartParser} that allows you to step through the parsing 1339 process, adding a single edge at a time. It also allows you to 1340 change the parser's strategy or grammar midway through parsing a 1341 text. 1342 1343 The C{initialize} method is used to start parsing a text. C{step} 1344 adds a single edge to the chart. C{set_strategy} changes the 1345 strategy used by the chart parser. C{parses} returns the set of 1346 parses that has been found by the chart parser. 1347 1348 @ivar _restart: Records whether the parser's strategy, grammar, 1349 or chart has been changed. If so, then L{step} must restart 1350 the parsing algorithm. 1351 """

1352 - def __init__(self, grammar, strategy=None, trace=0):

1353 self._chart = None 1354 self._current_chartrule = None 1355 self._restart = False 1356 ChartParser.__init__(self, grammar, strategy, trace)

1357 1358 #//////////////////////////////////////////////////////////// 1359 # Initialization 1360 #//////////////////////////////////////////////////////////// 1361

1362 - def initialize(self, tokens):

1363 "Begin parsing the given tokens." 1364 self._chart = Chart(list(tokens)) 1365 self._restart = True

1366 1367 #//////////////////////////////////////////////////////////// 1368 # Stepping 1369 #//////////////////////////////////////////////////////////// 1370

1371 - def step(self):

1372 """ 1373 @return: A generator that adds edges to the chart, one at a 1374 time. Each time the generator is resumed, it adds a single 1375 edge and yields that edge. If no more edges can be added, 1376 then it yields C{None}. 1377 1378 If the parser's strategy, grammar, or chart is changed, then 1379 the generator will continue adding edges using the new 1380 strategy, grammar, or chart. 1381 1382 Note that this generator never terminates, since the grammar 1383 or strategy might be changed to values that would add new 1384 edges. Instead, it yields C{None} when no more edges can be 1385 added with the current strategy and grammar. 1386 """ 1387 if self._chart is None: 1388 raise ValueError, 'Parser must be initialized first' 1389 while 1: 1390 self._restart = False 1391 w = 50/(self._chart.num_leaves()+1) 1392 1393 for e in self._parse(): 1394 if self._trace > 1: print self._current_chartrule 1395 if self._trace > 0: print self._chart.pp_edge(e,w) 1396 yield e 1397 if self._restart: break 1398 else: 1399 yield None # No more edges.

1400

1401 - def _parse(self):

1402 """ 1403 A generator that implements the actual parsing algorithm. 1404 L{step} iterates through this generator, and restarts it 1405 whenever the parser's strategy, grammar, or chart is modified. 1406 """ 1407 chart = self._chart 1408 grammar = self._grammar 1409 edges_added = 1 1410 while edges_added > 0: 1411 edges_added = 0 1412 for rule in self._strategy: 1413 self._current_chartrule = rule 1414 for e in rule.apply_everywhere_iter(chart, grammar): 1415 edges_added += 1 1416 yield e

1417 1418 #//////////////////////////////////////////////////////////// 1419 # Accessors 1420 #//////////////////////////////////////////////////////////// 1421

1422 - def strategy(self):

1423 "@return: The strategy used by this parser." 1424 return self._strategy

1425

1426 - def grammar(self):

1427 "@return: The grammar used by this parser." 1428 return self._grammar

1429

1430 - def chart(self):

1431 "@return: The chart that is used by this parser." 1432 return self._chart

1433

1434 - def current_chartrule(self):

1435 "@return: The chart rule used to generate the most recent edge." 1436 return self._current_chartrule

1437

1438 - def parses(self, tree_class=Tree):

1439 "@return: The parse trees currently contained in the chart." 1440 return self._chart.parses(self._grammar.start(), tree_class)

1441 1442 #//////////////////////////////////////////////////////////// 1443 # Parser modification 1444 #//////////////////////////////////////////////////////////// 1445

1446 - def set_strategy(self, strategy):

1447 """ 1448 Change the startegy that the parser uses to decide which edges 1449 to add to the chart. 1450 @type strategy: C{list} of L{ChartRuleI} 1451 @param strategy: A list of rules that should be used to decide 1452 what edges to add to the chart. 1453 """ 1454 if strategy == self._strategy: return 1455 self._strategy = strategy[:] # Make a copy. 1456 self._restart = True

1457

1458 - def set_grammar(self, grammar):

1459 "Change the grammar used by the parser." 1460 if grammar is self._grammar: return 1461 self._grammar = grammar 1462 self._restart = True

1463

1464 - def set_chart(self, chart):

1465 "Load a given chart into the chart parser." 1466 if chart is self._chart: return 1467 self._chart = chart 1468 self._restart = True

1469 1470 #//////////////////////////////////////////////////////////// 1471 # Standard parser methods 1472 #//////////////////////////////////////////////////////////// 1473

1474 - def nbest_parse(self, tokens, n=None, tree_class=Tree):

1475 tokens = list(tokens) 1476 self._grammar.check_coverage(tokens) 1477 1478 # Initialize ourselves. 1479 self.initialize(tokens) 1480 1481 # Step until no more edges are generated. 1482 for e in self.step(): 1483 if e is None: break 1484 1485 # Return a list of complete parses. 1486 return self.parses(tree_class=tree_class)[:n]

1487 1488 ######################################################################## 1489 ## Demo Code 1490 ######################################################################## 1491

1492 -def demo():

1493 """ 1494 A demonstration of the chart parsers. 1495 """ 1496 import sys, time 1497 1498 # Define some nonterminals 1499 S, VP, NP, PP = cfg.nonterminals('S, VP, NP, PP') 1500 V, N, P, Name, Det = cfg.nonterminals('V, N, P, Name, Det') 1501 1502 # Define some grammatical productions. 1503 grammatical_productions = [ 1504 cfg.Production(S, [NP, VP]), cfg.Production(PP, [P, NP]), 1505 cfg.Production(NP, [Det, N]), cfg.Production(NP, [NP, PP]), 1506 cfg.Production(VP, [VP, PP]), cfg.Production(VP, [V, NP]), 1507 cfg.Production(VP, [V]),] 1508 1509 # Define some lexical productions. 1510 lexical_productions = [ 1511 cfg.Production(NP, ['John']), cfg.Production(NP, ['I']), 1512 cfg.Production(Det, ['the']), cfg.Production(Det, ['my']), 1513 cfg.Production(Det, ['a']), 1514 cfg.Production(N, ['dog']), cfg.Production(N, ['cookie']), 1515 cfg.Production(V, ['ate']), cfg.Production(V, ['saw']), 1516 cfg.Production(P, ['with']), cfg.Production(P, ['under']), 1517 ] 1518 1519 # The grammar for ChartParser and SteppingChartParser: 1520 grammar = cfg.Grammar(S, grammatical_productions+lexical_productions) 1521 1522 # Tokenize a sample sentence. 1523 sent = 'I saw John with a dog with my cookie' 1524 print "Sentence:\n", sent 1525 tokens = sent.split() 1526 1527 print tokens 1528 1529 # Ask the user which parser to test 1530 print ' 1: Top-down chart parser' 1531 print ' 2: Bottom-up chart parser' 1532 print ' 3: Earley parser' 1533 print ' 4: Stepping chart parser (alternating top-down & bottom-up)' 1534 print ' 5: All parsers' 1535 print '\nWhich parser (1-5)? ', 1536 choice = sys.stdin.readline().strip() 1537 print 1538 if choice not in '12345': 1539 print 'Bad parser number' 1540 return 1541 1542 # Keep track of how long each parser takes. 1543 times = {} 1544 1545 # Run the top-down parser, if requested. 1546 if choice in ('1', '5'): 1547 cp = ChartParser(grammar, TD_STRATEGY, trace=2) 1548 t = time.time() 1549 parses = cp.nbest_parse(tokens) 1550 times['top down'] = time.time()-t 1551 assert len(parses)==5, 'Not all parses found' 1552 for tree in parses: print tree 1553 1554 # Run the bottom-up parser, if requested. 1555 if choice in ('2', '5'): 1556 cp = ChartParser(grammar, BU_STRATEGY, trace=2) 1557 t = time.time() 1558 parses = cp.nbest_parse(tokens) 1559 times['bottom up'] = time.time()-t 1560 assert len(parses)==5, 'Not all parses found' 1561 for tree in parses: print tree 1562 1563 # Run the earley, if requested. 1564 if choice in ('3', '5'): 1565 cp = ChartParser(grammar, makeEarleyStrategy(grammar), trace=2) 1566 t = time.time() 1567 parses = cp.nbest_parse(tokens) 1568 times['Earley'] = time.time()-t 1569 assert len(parses)==5, 'Not all parses found' 1570 for tree in parses: print tree 1571 1572 # Run the stepping parser, if requested. 1573 if choice in ('4', '5'): 1574 t = time.time() 1575 cp = SteppingChartParser(grammar, trace=1) 1576 cp.initialize(tokens) 1577 for i in range(5): 1578 print '*** SWITCH TO TOP DOWN' 1579 cp.set_strategy(TD_STRATEGY) 1580 for j, e in enumerate(cp.step()): 1581 if j>20 or e is None: break 1582 print '*** SWITCH TO BOTTOM UP' 1583 cp.set_strategy(BU_STRATEGY) 1584 for j, e in enumerate(cp.step()): 1585 if j>20 or e is None: break 1586 times['stepping'] = time.time()-t 1587 assert len(cp.parses())==5, 'Not all parses found' 1588 for parse in cp.parses(): print parse 1589 1590 # Print the times of all parsers: 1591 if not times: return 1592 maxlen = max(len(key) for key in times.keys()) 1593 format = '%' + `maxlen` + 's parser: %6.3fsec' 1594 times_items = times.items() 1595 times_items.sort(lambda a,b:cmp(a[1], b[1])) 1596 for (parser, t) in times_items: 1597 print format % (parser, t)

1598 1599 if __name__ == '__main__': demo() 1600

Source Code for Module nltk.parse.chart