1
2
3
4
5
6
7
8
9
10
11
12 from nltk import Tree, cfg, defaultdict
13
14 from api import *
15
16 """
17 Data classes and parser implementations for \"chart parsers\", which
18 use dynamic programming to efficiently parse a text. A X{chart
19 parser} derives parse trees for a text by iteratively adding \"edges\"
20 to a \"chart.\" Each X{edge} represents a hypothesis about the tree
21 structure for a subsequence of the text. The X{chart} is a
22 \"blackboard\" for composing and combining these hypotheses.
23
24 When a chart parser begins parsing a text, it creates a new (empty)
25 chart, spanning the text. It then incrementally adds new edges to the
26 chart. A set of X{chart rules} specifies the conditions under which
27 new edges should be added to the chart. Once the chart reaches a
28 stage where none of the chart rules adds any new edges, parsing is
29 complete.
30
31 Charts are encoded with the L{Chart} class, and edges are encoded with
32 the L{TreeEdge} and L{LeafEdge} classes. The chart parser module
33 defines three chart parsers:
34
35 - C{ChartParser} is a simple and flexible chart parser. Given a
36 set of chart rules, it will apply those rules to the chart until
37 no more edges are added.
38
39 - C{SteppingChartParser} is a subclass of C{ChartParser} that can
40 be used to step through the parsing process.
41 """
42
43 import re
44
45
46
47
48
50 """
51 A hypothesis about the structure of part of a sentence.
52 Each edge records the fact that a structure is (partially)
53 consistent with the sentence. An edge contains:
54
55 - A X{span}, indicating what part of the sentence is
56 consistent with the hypothesized structure.
57
58 - A X{left-hand side}, specifying what kind of structure is
59 hypothesized.
60
61 - A X{right-hand side}, specifying the contents of the
62 hypothesized structure.
63
64 - A X{dot position}, indicating how much of the hypothesized
65 structure is consistent with the sentence.
66
67 Every edge is either X{complete} or X{incomplete}:
68
69 - An edge is X{complete} if its structure is fully consistent
70 with the sentence.
71
72 - An edge is X{incomplete} if its structure is partially
73 consistent with the sentence. For every incomplete edge, the
74 span specifies a possible prefix for the edge's structure.
75
76 There are two kinds of edge:
77
78 - C{TreeEdges<TreeEdge>} record which trees have been found to
79 be (partially) consistent with the text.
80
81 - C{LeafEdges<leafEdge>} record the tokens occur in the text.
82
83 The C{EdgeI} interface provides a common interface to both types
84 of edge, allowing chart parsers to treat them in a uniform manner.
85 """
87 if self.__class__ == EdgeI:
88 raise TypeError('Edge is an abstract interface')
89
90
91
92
93
95 """
96 @return: A tuple C{(s,e)}, where C{subtokens[s:e]} is the
97 portion of the sentence that is consistent with this
98 edge's structure.
99 @rtype: C{(int, int)}
100 """
101 raise AssertionError('EdgeI is an abstract interface')
102
104 """
105 @return: The start index of this edge's span.
106 @rtype: C{int}
107 """
108 raise AssertionError('EdgeI is an abstract interface')
109
111 """
112 @return: The end index of this edge's span.
113 @rtype: C{int}
114 """
115 raise AssertionError('EdgeI is an abstract interface')
116
118 """
119 @return: The length of this edge's span.
120 @rtype: C{int}
121 """
122 raise AssertionError('EdgeI is an abstract interface')
123
124
125
126
127
129 """
130 @return: This edge's left-hand side, which specifies what kind
131 of structure is hypothesized by this edge.
132 @see: L{TreeEdge} and L{LeafEdge} for a description of
133 the left-hand side values for each edge type.
134 """
135 raise AssertionError('EdgeI is an abstract interface')
136
137
138
139
140
142 """
143 @return: This edge's right-hand side, which specifies
144 the content of the structure hypothesized by this
145 edge.
146 @see: L{TreeEdge} and L{LeafEdge} for a description of
147 the right-hand side values for each edge type.
148 """
149 raise AssertionError('EdgeI is an abstract interface')
150
152 """
153 @return: This edge's dot position, which indicates how much of
154 the hypothesized structure is consistent with the
155 sentence. In particular, C{self.rhs[:dot]} is consistent
156 with C{subtoks[self.start():self.end()]}.
157 @rtype: C{int}
158 """
159 raise AssertionError('EdgeI is an abstract interface')
160
162 """
163 @return: The element of this edge's right-hand side that
164 immediately follows its dot.
165 @rtype: C{Nonterminal} or X{terminal} or C{None}
166 """
167 raise AssertionError('EdgeI is an abstract interface')
168
170 """
171 @return: True if this edge's structure is fully consistent
172 with the text.
173 @rtype: C{boolean}
174 """
175 raise AssertionError('EdgeI is an abstract interface')
176
178 """
179 @return: True if this edge's structure is partially consistent
180 with the text.
181 @rtype: C{boolean}
182 """
183 raise AssertionError('EdgeI is an abstract interface')
184
185
186
187
189 raise AssertionError('EdgeI is an abstract interface')
190
192 raise AssertionError('EdgeI is an abstract interface')
193
195 """
196 An edge that records the fact that a tree is (partially)
197 consistent with the sentence. A tree edge consists of:
198
199 - A X{span}, indicating what part of the sentence is
200 consistent with the hypothesized tree.
201
202 - A X{left-hand side}, specifying the hypothesized tree's node
203 value.
204
205 - A X{right-hand side}, specifying the hypothesized tree's
206 children. Each element of the right-hand side is either a
207 terminal, specifying a token with that terminal as its leaf
208 value; or a nonterminal, specifying a subtree with that
209 nonterminal's symbol as its node value.
210
211 - A X{dot position}, indicating which children are consistent
212 with part of the sentence. In particular, if C{dot} is the
213 dot position, C{rhs} is the right-hand size, C{(start,end)}
214 is the span, and C{sentence} is the list of subtokens in the
215 sentence, then C{subtokens[start:end]} can be spanned by the
216 children specified by C{rhs[:dot]}.
217
218 For more information about edges, see the L{EdgeI} interface.
219 """
220 - def __init__(self, span, lhs, rhs, dot=0):
221 """
222 Construct a new C{TreeEdge}.
223
224 @type span: C{(int, int)}
225 @param span: A tuple C{(s,e)}, where C{subtokens[s:e]} is the
226 portion of the sentence that is consistent with the new
227 edge's structure.
228 @type lhs: L{Nonterminal}
229 @param lhs: The new edge's left-hand side, specifying the
230 hypothesized tree's node value.
231 @type rhs: C{list} of (L{Nonterminal} and C{string})
232 @param rhs: The new edge's right-hand side, specifying the
233 hypothesized tree's children.
234 @type dot: C{int}
235 @param dot: The position of the new edge's dot. This position
236 specifies what prefix of the production's right hand side
237 is consistent with the text. In particular, if
238 C{sentence} is the list of subtokens in the sentence, then
239 C{subtokens[span[0]:span[1]]} can be spanned by the
240 children specified by C{rhs[:dot]}.
241 """
242 self._lhs = lhs
243 self._rhs = tuple(rhs)
244 self._span = span
245 self._dot = dot
246
247
249 """
250 @return: A new C{TreeEdge} formed from the given production.
251 The new edge's left-hand side and right-hand side will
252 be taken from C{production}; its span will be C{(index,
253 index)}; and its dot position will be C{0}.
254 @rtype: L{TreeEdge}
255 """
256 return TreeEdge(span=(index, index), lhs=production.lhs(),
257 rhs=production.rhs(), dot=0)
258 from_production = staticmethod(from_production)
259
260
261 - def lhs(self): return self._lhs
262 - def span(self): return self._span
263 - def start(self): return self._span[0]
264 - def end(self): return self._span[1]
265 - def length(self): return self._span[1] - self._span[0]
266 - def rhs(self): return self._rhs
267 - def dot(self): return self._dot
268 - def is_complete(self): return self._dot == len(self._rhs)
271 if self._dot >= len(self._rhs): return None
272 else: return self._rhs[self._dot]
273
274
276 if self.__class__ != other.__class__: return -1
277 return cmp((self._span, self.lhs(), self.rhs(), self._dot),
278 (other._span, other.lhs(), other.rhs(), other._dot))
280 return hash((self.lhs(), self.rhs(), self._span, self._dot))
281
282
284 str = '[%s:%s] ' % (self._span[0], self._span[1])
285 if isinstance(self._lhs.symbol(), basestring):
286 str += '%-2s ->' % (self._lhs,)
287 else:
288 str += '%-2r ->' % (self._lhs,)
289
290 for i in range(len(self._rhs)):
291 if i == self._dot: str += ' *'
292 if (isinstance(self._rhs[i], cfg.Nonterminal) and
293 isinstance(self._rhs[i].symbol(), basestring)):
294 str += ' %s' % (self._rhs[i],)
295 else:
296 str += ' %r' % (self._rhs[i],)
297 if len(self._rhs) == self._dot: str += ' *'
298 return str
299
301 return '[Edge: %s]' % self
302
304 """
305 An edge that records the fact that a leaf value is consistent with
306 a word in the sentence. A leaf edge consists of:
307
308 - An X{index}, indicating the position of the word.
309 - A X{leaf}, specifying the word's content.
310
311 A leaf edge's left-hand side is its leaf value, and its right hand
312 side is C{()}. Its span is C{[index, index+1]}, and its dot
313 position is C{0}.
314 """
316 """
317 Construct a new C{LeafEdge}.
318
319 @param leaf: The new edge's leaf value, specifying the word
320 that is recorded by this edge.
321 @param index: The new edge's index, specifying the position of
322 the word that is recorded by this edge.
323 """
324 self._leaf = leaf
325 self._index = index
326
327
328 - def lhs(self): return self._leaf
333 - def rhs(self): return ()
334 - def dot(self): return 0
337 - def next(self): return None
338
339
341 if not isinstance(other, LeafEdge): return -1
342 return cmp((self._index, self._leaf), (other._index, other._leaf))
344 return hash((self._index, self._leaf))
345
346
349 return '[Edge: %s]' % (self)
350
351
352
353
354
356 """
357 A blackboard for hypotheses about the syntactic constituents of a
358 sentence. A chart contains a set of edges, and each edge encodes
359 a single hypothesis about the structure of some portion of the
360 sentence.
361
362 The L{select} method can be used to select a specific collection
363 of edges. For example C{chart.select(is_complete=True, start=0)}
364 yields all complete edges whose start indices are 0. To ensure
365 the efficiency of these selection operations, C{Chart} dynamically
366 creates and maintains an index for each set of attributes that
367 have been selected on.
368
369 In order to reconstruct the trees that are represented by an edge,
370 the chart associates each edge with a set of child pointer lists.
371 A X{child pointer list} is a list of the edges that license an
372 edge's right-hand side.
373
374 @ivar _tokens: The sentence that the chart covers.
375 @ivar _num_leaves: The number of tokens.
376 @ivar _edges: A list of the edges in the chart
377 @ivar _edge_to_cpls: A dictionary mapping each edge to a set
378 of child pointer lists that are associated with that edge.
379 @ivar _indexes: A dictionary mapping tuples of edge attributes
380 to indices, where each index maps the corresponding edge
381 attribute values to lists of edges.
382 """
384 """
385 Construct a new empty chart.
386
387 @type tokens: L{list}
388 @param tokens: The sentence that this chart will be used to parse.
389 """
390
391 self._tokens = list(tokens)
392 self._num_leaves = len(self._tokens)
393
394
395 self._edges = []
396
397
398 self._edge_to_cpls = {}
399
400
401
402 self._indexes = {}
403
404
405
406
407
409 """
410 @return: The number of words in this chart's sentence.
411 @rtype: C{int}
412 """
413 return self._num_leaves
414
415 - def leaf(self, index):
416 """
417 @return: The leaf value of the word at the given index.
418 @rtype: C{string}
419 """
420 return self._tokens[index]
421
423 """
424 @return: A list of the leaf values of each word in the
425 chart's sentence.
426 @rtype: C{list} of C{string}
427 """
428 return self._tokens
429
430
431
432
433
435 """
436 @return: A list of all edges in this chart. New edges
437 that are added to the chart after the call to edges()
438 will I{not} be contained in this list.
439 @rtype: C{list} of L{EdgeI}
440 @see: L{iteredges}, L{select}
441 """
442 return self._edges[:]
443
445 """
446 @return: An iterator over the edges in this chart. Any
447 new edges that are added to the chart before the iterator
448 is exahusted will also be generated.
449 @rtype: C{iter} of L{EdgeI}
450 @see: L{edges}, L{select}
451 """
452 return iter(self._edges)
453
454
455 __iter__ = iteredges
456
458 """
459 @return: The number of edges contained in this chart.
460 @rtype: C{int}
461 """
462 return len(self._edge_to_cpls)
463
464 - def select(self, **restrictions):
465 """
466 @return: An iterator over the edges in this chart. Any
467 new edges that are added to the chart before the iterator
468 is exahusted will also be generated. C{restrictions}
469 can be used to restrict the set of edges that will be
470 generated.
471 @rtype: C{iter} of L{EdgeI}
472
473 @kwarg span: Only generate edges C{e} where C{e.span()==span}
474 @kwarg start: Only generate edges C{e} where C{e.start()==start}
475 @kwarg end: Only generate edges C{e} where C{e.end()==end}
476 @kwarg length: Only generate edges C{e} where C{e.length()==length}
477 @kwarg lhs: Only generate edges C{e} where C{e.lhs()==lhs}
478 @kwarg rhs: Only generate edges C{e} where C{e.rhs()==rhs}
479 @kwarg next: Only generate edges C{e} where C{e.next()==next}
480 @kwarg dot: Only generate edges C{e} where C{e.dot()==dot}
481 @kwarg is_complete: Only generate edges C{e} where
482 C{e.is_complete()==is_complete}
483 @kwarg is_incomplete: Only generate edges C{e} where
484 C{e.is_incomplete()==is_incomplete}
485 """
486
487 if restrictions=={}: return iter(self._edges)
488
489
490 restr_keys = restrictions.keys()
491 restr_keys.sort()
492 restr_keys = tuple(restr_keys)
493
494
495 if restr_keys not in self._indexes:
496 self._add_index(restr_keys)
497
498 vals = [restrictions[k] for k in restr_keys]
499 return iter(self._indexes[restr_keys].get(tuple(vals), []))
500
502 """
503 A helper function for L{select}, which creates a new index for
504 a given set of attributes (aka restriction keys).
505 """
506
507 for k in restr_keys:
508 if not hasattr(EdgeI, k):
509 raise ValueError, 'Bad restriction: %s' % k
510
511
512 self._indexes[restr_keys] = {}
513
514
515 for edge in self._edges:
516 vals = [getattr(edge, k)() for k in restr_keys]
517 index = self._indexes[restr_keys]
518 index.setdefault(tuple(vals),[]).append(edge)
519
520
521
522
523
524 - def insert(self, edge, child_pointer_list):
525 """
526 Add a new edge to the chart.
527
528 @type edge: L{EdgeI}
529 @param edge: The new edge
530 @type child_pointer_list: C{tuple} of L{EdgeI}
531 @param child_pointer_list: A list of the edges that were used to
532 form this edge. This list is used to reconstruct the trees
533 (or partial trees) that are associated with C{edge}.
534 @rtype: C{bool}
535 @return: True if this operation modified the chart. In
536 particular, return true iff the chart did not already
537 contain C{edge}, or if it did not already associate
538 C{child_pointer_list} with C{edge}.
539 """
540
541 if edge not in self._edge_to_cpls:
542
543 self._edges.append(edge)
544
545
546 for (restr_keys, index) in self._indexes.items():
547 vals = [getattr(edge, k)() for k in restr_keys]
548 index = self._indexes[restr_keys]
549 index.setdefault(tuple(vals),[]).append(edge)
550
551
552 cpls = self._edge_to_cpls.setdefault(edge,{})
553 child_pointer_list = tuple(child_pointer_list)
554
555 if child_pointer_list in cpls:
556
557 return False
558 else:
559
560 cpls[child_pointer_list] = True
561 return True
562
563
564
565
566
568 """
569 @return: A list of the complete tree structures that span
570 the entire chart, and whose root node is C{root}.
571 """
572 trees = []
573 for edge in self.select(span=(0,self._num_leaves), lhs=root):
574 trees += self.trees(edge, tree_class=tree_class, complete=True)
575 return trees
576
577 - def trees(self, edge, tree_class=Tree, complete=False):
578 """
579 @return: A list of the tree structures that are associated
580 with C{edge}.
581
582 If C{edge} is incomplete, then the unexpanded children will be
583 encoded as childless subtrees, whose node value is the
584 corresponding terminal or nonterminal.
585
586 @rtype: C{list} of L{Tree}
587 @note: If two trees share a common subtree, then the same
588 C{Tree} may be used to encode that subtree in
589 both trees. If you need to eliminate this subtree
590 sharing, then create a deep copy of each tree.
591 """
592 return self._trees(edge, complete, memo={}, tree_class=tree_class)
593
594 - def _trees(self, edge, complete, memo, tree_class):
595 """
596 A helper function for L{trees}.
597 @param memo: A dictionary used to record the trees that we've
598 generated for each edge, so that when we see an edge more
599 than once, we can reuse the same trees.
600 """
601
602 if edge in memo:
603 return memo[edge]
604
605 trees = []
606
607
608 if complete and edge.is_incomplete():
609 return trees
610
611
612
613
614
615
616
617 memo[edge] = []
618
619
620 if isinstance(edge, LeafEdge):
621 leaf = self._tokens[edge.start()]
622 memo[edge] = leaf
623 return [leaf]
624
625
626 for cpl in self.child_pointer_lists(edge):
627
628
629
630 child_choices = [self._trees(cp, complete, memo, tree_class)
631 for cp in cpl]
632
633
634 if len(child_choices) > 0 and type(child_choices[0]) == type(""):
635 child_choices = [child_choices]
636
637
638 for children in self._choose_children(child_choices):
639 lhs = edge.lhs().symbol()
640 trees.append(tree_class(lhs, children))
641
642
643 if edge.is_incomplete():
644 unexpanded = [tree_class(elt,[])
645 for elt in edge.rhs()[edge.dot():]]
646 for tree in trees:
647 tree.extend(unexpanded)
648
649
650 memo[edge] = trees
651
652
653 return trees
654
656 """
657 A helper function for L{_trees} that finds the possible sets
658 of subtrees for a new tree.
659
660 @param child_choices: A list that specifies the options for
661 each child. In particular, C{child_choices[i]} is a list of
662 tokens and subtrees that can be used as the C{i}th child.
663 """
664 children_lists = [[]]
665 for child_choice in child_choices:
666 children_lists = [child_list+[child]
667 for child in child_choice
668 for child_list in children_lists]
669 return children_lists
670
672 """
673 @rtype: C{list} of C{list} of C{EdgeI}
674 @return: The set of child pointer lists for the given edge.
675 Each child pointer list is a list of edges that have
676 been used to form this edge.
677 """
678
679 return self._edge_to_cpls.get(edge, {}).keys()
680
681
682
683
684 - def pp_edge(self, edge, width=None):
713
715 """
716 @return: A pretty-printed string representation of this
717 chart's leaves. This string can be used as a header
718 for calls to L{pp_edge}.
719 """
720 if width is None: width = 50/(self.num_leaves()+1)
721
722 if self._tokens is not None and width>1:
723 header = '|.'
724 for tok in self._tokens:
725 header += tok[:width-1].center(width-1)+'.'
726 header += '|'
727 else:
728 header = ''
729
730 return header
731
732 - def pp(self, width=None):
748
749
750
751
752
754
755 s = 'digraph nltk_chart {\n'
756
757 s += ' rankdir=LR;\n'
758 s += ' node [height=0.1,width=0.1];\n'
759 s += ' node [style=filled, color="lightgray"];\n'
760
761
762 for y in range(self.num_edges(), -1, -1):
763 if y == 0:
764 s += ' node [style=filled, color="black"];\n'
765 for x in range(self.num_leaves()+1):
766 if y == 0 or (x <= self._edges[y-1].start() or
767 x >= self._edges[y-1].end()):
768 s += ' %04d.%04d [label=""];\n' % (x,y)
769
770
771 s += ' x [style=invis]; x->0000.0000 [style=invis];\n'
772
773
774 for x in range(self.num_leaves()+1):
775 s += ' {rank=same;'
776 for y in range(self.num_edges()+1):
777 if y == 0 or (x <= self._edges[y-1].start() or
778 x >= self._edges[y-1].end()):
779 s += ' %04d.%04d' % (x,y)
780 s += '}\n'
781
782
783 s += ' edge [style=invis, weight=100];\n'
784 s += ' node [shape=plaintext]\n'
785 s += ' 0000.0000'
786 for x in range(self.num_leaves()):
787 s += '->%s->%04d.0000' % (self.leaf(x), x+1)
788 s += ';\n\n'
789
790
791 s += ' edge [style=solid, weight=1];\n'
792 for y, edge in enumerate(self):
793 for x in range(edge.start()):
794 s += (' %04d.%04d -> %04d.%04d [style="invis"];\n' %
795 (x, y+1, x+1, y+1))
796 s += (' %04d.%04d -> %04d.%04d [label="%s"];\n' %
797 (edge.start(), y+1, edge.end(), y+1, edge))
798 for x in range(edge.end(), self.num_leaves()):
799 s += (' %04d.%04d -> %04d.%04d [style="invis"];\n' %
800 (x, y+1, x+1, y+1))
801 s += '}\n'
802 return s
803
804
805
806
807
809 """
810 A rule that specifies what new edges are licensed by any given set
811 of existing edges. Each chart rule expects a fixed number of
812 edges, as indicated by the class variable L{NUM_EDGES}. In
813 particular:
814
815 - A chart rule with C{NUM_EDGES=0} specifies what new edges are
816 licensed, regardless of existing edges.
817
818 - A chart rule with C{NUM_EDGES=1} specifies what new edges are
819 licensed by a single existing edge.
820
821 - A chart rule with C{NUM_EDGES=2} specifies what new edges are
822 licensed by a pair of existing edges.
823
824 @type NUM_EDGES: C{int}
825 @cvar NUM_EDGES: The number of existing edges that this rule uses
826 to license new edges. Typically, this number ranges from zero
827 to two.
828 """
829 - def apply(self, chart, grammar, *edges):
830 """
831 Add the edges licensed by this rule and the given edges to the
832 chart.
833
834 @type edges: C{list} of L{EdgeI}
835 @param edges: A set of existing edges. The number of edges
836 that should be passed to C{apply} is specified by the
837 L{NUM_EDGES} class variable.
838 @rtype: C{list} of L{EdgeI}
839 @return: A list of the edges that were added.
840 """
841 raise AssertionError, 'ChartRuleI is an abstract interface'
842
844 """
845 @return: A generator that will add edges licensed by this rule
846 and the given edges to the chart, one at a time. Each
847 time the generator is resumed, it will either add a new
848 edge and yield that edge; or return.
849 @rtype: C{iter} of L{EdgeI}
850
851 @type edges: C{list} of L{EdgeI}
852 @param edges: A set of existing edges. The number of edges
853 that should be passed to C{apply} is specified by the
854 L{NUM_EDGES} class variable.
855 """
856 raise AssertionError, 'ChartRuleI is an abstract interface'
857
859 """
860 Add all the edges licensed by this rule and the edges in the
861 chart to the chart.
862
863 @rtype: C{list} of L{EdgeI}
864 @return: A list of the edges that were added.
865 """
866 raise AssertionError, 'ChartRuleI is an abstract interface'
867
869 """
870 @return: A generator that will add all edges licensed by
871 this rule, given the edges that are currently in the
872 chart, one at a time. Each time the generator is resumed,
873 it will either add a new edge and yield that edge; or
874 return.
875 @rtype: C{iter} of L{EdgeI}
876 """
877 raise AssertionError, 'ChartRuleI is an abstract interface'
878
880 """
881 An abstract base class for chart rules. C{AbstractChartRule}
882 provides:
883 - A default implementation for C{apply}, based on C{apply_iter}.
884 - A default implementation for C{apply_everywhere_iter},
885 based on C{apply_iter}.
886 - A default implementation for C{apply_everywhere}, based on
887 C{apply_everywhere_iter}. Currently, this implementation
888 assumes that C{NUM_EDGES}<=3.
889 - A default implementation for C{__str__}, which returns a
890 name basd on the rule's class name.
891 """
892
893
895 raise AssertionError, 'AbstractChartRule is an abstract class'
896
897
898
900 if self.NUM_EDGES == 0:
901 for new_edge in self.apply_iter(chart, grammar):
902 yield new_edge
903
904 elif self.NUM_EDGES == 1:
905 for e1 in chart:
906 for new_edge in self.apply_iter(chart, grammar, e1):
907 yield new_edge
908
909 elif self.NUM_EDGES == 2:
910 for e1 in chart:
911 for e2 in chart:
912 for new_edge in self.apply_iter(chart, grammar, e1, e2):
913 yield new_edge
914
915 elif self.NUM_EDGES == 3:
916 for e1 in chart:
917 for e2 in chart:
918 for e3 in chart:
919 for new_edge in self.apply_iter(chart,grammar,e1,e2,e3):
920 yield new_edge
921
922 else:
923 raise AssertionError, 'NUM_EDGES>3 is not currently supported'
924
925
926 - def apply(self, chart, grammar, *edges):
928
929
932
933
935
936 return re.sub('([a-z])([A-Z])', r'\1 \2', self.__class__.__name__)
937
938
939
940
942 """
943 A rule that joins two adjacent edges to form a single combined
944 edge. In particular, this rule specifies that any pair of edges:
945
946 - [AS{->}S{alpha}*BS{beta}][i:j]
947 - [BS{->}S{gamma}*][j:k]
948 licenses the edge:
949 - [AS{->}S{alpha}B*S{beta}][i:j]
950 """
951 NUM_EDGES = 2
952 - def apply_iter(self, chart, grammar, left_edge, right_edge):
972
974 """
975 A rule that joins a given edge with adjacent edges in the chart,
976 to form combined edges. In particular, this rule specifies that
977 either of the edges:
978 - [AS{->}S{alpha}*BS{beta}][i:j]
979 - [BS{->}S{gamma}*][j:k]
980 licenses the edge:
981 - [AS{->}S{alpha}B*S{beta}][i:j]
982 if the other edge is already in the chart.
983 @note: This is basically L{FundamentalRule}, with one edge is left
984 unspecified.
985 """
986 NUM_EDGES = 1
987
988 _fundamental_rule = FundamentalRule()
989
991 fr = self._fundamental_rule
992 if edge1.is_incomplete():
993
994 for edge2 in chart.select(start=edge1.end(), is_complete=True,
995 lhs=edge1.next()):
996 for new_edge in fr.apply_iter(chart, grammar, edge1, edge2):
997 yield new_edge
998 else:
999
1000 for edge2 in chart.select(end=edge1.start(), is_complete=False,
1001 next=edge1.lhs()):
1002 for new_edge in fr.apply_iter(chart, grammar, edge2, edge1):
1003 yield new_edge
1004
1005 - def __str__(self): return 'Fundamental Rule'
1006
1008 """
1009 A rule licensing any edges corresponding to terminals in the
1010 text. In particular, this rule licenses the leaf edge:
1011 - [wS{->}*][i:i+1]
1012 for C{w} is a word in the text, where C{i} is C{w}'s index.
1013 """
1014 NUM_EDGES = 0
1020
1021
1022
1023
1024
1025
1027 """
1028 A rule licensing edges corresponding to the grammar productions for
1029 the grammar's start symbol. In particular, this rule specifies that:
1030 - [SS{->}*S{alpha}][0:i]
1031 is licensed for each grammar production C{SS{->}S{alpha}}, where
1032 C{S} is the grammar's start symbol.
1033 """
1034 NUM_EDGES = 0
1040
1042 """
1043 A rule licensing edges corresponding to the grammar productions
1044 for the nonterminal following an incomplete edge's dot. In
1045 particular, this rule specifies that:
1046 - [AS{->}S{alpha}*BS{beta}][i:j]
1047 licenses the edge:
1048 - [BS{->}*S{gamma}][j:j]
1049 for each grammar production C{BS{->}S{gamma}}.
1050 """
1051 NUM_EDGES = 1
1058
1060 """
1061 A rule licensing an edge corresponding to a terminal following an
1062 incomplete edge's dot. In particular, this rule specifies that:
1063 - [AS{->}S{alpha}*w{beta}][i:j]
1064 licenses the leaf edge:
1065 - [wS{->}*][j:j+1]
1066 if the C{j}th word in the text is C{w}.
1067 """
1068 NUM_EDGES = 1
1077
1078
1080 """
1081 A cached version of L{TopDownInitRule}. After the first time this
1082 rule is applied, it will not generate any more edges.
1083
1084 If C{chart} or C{grammar} are changed, then the cache is flushed.
1085 """
1089
1101
1102 - def __str__(self): return 'Top Down Init Rule'
1103
1105 """
1106 A cached version of L{TopDownExpandRule}. After the first time
1107 this rule is applied to an edge with a given C{end} and C{next},
1108 it will not generate any more edges for edges with that C{end} and
1109 C{next}.
1110
1111 If C{chart} or C{grammar} are changed, then the cache is flushed.
1112 """
1116
1130
1131 - def __str__(self): return 'Top Down Expand Rule'
1132
1133
1134
1135
1136
1138 """
1139 A rule licensing any edges corresponding to terminals in the
1140 text. In particular, this rule licenses the leaf edge:
1141 - [wS{->}*][i:i+1]
1142 for C{w} is a word in the text, where C{i} is C{w}'s index.
1143 """
1144 NUM_EDGES = 0
1150
1152 """
1153 A rule licensing any edge corresponding to a production whose
1154 right-hand side begins with a complete edge's left-hand side. In
1155 particular, this rule specifies that:
1156 - [AS{->}S{alpha}*]
1157 licenses the edge:
1158 - [BS{->}*AS{beta}]
1159 for each grammar production C{BS{->}AS{beta}}
1160 """
1161 NUM_EDGES = 1
1168
1169
1170
1171
1172
1174 """
1175 A rule that joins a given complete edge with adjacent incomplete
1176 edges in the chart, to form combined edges. In particular, this
1177 rule specifies that:
1178 - [BS{->}S{gamma}*][j:k]
1179 licenses the edge:
1180 - [AS{->}S{alpha}B*S{beta}][i:j]
1181 given that the chart contains:
1182 - [AS{->}S{alpha}*BS{beta}][i:j]
1183 @note: This is basically L{FundamentalRule}, with the left edge
1184 left unspecified.
1185 """
1186 NUM_EDGES = 1
1187
1188 _fundamental_rule = FundamentalRule()
1189
1190 - def apply_iter(self, chart, grammar, right_edge):
1198
1199 - def __str__(self): return 'Completer Rule'
1200
1202 """
1203 A rule licensing a leaf edge corresponding to a part-of-speech
1204 terminal following an incomplete edge's dot. In particular, this
1205 rule specifies that:
1206 - [AS{->}S{alpha}*PS{beta}][i:j]
1207 licenses the edges:
1208 - [PS{->}w*][j:j+1]
1209 - [wS{->}*][j:j+1]
1210 if the C{j}th word in the text is C{w}; and C{P} is a valid part
1211 of speech for C{w}.
1212 """
1213 NUM_EDGES = 1
1214 - def __init__(self, word_to_pos_lexicon):
1215 self._word_to_pos = word_to_pos_lexicon
1216
1229
1230
1231
1250
1251 - def __str__(self): return 'Predictor Rule'
1252
1259
1260
1261
1262
1263
1264
1265 TD_STRATEGY = [CachedTopDownInitRule(), CachedTopDownExpandRule(),
1266 TopDownMatchRule(), SingleEdgeFundamentalRule()]
1267 BU_STRATEGY = [BottomUpInitRule(), BottomUpPredictRule(),
1268 SingleEdgeFundamentalRule()]
1269
1271 """
1272 A generic chart parser. A X{strategy}, or list of
1273 L{ChartRules<ChartRuleI>}, is used to decide what edges to add to
1274 the chart. In particular, C{ChartParser} uses the following
1275 algorithm to parse texts:
1276
1277 - Until no new edges are added:
1278 - For each I{rule} in I{strategy}:
1279 - Apply I{rule} to any applicable edges in the chart.
1280 - Return any complete parses in the chart
1281 """
1283 """
1284 Create a new chart parser, that uses C{grammar} to parse
1285 texts.
1286
1287 @type grammar: L{cfg.Grammar}
1288 @param grammar: The grammar used to parse texts.
1289 @type strategy: C{list} of L{ChartRuleI}
1290 @param strategy: A list of rules that should be used to decide
1291 what edges to add to the chart (top-down strategy by default).
1292 @type trace: C{int}
1293 @param trace: The level of tracing that should be used when
1294 parsing a text. C{0} will generate no tracing output;
1295 and higher numbers will produce more verbose tracing
1296 output.
1297 """
1298 self._grammar = grammar
1299 self._strategy = strategy
1300 self._trace = trace
1301
1303 return self._grammar
1304
1306 tokens = list(tokens)
1307 self._grammar.check_coverage(tokens)
1308 chart = Chart(list(tokens))
1309 grammar = self._grammar
1310
1311
1312 w = 50/(chart.num_leaves()+1)
1313 if self._trace > 0: print chart.pp_leaves(w)
1314
1315 edges_added = 1
1316 while edges_added > 0:
1317 edges_added = 0
1318 for rule in self._strategy:
1319 edges_added_by_rule = 0
1320 for e in rule.apply_everywhere(chart, grammar):
1321 if self._trace > 0 and edges_added_by_rule == 0:
1322 print '%s:' % rule
1323 edges_added_by_rule += 1
1324 if self._trace > 1: print chart.pp_edge(e,w)
1325 if self._trace == 1 and edges_added_by_rule > 0:
1326 print ' - Added %d edges' % edges_added_by_rule
1327 edges_added += edges_added_by_rule
1328
1329
1330 return chart.parses(grammar.start(), tree_class=tree_class)[:n]
1331
1332
1333
1334
1335
1337 """
1338 A C{ChartParser} that allows you to step through the parsing
1339 process, adding a single edge at a time. It also allows you to
1340 change the parser's strategy or grammar midway through parsing a
1341 text.
1342
1343 The C{initialize} method is used to start parsing a text. C{step}
1344 adds a single edge to the chart. C{set_strategy} changes the
1345 strategy used by the chart parser. C{parses} returns the set of
1346 parses that has been found by the chart parser.
1347
1348 @ivar _restart: Records whether the parser's strategy, grammar,
1349 or chart has been changed. If so, then L{step} must restart
1350 the parsing algorithm.
1351 """
1352 - def __init__(self, grammar, strategy=None, trace=0):
1357
1358
1359
1360
1361
1363 "Begin parsing the given tokens."
1364 self._chart = Chart(list(tokens))
1365 self._restart = True
1366
1367
1368
1369
1370
1372 """
1373 @return: A generator that adds edges to the chart, one at a
1374 time. Each time the generator is resumed, it adds a single
1375 edge and yields that edge. If no more edges can be added,
1376 then it yields C{None}.
1377
1378 If the parser's strategy, grammar, or chart is changed, then
1379 the generator will continue adding edges using the new
1380 strategy, grammar, or chart.
1381
1382 Note that this generator never terminates, since the grammar
1383 or strategy might be changed to values that would add new
1384 edges. Instead, it yields C{None} when no more edges can be
1385 added with the current strategy and grammar.
1386 """
1387 if self._chart is None:
1388 raise ValueError, 'Parser must be initialized first'
1389 while 1:
1390 self._restart = False
1391 w = 50/(self._chart.num_leaves()+1)
1392
1393 for e in self._parse():
1394 if self._trace > 1: print self._current_chartrule
1395 if self._trace > 0: print self._chart.pp_edge(e,w)
1396 yield e
1397 if self._restart: break
1398 else:
1399 yield None
1400
1402 """
1403 A generator that implements the actual parsing algorithm.
1404 L{step} iterates through this generator, and restarts it
1405 whenever the parser's strategy, grammar, or chart is modified.
1406 """
1407 chart = self._chart
1408 grammar = self._grammar
1409 edges_added = 1
1410 while edges_added > 0:
1411 edges_added = 0
1412 for rule in self._strategy:
1413 self._current_chartrule = rule
1414 for e in rule.apply_everywhere_iter(chart, grammar):
1415 edges_added += 1
1416 yield e
1417
1418
1419
1420
1421
1423 "@return: The strategy used by this parser."
1424 return self._strategy
1425
1427 "@return: The grammar used by this parser."
1428 return self._grammar
1429
1431 "@return: The chart that is used by this parser."
1432 return self._chart
1433
1435 "@return: The chart rule used to generate the most recent edge."
1436 return self._current_chartrule
1437
1439 "@return: The parse trees currently contained in the chart."
1440 return self._chart.parses(self._grammar.start(), tree_class)
1441
1442
1443
1444
1445
1447 """
1448 Change the startegy that the parser uses to decide which edges
1449 to add to the chart.
1450 @type strategy: C{list} of L{ChartRuleI}
1451 @param strategy: A list of rules that should be used to decide
1452 what edges to add to the chart.
1453 """
1454 if strategy == self._strategy: return
1455 self._strategy = strategy[:]
1456 self._restart = True
1457
1459 "Change the grammar used by the parser."
1460 if grammar is self._grammar: return
1461 self._grammar = grammar
1462 self._restart = True
1463
1465 "Load a given chart into the chart parser."
1466 if chart is self._chart: return
1467 self._chart = chart
1468 self._restart = True
1469
1470
1471
1472
1473
1487
1488
1489
1490
1491
1493 """
1494 A demonstration of the chart parsers.
1495 """
1496 import sys, time
1497
1498
1499 S, VP, NP, PP = cfg.nonterminals('S, VP, NP, PP')
1500 V, N, P, Name, Det = cfg.nonterminals('V, N, P, Name, Det')
1501
1502
1503 grammatical_productions = [
1504 cfg.Production(S, [NP, VP]), cfg.Production(PP, [P, NP]),
1505 cfg.Production(NP, [Det, N]), cfg.Production(NP, [NP, PP]),
1506 cfg.Production(VP, [VP, PP]), cfg.Production(VP, [V, NP]),
1507 cfg.Production(VP, [V]),]
1508
1509
1510 lexical_productions = [
1511 cfg.Production(NP, ['John']), cfg.Production(NP, ['I']),
1512 cfg.Production(Det, ['the']), cfg.Production(Det, ['my']),
1513 cfg.Production(Det, ['a']),
1514 cfg.Production(N, ['dog']), cfg.Production(N, ['cookie']),
1515 cfg.Production(V, ['ate']), cfg.Production(V, ['saw']),
1516 cfg.Production(P, ['with']), cfg.Production(P, ['under']),
1517 ]
1518
1519
1520 grammar = cfg.Grammar(S, grammatical_productions+lexical_productions)
1521
1522
1523 sent = 'I saw John with a dog with my cookie'
1524 print "Sentence:\n", sent
1525 tokens = sent.split()
1526
1527 print tokens
1528
1529
1530 print ' 1: Top-down chart parser'
1531 print ' 2: Bottom-up chart parser'
1532 print ' 3: Earley parser'
1533 print ' 4: Stepping chart parser (alternating top-down & bottom-up)'
1534 print ' 5: All parsers'
1535 print '\nWhich parser (1-5)? ',
1536 choice = sys.stdin.readline().strip()
1537 print
1538 if choice not in '12345':
1539 print 'Bad parser number'
1540 return
1541
1542
1543 times = {}
1544
1545
1546 if choice in ('1', '5'):
1547 cp = ChartParser(grammar, TD_STRATEGY, trace=2)
1548 t = time.time()
1549 parses = cp.nbest_parse(tokens)
1550 times['top down'] = time.time()-t
1551 assert len(parses)==5, 'Not all parses found'
1552 for tree in parses: print tree
1553
1554
1555 if choice in ('2', '5'):
1556 cp = ChartParser(grammar, BU_STRATEGY, trace=2)
1557 t = time.time()
1558 parses = cp.nbest_parse(tokens)
1559 times['bottom up'] = time.time()-t
1560 assert len(parses)==5, 'Not all parses found'
1561 for tree in parses: print tree
1562
1563
1564 if choice in ('3', '5'):
1565 cp = ChartParser(grammar, makeEarleyStrategy(grammar), trace=2)
1566 t = time.time()
1567 parses = cp.nbest_parse(tokens)
1568 times['Earley'] = time.time()-t
1569 assert len(parses)==5, 'Not all parses found'
1570 for tree in parses: print tree
1571
1572
1573 if choice in ('4', '5'):
1574 t = time.time()
1575 cp = SteppingChartParser(grammar, trace=1)
1576 cp.initialize(tokens)
1577 for i in range(5):
1578 print '*** SWITCH TO TOP DOWN'
1579 cp.set_strategy(TD_STRATEGY)
1580 for j, e in enumerate(cp.step()):
1581 if j>20 or e is None: break
1582 print '*** SWITCH TO BOTTOM UP'
1583 cp.set_strategy(BU_STRATEGY)
1584 for j, e in enumerate(cp.step()):
1585 if j>20 or e is None: break
1586 times['stepping'] = time.time()-t
1587 assert len(cp.parses())==5, 'Not all parses found'
1588 for parse in cp.parses(): print parse
1589
1590
1591 if not times: return
1592 maxlen = max(len(key) for key in times.keys())
1593 format = '%' + `maxlen` + 's parser: %6.3fsec'
1594 times_items = times.items()
1595 times_items.sort(lambda a,b:cmp(a[1], b[1]))
1596 for (parser, t) in times_items:
1597 print format % (parser, t)
1598
1599 if __name__ == '__main__': demo()
1600