1
2
3
4
5
6
7
8
9
10 """
11 Class for representing hierarchical language structures, such as
12 syntax trees and morphological trees.
13 """
14
15 import re
16 import string
17
18 import nltk.cfg
19 from probability import ProbabilisticMixIn
20 from util import slice_bounds
21
22
23
24
25
27 """
28 A hierarchical structure.
29
30 Each C{Tree} represents a single hierarchical grouping of
31 leaves and subtrees. For example, each constituent in a syntax
32 tree is represented by a single C{Tree}.
33
34 A tree's children are encoded as a C{list} of leaves and subtrees,
35 where a X{leaf} is a basic (non-tree) value; and a X{subtree} is a
36 nested C{Tree}.
37
38 Any other properties that a C{Tree} defines are known as
39 X{node properties}, and are used to add information about
40 individual hierarchical groupings. For example, syntax trees use a
41 NODE property to label syntactic constituents with phrase tags,
42 such as \"NP\" and\"VP\".
43
44 Several C{Tree} methods use X{tree positions} to specify
45 children or descendants of a tree. Tree positions are defined as
46 follows:
47
48 - The tree position M{i} specifies a C{Tree}'s M{i}th child.
49 - The tree position C{()} specifies the C{Tree} itself.
50 - If C{M{p}} is the tree position of descendant M{d}, then
51 C{M{p}+(M{i})} specifies the C{M{i}}th child of M{d}.
52
53 I.e., every tree position is either a single index C{M{i}},
54 specifying C{self[M{i}]}; or a sequence C{(M{i1}, M{i2}, ...,
55 M{iN})}, specifying
56 C{self[M{i1}][M{i2}]...[M{iN}]}.
57 """
58 - def __new__(cls, node_or_str=None, children=None):
59 if node_or_str is None:
60 return list.__new__(cls)
61 if children is None:
62 if not isinstance(node_or_str, basestring):
63 raise TypeError("%s: Expected a node value and child list "
64 "or a single string" % cls.__name__)
65 return cls.parse(node_or_str)
66 else:
67 if (isinstance(children, basestring) or
68 not hasattr(children, '__iter__')):
69 raise TypeError("%s() argument 2 should be a list, not a "
70 "string" % cls.__name__)
71 return list.__new__(cls, node_or_str, children)
72
73 - def __init__(self, node_or_str, children=None):
74 """
75 Construct a new tree. This constructor can be called in one
76 of two ways:
77
78 - C{Tree(node, children)} constructs a new tree with the
79 specified node value and list of children.
80
81 - C{Tree(s)} constructs a new tree by parsing the string
82 C{s}. It is equivalent to calling the class method
83 C{Tree.parse(s)}.
84 """
85
86
87
88
89
90
91 if children is None: return
92
93 list.__init__(self, children)
94 self.node = node_or_str
95
96
97
98
99
101 if not isinstance(other, Tree): return False
102 return self.node == other.node and list.__eq__(self, other)
104 return not (self == other)
106 if not isinstance(other, Tree): return False
107 return self.node < other.node or list.__lt__(self, other)
109 if not isinstance(other, Tree): return False
110 return self.node <= other.node or list.__le__(self, other)
112 if not isinstance(other, Tree): return True
113 return self.node > other.node or list.__gt__(self, other)
115 if not isinstance(other, Tree): return False
116 return self.node >= other.node or list.__ge__(self, other)
117
118
119
120
121
123 raise TypeError('Tree does not support multiplication')
125 raise TypeError('Tree does not support multiplication')
127 raise TypeError('Tree does not support addition')
129 raise TypeError('Tree does not support addition')
130
131
132
133
134
145
147 if isinstance(index, (int, slice)):
148 return list.__setitem__(self, index, value)
149 else:
150 if len(index) == 0:
151 raise IndexError('The tree position () may not be '
152 'assigned to.')
153 elif len(index) == 1:
154 self[index[0]] = value
155 else:
156 self[index[0]][index[1:]] = value
157
159 if isinstance(index, (int, slice)):
160 return list.__delitem__(self, index)
161 else:
162 if len(index) == 0:
163 raise IndexError('The tree position () may not be deleted.')
164 elif len(index) == 1:
165 del self[index[0]]
166 else:
167 del self[index[0]][index[1:]]
168
169
170
171
172
174 """
175 @return: a list containing this tree's leaves.
176 The order reflects the order of the
177 leaves in the tree's hierarchical structure.
178 @rtype: C{list}
179 """
180 leaves = []
181 for child in self:
182 if isinstance(child, Tree):
183 leaves.extend(child.leaves())
184 else:
185 leaves.append(child)
186 return leaves
187
189 """
190 @return: a tree consisting of this tree's root connected directly to
191 its leaves, omitting all intervening non-terminal nodes.
192 @rtype: C{Tree}
193 """
194 return Tree(self.node, self.leaves())
195
197 """
198 @return: The height of this tree. The height of a tree
199 containing no children is 1; the height of a tree
200 containing only leaves is 2; and the height of any other
201 tree is one plus the maximum of its children's
202 heights.
203 @rtype: C{int}
204 """
205 max_child_height = 0
206 for child in self:
207 if isinstance(child, Tree):
208 max_child_height = max(max_child_height, child.height())
209 else:
210 max_child_height = max(max_child_height, 1)
211 return 1 + max_child_height
212
214 """
215 @param order: One of: C{preorder}, C{postorder}, C{bothorder},
216 C{leaves}.
217 """
218 positions = []
219 if order in ('preorder', 'bothorder'): positions.append( () )
220 for i, child in enumerate(self):
221 if isinstance(child, Tree):
222 childpos = child.treepositions(order)
223 positions.extend((i,)+p for p in childpos)
224 else:
225 positions.append( (i,) )
226 if order in ('postorder', 'bothorder'): positions.append( () )
227 return positions
228
230 """
231 Generate all the subtrees of this tree, optionally restricted
232 to trees matching the filter function.
233 @type filter: C{function}
234 @param filter: the function to filter all local trees
235 """
236 if not filter or filter(self):
237 yield self
238 for child in self:
239 if isinstance(child, Tree):
240 for subtree in child.subtrees(filter):
241 yield subtree
242
244 """
245 Generate the productions that correspond to the non-terminal nodes of the tree.
246 For each subtree of the form (P: C1 C2 ... Cn) this produces a production of the
247 form P -> C1 C2 ... Cn.
248
249 @rtype: list of C{Production}s
250 """
251
252 if not isinstance(self.node, str):
253 raise TypeError, 'Productions can only be generated from trees having node labels that are strings'
254
255 prods = [nltk.cfg.Production(nltk.cfg.Nonterminal(self.node), _child_names(self))]
256 for child in self:
257 if isinstance(child, Tree):
258 prods += child.productions()
259 return prods
260
262 """
263 @return: a list of tuples containing leaves and pre-terminals (part-of-speech tags).
264 The order reflects the order of the
265 leaves in the tree's hierarchical structure.
266 @rtype: C{list} of C{tuples}
267 """
268 pos = []
269 for child in self:
270 if isinstance(child, Tree):
271 pos.extend(child.pos())
272 else:
273 pos.append((child, self.node))
274 return pos
275
277 """
278 @return: The tree position of the C{index}-th leaf in this
279 tree. I.e., if C{tp=self.leaf_treeposition(i)}, then
280 C{self[tp]==self.leaves()[i]}.
281
282 @raise IndexError: If this tree contains fewer than C{index+1}
283 leaves, or if C{index<0}.
284 """
285 if index < 0: raise IndexError('index must be non-negative')
286
287 stack = [(self, ())]
288 while stack:
289 value, treepos = stack.pop()
290 if not isinstance(value, Tree):
291 if index == 0: return treepos
292 else: index -= 1
293 else:
294 for i in range(len(value)-1, -1, -1):
295 stack.append( (value[i], treepos+(i,)) )
296
297 raise IndexError('index must be less than or equal to len(self)')
298
300 """
301 @return: The tree position of the lowest descendant of this
302 tree that dominates C{self.leaves()[start:end]}.
303 @raise ValueError: if C{end <= start}
304 """
305 if end <= start:
306 raise ValueError('end must be greater than start')
307
308
309 start_treepos = self.leaf_treeposition(start)
310 end_treepos = self.leaf_treeposition(end-1)
311
312 for i in range(len(start_treepos)):
313 if i == len(end_treepos) or start_treepos[i] != end_treepos[i]:
314 return start_treepos[:i]
315 return start_treepos
316
317
318
319
320
347
369
370 - def collapse_unary(self, collapsePOS = False, collapseRoot = False, joinChar = "+"):
371 """
372 Collapse subtrees with a single child (ie. unary productions)
373 into a new non-terminal (Tree node) joined by 'joinChar'.
374 This is useful when working with algorithms that do not allow
375 unary productions, and completely removing the unary productions
376 would require loss of useful information. The Tree is modified
377 directly (since it is passed by reference) and no value is returned.
378
379 @param collapsePOS: 'False' (default) will not collapse the parent of leaf nodes (ie.
380 Part-of-Speech tags) since they are always unary productions
381 @type collapsePOS: C{boolean}
382 @param collapseRoot: 'False' (default) will not modify the root production
383 if it is unary. For the Penn WSJ treebank corpus, this corresponds
384 to the TOP -> productions.
385 @type collapseRoot: C{boolean}
386 @param joinChar: A string used to connect collapsed node values (default = "+")
387 @type joinChar: C{string}
388 """
389 from treetransforms import collapse_unary
390 collapse_unary(self, collapsePOS, collapseRoot, joinChar)
391
392
393
394
395
396
398 """
399 Convert a tree between different subtypes of Tree. C{cls} determines
400 which class will be used to encode the new tree.
401
402 @type val: L{Tree}
403 @param val: The tree that should be converted.
404 @return: The new C{Tree}.
405 """
406 if isinstance(val, Tree):
407 children = [cls.convert(child) for child in val]
408 return cls(val.node, children)
409 else:
410 return val
411 convert = classmethod(convert)
412
413 - def copy(self, deep=False):
414 if not deep: return self.__class__(self.node, self)
415 else: return self.__class__.convert(self)
416
418 - def freeze(self, leaf_freezer=None):
419 frozen_class = self._frozen_class()
420 if leaf_freezer is None:
421 newcopy = frozen_class.convert(self)
422 else:
423 newcopy = self.copy(deep=True)
424 for pos in newcopy.treepositions('leaves'):
425 newcopy[pos] = leaf_freezer(newcopy[pos])
426 newcopy = frozen_class.convert(newcopy)
427 hash(newcopy)
428 return newcopy
429
430
431
432
433
434 @classmethod
435 - def parse(cls, s, brackets='()', parse_node=None, parse_leaf=None,
436 node_pattern=None, leaf_pattern=None,
437 remove_empty_top_bracketing=False):
438 """
439 Parse a bracketed tree string and return the resulting tree.
440 Trees are represented as nested brackettings, such as::
441
442 (S (NP (NNP John)) (VP (V runs)))
443
444 @type s: C{str}
445 @param s: The string to parse
446
447 @type brackets: length-2 C{str}
448 @param brackets: The bracket characters used to mark the
449 beginning and end of trees and subtrees.
450
451 @type parse_node: C{function}
452 @type parse_leaf: C{function}
453 @param parse_node, parse_leaf: If specified, these functions
454 are applied to the substrings of C{s} corresponding to
455 nodes and leaves (respectively) to obtain the values for
456 those nodes and leaves. They should have the following
457 signature:
458
459 >>> parse_node(str) -> value
460
461 For example, these functions could be used to parse nodes
462 and leaves whose values should be some type other than
463 string (such as L{FeatStruct <nltk.featstruct.FeatStruct>}).
464 Note that by default, node strings and leaf strings are
465 delimited by whitespace and brackets; to override this
466 default, use the C{node_pattern} and C{leaf_pattern}
467 arguments.
468
469 @type node_pattern: C{str}
470 @type leaf_pattern: C{str}
471 @param node_pattern, leaf_pattern: Regular expression patterns
472 used to find node and leaf substrings in C{s}. By
473 default, both nodes patterns are defined to match any
474 sequence of non-whitespace non-bracket characters.
475
476 @type remove_empty_top_bracketing: C{bool}
477 @param remove_empty_top_bracketing: If the resulting tree has
478 an empty node label, and is length one, then return its
479 single child instead. This is useful for treebank trees,
480 which sometimes contain an extra level of bracketing.
481
482 @return: A tree corresponding to the string representation C{s}.
483 If this class method is called using a subclass of C{Tree},
484 then it will return a tree of that type.
485 @rtype: C{Tree}
486 """
487 if not isinstance(brackets, basestring) or len(brackets) != 2:
488 raise TypeError('brackets must be a length-2 string')
489 if re.search('\s', brackets):
490 raise TypeError('whitespace brackets not allowed')
491
492 open_b, close_b = brackets
493 open_pattern, close_pattern = (re.escape(open_b), re.escape(close_b))
494 if node_pattern is None:
495 node_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern)
496 if leaf_pattern is None:
497 leaf_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern)
498 token_re = re.compile('%s\s*(%s)?|%s|(%s)' % (
499 open_pattern, node_pattern, close_pattern, leaf_pattern))
500
501 stack = [(None, [])]
502 for match in token_re.finditer(s):
503 token = match.group()
504
505 if token[0] == open_b:
506 if len(stack) == 1 and len(stack[0][1]) > 0:
507 cls._parse_error(s, match, 'end-of-string')
508 node = token[1:].lstrip()
509 if parse_node is not None: node = parse_node(node)
510 stack.append((node, []))
511
512 elif token == close_b:
513 if len(stack) == 1:
514 if len(stack[0][1]) == 0:
515 cls._parse_error(s, match, open_b)
516 else:
517 cls._parse_error(s, match, 'end-of-string')
518 node, children = stack.pop()
519 stack[-1][1].append(cls(node, children))
520
521 else:
522 if len(stack) == 1:
523 cls._parse_error(s, match, open_b)
524 if parse_leaf is not None: token = parse_leaf(token)
525 stack[-1][1].append(token)
526
527
528 if len(stack) > 1:
529 cls._parse_error(s, 'end-of-string', close_b)
530 elif len(stack[0][1]) == 0:
531 cls._parse_error(s, 'end-of-string', open_b)
532 else:
533 assert stack[0][0] is None
534 assert len(stack[0][1]) == 1
535 tree = stack[0][1][0]
536
537
538
539 if remove_empty_top_bracketing and tree.node == '' and len(tree) == 1:
540 tree = tree[0]
541
542 return tree
543
544 @classmethod
546 """
547 Display a friendly error message when parsing a tree string fails.
548 @param s: The string we're parsing.
549 @param match: regexp match of the problem token.
550 @param expecting: what we expected to see instead.
551 """
552
553 if match == 'end-of-string':
554 pos, token = len(s), 'end-of-string'
555 else:
556 pos, token = match.start(), match.group()
557 msg = '%s.parse(): expected %r but got %r\n%sat index %d.' % (
558 cls.__name__, expecting, token, ' '*12, pos)
559
560 s = s.replace('\n', ' ').replace('\t', ' ')
561 offset = pos
562 if len(s) > pos+10:
563 s = s[:pos+10]+'...'
564 if pos > 10:
565 s = '...'+s[pos-10:]
566 offset = 13
567 msg += '\n%s"%s"\n%s^' % (' '*16, s, ' '*(17+offset))
568 raise ValueError(msg)
569
570
571
572
579
581 childstr = ", ".join(repr(c) for c in self)
582 return '%s(%r, [%s])' % (self.__class__.__name__, self.node, childstr)
583
586
587 - def pprint(self, margin=70, indent=0, nodesep='', parens='()', quotes=False):
588 """
589 @return: A pretty-printed string representation of this tree.
590 @rtype: C{string}
591 @param margin: The right margin at which to do line-wrapping.
592 @type margin: C{int}
593 @param indent: The indentation level at which printing
594 begins. This number is used to decide how far to indent
595 subsequent lines.
596 @type indent: C{int}
597 @param nodesep: A string that is used to separate the node
598 from the children. E.g., the default value C{':'} gives
599 trees like C{(S: (NP: I) (VP: (V: saw) (NP: it)))}.
600 """
601
602
603 s = self._pprint_flat(nodesep, parens, quotes)
604 if len(s)+indent < margin:
605 return s
606
607
608 if isinstance(self.node, basestring):
609 s = '%s%s%s' % (parens[0], self.node, nodesep)
610 else:
611 s = '%s%r%s' % (parens[0], self.node, nodesep)
612 for child in self:
613 if isinstance(child, Tree):
614 s += '\n'+' '*(indent+2)+child.pprint(margin, indent+2,
615 nodesep, parens, quotes)
616 elif isinstance(child, tuple):
617 s += '\n'+' '*(indent+2)+ "/".join(child)
618 elif isinstance(child, str) and not quotes:
619 s += '\n'+' '*(indent+2)+ '%s' % child
620 else:
621 s += '\n'+' '*(indent+2)+ '%r' % child
622 return s+parens[1]
623
625 r"""
626 Returns a representation of the tree compatible with the
627 LaTeX qtree package. This consists of the string C{\Tree}
628 followed by the parse tree represented in bracketed notation.
629
630 For example, the following result was generated from a parse tree of
631 the sentence C{The announcement astounded us}::
632
633 \Tree [.I'' [.N'' [.D The ] [.N' [.N announcement ] ] ]
634 [.I' [.V'' [.V' [.V astounded ] [.N'' [.N' [.N us ] ] ] ] ] ] ]
635
636 See U{http://www.ling.upenn.edu/advice/latex.html} for the LaTeX
637 style file for the qtree package.
638
639 @return: A latex qtree representation of this tree.
640 @rtype: C{string}
641 """
642 return r'\Tree ' + self.pprint(indent=6, nodesep='', parens=('[.', ' ]'))
643
645 childstrs = []
646 for child in self:
647 if isinstance(child, Tree):
648 childstrs.append(child._pprint_flat(nodesep, parens, quotes))
649 elif isinstance(child, tuple):
650 childstrs.append("/".join(child))
651 elif isinstance(child, str) and not quotes:
652 childstrs.append('%s' % child)
653 else:
654 childstrs.append('%r' % child)
655 if isinstance(self.node, basestring):
656 return '%s%s%s %s%s' % (parens[0], self.node, nodesep,
657 string.join(childstrs), parens[1])
658 else:
659 return '%s%r%s %s%s' % (parens[0], self.node, nodesep,
660 string.join(childstrs), parens[1])
661
663 - def __init__(self, node_or_str, children=None):
664 if children is None: return
665 super(ImmutableTree, self).__init__(node_or_str, children)
666
667
668 try:
669 self._hash = hash( (self.node, tuple(self)) )
670 except (TypeError, ValueError):
671 raise ValueError("ImmutableTree's node value and children "
672 "must be immutable")
674 raise ValueError, 'ImmutableTrees may not be modified'
676 raise ValueError, 'ImmutableTrees may not be modified'
678 raise ValueError, 'ImmutableTrees may not be modified'
680 raise ValueError, 'ImmutableTrees may not be modified'
682 raise ValueError, 'ImmutableTrees may not be modified'
684 raise ValueError, 'ImmutableTrees may not be modified'
686 raise ValueError, 'ImmutableTrees may not be modified'
688 raise ValueError, 'ImmutableTrees may not be modified'
689 - def pop(self, v=None):
690 raise ValueError, 'ImmutableTrees may not be modified'
692 raise ValueError, 'ImmutableTrees may not be modified'
694 raise ValueError, 'ImmutableTrees may not be modified'
696 raise ValueError, 'ImmutableTrees may not be modified'
699
701 """Set self._node. This will only succeed the first time the
702 node value is set, which should occur in Tree.__init__()."""
703 if hasattr(self, 'node'):
704 raise ValueError, 'ImmutableTrees may not be modified'
705 self._node = node
708 node = property(_get_node, _set_node)
709
710
711
712
713
715 """
716 An abstract base class for L{Tree}s that automatically maintain
717 pointers to their parents. These parent pointers are updated
718 whenever any change is made to a tree's structure. Two subclasses
719 are currently defined:
720
721 - L{ParentedTree} is used for tree structures where each subtree
722 has at most one parent. This class should be used in cases
723 where there is no"sharing" of subtrees.
724
725 - L{MultiParentedTree} is used for tree structures where a
726 subtree may have zero or more parents. This class should be
727 used in cases where subtrees may be shared.
728
729 Subclassing
730 ===========
731 The C{AbstractParentedTree} class redefines all operations that
732 modify a tree's structure to call two methods, which are used by
733 subclasses to update parent information:
734
735 - L{_setparent()} is called whenever a new child is added.
736 - L{_delparent()} is called whenever a child is removed.
737 """
738 - def __init__(self, node_or_str, children=None):
749
750
751
752
753
754 - def _setparent(self, child, index, dry_run=False):
755 """
756 Update C{child}'s parent pointer to point to self. This
757 method is only called if C{child}'s type is L{Tree}; i.e., it
758 is not called when adding a leaf to a tree. This method is
759 always called before the child is actually added to C{self}'s
760 child list.
761
762 @type child: L{Tree}
763 @type index: C{int}
764 @param index: The index of C{child} in C{self}.
765 @raise TypeError: If C{child} is a tree with an impropriate
766 type. Typically, if C{child} is a tree, then its type needs
767 to match C{self}'s type. This prevents mixing of
768 different tree types (single-parented, multi-parented, and
769 non-parented).
770 @param dry_run: If true, the don't actually set the child's
771 parent pointer; just check for any error conditions, and
772 raise an exception if one is found.
773 """
774 raise AssertionError('Abstract base class')
775
777 """
778 Update C{child}'s parent pointer to not point to self. This
779 method is only called if C{child}'s type is L{Tree}; i.e., it
780 is not called when removing a leaf from a tree. This method
781 is always called before the child is actually removed from
782 C{self}'s child list.
783
784 @type child: L{Tree}
785 @type index: C{int}
786 @param index: The index of C{child} in C{self}.
787 """
788 raise AssertionError('Abstract base class')
789
790
791
792
793
794
795
797
798 if isinstance(index, slice):
799 start, stop = slice_bounds(self, index)
800
801 for i in xrange(start, stop):
802 if isinstance(self[i], Tree):
803 self._delparent(self[i], i)
804
805 super(AbstractParentedTree, self).__delitem__(index)
806
807
808 elif isinstance(index, int):
809 if index < 0: index += len(self)
810 if index < 0: raise IndexError('index out of range')
811
812 if isinstance(self[index], Tree):
813 self._delparent(self[index], index)
814
815 super(AbstractParentedTree, self).__delitem__(index)
816
817
818 elif len(index) == 0:
819 raise IndexError('The tree position () may not be deleted.')
820
821
822 elif len(index) == 1:
823 del self[index[0]]
824
825
826 else:
827 del self[index[0]][index[1:]]
828
830
831 if isinstance(index, slice):
832 start, stop = slice_bounds(self, index)
833
834 if not isinstance(value, (list, tuple)):
835 value = list(value)
836
837
838 for i, child in enumerate(value):
839 if isinstance(child, Tree):
840 self._setparent(child, start+i, dry_run=True)
841
842 for i in xrange(start, stop):
843 if isinstance(self[i], Tree):
844 self._delparent(self[i], i)
845
846
847
848 for i, child in enumerate(value):
849 if isinstance(child, Tree):
850 self._setparent(child, start+i)
851
852 super(AbstractParentedTree, self).__setitem__(index, value)
853
854
855 elif isinstance(index, int):
856 if index < 0: index += len(self)
857 if index < 0: raise IndexError('index out of range')
858
859 if value is self[index]:
860 return
861
862 if isinstance(value, Tree):
863 self._setparent(value, index)
864
865 if isinstance(self[index], Tree):
866 self._delparent(self[index], index)
867
868 super(AbstractParentedTree, self).__setitem__(index, value)
869
870
871 elif len(index) == 0:
872 raise IndexError('The tree position () may not be assigned to.')
873
874
875 elif len(index) == 1:
876 self[index[0]] = value
877
878
879 else:
880 self[index[0]][index[1:]] = value
881
886
892
893 - def insert(self, index, child):
903
904 - def pop(self, index=-1):
910
911
912
918
919
920
921
922
923
924
925 if hasattr(list, '__getslice__'):
932
934 """
935 A L{Tree} that automatically maintains parent pointers for
936 single-parented trees. The following read-only property values
937 are automatically updated whenever the structure of a parented
938 tree is modified: L{parent}, L{parent_index}, L{left_sibling},
939 L{right_sibling}, L{root}, L{treeposition}.
940
941 Each C{ParentedTree} may have at most one parent. In
942 particular, subtrees may not be shared. Any attempt to reuse a
943 single C{ParentedTree} as a child of more than one parent (or
944 as multiple children of the same parent) will cause a
945 C{ValueError} exception to be raised.
946
947 C{ParentedTrees} should never be used in the same tree as C{Trees}
948 or C{MultiParentedTrees}. Mixing tree implementations may result
949 in incorrect parent pointers and in C{TypeError} exceptions.
950 """
951 - def __init__(self, node_or_str, children=None):
952 if children is None: return
953
954 self._parent = None
955 """The parent of this Tree, or C{None} if it has no parent."""
956
957 super(ParentedTree, self).__init__(node_or_str, children)
958
960
961
962
963
964
966 if self._parent is None: return None
967 for i, child in enumerate(self._parent):
968 if child is self: return i
969 assert False, 'expected to find self in self._parent!'
970
976
982
987
989 if self._parent is None: return self
990 else: return self._parent._get_root()
991
992 parent = property(lambda self: self._parent, doc="""
993 The parent of this tree, or C{None} if it has no parent.""")
994
995 parent_index = property(_get_parent_index, doc="""
996 The index of this tree in its parent. I.e.,
997 C{ptree.parent[ptree.parent_index] is ptree}. Note that
998 C{ptree.parent_index} is not necessarily equal to
999 C{ptree.parent.index(ptree)}, since the C{index()} method
1000 returns the first child that is I{equal} to its argument.""")
1001
1002 left_sibling = property(_get_left_sibling, doc="""
1003 The left sibling of this tree, or C{None} if it has none.""")
1004
1005 right_sibling = property(_get_right_sibling, doc="""
1006 The right sibling of this tree, or C{None} if it has none.""")
1007
1008 root = property(_get_root, doc="""
1009 The root of this tree. I.e., the unique ancestor of this tree
1010 whose parent is C{None}. If C{ptree.parent} is C{None}, then
1011 C{ptree} is its own root.""")
1012
1013 treeposition = property(_get_treeposition, doc="""
1014 The tree position of this tree, relative to the root of the
1015 tree. I.e., C{ptree.root[ptree.treeposition] is ptree}.""")
1016 treepos = treeposition
1017
1018
1019
1020
1021
1030
1031 - def _setparent(self, child, index, dry_run=False):
1032
1033 if not isinstance(child, ParentedTree):
1034 raise TypeError('Can not insert a non-ParentedTree '+
1035 'into a ParentedTree')
1036
1037
1038 if child._parent is not None:
1039 raise ValueError('Can not insert a subtree that already '
1040 'has a parent.')
1041
1042
1043 if not dry_run:
1044 child._parent = self
1045
1047 """
1048 A L{Tree} that automatically maintains parent pointers for
1049 multi-parented trees. The following read-only property values are
1050 automatically updated whenever the structure of a multi-parented
1051 tree is modified: L{parents}, L{parent_indices}, L{left_siblings},
1052 L{right_siblings}, L{roots}, L{treepositions}.
1053
1054 Each C{MultiParentedTree} may have zero or more parents. In
1055 particular, subtrees may be shared. If a single
1056 C{MultiParentedTree} is used as multiple children of the same
1057 parent, then that parent will appear multiple times in its
1058 C{parents} property.
1059
1060 C{MultiParentedTrees} should never be used in the same tree as
1061 C{Trees} or C{ParentedTrees}. Mixing tree implementations may
1062 result in incorrect parent pointers and in C{TypeError} exceptions.
1063 """
1064 - def __init__(self, node_or_str, children=None):
1065 if children is None: return
1066
1067 self._parents = []
1068 """A list of this tree's parents. This list should not
1069 contain duplicates, even if a parent contains this tree
1070 multiple times."""
1071
1072 super(MultiParentedTree, self).__init__(node_or_str, children)
1073
1075
1076
1077
1078
1079
1085
1090
1095
1098
1100 if self._parents:
1101 for parent in self._parents:
1102 parent._get_roots_helper(result)
1103 else:
1104 result[id(self)] = self
1105 return result
1106
1107 parents = property(lambda self: list(self._parents), doc="""
1108 The set of parents of this tree. If this tree has no parents,
1109 then C{parents} is the empty set. To check if a tree is used
1110 as multiple children of the same parent, use the
1111 L{parent_indices} property.
1112
1113 @type: C{list} of L{MultiParentedTree}""")
1114
1115 left_siblings = property(_get_left_siblings, doc="""
1116 A list of all left siblings of this tree, in any of its parent
1117 trees. A tree may be its own left sibling if it is used as
1118 multiple contiguous children of the same parent. A tree may
1119 appear multiple times in this list if it is the left sibling
1120 of this tree with respect to multiple parents.
1121
1122 @type: C{list} of L{MultiParentedTree}""")
1123
1124 right_siblings = property(_get_right_siblings, doc="""
1125 A list of all right siblings of this tree, in any of its parent
1126 trees. A tree may be its own right sibling if it is used as
1127 multiple contiguous children of the same parent. A tree may
1128 appear multiple times in this list if it is the right sibling
1129 of this tree with respect to multiple parents.
1130
1131 @type: C{list} of L{MultiParentedTree}""")
1132
1133 roots = property(_get_roots, doc="""
1134 The set of all roots of this tree. This set is formed by
1135 tracing all possible parent paths until trees with no parents
1136 are found.
1137
1138 @type: C{list} of L{MultiParentedTree}""")
1139
1141 """
1142 Return a list of the indices where this tree occurs as a child
1143 of C{parent}. If this child does not occur as a child of
1144 C{parent}, then the empty list is returned. The following is
1145 always true::
1146
1147 for parent_index in ptree.parent_indices(parent):
1148 parent[parent_index] is ptree
1149 """
1150 if parent not in self._parents: return []
1151 else: return [index for (index, child) in enumerate(parent)
1152 if child is self]
1153
1155 """
1156 Return a list of all tree positions that can be used to reach
1157 this multi-parented tree starting from C{root}. I.e., the
1158 following is always true::
1159
1160 for treepos in ptree.treepositions(root):
1161 root[treepos] is ptree
1162 """
1163 if self is root:
1164 return [()]
1165 else:
1166 return [treepos+(index,)
1167 for parent in self._parents
1168 for treepos in parent.treepositions(root)
1169 for (index, child) in enumerate(parent) if child is self]
1170
1171
1172
1173
1174
1175
1188
1189 - def _setparent(self, child, index, dry_run=False):
1190
1191 if not isinstance(child, MultiParentedTree):
1192 raise TypeError('Can not insert a non-MultiParentedTree '+
1193 'into a MultiParentedTree')
1194
1195
1196 if not dry_run:
1197 for parent in child._parents:
1198 if parent is self: break
1199 else:
1200 child._parents.append(self)
1201
1203 - def __init__(self, node_or_str, children=None):
1206
1208 - def __init__(self, node_or_str, children=None):
1211
1212
1213
1214
1216 - def __new__(cls, node_or_str, children=None, **prob_kwargs):
1219 - def __init__(self, node_or_str, children=None, **prob_kwargs):
1223
1224
1229 return '%s (p=%s)' % (self.pprint(margin=60), self.prob())
1235 if not isinstance(other, Tree): return False
1236 return Tree.__eq__(self, other) and self.prob()==other.prob()
1238 return not (self == other)
1239 - def copy(self, deep=False):
1240 if not deep: return self.__class__(self.node, self, prob=self.prob())
1241 else: return self.__class__.convert(self)
1251 convert = classmethod(convert)
1252
1254 - def __new__(cls, node_or_str, children=None, **prob_kwargs):
1257 - def __init__(self, node_or_str, children=None, **prob_kwargs):
1261
1262
1267 return '%s [%s]' % (self.pprint(margin=60), self.prob())
1273 if not isinstance(other, Tree): return False
1274 return Tree.__eq__(self, other) and self.prob()==other.prob()
1276 return not (self == other)
1277 - def copy(self, deep=False):
1278 if not deep: return self.__class__(self.node, self, prob=self.prob())
1279 else: return self.__class__.convert(self)
1289 convert = classmethod(convert)
1290
1291
1300
1301
1302
1303
1304
1305
1306
1308 """
1309 Parse a treebank string and return a tree. Trees are represented
1310 as nested brackettings, e.g. (S (NP (NNP John)) (VP (V runs))).
1311
1312 @return: A tree corresponding to the string representation.
1313 @rtype: C{tree}
1314 @param s: The string to be converted
1315 @type s: C{string}
1316 """
1317 return Tree.parse(s, remove_empty_top_bracketing=True)
1318
1320 """
1321 Parse a Sinica Treebank string and return a tree. Trees are represented as nested brackettings,
1322 as shown in the following example (X represents a Chinese character):
1323 S(goal:NP(Head:Nep:XX)|theme:NP(Head:Nhaa:X)|quantity:Dab:X|Head:VL2:X)#0(PERIODCATEGORY)
1324
1325 @return: A tree corresponding to the string representation.
1326 @rtype: C{tree}
1327 @param s: The string to be converted
1328 @type s: C{string}
1329 """
1330 tokens = re.split(r'([()| ])', s)
1331 for i in range(len(tokens)):
1332 if tokens[i] == '(':
1333 tokens[i-1], tokens[i] = tokens[i], tokens[i-1]
1334 elif ':' in tokens[i]:
1335 fields = tokens[i].split(':')
1336 if len(fields) == 2:
1337 tokens[i] = fields[1]
1338 else:
1339 tokens[i] = "(" + fields[-2] + " " + fields[-1] + ")"
1340 elif tokens[i] == '|':
1341 tokens[i] = ''
1342
1343 treebank_string = string.join(tokens)
1344 return bracket_parse(treebank_string)
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1356 """
1357 A demonstration showing how C{Tree}s and C{Tree}s can be
1358 used. This demonstration creates a C{Tree}, and loads a
1359 C{Tree} from the L{treebank<nltk.corpus.treebank>} corpus,
1360 and shows the results of calling several of their methods.
1361 """
1362
1363 from nltk import tree
1364
1365
1366 s = '(S (NP (DT the) (NN cat)) (VP (VBD ate) (NP (DT a) (NN cookie))))'
1367 t = tree.bracket_parse(s)
1368 print "Convert bracketed string into tree:"
1369 print t
1370 print t.__repr__()
1371
1372 print "Display tree properties:"
1373 print t.node
1374 print t[0]
1375 print t[1]
1376 print t.height()
1377 print t.leaves()
1378 print t[1]
1379 print t[1,1]
1380 print t[1,1,0]
1381
1382
1383 the_cat = t[0]
1384 the_cat.insert(1, tree.bracket_parse('(JJ big)'))
1385 print "Tree modification:"
1386 print t
1387 t[1,1,1] = tree.bracket_parse('(NN cake)')
1388 print t
1389 print
1390
1391
1392 print "Collapse unary:"
1393 t.collapse_unary()
1394 print t
1395 print "Chomsky normal form:"
1396 t.chomsky_normal_form()
1397 print t
1398 print
1399
1400
1401 pt = tree.ProbabilisticTree('x', ['y', 'z'], prob=0.5)
1402 print "Probabilistic Tree:"
1403 print pt
1404 print
1405
1406
1407 t = tree.bracket_parse(t.pprint())
1408 print "Convert tree to bracketed string and back again:"
1409 print t
1410 print
1411
1412
1413 print "LaTeX output:"
1414 print t.pprint_latex_qtree()
1415 print
1416
1417
1418 print "Production output:"
1419 print t.productions()
1420 print
1421
1422
1423 t.node = ('test', 3)
1424 print t
1425
1426 if __name__ == '__main__':
1427 demo()
1428
1429 __all__ = ['ImmutableProbabilisticTree', 'ImmutableTree', 'ProbabilisticMixIn',
1430 'ProbabilisticTree', 'Tree', 'bracket_parse', 'demo',
1431 'sinica_parse', 'ParentedTree', 'MultiParentedTree',
1432 'ImmutableParentedTree', 'ImmutableMultiParentedTree']
1433