Package nltk :: Module tree
[hide private]
[frames] | no frames]

Source Code for Module nltk.tree

   1  # Natural Language Toolkit: Text Trees 
   2  # 
   3  # Copyright (C) 2001-2008 NLTK Project 
   4  # Author: Edward Loper <[email protected]> 
   5  #         Steven Bird <[email protected]> 
   6  #         Nathan Bodenstab <[email protected]> (tree transforms) 
   7  # URL: <http://nltk.org> 
   8  # For license information, see LICENSE.TXT 
   9   
  10  """ 
  11  Class for representing hierarchical language structures, such as 
  12  syntax trees and morphological trees. 
  13  """ 
  14   
  15  import re 
  16  import string 
  17   
  18  import nltk.cfg 
  19  from probability import ProbabilisticMixIn 
  20  from util import slice_bounds 
  21   
  22  ###################################################################### 
  23  ## Trees 
  24  ###################################################################### 
  25   
26 -class Tree(list):
27 """ 28 A hierarchical structure. 29 30 Each C{Tree} represents a single hierarchical grouping of 31 leaves and subtrees. For example, each constituent in a syntax 32 tree is represented by a single C{Tree}. 33 34 A tree's children are encoded as a C{list} of leaves and subtrees, 35 where a X{leaf} is a basic (non-tree) value; and a X{subtree} is a 36 nested C{Tree}. 37 38 Any other properties that a C{Tree} defines are known as 39 X{node properties}, and are used to add information about 40 individual hierarchical groupings. For example, syntax trees use a 41 NODE property to label syntactic constituents with phrase tags, 42 such as \"NP\" and\"VP\". 43 44 Several C{Tree} methods use X{tree positions} to specify 45 children or descendants of a tree. Tree positions are defined as 46 follows: 47 48 - The tree position M{i} specifies a C{Tree}'s M{i}th child. 49 - The tree position C{()} specifies the C{Tree} itself. 50 - If C{M{p}} is the tree position of descendant M{d}, then 51 C{M{p}+(M{i})} specifies the C{M{i}}th child of M{d}. 52 53 I.e., every tree position is either a single index C{M{i}}, 54 specifying C{self[M{i}]}; or a sequence C{(M{i1}, M{i2}, ..., 55 M{iN})}, specifying 56 C{self[M{i1}][M{i2}]...[M{iN}]}. 57 """
58 - def __new__(cls, node_or_str=None, children=None):
59 if node_or_str is None: 60 return list.__new__(cls) # used by copy.deepcopy 61 if children is None: 62 if not isinstance(node_or_str, basestring): 63 raise TypeError("%s: Expected a node value and child list " 64 "or a single string" % cls.__name__) 65 return cls.parse(node_or_str) 66 else: 67 if (isinstance(children, basestring) or 68 not hasattr(children, '__iter__')): 69 raise TypeError("%s() argument 2 should be a list, not a " 70 "string" % cls.__name__) 71 return list.__new__(cls, node_or_str, children)
72
73 - def __init__(self, node_or_str, children=None):
74 """ 75 Construct a new tree. This constructor can be called in one 76 of two ways: 77 78 - C{Tree(node, children)} constructs a new tree with the 79 specified node value and list of children. 80 81 - C{Tree(s)} constructs a new tree by parsing the string 82 C{s}. It is equivalent to calling the class method 83 C{Tree.parse(s)}. 84 """ 85 # Because __new__ may delegate to Tree.parse(), the __init__ 86 # method may end up getting called more than once (once when 87 # constructing the return value for Tree.parse; and again when 88 # __new__ returns). We therefore check if `children` is None 89 # (which will cause __new__ to call Tree.parse()); if so, then 90 # __init__ has already been called once, so just return. 91 if children is None: return 92 93 list.__init__(self, children) 94 self.node = node_or_str
95 96 #//////////////////////////////////////////////////////////// 97 # Comparison operators 98 #//////////////////////////////////////////////////////////// 99
100 - def __eq__(self, other):
101 if not isinstance(other, Tree): return False 102 return self.node == other.node and list.__eq__(self, other)
103 - def __ne__(self, other):
104 return not (self == other)
105 - def __lt__(self, other):
106 if not isinstance(other, Tree): return False 107 return self.node < other.node or list.__lt__(self, other)
108 - def __le__(self, other):
109 if not isinstance(other, Tree): return False 110 return self.node <= other.node or list.__le__(self, other)
111 - def __gt__(self, other):
112 if not isinstance(other, Tree): return True 113 return self.node > other.node or list.__gt__(self, other)
114 - def __ge__(self, other):
115 if not isinstance(other, Tree): return False 116 return self.node >= other.node or list.__ge__(self, other)
117 118 #//////////////////////////////////////////////////////////// 119 # Disabled list operations 120 #//////////////////////////////////////////////////////////// 121
122 - def __mul__(self, v):
123 raise TypeError('Tree does not support multiplication')
124 - def __rmul__(self, v):
125 raise TypeError('Tree does not support multiplication')
126 - def __add__(self, v):
127 raise TypeError('Tree does not support addition')
128 - def __radd__(self, v):
129 raise TypeError('Tree does not support addition')
130 131 #//////////////////////////////////////////////////////////// 132 # Indexing (with support for tree positions) 133 #//////////////////////////////////////////////////////////// 134
135 - def __getitem__(self, index):
136 if isinstance(index, (int, slice)): 137 return list.__getitem__(self, index) 138 else: 139 if len(index) == 0: 140 return self 141 elif len(index) == 1: 142 return self[int(index[0])] 143 else: 144 return self[int(index[0])][index[1:]]
145
146 - def __setitem__(self, index, value):
147 if isinstance(index, (int, slice)): 148 return list.__setitem__(self, index, value) 149 else: 150 if len(index) == 0: 151 raise IndexError('The tree position () may not be ' 152 'assigned to.') 153 elif len(index) == 1: 154 self[index[0]] = value 155 else: 156 self[index[0]][index[1:]] = value
157
158 - def __delitem__(self, index):
159 if isinstance(index, (int, slice)): 160 return list.__delitem__(self, index) 161 else: 162 if len(index) == 0: 163 raise IndexError('The tree position () may not be deleted.') 164 elif len(index) == 1: 165 del self[index[0]] 166 else: 167 del self[index[0]][index[1:]]
168 169 #//////////////////////////////////////////////////////////// 170 # Basic tree operations 171 #//////////////////////////////////////////////////////////// 172
173 - def leaves(self):
174 """ 175 @return: a list containing this tree's leaves. 176 The order reflects the order of the 177 leaves in the tree's hierarchical structure. 178 @rtype: C{list} 179 """ 180 leaves = [] 181 for child in self: 182 if isinstance(child, Tree): 183 leaves.extend(child.leaves()) 184 else: 185 leaves.append(child) 186 return leaves
187
188 - def flatten(self):
189 """ 190 @return: a tree consisting of this tree's root connected directly to 191 its leaves, omitting all intervening non-terminal nodes. 192 @rtype: C{Tree} 193 """ 194 return Tree(self.node, self.leaves())
195
196 - def height(self):
197 """ 198 @return: The height of this tree. The height of a tree 199 containing no children is 1; the height of a tree 200 containing only leaves is 2; and the height of any other 201 tree is one plus the maximum of its children's 202 heights. 203 @rtype: C{int} 204 """ 205 max_child_height = 0 206 for child in self: 207 if isinstance(child, Tree): 208 max_child_height = max(max_child_height, child.height()) 209 else: 210 max_child_height = max(max_child_height, 1) 211 return 1 + max_child_height
212
213 - def treepositions(self, order='preorder'):
214 """ 215 @param order: One of: C{preorder}, C{postorder}, C{bothorder}, 216 C{leaves}. 217 """ 218 positions = [] 219 if order in ('preorder', 'bothorder'): positions.append( () ) 220 for i, child in enumerate(self): 221 if isinstance(child, Tree): 222 childpos = child.treepositions(order) 223 positions.extend((i,)+p for p in childpos) 224 else: 225 positions.append( (i,) ) 226 if order in ('postorder', 'bothorder'): positions.append( () ) 227 return positions
228
229 - def subtrees(self, filter=None):
230 """ 231 Generate all the subtrees of this tree, optionally restricted 232 to trees matching the filter function. 233 @type filter: C{function} 234 @param filter: the function to filter all local trees 235 """ 236 if not filter or filter(self): 237 yield self 238 for child in self: 239 if isinstance(child, Tree): 240 for subtree in child.subtrees(filter): 241 yield subtree
242
243 - def productions(self):
244 """ 245 Generate the productions that correspond to the non-terminal nodes of the tree. 246 For each subtree of the form (P: C1 C2 ... Cn) this produces a production of the 247 form P -> C1 C2 ... Cn. 248 249 @rtype: list of C{Production}s 250 """ 251 252 if not isinstance(self.node, str): 253 raise TypeError, 'Productions can only be generated from trees having node labels that are strings' 254 255 prods = [nltk.cfg.Production(nltk.cfg.Nonterminal(self.node), _child_names(self))] 256 for child in self: 257 if isinstance(child, Tree): 258 prods += child.productions() 259 return prods
260
261 - def pos(self):
262 """ 263 @return: a list of tuples containing leaves and pre-terminals (part-of-speech tags). 264 The order reflects the order of the 265 leaves in the tree's hierarchical structure. 266 @rtype: C{list} of C{tuples} 267 """ 268 pos = [] 269 for child in self: 270 if isinstance(child, Tree): 271 pos.extend(child.pos()) 272 else: 273 pos.append((child, self.node)) 274 return pos
275
276 - def leaf_treeposition(self, index):
277 """ 278 @return: The tree position of the C{index}-th leaf in this 279 tree. I.e., if C{tp=self.leaf_treeposition(i)}, then 280 C{self[tp]==self.leaves()[i]}. 281 282 @raise IndexError: If this tree contains fewer than C{index+1} 283 leaves, or if C{index<0}. 284 """ 285 if index < 0: raise IndexError('index must be non-negative') 286 287 stack = [(self, ())] 288 while stack: 289 value, treepos = stack.pop() 290 if not isinstance(value, Tree): 291 if index == 0: return treepos 292 else: index -= 1 293 else: 294 for i in range(len(value)-1, -1, -1): 295 stack.append( (value[i], treepos+(i,)) ) 296 297 raise IndexError('index must be less than or equal to len(self)')
298
299 - def treeposition_spanning_leaves(self, start, end):
300 """ 301 @return: The tree position of the lowest descendant of this 302 tree that dominates C{self.leaves()[start:end]}. 303 @raise ValueError: if C{end <= start} 304 """ 305 if end <= start: 306 raise ValueError('end must be greater than start') 307 # Find the tree positions of the start & end leaves, and 308 # take the longest common subsequence. 309 start_treepos = self.leaf_treeposition(start) 310 end_treepos = self.leaf_treeposition(end-1) 311 # Find the first index where they mismatch: 312 for i in range(len(start_treepos)): 313 if i == len(end_treepos) or start_treepos[i] != end_treepos[i]: 314 return start_treepos[:i] 315 return start_treepos
316 317 #//////////////////////////////////////////////////////////// 318 # Transforms 319 #//////////////////////////////////////////////////////////// 320
321 - def chomsky_normal_form(self, factor = "right", horzMarkov = None, vertMarkov = 0, childChar = "|", parentChar = "^"):
322 """ 323 This method can modify a tree in three ways: 324 325 1. Convert a tree into its Chomsky Normal Form (CNF) 326 equivalent -- Every subtree has either two non-terminals 327 or one terminal as its children. This process requires 328 the creation of more"artificial" non-terminal nodes. 329 2. Markov (vertical) smoothing of children in new artificial 330 nodes 331 3. Horizontal (parent) annotation of nodes 332 333 @param factor: Right or left factoring method (default = "right") 334 @type factor: C{string} = [left|right] 335 @param horzMarkov: Markov order for sibling smoothing in artificial nodes (None (default) = include all siblings) 336 @type horzMarkov: C{int} | None 337 @param vertMarkov: Markov order for parent smoothing (0 (default) = no vertical annotation) 338 @type vertMarkov: C{int} | None 339 @param childChar: A string used in construction of the artificial nodes, separating the head of the 340 original subtree from the child nodes that have yet to be expanded (default = "|") 341 @type childChar: C{string} 342 @param parentChar: A string used to separate the node representation from its vertical annotation 343 @type parentChar: C{string} 344 """ 345 from treetransforms import chomsky_normal_form 346 chomsky_normal_form(self, factor, horzMarkov, vertMarkov, childChar, parentChar)
347
348 - def un_chomsky_normal_form(self, expandUnary = True, childChar = "|", parentChar = "^", unaryChar = "+"):
349 """ 350 This method modifies the tree in three ways: 351 352 1. Transforms a tree in Chomsky Normal Form back to its 353 original structure (branching greater than two) 354 2. Removes any parent annotation (if it exists) 355 3. (optional) expands unary subtrees (if previously 356 collapsed with collapseUnary(...) ) 357 358 @param expandUnary: Flag to expand unary or not (default = True) 359 @type expandUnary: C{boolean} 360 @param childChar: A string separating the head node from its children in an artificial node (default = "|") 361 @type childChar: C{string} 362 @param parentChar: A sting separating the node label from its parent annotation (default = "^") 363 @type parentChar: C{string} 364 @param unaryChar: A string joining two non-terminals in a unary production (default = "+") 365 @type unaryChar: C{string} 366 """ 367 from treetransforms import un_chomsky_normal_form 368 un_chomsky_normal_form(self, expandUnary, childChar, parentChar, unaryChar)
369
370 - def collapse_unary(self, collapsePOS = False, collapseRoot = False, joinChar = "+"):
371 """ 372 Collapse subtrees with a single child (ie. unary productions) 373 into a new non-terminal (Tree node) joined by 'joinChar'. 374 This is useful when working with algorithms that do not allow 375 unary productions, and completely removing the unary productions 376 would require loss of useful information. The Tree is modified 377 directly (since it is passed by reference) and no value is returned. 378 379 @param collapsePOS: 'False' (default) will not collapse the parent of leaf nodes (ie. 380 Part-of-Speech tags) since they are always unary productions 381 @type collapsePOS: C{boolean} 382 @param collapseRoot: 'False' (default) will not modify the root production 383 if it is unary. For the Penn WSJ treebank corpus, this corresponds 384 to the TOP -> productions. 385 @type collapseRoot: C{boolean} 386 @param joinChar: A string used to connect collapsed node values (default = "+") 387 @type joinChar: C{string} 388 """ 389 from treetransforms import collapse_unary 390 collapse_unary(self, collapsePOS, collapseRoot, joinChar)
391 392 #//////////////////////////////////////////////////////////// 393 # Convert, copy 394 #//////////////////////////////////////////////////////////// 395 396 # [classmethod]
397 - def convert(cls, val):
398 """ 399 Convert a tree between different subtypes of Tree. C{cls} determines 400 which class will be used to encode the new tree. 401 402 @type val: L{Tree} 403 @param val: The tree that should be converted. 404 @return: The new C{Tree}. 405 """ 406 if isinstance(val, Tree): 407 children = [cls.convert(child) for child in val] 408 return cls(val.node, children) 409 else: 410 return val
411 convert = classmethod(convert) 412
413 - def copy(self, deep=False):
414 if not deep: return self.__class__(self.node, self) 415 else: return self.__class__.convert(self)
416
417 - def _frozen_class(self): return ImmutableTree
418 - def freeze(self, leaf_freezer=None):
419 frozen_class = self._frozen_class() 420 if leaf_freezer is None: 421 newcopy = frozen_class.convert(self) 422 else: 423 newcopy = self.copy(deep=True) 424 for pos in newcopy.treepositions('leaves'): 425 newcopy[pos] = leaf_freezer(newcopy[pos]) 426 newcopy = frozen_class.convert(newcopy) 427 hash(newcopy) # Make sure the leaves are hashable. 428 return newcopy
429 430 #//////////////////////////////////////////////////////////// 431 # Parsing 432 #//////////////////////////////////////////////////////////// 433 434 @classmethod
435 - def parse(cls, s, brackets='()', parse_node=None, parse_leaf=None, 436 node_pattern=None, leaf_pattern=None, 437 remove_empty_top_bracketing=False):
438 """ 439 Parse a bracketed tree string and return the resulting tree. 440 Trees are represented as nested brackettings, such as:: 441 442 (S (NP (NNP John)) (VP (V runs))) 443 444 @type s: C{str} 445 @param s: The string to parse 446 447 @type brackets: length-2 C{str} 448 @param brackets: The bracket characters used to mark the 449 beginning and end of trees and subtrees. 450 451 @type parse_node: C{function} 452 @type parse_leaf: C{function} 453 @param parse_node, parse_leaf: If specified, these functions 454 are applied to the substrings of C{s} corresponding to 455 nodes and leaves (respectively) to obtain the values for 456 those nodes and leaves. They should have the following 457 signature: 458 459 >>> parse_node(str) -> value 460 461 For example, these functions could be used to parse nodes 462 and leaves whose values should be some type other than 463 string (such as L{FeatStruct <nltk.featstruct.FeatStruct>}). 464 Note that by default, node strings and leaf strings are 465 delimited by whitespace and brackets; to override this 466 default, use the C{node_pattern} and C{leaf_pattern} 467 arguments. 468 469 @type node_pattern: C{str} 470 @type leaf_pattern: C{str} 471 @param node_pattern, leaf_pattern: Regular expression patterns 472 used to find node and leaf substrings in C{s}. By 473 default, both nodes patterns are defined to match any 474 sequence of non-whitespace non-bracket characters. 475 476 @type remove_empty_top_bracketing: C{bool} 477 @param remove_empty_top_bracketing: If the resulting tree has 478 an empty node label, and is length one, then return its 479 single child instead. This is useful for treebank trees, 480 which sometimes contain an extra level of bracketing. 481 482 @return: A tree corresponding to the string representation C{s}. 483 If this class method is called using a subclass of C{Tree}, 484 then it will return a tree of that type. 485 @rtype: C{Tree} 486 """ 487 if not isinstance(brackets, basestring) or len(brackets) != 2: 488 raise TypeError('brackets must be a length-2 string') 489 if re.search('\s', brackets): 490 raise TypeError('whitespace brackets not allowed') 491 # Construct a regexp that will tokenize the string. 492 open_b, close_b = brackets 493 open_pattern, close_pattern = (re.escape(open_b), re.escape(close_b)) 494 if node_pattern is None: 495 node_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern) 496 if leaf_pattern is None: 497 leaf_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern) 498 token_re = re.compile('%s\s*(%s)?|%s|(%s)' % ( 499 open_pattern, node_pattern, close_pattern, leaf_pattern)) 500 # Walk through each token, updating a stack of trees. 501 stack = [(None, [])] # list of (node, children) tuples 502 for match in token_re.finditer(s): 503 token = match.group() 504 # Beginning of a tree/subtree 505 if token[0] == open_b: 506 if len(stack) == 1 and len(stack[0][1]) > 0: 507 cls._parse_error(s, match, 'end-of-string') 508 node = token[1:].lstrip() 509 if parse_node is not None: node = parse_node(node) 510 stack.append((node, [])) 511 # End of a tree/subtree 512 elif token == close_b: 513 if len(stack) == 1: 514 if len(stack[0][1]) == 0: 515 cls._parse_error(s, match, open_b) 516 else: 517 cls._parse_error(s, match, 'end-of-string') 518 node, children = stack.pop() 519 stack[-1][1].append(cls(node, children)) 520 # Leaf node 521 else: 522 if len(stack) == 1: 523 cls._parse_error(s, match, open_b) 524 if parse_leaf is not None: token = parse_leaf(token) 525 stack[-1][1].append(token) 526 527 # check that we got exactly one complete tree. 528 if len(stack) > 1: 529 cls._parse_error(s, 'end-of-string', close_b) 530 elif len(stack[0][1]) == 0: 531 cls._parse_error(s, 'end-of-string', open_b) 532 else: 533 assert stack[0][0] is None 534 assert len(stack[0][1]) == 1 535 tree = stack[0][1][0] 536 537 # If the tree has an extra level with node='', then get rid of 538 # it. E.g.: "((S (NP ...) (VP ...)))" 539 if remove_empty_top_bracketing and tree.node == '' and len(tree) == 1: 540 tree = tree[0] 541 # return the tree. 542 return tree
543 544 @classmethod
545 - def _parse_error(cls, s, match, expecting):
546 """ 547 Display a friendly error message when parsing a tree string fails. 548 @param s: The string we're parsing. 549 @param match: regexp match of the problem token. 550 @param expecting: what we expected to see instead. 551 """ 552 # Construct a basic error message 553 if match == 'end-of-string': 554 pos, token = len(s), 'end-of-string' 555 else: 556 pos, token = match.start(), match.group() 557 msg = '%s.parse(): expected %r but got %r\n%sat index %d.' % ( 558 cls.__name__, expecting, token, ' '*12, pos) 559 # Add a display showing the error token itsels: 560 s = s.replace('\n', ' ').replace('\t', ' ') 561 offset = pos 562 if len(s) > pos+10: 563 s = s[:pos+10]+'...' 564 if pos > 10: 565 s = '...'+s[pos-10:] 566 offset = 13 567 msg += '\n%s"%s"\n%s^' % (' '*16, s, ' '*(17+offset)) 568 raise ValueError(msg)
569 #//////////////////////////////////////////////////////////// 570 # Visualization & String Representation 571 #//////////////////////////////////////////////////////////// 572
573 - def draw(self):
574 """ 575 Open a new window containing a graphical diagram of this tree. 576 """ 577 from nltk.draw.tree import draw_trees 578 draw_trees(self)
579
580 - def __repr__(self):
581 childstr = ", ".join(repr(c) for c in self) 582 return '%s(%r, [%s])' % (self.__class__.__name__, self.node, childstr)
583
584 - def __str__(self):
585 return self.pprint()
586
587 - def pprint(self, margin=70, indent=0, nodesep='', parens='()', quotes=False):
588 """ 589 @return: A pretty-printed string representation of this tree. 590 @rtype: C{string} 591 @param margin: The right margin at which to do line-wrapping. 592 @type margin: C{int} 593 @param indent: The indentation level at which printing 594 begins. This number is used to decide how far to indent 595 subsequent lines. 596 @type indent: C{int} 597 @param nodesep: A string that is used to separate the node 598 from the children. E.g., the default value C{':'} gives 599 trees like C{(S: (NP: I) (VP: (V: saw) (NP: it)))}. 600 """ 601 602 # Try writing it on one line. 603 s = self._pprint_flat(nodesep, parens, quotes) 604 if len(s)+indent < margin: 605 return s 606 607 # If it doesn't fit on one line, then write it on multi-lines. 608 if isinstance(self.node, basestring): 609 s = '%s%s%s' % (parens[0], self.node, nodesep) 610 else: 611 s = '%s%r%s' % (parens[0], self.node, nodesep) 612 for child in self: 613 if isinstance(child, Tree): 614 s += '\n'+' '*(indent+2)+child.pprint(margin, indent+2, 615 nodesep, parens, quotes) 616 elif isinstance(child, tuple): 617 s += '\n'+' '*(indent+2)+ "/".join(child) 618 elif isinstance(child, str) and not quotes: 619 s += '\n'+' '*(indent+2)+ '%s' % child 620 else: 621 s += '\n'+' '*(indent+2)+ '%r' % child 622 return s+parens[1]
623
624 - def pprint_latex_qtree(self):
625 r""" 626 Returns a representation of the tree compatible with the 627 LaTeX qtree package. This consists of the string C{\Tree} 628 followed by the parse tree represented in bracketed notation. 629 630 For example, the following result was generated from a parse tree of 631 the sentence C{The announcement astounded us}:: 632 633 \Tree [.I'' [.N'' [.D The ] [.N' [.N announcement ] ] ] 634 [.I' [.V'' [.V' [.V astounded ] [.N'' [.N' [.N us ] ] ] ] ] ] ] 635 636 See U{http://www.ling.upenn.edu/advice/latex.html} for the LaTeX 637 style file for the qtree package. 638 639 @return: A latex qtree representation of this tree. 640 @rtype: C{string} 641 """ 642 return r'\Tree ' + self.pprint(indent=6, nodesep='', parens=('[.', ' ]'))
643
644 - def _pprint_flat(self, nodesep, parens, quotes):
645 childstrs = [] 646 for child in self: 647 if isinstance(child, Tree): 648 childstrs.append(child._pprint_flat(nodesep, parens, quotes)) 649 elif isinstance(child, tuple): 650 childstrs.append("/".join(child)) 651 elif isinstance(child, str) and not quotes: 652 childstrs.append('%s' % child) 653 else: 654 childstrs.append('%r' % child) 655 if isinstance(self.node, basestring): 656 return '%s%s%s %s%s' % (parens[0], self.node, nodesep, 657 string.join(childstrs), parens[1]) 658 else: 659 return '%s%r%s %s%s' % (parens[0], self.node, nodesep, 660 string.join(childstrs), parens[1])
661
662 -class ImmutableTree(Tree):
663 - def __init__(self, node_or_str, children=None):
664 if children is None: return # see note in Tree.__init__() 665 super(ImmutableTree, self).__init__(node_or_str, children) 666 # Precompute our hash value. This ensures that we're really 667 # immutable. It also means we only have to calculate it once. 668 try: 669 self._hash = hash( (self.node, tuple(self)) ) 670 except (TypeError, ValueError): 671 raise ValueError("ImmutableTree's node value and children " 672 "must be immutable")
673 - def __setitem__(self):
674 raise ValueError, 'ImmutableTrees may not be modified'
675 - def __setslice__(self):
676 raise ValueError, 'ImmutableTrees may not be modified'
677 - def __delitem__(self):
678 raise ValueError, 'ImmutableTrees may not be modified'
679 - def __delslice__(self):
680 raise ValueError, 'ImmutableTrees may not be modified'
681 - def __iadd__(self):
682 raise ValueError, 'ImmutableTrees may not be modified'
683 - def __imul__(self):
684 raise ValueError, 'ImmutableTrees may not be modified'
685 - def append(self, v):
686 raise ValueError, 'ImmutableTrees may not be modified'
687 - def extend(self, v):
688 raise ValueError, 'ImmutableTrees may not be modified'
689 - def pop(self, v=None):
690 raise ValueError, 'ImmutableTrees may not be modified'
691 - def remove(self, v):
692 raise ValueError, 'ImmutableTrees may not be modified'
693 - def reverse(self):
694 raise ValueError, 'ImmutableTrees may not be modified'
695 - def sort(self):
696 raise ValueError, 'ImmutableTrees may not be modified'
697 - def __hash__(self):
698 return self._hash
699
700 - def _set_node(self, node):
701 """Set self._node. This will only succeed the first time the 702 node value is set, which should occur in Tree.__init__().""" 703 if hasattr(self, 'node'): 704 raise ValueError, 'ImmutableTrees may not be modified' 705 self._node = node
706 - def _get_node(self):
707 return self._node
708 node = property(_get_node, _set_node)
709 710 ###################################################################### 711 ## Parented trees 712 ###################################################################### 713
714 -class AbstractParentedTree(Tree):
715 """ 716 An abstract base class for L{Tree}s that automatically maintain 717 pointers to their parents. These parent pointers are updated 718 whenever any change is made to a tree's structure. Two subclasses 719 are currently defined: 720 721 - L{ParentedTree} is used for tree structures where each subtree 722 has at most one parent. This class should be used in cases 723 where there is no"sharing" of subtrees. 724 725 - L{MultiParentedTree} is used for tree structures where a 726 subtree may have zero or more parents. This class should be 727 used in cases where subtrees may be shared. 728 729 Subclassing 730 =========== 731 The C{AbstractParentedTree} class redefines all operations that 732 modify a tree's structure to call two methods, which are used by 733 subclasses to update parent information: 734 735 - L{_setparent()} is called whenever a new child is added. 736 - L{_delparent()} is called whenever a child is removed. 737 """
738 - def __init__(self, node_or_str, children=None):
739 if children is None: return # see note in Tree.__init__() 740 super(AbstractParentedTree, self).__init__(node_or_str, children) 741 # iterate over self, and *not* children, because children 742 # might be an iterator. 743 for i, child in enumerate(self): 744 if isinstance(child, Tree): 745 self._setparent(child, i, dry_run=True) 746 for i, child in enumerate(self): 747 if isinstance(child, Tree): 748 self._setparent(child, i)
749 750 #//////////////////////////////////////////////////////////// 751 # Parent management 752 #//////////////////////////////////////////////////////////// 753
754 - def _setparent(self, child, index, dry_run=False):
755 """ 756 Update C{child}'s parent pointer to point to self. This 757 method is only called if C{child}'s type is L{Tree}; i.e., it 758 is not called when adding a leaf to a tree. This method is 759 always called before the child is actually added to C{self}'s 760 child list. 761 762 @type child: L{Tree} 763 @type index: C{int} 764 @param index: The index of C{child} in C{self}. 765 @raise TypeError: If C{child} is a tree with an impropriate 766 type. Typically, if C{child} is a tree, then its type needs 767 to match C{self}'s type. This prevents mixing of 768 different tree types (single-parented, multi-parented, and 769 non-parented). 770 @param dry_run: If true, the don't actually set the child's 771 parent pointer; just check for any error conditions, and 772 raise an exception if one is found. 773 """ 774 raise AssertionError('Abstract base class')
775
776 - def _delparent(self, child, index):
777 """ 778 Update C{child}'s parent pointer to not point to self. This 779 method is only called if C{child}'s type is L{Tree}; i.e., it 780 is not called when removing a leaf from a tree. This method 781 is always called before the child is actually removed from 782 C{self}'s child list. 783 784 @type child: L{Tree} 785 @type index: C{int} 786 @param index: The index of C{child} in C{self}. 787 """ 788 raise AssertionError('Abstract base class')
789 790 #//////////////////////////////////////////////////////////// 791 # Methods that add/remove children 792 #//////////////////////////////////////////////////////////// 793 # Every method that adds or removes a child must make 794 # appropriate calls to _setparent() and _delparent(). 795
796 - def __delitem__(self, index):
797 # del ptree[start:stop] 798 if isinstance(index, slice): 799 start, stop = slice_bounds(self, index) 800 # Clear all the children pointers. 801 for i in xrange(start, stop): 802 if isinstance(self[i], Tree): 803 self._delparent(self[i], i) 804 # Delete the children from our child list. 805 super(AbstractParentedTree, self).__delitem__(index) 806 807 # del ptree[i] 808 elif isinstance(index, int): 809 if index < 0: index += len(self) 810 if index < 0: raise IndexError('index out of range') 811 # Clear the child's parent pointer. 812 if isinstance(self[index], Tree): 813 self._delparent(self[index], index) 814 # Remove the child from our child list. 815 super(AbstractParentedTree, self).__delitem__(index) 816 817 # del ptree[()] 818 elif len(index) == 0: 819 raise IndexError('The tree position () may not be deleted.') 820 821 # del ptree[(i,)] 822 elif len(index) == 1: 823 del self[index[0]] 824 825 # del ptree[i1, i2, i3] 826 else: 827 del self[index[0]][index[1:]]
828
829 - def __setitem__(self, index, value):
830 # ptree[start:stop] = value 831 if isinstance(index, slice): 832 start, stop = slice_bounds(self, index) 833 # make a copy of value, in case it's an iterator 834 if not isinstance(value, (list, tuple)): 835 value = list(value) 836 # Check for any error conditions, so we can avoid ending 837 # up in an inconsistent state if an error does occur. 838 for i, child in enumerate(value): 839 if isinstance(child, Tree): 840 self._setparent(child, start+i, dry_run=True) 841 # clear the child pointers of all parents we're removing 842 for i in xrange(start, stop): 843 if isinstance(self[i], Tree): 844 self._delparent(self[i], i) 845 # set the child pointers of the new children. We do this 846 # after clearing *all* child pointers, in case we're e.g. 847 # reversing the elements in a tree. 848 for i, child in enumerate(value): 849 if isinstance(child, Tree): 850 self._setparent(child, start+i) 851 # finally, update the content of the child list itself. 852 super(AbstractParentedTree, self).__setitem__(index, value) 853 854 # ptree[i] = value 855 elif isinstance(index, int): 856 if index < 0: index += len(self) 857 if index < 0: raise IndexError('index out of range') 858 # if the value is not changing, do nothing. 859 if value is self[index]: 860 return 861 # Set the new child's parent pointer. 862 if isinstance(value, Tree): 863 self._setparent(value, index) 864 # Remove the old child's parent pointer 865 if isinstance(self[index], Tree): 866 self._delparent(self[index], index) 867 # Update our child list. 868 super(AbstractParentedTree, self).__setitem__(index, value) 869 870 # ptree[()] = value 871 elif len(index) == 0: 872 raise IndexError('The tree position () may not be assigned to.') 873 874 # ptree[(i,)] = value 875 elif len(index) == 1: 876 self[index[0]] = value 877 878 # ptree[i1, i2, i3] = value 879 else: 880 self[index[0]][index[1:]] = value
881
882 - def append(self, child):
883 if isinstance(child, Tree): 884 self._setparent(child, len(self)) 885 super(AbstractParentedTree, self).append(child)
886
887 - def extend(self, children):
888 for child in children: 889 if isinstance(child, Tree): 890 self._setparent(child, len(self)) 891 super(AbstractParentedTree, self).append(child)
892
893 - def insert(self, index, child):
894 # Handle negative indexes. Note that if index < -len(self), 895 # we do *not* raise an IndexError, unlike __getitem__. This 896 # is done for consistency with list.__getitem__ and list.index. 897 if index < 0: index += len(self) 898 if index < 0: index = 0 899 # Set the child's parent, and update our child list. 900 if isinstance(child, Tree): 901 self._setparent(child, index) 902 super(AbstractParentedTree, self).insert(index, child)
903
904 - def pop(self, index=-1):
905 if index < 0: index += len(self) 906 if index < 0: raise IndexError('index out of range') 907 if isinstance(self[index], Tree): 908 self._delparent(self[index], index) 909 return super(AbstractParentedTree, self).pop(index)
910 911 # n.b.: like `list`, this is done by equality, not identity! 912 # To remove a specific child, use del ptree[i].
913 - def remove(self, child):
914 index = self.index(child) 915 if isinstance(self[index], Tree): 916 self._delparent(self[index], index) 917 super(AbstractParentedTree, self).remove(child)
918 919 # We need to implement __getslice__ and friends, even though 920 # they're deprecated, because otherwise list.__getslice__ will get 921 # called (since we're subclassing from list). Just delegate to 922 # __getitem__ etc., but use max(0, start) and max(0, stop) because 923 # because negative indices are already handled *before* 924 # __getslice__ is called; and we don't want to double-count them. 925 if hasattr(list, '__getslice__'):
926 - def __getslice__(self, start, stop):
927 return self.__getitem__(slice(max(0, start), max(0, stop)))
928 - def __delslice__(self, start, stop):
929 return self.__delitem__(slice(max(0, start), max(0, stop)))
930 - def __setslice__(self, start, stop, value):
931 return self.__setitem__(slice(max(0, start), max(0, stop)), value)
932
933 -class ParentedTree(AbstractParentedTree):
934 """ 935 A L{Tree} that automatically maintains parent pointers for 936 single-parented trees. The following read-only property values 937 are automatically updated whenever the structure of a parented 938 tree is modified: L{parent}, L{parent_index}, L{left_sibling}, 939 L{right_sibling}, L{root}, L{treeposition}. 940 941 Each C{ParentedTree} may have at most one parent. In 942 particular, subtrees may not be shared. Any attempt to reuse a 943 single C{ParentedTree} as a child of more than one parent (or 944 as multiple children of the same parent) will cause a 945 C{ValueError} exception to be raised. 946 947 C{ParentedTrees} should never be used in the same tree as C{Trees} 948 or C{MultiParentedTrees}. Mixing tree implementations may result 949 in incorrect parent pointers and in C{TypeError} exceptions. 950 """
951 - def __init__(self, node_or_str, children=None):
952 if children is None: return # see note in Tree.__init__() 953 954 self._parent = None 955 """The parent of this Tree, or C{None} if it has no parent.""" 956 957 super(ParentedTree, self).__init__(node_or_str, children)
958
959 - def _frozen_class(self): return ImmutableParentedTree
960 961 #///////////////////////////////////////////////////////////////// 962 # Properties 963 #///////////////////////////////////////////////////////////////// 964
965 - def _get_parent_index(self):
966 if self._parent is None: return None 967 for i, child in enumerate(self._parent): 968 if child is self: return i 969 assert False, 'expected to find self in self._parent!'
970
971 - def _get_left_sibling(self):
972 parent_index = self._get_parent_index() 973 if self._parent and parent_index > 0: 974 return self._parent[parent_index-1] 975 return None # no left sibling
976
977 - def _get_right_sibling(self):
978 parent_index = self._get_parent_index() 979 if self._parent and parent_index < (len(self._parent)-1): 980 return self._parent[parent_index+1] 981 return None # no right sibling
982
983 - def _get_treeposition(self):
984 if self._parent is None: return () 985 else: return (self._parent._get_treeposition() + 986 (self._get_parent_index(),))
987
988 - def _get_root(self):
989 if self._parent is None: return self 990 else: return self._parent._get_root()
991 992 parent = property(lambda self: self._parent, doc=""" 993 The parent of this tree, or C{None} if it has no parent.""") 994 995 parent_index = property(_get_parent_index, doc=""" 996 The index of this tree in its parent. I.e., 997 C{ptree.parent[ptree.parent_index] is ptree}. Note that 998 C{ptree.parent_index} is not necessarily equal to 999 C{ptree.parent.index(ptree)}, since the C{index()} method 1000 returns the first child that is I{equal} to its argument.""") 1001 1002 left_sibling = property(_get_left_sibling, doc=""" 1003 The left sibling of this tree, or C{None} if it has none.""") 1004 1005 right_sibling = property(_get_right_sibling, doc=""" 1006 The right sibling of this tree, or C{None} if it has none.""") 1007 1008 root = property(_get_root, doc=""" 1009 The root of this tree. I.e., the unique ancestor of this tree 1010 whose parent is C{None}. If C{ptree.parent} is C{None}, then 1011 C{ptree} is its own root.""") 1012 1013 treeposition = property(_get_treeposition, doc=""" 1014 The tree position of this tree, relative to the root of the 1015 tree. I.e., C{ptree.root[ptree.treeposition] is ptree}.""") 1016 treepos = treeposition # [xx] alias -- which name should we use? 1017 1018 #///////////////////////////////////////////////////////////////// 1019 # Parent Management 1020 #///////////////////////////////////////////////////////////////// 1021
1022 - def _delparent(self, child, index):
1023 # Sanity checks 1024 assert isinstance(child, ParentedTree) 1025 assert self[index] is child 1026 assert child._parent is self 1027 1028 # Delete child's parent pointer. 1029 child._parent = None
1030
1031 - def _setparent(self, child, index, dry_run=False):
1032 # If the child's type is incorrect, then complain. 1033 if not isinstance(child, ParentedTree): 1034 raise TypeError('Can not insert a non-ParentedTree '+ 1035 'into a ParentedTree') 1036 1037 # If child already has a parent, then complain. 1038 if child._parent is not None: 1039 raise ValueError('Can not insert a subtree that already ' 1040 'has a parent.') 1041 1042 # Set child's parent pointer & index. 1043 if not dry_run: 1044 child._parent = self
1045
1046 -class MultiParentedTree(AbstractParentedTree):
1047 """ 1048 A L{Tree} that automatically maintains parent pointers for 1049 multi-parented trees. The following read-only property values are 1050 automatically updated whenever the structure of a multi-parented 1051 tree is modified: L{parents}, L{parent_indices}, L{left_siblings}, 1052 L{right_siblings}, L{roots}, L{treepositions}. 1053 1054 Each C{MultiParentedTree} may have zero or more parents. In 1055 particular, subtrees may be shared. If a single 1056 C{MultiParentedTree} is used as multiple children of the same 1057 parent, then that parent will appear multiple times in its 1058 C{parents} property. 1059 1060 C{MultiParentedTrees} should never be used in the same tree as 1061 C{Trees} or C{ParentedTrees}. Mixing tree implementations may 1062 result in incorrect parent pointers and in C{TypeError} exceptions. 1063 """
1064 - def __init__(self, node_or_str, children=None):
1065 if children is None: return # see note in Tree.__init__() 1066 1067 self._parents = [] 1068 """A list of this tree's parents. This list should not 1069 contain duplicates, even if a parent contains this tree 1070 multiple times.""" 1071 1072 super(MultiParentedTree, self).__init__(node_or_str, children)
1073 1075 1076 #///////////////////////////////////////////////////////////////// 1077 # Properties 1078 #///////////////////////////////////////////////////////////////// 1079
1080 - def _get_parent_indices(self):
1081 return [(parent, index) 1082 for parent in self._parents 1083 for index, child in enumerate(parent) 1084 if child is self]
1085
1086 - def _get_left_siblings(self):
1087 return [parent[index-1] 1088 for (parent, index) in self._get_parent_indices() 1089 if index > 0]
1090
1091 - def _get_right_siblings(self):
1092 return [parent[index+1] 1093 for (parent, index) in self._get_parent_indices() 1094 if index < (len(parent)-1)]
1095
1096 - def _get_roots(self):
1097 return self._get_roots_helper({}).values()
1098
1099 - def _get_roots_helper(self, result):
1100 if self._parents: 1101 for parent in self._parents: 1102 parent._get_roots_helper(result) 1103 else: 1104 result[id(self)] = self 1105 return result
1106 1107 parents = property(lambda self: list(self._parents), doc=""" 1108 The set of parents of this tree. If this tree has no parents, 1109 then C{parents} is the empty set. To check if a tree is used 1110 as multiple children of the same parent, use the 1111 L{parent_indices} property. 1112 1113 @type: C{list} of L{MultiParentedTree}""") 1114 1115 left_siblings = property(_get_left_siblings, doc=""" 1116 A list of all left siblings of this tree, in any of its parent 1117 trees. A tree may be its own left sibling if it is used as 1118 multiple contiguous children of the same parent. A tree may 1119 appear multiple times in this list if it is the left sibling 1120 of this tree with respect to multiple parents. 1121 1122 @type: C{list} of L{MultiParentedTree}""") 1123 1124 right_siblings = property(_get_right_siblings, doc=""" 1125 A list of all right siblings of this tree, in any of its parent 1126 trees. A tree may be its own right sibling if it is used as 1127 multiple contiguous children of the same parent. A tree may 1128 appear multiple times in this list if it is the right sibling 1129 of this tree with respect to multiple parents. 1130 1131 @type: C{list} of L{MultiParentedTree}""") 1132 1133 roots = property(_get_roots, doc=""" 1134 The set of all roots of this tree. This set is formed by 1135 tracing all possible parent paths until trees with no parents 1136 are found. 1137 1138 @type: C{list} of L{MultiParentedTree}""") 1139
1140 - def parent_indices(self, parent):
1141 """ 1142 Return a list of the indices where this tree occurs as a child 1143 of C{parent}. If this child does not occur as a child of 1144 C{parent}, then the empty list is returned. The following is 1145 always true:: 1146 1147 for parent_index in ptree.parent_indices(parent): 1148 parent[parent_index] is ptree 1149 """ 1150 if parent not in self._parents: return [] 1151 else: return [index for (index, child) in enumerate(parent) 1152 if child is self]
1153
1154 - def treepositions(self, root):
1155 """ 1156 Return a list of all tree positions that can be used to reach 1157 this multi-parented tree starting from C{root}. I.e., the 1158 following is always true:: 1159 1160 for treepos in ptree.treepositions(root): 1161 root[treepos] is ptree 1162 """ 1163 if self is root: 1164 return [()] 1165 else: 1166 return [treepos+(index,) 1167 for parent in self._parents 1168 for treepos in parent.treepositions(root) 1169 for (index, child) in enumerate(parent) if child is self]
1170 1171 1172 #///////////////////////////////////////////////////////////////// 1173 # Parent Management 1174 #///////////////////////////////////////////////////////////////// 1175
1176 - def _delparent(self, child, index):
1177 # Sanity checks 1178 assert isinstance(child, MultiParentedTree) 1179 assert self[index] is child 1180 assert len([p for p in child._parents if p is self]) == 1 1181 1182 # If the only copy of child in self is at index, then delete 1183 # self from child's parent list. 1184 for i, c in enumerate(self): 1185 if c is child and i != index: break 1186 else: 1187 child._parents.remove(self)
1188
1189 - def _setparent(self, child, index, dry_run=False):
1190 # If the child's type is incorrect, then complain. 1191 if not isinstance(child, MultiParentedTree): 1192 raise TypeError('Can not insert a non-MultiParentedTree '+ 1193 'into a MultiParentedTree') 1194 1195 # Add self as a parent pointer if it's not already listed. 1196 if not dry_run: 1197 for parent in child._parents: 1198 if parent is self: break 1199 else: 1200 child._parents.append(self)
1201
1202 -class ImmutableParentedTree(ImmutableTree, ParentedTree):
1203 - def __init__(self, node_or_str, children=None):
1204 if children is None: return # see note in Tree.__init__() 1205 super(ImmutableParentedTree, self).__init__(node_or_str, children)
1206
1207 -class ImmutableMultiParentedTree(ImmutableTree, MultiParentedTree):
1208 - def __init__(self, node_or_str, children=None):
1209 if children is None: return # see note in Tree.__init__() 1210 super(ImmutableMultiParentedTree, self).__init__(node_or_str, children)
1211 1212 ###################################################################### 1213 ## Probabilistic trees 1214 ######################################################################
1215 -class ProbabilisticTree(Tree, ProbabilisticMixIn):
1216 - def __new__(cls, node_or_str, children=None, **prob_kwargs):
1217 return super(ProbabilisticTree, cls).__new__( 1218 cls, node_or_str, children)
1219 - def __init__(self, node_or_str, children=None, **prob_kwargs):
1220 if children is None: return # see note in Tree.__init__() 1221 Tree.__init__(self, node_or_str, children) 1222 ProbabilisticMixIn.__init__(self, **prob_kwargs)
1223 1224 # We have to patch up these methods to make them work right:
1226 - def __repr__(self):
1227 return '%s (p=%s)' % (Tree.__repr__(self), self.prob())
1228 - def __str__(self):
1229 return '%s (p=%s)' % (self.pprint(margin=60), self.prob())
1230 - def __cmp__(self, other):
1231 c = Tree.__cmp__(self, other) 1232 if c != 0: return c 1233 return cmp(self.prob(), other.prob())
1234 - def __eq__(self, other):
1235 if not isinstance(other, Tree): return False 1236 return Tree.__eq__(self, other) and self.prob()==other.prob()
1237 - def __ne__(self, other):
1238 return not (self == other)
1239 - def copy(self, deep=False):
1240 if not deep: return self.__class__(self.node, self, prob=self.prob()) 1241 else: return self.__class__.convert(self)
1242 - def convert(cls, val):
1243 if isinstance(val, Tree): 1244 children = [cls.convert(child) for child in val] 1245 if isinstance(val, ProbabilisticMixIn): 1246 return cls(val.node, children, prob=val.prob()) 1247 else: 1248 return cls(val.node, children, prob=1.0) 1249 else: 1250 return val
1251 convert = classmethod(convert)
1252
1253 -class ImmutableProbabilisticTree(ImmutableTree, ProbabilisticMixIn):
1254 - def __new__(cls, node_or_str, children=None, **prob_kwargs):
1255 return super(ImmutableProbabilisticTree, cls).__new__( 1256 cls, node_or_str, children)
1257 - def __init__(self, node_or_str, children=None, **prob_kwargs):
1258 if children is None: return # see note in Tree.__init__() 1259 ImmutableTree.__init__(self, node_or_str, children) 1260 ProbabilisticMixIn.__init__(self, **prob_kwargs)
1261 1262 # We have to patch up these methods to make them work right:
1264 - def __repr__(self):
1265 return '%s [%s]' % (Tree.__repr__(self), self.prob())
1266 - def __str__(self):
1267 return '%s [%s]' % (self.pprint(margin=60), self.prob())
1268 - def __cmp__(self, other):
1269 c = Tree.__cmp__(self, other) 1270 if c != 0: return c 1271 return cmp(self.prob(), other.prob())
1272 - def __eq__(self, other):
1273 if not isinstance(other, Tree): return False 1274 return Tree.__eq__(self, other) and self.prob()==other.prob()
1275 - def __ne__(self, other):
1276 return not (self == other)
1277 - def copy(self, deep=False):
1278 if not deep: return self.__class__(self.node, self, prob=self.prob()) 1279 else: return self.__class__.convert(self)
1280 - def convert(cls, val):
1281 if isinstance(val, Tree): 1282 children = [cls.convert(child) for child in val] 1283 if isinstance(val, ProbabilisticMixIn): 1284 return cls(val.node, children, prob=val.prob()) 1285 else: 1286 return cls(val.node, children, prob=1) 1287 else: 1288 return val
1289 convert = classmethod(convert)
1290 1291
1292 -def _child_names(tree):
1293 names = [] 1294 for child in tree: 1295 if isinstance(child, Tree): 1296 names.append(nltk.cfg.Nonterminal(child.node)) 1297 else: 1298 names.append(child) 1299 return names
1300 1301 ###################################################################### 1302 ## Parsing 1303 ###################################################################### 1304 1305 # We should consider deprecating this function: 1306 #@deprecated('Use Tree.parse(s, remove_top_empty_bracketing=True) instead.')
1307 -def bracket_parse(s):
1308 """ 1309 Parse a treebank string and return a tree. Trees are represented 1310 as nested brackettings, e.g. (S (NP (NNP John)) (VP (V runs))). 1311 1312 @return: A tree corresponding to the string representation. 1313 @rtype: C{tree} 1314 @param s: The string to be converted 1315 @type s: C{string} 1316 """ 1317 return Tree.parse(s, remove_empty_top_bracketing=True)
1318
1319 -def sinica_parse(s):
1320 """ 1321 Parse a Sinica Treebank string and return a tree. Trees are represented as nested brackettings, 1322 as shown in the following example (X represents a Chinese character): 1323 S(goal:NP(Head:Nep:XX)|theme:NP(Head:Nhaa:X)|quantity:Dab:X|Head:VL2:X)#0(PERIODCATEGORY) 1324 1325 @return: A tree corresponding to the string representation. 1326 @rtype: C{tree} 1327 @param s: The string to be converted 1328 @type s: C{string} 1329 """ 1330 tokens = re.split(r'([()| ])', s) 1331 for i in range(len(tokens)): 1332 if tokens[i] == '(': 1333 tokens[i-1], tokens[i] = tokens[i], tokens[i-1] # pull nonterminal inside parens 1334 elif ':' in tokens[i]: 1335 fields = tokens[i].split(':') 1336 if len(fields) == 2: # non-terminal 1337 tokens[i] = fields[1] 1338 else: 1339 tokens[i] = "(" + fields[-2] + " " + fields[-1] + ")" 1340 elif tokens[i] == '|': 1341 tokens[i] = '' 1342 1343 treebank_string = string.join(tokens) 1344 return bracket_parse(treebank_string)
1345 1346 # s = re.sub(r'^#[^\s]*\s', '', s) # remove leading identifier 1347 # s = re.sub(r'\w+:', '', s) # remove role tags 1348 1349 # return s 1350 1351 ###################################################################### 1352 ## Demonstration 1353 ###################################################################### 1354
1355 -def demo():
1356 """ 1357 A demonstration showing how C{Tree}s and C{Tree}s can be 1358 used. This demonstration creates a C{Tree}, and loads a 1359 C{Tree} from the L{treebank<nltk.corpus.treebank>} corpus, 1360 and shows the results of calling several of their methods. 1361 """ 1362 1363 from nltk import tree 1364 1365 # Demonstrate tree parsing. 1366 s = '(S (NP (DT the) (NN cat)) (VP (VBD ate) (NP (DT a) (NN cookie))))' 1367 t = tree.bracket_parse(s) 1368 print "Convert bracketed string into tree:" 1369 print t 1370 print t.__repr__() 1371 1372 print "Display tree properties:" 1373 print t.node # tree's constituent type 1374 print t[0] # tree's first child 1375 print t[1] # tree's second child 1376 print t.height() 1377 print t.leaves() 1378 print t[1] 1379 print t[1,1] 1380 print t[1,1,0] 1381 1382 # Demonstrate tree modification. 1383 the_cat = t[0] 1384 the_cat.insert(1, tree.bracket_parse('(JJ big)')) 1385 print "Tree modification:" 1386 print t 1387 t[1,1,1] = tree.bracket_parse('(NN cake)') 1388 print t 1389 print 1390 1391 # Tree transforms 1392 print "Collapse unary:" 1393 t.collapse_unary() 1394 print t 1395 print "Chomsky normal form:" 1396 t.chomsky_normal_form() 1397 print t 1398 print 1399 1400 # Demonstrate probabilistic trees. 1401 pt = tree.ProbabilisticTree('x', ['y', 'z'], prob=0.5) 1402 print "Probabilistic Tree:" 1403 print pt 1404 print 1405 1406 # Demonstrate parsing of treebank output format. 1407 t = tree.bracket_parse(t.pprint()) 1408 print "Convert tree to bracketed string and back again:" 1409 print t 1410 print 1411 1412 # Demonstrate LaTeX output 1413 print "LaTeX output:" 1414 print t.pprint_latex_qtree() 1415 print 1416 1417 # Demonstrate Productions 1418 print "Production output:" 1419 print t.productions() 1420 print 1421 1422 # Demonstrate tree nodes containing objects other than strings 1423 t.node = ('test', 3) 1424 print t
1425 1426 if __name__ == '__main__': 1427 demo() 1428 1429 __all__ = ['ImmutableProbabilisticTree', 'ImmutableTree', 'ProbabilisticMixIn', 1430 'ProbabilisticTree', 'Tree', 'bracket_parse', 'demo', 1431 'sinica_parse', 'ParentedTree', 'MultiParentedTree', 1432 'ImmutableParentedTree', 'ImmutableMultiParentedTree'] 1433