Package nltk :: Package etree :: Module ElementTree
[hide private]
[frames] | no frames]

Source Code for Module nltk.etree.ElementTree

   1  # 
   2  # ElementTree 
   3  # $Id: ElementTree.py 2326 2005-03-17 07:45:21Z fredrik $ 
   4  # 
   5  # light-weight XML support for Python 1.5.2 and later. 
   6  # 
   7  # history: 
   8  # 2001-10-20 fl   created (from various sources) 
   9  # 2001-11-01 fl   return root from parse method 
  10  # 2002-02-16 fl   sort attributes in lexical order 
  11  # 2002-04-06 fl   TreeBuilder refactoring, added PythonDoc markup 
  12  # 2002-05-01 fl   finished TreeBuilder refactoring 
  13  # 2002-07-14 fl   added basic namespace support to ElementTree.write 
  14  # 2002-07-25 fl   added QName attribute support 
  15  # 2002-10-20 fl   fixed encoding in write 
  16  # 2002-11-24 fl   changed default encoding to ascii; fixed attribute encoding 
  17  # 2002-11-27 fl   accept file objects or file names for parse/write 
  18  # 2002-12-04 fl   moved XMLTreeBuilder back to this module 
  19  # 2003-01-11 fl   fixed entity encoding glitch for us-ascii 
  20  # 2003-02-13 fl   added XML literal factory 
  21  # 2003-02-21 fl   added ProcessingInstruction/PI factory 
  22  # 2003-05-11 fl   added tostring/fromstring helpers 
  23  # 2003-05-26 fl   added ElementPath support 
  24  # 2003-07-05 fl   added makeelement factory method 
  25  # 2003-07-28 fl   added more well-known namespace prefixes 
  26  # 2003-08-15 fl   fixed typo in ElementTree.findtext (Thomas Dartsch) 
  27  # 2003-09-04 fl   fall back on emulator if ElementPath is not installed 
  28  # 2003-10-31 fl   markup updates 
  29  # 2003-11-15 fl   fixed nested namespace bug 
  30  # 2004-03-28 fl   added XMLID helper 
  31  # 2004-06-02 fl   added default support to findtext 
  32  # 2004-06-08 fl   fixed encoding of non-ascii element/attribute names 
  33  # 2004-08-23 fl   take advantage of post-2.1 expat features 
  34  # 2005-02-01 fl   added iterparse implementation 
  35  # 2005-03-02 fl   fixed iterparse support for pre-2.2 versions 
  36  # 
  37  # Copyright (c) 1999-2005 by Fredrik Lundh.  All rights reserved. 
  38  # 
  39  # [email protected] 
  40  # http://www.pythonware.com 
  41  # 
  42  # -------------------------------------------------------------------- 
  43  # The ElementTree toolkit is 
  44  # 
  45  # Copyright (c) 1999-2005 by Fredrik Lundh 
  46  # 
  47  # By obtaining, using, and/or copying this software and/or its 
  48  # associated documentation, you agree that you have read, understood, 
  49  # and will comply with the following terms and conditions: 
  50  # 
  51  # Permission to use, copy, modify, and distribute this software and 
  52  # its associated documentation for any purpose and without fee is 
  53  # hereby granted, provided that the above copyright notice appears in 
  54  # all copies, and that both that copyright notice and this permission 
  55  # notice appear in supporting documentation, and that the name of 
  56  # Secret Labs AB or the author not be used in advertising or publicity 
  57  # pertaining to distribution of the software without specific, written 
  58  # prior permission. 
  59  # 
  60  # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD 
  61  # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- 
  62  # ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR 
  63  # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY 
  64  # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, 
  65  # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 
  66  # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
  67  # OF THIS SOFTWARE. 
  68  # -------------------------------------------------------------------- 
  69   
  70  # Licensed to PSF under a Contributor Agreement. 
  71  # See http://www.python.org/2.4/license for licensing details. 
  72   
  73  __all__ = [ 
  74      # public symbols 
  75      "Comment", 
  76      "dump", 
  77      "Element", "ElementTree", 
  78      "fromstring", 
  79      "iselement", "iterparse", 
  80      "parse", 
  81      "PI", "ProcessingInstruction", 
  82      "QName", 
  83      "SubElement", 
  84      "tostring", 
  85      "TreeBuilder", 
  86      "VERSION", "XML", 
  87      "XMLParser", "XMLTreeBuilder", 
  88      ] 
  89   
  90  ## 
  91  # The <b>Element</b> type is a flexible container object, designed to 
  92  # store hierarchical data structures in memory. The type can be 
  93  # described as a cross between a list and a dictionary. 
  94  # <p> 
  95  # Each element has a number of properties associated with it: 
  96  # <ul> 
  97  # <li>a <i>tag</i>. This is a string identifying what kind of data 
  98  # this element represents (the element type, in other words).</li> 
  99  # <li>a number of <i>attributes</i>, stored in a Python dictionary.</li> 
 100  # <li>a <i>text</i> string.</li> 
 101  # <li>an optional <i>tail</i> string.</li> 
 102  # <li>a number of <i>child elements</i>, stored in a Python sequence</li> 
 103  # </ul> 
 104  # 
 105  # To create an element instance, use the {@link #Element} or {@link 
 106  # #SubElement} factory functions. 
 107  # <p> 
 108  # The {@link #ElementTree} class can be used to wrap an element 
 109  # structure, and convert it from and to XML. 
 110  ## 
 111   
 112  import string, sys, re 
 113   
114 -class _SimpleElementPath:
115 # emulate pre-1.2 find/findtext/findall behaviour
116 - def find(self, element, tag):
117 for elem in element: 118 if elem.tag == tag: 119 return elem 120 return None
121 - def findtext(self, element, tag, default=None):
122 for elem in element: 123 if elem.tag == tag: 124 return elem.text or "" 125 return default
126 - def findall(self, element, tag):
127 if tag[:3] == ".//": 128 return element.getiterator(tag[3:]) 129 result = [] 130 for elem in element: 131 if elem.tag == tag: 132 result.append(elem) 133 return result
134 135 try: 136 import ElementPath 137 except ImportError: 138 # FIXME: issue warning in this case? 139 ElementPath = _SimpleElementPath() 140 141 # TODO: add support for custom namespace resolvers/default namespaces 142 # TODO: add improved support for incremental parsing 143 144 VERSION = "1.2.6" 145 146 ## 147 # Internal element class. This class defines the Element interface, 148 # and provides a reference implementation of this interface. 149 # <p> 150 # You should not create instances of this class directly. Use the 151 # appropriate factory functions instead, such as {@link #Element} 152 # and {@link #SubElement}. 153 # 154 # @see Element 155 # @see SubElement 156 # @see Comment 157 # @see ProcessingInstruction 158
159 -class _ElementInterface:
160 # <tag attrib>text<child/>...</tag>tail 161 162 ## 163 # (Attribute) Element tag. 164 165 tag = None 166 167 ## 168 # (Attribute) Element attribute dictionary. Where possible, use 169 # {@link #_ElementInterface.get}, 170 # {@link #_ElementInterface.set}, 171 # {@link #_ElementInterface.keys}, and 172 # {@link #_ElementInterface.items} to access 173 # element attributes. 174 175 attrib = None 176 177 ## 178 # (Attribute) Text before first subelement. This is either a 179 # string or the value None, if there was no text. 180 181 text = None 182 183 ## 184 # (Attribute) Text after this element's end tag, but before the 185 # next sibling element's start tag. This is either a string or 186 # the value None, if there was no text. 187 188 tail = None # text after end tag, if any 189
190 - def __init__(self, tag, attrib):
191 self.tag = tag 192 self.attrib = attrib 193 self._children = []
194
195 - def __repr__(self):
196 return "<Element %s at %x>" % (self.tag, id(self))
197 198 ## 199 # Creates a new element object of the same type as this element. 200 # 201 # @param tag Element tag. 202 # @param attrib Element attributes, given as a dictionary. 203 # @return A new element instance. 204
205 - def makeelement(self, tag, attrib):
206 return Element(tag, attrib)
207 208 ## 209 # Returns the number of subelements. 210 # 211 # @return The number of subelements. 212
213 - def __len__(self):
214 return len(self._children)
215 216 ## 217 # Returns the given subelement. 218 # 219 # @param index What subelement to return. 220 # @return The given subelement. 221 # @exception IndexError If the given element does not exist. 222
223 - def __getitem__(self, index):
224 return self._children[index]
225 226 ## 227 # Replaces the given subelement. 228 # 229 # @param index What subelement to replace. 230 # @param element The new element value. 231 # @exception IndexError If the given element does not exist. 232 # @exception AssertionError If element is not a valid object. 233
234 - def __setitem__(self, index, element):
235 assert iselement(element) 236 self._children[index] = element
237 238 ## 239 # Deletes the given subelement. 240 # 241 # @param index What subelement to delete. 242 # @exception IndexError If the given element does not exist. 243
244 - def __delitem__(self, index):
245 del self._children[index]
246 247 ## 248 # Returns a list containing subelements in the given range. 249 # 250 # @param start The first subelement to return. 251 # @param stop The first subelement that shouldn't be returned. 252 # @return A sequence object containing subelements. 253
254 - def __getslice__(self, start, stop):
255 return self._children[start:stop]
256 257 ## 258 # Replaces a number of subelements with elements from a sequence. 259 # 260 # @param start The first subelement to replace. 261 # @param stop The first subelement that shouldn't be replaced. 262 # @param elements A sequence object with zero or more elements. 263 # @exception AssertionError If a sequence member is not a valid object. 264
265 - def __setslice__(self, start, stop, elements):
266 for element in elements: 267 assert iselement(element) 268 self._children[start:stop] = list(elements)
269 270 ## 271 # Deletes a number of subelements. 272 # 273 # @param start The first subelement to delete. 274 # @param stop The first subelement to leave in there. 275
276 - def __delslice__(self, start, stop):
277 del self._children[start:stop]
278 279 ## 280 # Adds a subelement to the end of this element. 281 # 282 # @param element The element to add. 283 # @exception AssertionError If a sequence member is not a valid object. 284
285 - def append(self, element):
286 assert iselement(element) 287 self._children.append(element)
288 289 ## 290 # Inserts a subelement at the given position in this element. 291 # 292 # @param index Where to insert the new subelement. 293 # @exception AssertionError If the element is not a valid object. 294
295 - def insert(self, index, element):
296 assert iselement(element) 297 self._children.insert(index, element)
298 299 ## 300 # Removes a matching subelement. Unlike the <b>find</b> methods, 301 # this method compares elements based on identity, not on tag 302 # value or contents. 303 # 304 # @param element What element to remove. 305 # @exception ValueError If a matching element could not be found. 306 # @exception AssertionError If the element is not a valid object. 307
308 - def remove(self, element):
309 assert iselement(element) 310 self._children.remove(element)
311 312 ## 313 # Returns all subelements. The elements are returned in document 314 # order. 315 # 316 # @return A list of subelements. 317 # @defreturn list of Element instances 318
319 - def getchildren(self):
320 return self._children
321 322 ## 323 # Finds the first matching subelement, by tag name or path. 324 # 325 # @param path What element to look for. 326 # @return The first matching element, or None if no element was found. 327 # @defreturn Element or None 328
329 - def find(self, path):
330 return ElementPath.find(self, path)
331 332 ## 333 # Finds text for the first matching subelement, by tag name or path. 334 # 335 # @param path What element to look for. 336 # @param default What to return if the element was not found. 337 # @return The text content of the first matching element, or the 338 # default value no element was found. Note that if the element 339 # has is found, but has no text content, this method returns an 340 # empty string. 341 # @defreturn string 342
343 - def findtext(self, path, default=None):
344 return ElementPath.findtext(self, path, default)
345 346 ## 347 # Finds all matching subelements, by tag name or path. 348 # 349 # @param path What element to look for. 350 # @return A list or iterator containing all matching elements, 351 # in document order. 352 # @defreturn list of Element instances 353
354 - def findall(self, path):
355 return ElementPath.findall(self, path)
356 357 ## 358 # Resets an element. This function removes all subelements, clears 359 # all attributes, and sets the text and tail attributes to None. 360
361 - def clear(self):
362 self.attrib.clear() 363 self._children = [] 364 self.text = self.tail = None
365 366 ## 367 # Gets an element attribute. 368 # 369 # @param key What attribute to look for. 370 # @param default What to return if the attribute was not found. 371 # @return The attribute value, or the default value, if the 372 # attribute was not found. 373 # @defreturn string or None 374
375 - def get(self, key, default=None):
376 return self.attrib.get(key, default)
377 378 ## 379 # Sets an element attribute. 380 # 381 # @param key What attribute to set. 382 # @param value The attribute value. 383
384 - def set(self, key, value):
385 self.attrib[key] = value
386 387 ## 388 # Gets a list of attribute names. The names are returned in an 389 # arbitrary order (just like for an ordinary Python dictionary). 390 # 391 # @return A list of element attribute names. 392 # @defreturn list of strings 393
394 - def keys(self):
395 return self.attrib.keys()
396 397 ## 398 # Gets element attributes, as a sequence. The attributes are 399 # returned in an arbitrary order. 400 # 401 # @return A list of (name, value) tuples for all attributes. 402 # @defreturn list of (string, string) tuples 403
404 - def items(self):
405 return self.attrib.items()
406 407 ## 408 # Creates a tree iterator. The iterator loops over this element 409 # and all subelements, in document order, and returns all elements 410 # with a matching tag. 411 # <p> 412 # If the tree structure is modified during iteration, the result 413 # is undefined. 414 # 415 # @param tag What tags to look for (default is to return all elements). 416 # @return A list or iterator containing all the matching elements. 417 # @defreturn list or iterator 418
419 - def getiterator(self, tag=None):
420 nodes = [] 421 if tag == "*": 422 tag = None 423 if tag is None or self.tag == tag: 424 nodes.append(self) 425 for node in self._children: 426 nodes.extend(node.getiterator(tag)) 427 return nodes
428 429 # compatibility 430 _Element = _ElementInterface 431 432 ## 433 # Element factory. This function returns an object implementing the 434 # standard Element interface. The exact class or type of that object 435 # is implementation dependent, but it will always be compatible with 436 # the {@link #_ElementInterface} class in this module. 437 # <p> 438 # The element name, attribute names, and attribute values can be 439 # either 8-bit ASCII strings or Unicode strings. 440 # 441 # @param tag The element name. 442 # @param attrib An optional dictionary, containing element attributes. 443 # @param **extra Additional attributes, given as keyword arguments. 444 # @return An element instance. 445 # @defreturn Element 446
447 -def Element(tag, attrib={}, **extra):
448 attrib = attrib.copy() 449 attrib.update(extra) 450 return _ElementInterface(tag, attrib)
451 452 ## 453 # Subelement factory. This function creates an element instance, and 454 # appends it to an existing element. 455 # <p> 456 # The element name, attribute names, and attribute values can be 457 # either 8-bit ASCII strings or Unicode strings. 458 # 459 # @param parent The parent element. 460 # @param tag The subelement name. 461 # @param attrib An optional dictionary, containing element attributes. 462 # @param **extra Additional attributes, given as keyword arguments. 463 # @return An element instance. 464 # @defreturn Element 465
466 -def SubElement(parent, tag, attrib={}, **extra):
467 attrib = attrib.copy() 468 attrib.update(extra) 469 element = parent.makeelement(tag, attrib) 470 parent.append(element) 471 return element
472 473 ## 474 # Comment element factory. This factory function creates a special 475 # element that will be serialized as an XML comment. 476 # <p> 477 # The comment string can be either an 8-bit ASCII string or a Unicode 478 # string. 479 # 480 # @param text A string containing the comment string. 481 # @return An element instance, representing a comment. 482 # @defreturn Element 483
484 -def Comment(text=None):
485 element = Element(Comment) 486 element.text = text 487 return element
488 489 ## 490 # PI element factory. This factory function creates a special element 491 # that will be serialized as an XML processing instruction. 492 # 493 # @param target A string containing the PI target. 494 # @param text A string containing the PI contents, if any. 495 # @return An element instance, representing a PI. 496 # @defreturn Element 497
498 -def ProcessingInstruction(target, text=None):
499 element = Element(ProcessingInstruction) 500 element.text = target 501 if text: 502 element.text = element.text + " " + text 503 return element
504 505 PI = ProcessingInstruction 506 507 ## 508 # QName wrapper. This can be used to wrap a QName attribute value, in 509 # order to get proper namespace handling on output. 510 # 511 # @param text A string containing the QName value, in the form {uri}local, 512 # or, if the tag argument is given, the URI part of a QName. 513 # @param tag Optional tag. If given, the first argument is interpreted as 514 # an URI, and this argument is interpreted as a local name. 515 # @return An opaque object, representing the QName. 516
517 -class QName:
518 - def __init__(self, text_or_uri, tag=None):
519 if tag: 520 text_or_uri = "{%s}%s" % (text_or_uri, tag) 521 self.text = text_or_uri
522 - def __str__(self):
523 return self.text
524 - def __hash__(self):
525 return hash(self.text)
526 - def __cmp__(self, other):
527 if isinstance(other, QName): 528 return cmp(self.text, other.text) 529 return cmp(self.text, other)
530 531 ## 532 # ElementTree wrapper class. This class represents an entire element 533 # hierarchy, and adds some extra support for serialization to and from 534 # standard XML. 535 # 536 # @param element Optional root element. 537 # @keyparam file Optional file handle or name. If given, the 538 # tree is initialized with the contents of this XML file. 539
540 -class ElementTree:
541
542 - def __init__(self, element=None, file=None):
543 assert element is None or iselement(element) 544 self._root = element # first node 545 if file: 546 self.parse(file)
547 548 ## 549 # Gets the root element for this tree. 550 # 551 # @return An element instance. 552 # @defreturn Element 553
554 - def getroot(self):
555 return self._root
556 557 ## 558 # Replaces the root element for this tree. This discards the 559 # current contents of the tree, and replaces it with the given 560 # element. Use with care. 561 # 562 # @param element An element instance. 563
564 - def _setroot(self, element):
565 assert iselement(element) 566 self._root = element
567 568 ## 569 # Loads an external XML document into this element tree. 570 # 571 # @param source A file name or file object. 572 # @param parser An optional parser instance. If not given, the 573 # standard {@link XMLTreeBuilder} parser is used. 574 # @return The document root element. 575 # @defreturn Element 576
577 - def parse(self, source, parser=None):
578 if not hasattr(source, "read"): 579 source = open(source, "rb") 580 if not parser: 581 parser = XMLTreeBuilder() 582 while 1: 583 data = source.read(32768) 584 if not data: 585 break 586 parser.feed(data) 587 self._root = parser.close() 588 return self._root
589 590 ## 591 # Creates a tree iterator for the root element. The iterator loops 592 # over all elements in this tree, in document order. 593 # 594 # @param tag What tags to look for (default is to return all elements) 595 # @return An iterator. 596 # @defreturn iterator 597
598 - def getiterator(self, tag=None):
599 assert self._root is not None 600 return self._root.getiterator(tag)
601 602 ## 603 # Finds the first toplevel element with given tag. 604 # Same as getroot().find(path). 605 # 606 # @param path What element to look for. 607 # @return The first matching element, or None if no element was found. 608 # @defreturn Element or None 609
610 - def find(self, path):
611 assert self._root is not None 612 if path[:1] == "/": 613 path = "." + path 614 return self._root.find(path)
615 616 ## 617 # Finds the element text for the first toplevel element with given 618 # tag. Same as getroot().findtext(path). 619 # 620 # @param path What toplevel element to look for. 621 # @param default What to return if the element was not found. 622 # @return The text content of the first matching element, or the 623 # default value no element was found. Note that if the element 624 # has is found, but has no text content, this method returns an 625 # empty string. 626 # @defreturn string 627
628 - def findtext(self, path, default=None):
629 assert self._root is not None 630 if path[:1] == "/": 631 path = "." + path 632 return self._root.findtext(path, default)
633 634 ## 635 # Finds all toplevel elements with the given tag. 636 # Same as getroot().findall(path). 637 # 638 # @param path What element to look for. 639 # @return A list or iterator containing all matching elements, 640 # in document order. 641 # @defreturn list of Element instances 642
643 - def findall(self, path):
644 assert self._root is not None 645 if path[:1] == "/": 646 path = "." + path 647 return self._root.findall(path)
648 649 ## 650 # Writes the element tree to a file, as XML. 651 # 652 # @param file A file name, or a file object opened for writing. 653 # @param encoding Optional output encoding (default is US-ASCII). 654
655 - def write(self, file, encoding="us-ascii"):
656 assert self._root is not None 657 if not hasattr(file, "write"): 658 file = open(file, "wb") 659 if not encoding: 660 encoding = "us-ascii" 661 elif encoding != "utf-8" and encoding != "us-ascii": 662 file.write("<?xml version='1.0' encoding='%s'?>\n" % encoding) 663 self._write(file, self._root, encoding, {})
664
665 - def _write(self, file, node, encoding, namespaces):
666 # write XML to file 667 tag = node.tag 668 if tag is Comment: 669 file.write("<!-- %s -->" % _escape_cdata(node.text, encoding)) 670 elif tag is ProcessingInstruction: 671 file.write("<?%s?>" % _escape_cdata(node.text, encoding)) 672 else: 673 items = node.items() 674 xmlns_items = [] # new namespaces in this scope 675 try: 676 if isinstance(tag, QName) or tag[:1] == "{": 677 tag, xmlns = fixtag(tag, namespaces) 678 if xmlns: xmlns_items.append(xmlns) 679 except TypeError: 680 _raise_serialization_error(tag) 681 file.write("<" + _encode(tag, encoding)) 682 if items or xmlns_items: 683 items.sort() # lexical order 684 for k, v in items: 685 try: 686 if isinstance(k, QName) or k[:1] == "{": 687 k, xmlns = fixtag(k, namespaces) 688 if xmlns: xmlns_items.append(xmlns) 689 except TypeError: 690 _raise_serialization_error(k) 691 try: 692 if isinstance(v, QName): 693 v, xmlns = fixtag(v, namespaces) 694 if xmlns: xmlns_items.append(xmlns) 695 except TypeError: 696 _raise_serialization_error(v) 697 file.write(" %s=\"%s\"" % (_encode(k, encoding), 698 _escape_attrib(v, encoding))) 699 for k, v in xmlns_items: 700 file.write(" %s=\"%s\"" % (_encode(k, encoding), 701 _escape_attrib(v, encoding))) 702 if node.text or len(node): 703 file.write(">") 704 if node.text: 705 file.write(_escape_cdata(node.text, encoding)) 706 for n in node: 707 self._write(file, n, encoding, namespaces) 708 file.write("</" + _encode(tag, encoding) + ">") 709 else: 710 file.write(" />") 711 for k, v in xmlns_items: 712 del namespaces[v] 713 if node.tail: 714 file.write(_escape_cdata(node.tail, encoding))
715 716 # -------------------------------------------------------------------- 717 # helpers 718 719 ## 720 # Checks if an object appears to be a valid element object. 721 # 722 # @param An element instance. 723 # @return A true value if this is an element object. 724 # @defreturn flag 725
726 -def iselement(element):
727 # FIXME: not sure about this; might be a better idea to look 728 # for tag/attrib/text attributes 729 return isinstance(element, _ElementInterface) or hasattr(element, "tag")
730 731 ## 732 # Writes an element tree or element structure to sys.stdout. This 733 # function should be used for debugging only. 734 # <p> 735 # The exact output format is implementation dependent. In this 736 # version, it's written as an ordinary XML file. 737 # 738 # @param elem An element tree or an individual element. 739
740 -def dump(elem):
741 # debugging 742 if not isinstance(elem, ElementTree): 743 elem = ElementTree(elem) 744 elem.write(sys.stdout) 745 tail = elem.getroot().tail 746 if not tail or tail[-1] != "\n": 747 sys.stdout.write("\n")
748
749 -def _encode(s, encoding):
750 try: 751 return s.encode(encoding) 752 except AttributeError: 753 return s # 1.5.2: assume the string uses the right encoding
754 755 if sys.version[:3] == "1.5": 756 _escape = re.compile(r"[&<>\"\x80-\xff]+") # 1.5.2 757 else: 758 _escape = re.compile(eval(r'u"[&<>\"\u0080-\uffff]+"')) 759 760 _escape_map = { 761 "&": "&amp;", 762 "<": "&lt;", 763 ">": "&gt;", 764 '"': "&quot;", 765 } 766 767 _namespace_map = { 768 # "well-known" namespace prefixes 769 "http://www.w3.org/XML/1998/namespace": "xml", 770 "http://www.w3.org/1999/xhtml": "html", 771 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf", 772 "http://schemas.xmlsoap.org/wsdl/": "wsdl", 773 } 774
775 -def _raise_serialization_error(text):
776 raise TypeError( 777 "cannot serialize %r (type %s)" % (text, type(text).__name__) 778 )
779
780 -def _encode_entity(text, pattern=_escape):
781 # map reserved and non-ascii characters to numerical entities 782 def escape_entities(m, map=_escape_map): 783 out = [] 784 append = out.append 785 for char in m.group(): 786 text = map.get(char) 787 if text is None: 788 text = "&#%d;" % ord(char) 789 append(text) 790 return string.join(out, "")
791 try: 792 return _encode(pattern.sub(escape_entities, text), "ascii") 793 except TypeError: 794 _raise_serialization_error(text) 795 796 # 797 # the following functions assume an ascii-compatible encoding 798 # (or "utf-16") 799
800 -def _escape_cdata(text, encoding=None, replace=string.replace):
801 # escape character data 802 try: 803 if encoding: 804 try: 805 text = _encode(text, encoding) 806 except UnicodeError: 807 return _encode_entity(text) 808 text = replace(text, "&", "&amp;") 809 text = replace(text, "<", "&lt;") 810 text = replace(text, ">", "&gt;") 811 return text 812 except (TypeError, AttributeError): 813 _raise_serialization_error(text)
814
815 -def _escape_attrib(text, encoding=None, replace=string.replace):
816 # escape attribute value 817 try: 818 if encoding: 819 try: 820 text = _encode(text, encoding) 821 except UnicodeError: 822 return _encode_entity(text) 823 text = replace(text, "&", "&amp;") 824 text = replace(text, "'", "&apos;") # FIXME: overkill 825 text = replace(text, "\"", "&quot;") 826 text = replace(text, "<", "&lt;") 827 text = replace(text, ">", "&gt;") 828 return text 829 except (TypeError, AttributeError): 830 _raise_serialization_error(text)
831
832 -def fixtag(tag, namespaces):
833 # given a decorated tag (of the form {uri}tag), return prefixed 834 # tag and namespace declaration, if any 835 if isinstance(tag, QName): 836 tag = tag.text 837 namespace_uri, tag = string.split(tag[1:], "}", 1) 838 prefix = namespaces.get(namespace_uri) 839 if prefix is None: 840 prefix = _namespace_map.get(namespace_uri) 841 if prefix is None: 842 prefix = "ns%d" % len(namespaces) 843 namespaces[namespace_uri] = prefix 844 if prefix == "xml": 845 xmlns = None 846 else: 847 xmlns = ("xmlns:%s" % prefix, namespace_uri) 848 else: 849 xmlns = None 850 return "%s:%s" % (prefix, tag), xmlns
851 852 ## 853 # Parses an XML document into an element tree. 854 # 855 # @param source A filename or file object containing XML data. 856 # @param parser An optional parser instance. If not given, the 857 # standard {@link XMLTreeBuilder} parser is used. 858 # @return An ElementTree instance 859
860 -def parse(source, parser=None):
861 tree = ElementTree() 862 tree.parse(source, parser) 863 return tree
864 865 ## 866 # Parses an XML document into an element tree incrementally, and reports 867 # what's going on to the user. 868 # 869 # @param source A filename or file object containing XML data. 870 # @param events A list of events to report back. If omitted, only "end" 871 # events are reported. 872 # @return A (event, elem) iterator. 873
874 -class iterparse:
875
876 - def __init__(self, source, events=None):
877 if not hasattr(source, "read"): 878 source = open(source, "rb") 879 self._file = source 880 self._events = [] 881 self._index = 0 882 self.root = self._root = None 883 self._parser = XMLTreeBuilder() 884 # wire up the parser for event reporting 885 parser = self._parser._parser 886 append = self._events.append 887 if events is None: 888 events = ["end"] 889 for event in events: 890 if event == "start": 891 try: 892 parser.ordered_attributes = 1 893 parser.specified_attributes = 1 894 def handler(tag, attrib_in, event=event, append=append, 895 start=self._parser._start_list): 896 append((event, start(tag, attrib_in)))
897 parser.StartElementHandler = handler 898 except AttributeError: 899 def handler(tag, attrib_in, event=event, append=append, 900 start=self._parser._start): 901 append((event, start(tag, attrib_in)))
902 parser.StartElementHandler = handler 903 elif event == "end": 904 def handler(tag, event=event, append=append, 905 end=self._parser._end): 906 append((event, end(tag))) 907 parser.EndElementHandler = handler 908 elif event == "start-ns": 909 def handler(prefix, uri, event=event, append=append): 910 try: 911 uri = _encode(uri, "ascii") 912 except UnicodeError: 913 pass 914 append((event, (prefix or "", uri))) 915 parser.StartNamespaceDeclHandler = handler 916 elif event == "end-ns": 917 def handler(prefix, event=event, append=append): 918 append((event, None)) 919 parser.EndNamespaceDeclHandler = handler 920
921 - def next(self):
922 while 1: 923 try: 924 item = self._events[self._index] 925 except IndexError: 926 if self._parser is None: 927 self.root = self._root 928 try: 929 raise StopIteration 930 except NameError: 931 raise IndexError 932 # load event buffer 933 del self._events[:] 934 self._index = 0 935 data = self._file.read(16384) 936 if data: 937 self._parser.feed(data) 938 else: 939 self._root = self._parser.close() 940 self._parser = None 941 else: 942 self._index = self._index + 1 943 return item
944 945 try: 946 iter
947 - def __iter__(self):
948 return self
949 except NameError:
950 - def __getitem__(self, index):
951 return self.next()
952 953 ## 954 # Parses an XML document from a string constant. This function can 955 # be used to embed "XML literals" in Python code. 956 # 957 # @param source A string containing XML data. 958 # @return An Element instance. 959 # @defreturn Element 960
961 -def XML(text):
962 parser = XMLTreeBuilder() 963 parser.feed(text) 964 return parser.close()
965 966 ## 967 # Parses an XML document from a string constant, and also returns 968 # a dictionary which maps from element id:s to elements. 969 # 970 # @param source A string containing XML data. 971 # @return A tuple containing an Element instance and a dictionary. 972 # @defreturn (Element, dictionary) 973
974 -def XMLID(text):
975 parser = XMLTreeBuilder() 976 parser.feed(text) 977 tree = parser.close() 978 ids = {} 979 for elem in tree.getiterator(): 980 id = elem.get("id") 981 if id: 982 ids[id] = elem 983 return tree, ids
984 985 ## 986 # Parses an XML document from a string constant. Same as {@link #XML}. 987 # 988 # @def fromstring(text) 989 # @param source A string containing XML data. 990 # @return An Element instance. 991 # @defreturn Element 992 993 fromstring = XML 994 995 ## 996 # Generates a string representation of an XML element, including all 997 # subelements. 998 # 999 # @param element An Element instance. 1000 # @return An encoded string containing the XML data. 1001 # @defreturn string 1002
1003 -def tostring(element, encoding=None):
1004 class dummy: 1005 pass
1006 data = [] 1007 file = dummy() 1008 file.write = data.append 1009 ElementTree(element).write(file, encoding) 1010 return string.join(data, "") 1011 1012 ## 1013 # Generic element structure builder. This builder converts a sequence 1014 # of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link 1015 # #TreeBuilder.end} method calls to a well-formed element structure. 1016 # <p> 1017 # You can use this class to build an element structure using a custom XML 1018 # parser, or a parser for some other XML-like format. 1019 # 1020 # @param element_factory Optional element factory. This factory 1021 # is called to create new Element instances, as necessary. 1022
1023 -class TreeBuilder:
1024
1025 - def __init__(self, element_factory=None):
1026 self._data = [] # data collector 1027 self._elem = [] # element stack 1028 self._last = None # last element 1029 self._tail = None # true if we're after an end tag 1030 if element_factory is None: 1031 element_factory = _ElementInterface 1032 self._factory = element_factory
1033 1034 ## 1035 # Flushes the parser buffers, and returns the toplevel documen 1036 # element. 1037 # 1038 # @return An Element instance. 1039 # @defreturn Element 1040
1041 - def close(self):
1042 assert len(self._elem) == 0, "missing end tags" 1043 assert self._last != None, "missing toplevel element" 1044 return self._last
1045
1046 - def _flush(self):
1047 if self._data: 1048 if self._last is not None: 1049 text = string.join(self._data, "") 1050 if self._tail: 1051 assert self._last.tail is None, "internal error (tail)" 1052 self._last.tail = text 1053 else: 1054 assert self._last.text is None, "internal error (text)" 1055 self._last.text = text 1056 self._data = []
1057 1058 ## 1059 # Adds text to the current element. 1060 # 1061 # @param data A string. This should be either an 8-bit string 1062 # containing ASCII text, or a Unicode string. 1063
1064 - def data(self, data):
1065 self._data.append(data)
1066 1067 ## 1068 # Opens a new element. 1069 # 1070 # @param tag The element name. 1071 # @param attrib A dictionary containing element attributes. 1072 # @return The opened element. 1073 # @defreturn Element 1074
1075 - def start(self, tag, attrs):
1076 self._flush() 1077 self._last = elem = self._factory(tag, attrs) 1078 if self._elem: 1079 self._elem[-1].append(elem) 1080 self._elem.append(elem) 1081 self._tail = 0 1082 return elem
1083 1084 ## 1085 # Closes the current element. 1086 # 1087 # @param tag The element name. 1088 # @return The closed element. 1089 # @defreturn Element 1090
1091 - def end(self, tag):
1092 self._flush() 1093 self._last = self._elem.pop() 1094 assert self._last.tag == tag,\ 1095 "end tag mismatch (expected %s, got %s)" % ( 1096 self._last.tag, tag) 1097 self._tail = 1 1098 return self._last
1099 1100 ## 1101 # Element structure builder for XML source data, based on the 1102 # <b>expat</b> parser. 1103 # 1104 # @keyparam target Target object. If omitted, the builder uses an 1105 # instance of the standard {@link #TreeBuilder} class. 1106 # @keyparam html Predefine HTML entities. This flag is not supported 1107 # by the current implementation. 1108 # @see #ElementTree 1109 # @see #TreeBuilder 1110
1111 -class XMLTreeBuilder:
1112
1113 - def __init__(self, html=0, target=None):
1114 try: 1115 from xml.parsers import expat 1116 except ImportError: 1117 raise ImportError( 1118 "No module named expat; use SimpleXMLTreeBuilder instead" 1119 ) 1120 self._parser = parser = expat.ParserCreate(None, "}") 1121 if target is None: 1122 target = TreeBuilder() 1123 self._target = target 1124 self._names = {} # name memo cache 1125 # callbacks 1126 parser.DefaultHandlerExpand = self._default 1127 parser.StartElementHandler = self._start 1128 parser.EndElementHandler = self._end 1129 parser.CharacterDataHandler = self._data 1130 # let expat do the buffering, if supported 1131 try: 1132 self._parser.buffer_text = 1 1133 except AttributeError: 1134 pass 1135 # use new-style attribute handling, if supported 1136 try: 1137 self._parser.ordered_attributes = 1 1138 self._parser.specified_attributes = 1 1139 parser.StartElementHandler = self._start_list 1140 except AttributeError: 1141 pass 1142 encoding = None 1143 if not parser.returns_unicode: 1144 encoding = "utf-8" 1145 # target.xml(encoding, None) 1146 self._doctype = None 1147 self.entity = {}
1148
1149 - def _fixtext(self, text):
1150 # convert text string to ascii, if possible 1151 try: 1152 return _encode(text, "ascii") 1153 except UnicodeError: 1154 return text
1155
1156 - def _fixname(self, key):
1157 # expand qname, and convert name string to ascii, if possible 1158 try: 1159 name = self._names[key] 1160 except KeyError: 1161 name = key 1162 if "}" in name: 1163 name = "{" + name 1164 self._names[key] = name = self._fixtext(name) 1165 return name
1166
1167 - def _start(self, tag, attrib_in):
1168 fixname = self._fixname 1169 tag = fixname(tag) 1170 attrib = {} 1171 for key, value in attrib_in.items(): 1172 attrib[fixname(key)] = self._fixtext(value) 1173 return self._target.start(tag, attrib)
1174
1175 - def _start_list(self, tag, attrib_in):
1176 fixname = self._fixname 1177 tag = fixname(tag) 1178 attrib = {} 1179 if attrib_in: 1180 for i in range(0, len(attrib_in), 2): 1181 attrib[fixname(attrib_in[i])] = self._fixtext(attrib_in[i+1]) 1182 return self._target.start(tag, attrib)
1183
1184 - def _data(self, text):
1185 return self._target.data(self._fixtext(text))
1186
1187 - def _end(self, tag):
1188 return self._target.end(self._fixname(tag))
1189
1190 - def _default(self, text):
1191 prefix = text[:1] 1192 if prefix == "&": 1193 # deal with undefined entities 1194 try: 1195 self._target.data(self.entity[text[1:-1]]) 1196 except KeyError: 1197 from xml.parsers import expat 1198 raise expat.error( 1199 "undefined entity %s: line %d, column %d" % 1200 (text, self._parser.ErrorLineNumber, 1201 self._parser.ErrorColumnNumber) 1202 ) 1203 elif prefix == "<" and text[:9] == "<!DOCTYPE": 1204 self._doctype = [] # inside a doctype declaration 1205 elif self._doctype is not None: 1206 # parse doctype contents 1207 if prefix == ">": 1208 self._doctype = None 1209 return 1210 text = string.strip(text) 1211 if not text: 1212 return 1213 self._doctype.append(text) 1214 n = len(self._doctype) 1215 if n > 2: 1216 type = self._doctype[1] 1217 if type == "PUBLIC" and n == 4: 1218 name, type, pubid, system = self._doctype 1219 elif type == "SYSTEM" and n == 3: 1220 name, type, system = self._doctype 1221 pubid = None 1222 else: 1223 return 1224 if pubid: 1225 pubid = pubid[1:-1] 1226 self.doctype(name, pubid, system[1:-1]) 1227 self._doctype = None
1228 1229 ## 1230 # Handles a doctype declaration. 1231 # 1232 # @param name Doctype name. 1233 # @param pubid Public identifier. 1234 # @param system System identifier. 1235
1236 - def doctype(self, name, pubid, system):
1237 pass
1238 1239 ## 1240 # Feeds data to the parser. 1241 # 1242 # @param data Encoded data. 1243
1244 - def feed(self, data):
1245 self._parser.Parse(data, 0)
1246 1247 ## 1248 # Finishes feeding data to the parser. 1249 # 1250 # @return An element structure. 1251 # @defreturn Element 1252
1253 - def close(self):
1254 self._parser.Parse("", 1) # end of data 1255 tree = self._target.close() 1256 del self._target, self._parser # get rid of circular references 1257 return tree
1258 1259 # compatibility 1260 XMLParser = XMLTreeBuilder 1261