nltk.etree.ElementTree

1 # 2 # ElementTree 3 # $Id: ElementTree.py 2326 2005-03-17 07:45:21Z fredrik $ 4 # 5 # light-weight XML support for Python 1.5.2 and later. 6 # 7 # history: 8 # 2001-10-20 fl created (from various sources) 9 # 2001-11-01 fl return root from parse method 10 # 2002-02-16 fl sort attributes in lexical order 11 # 2002-04-06 fl TreeBuilder refactoring, added PythonDoc markup 12 # 2002-05-01 fl finished TreeBuilder refactoring 13 # 2002-07-14 fl added basic namespace support to ElementTree.write 14 # 2002-07-25 fl added QName attribute support 15 # 2002-10-20 fl fixed encoding in write 16 # 2002-11-24 fl changed default encoding to ascii; fixed attribute encoding 17 # 2002-11-27 fl accept file objects or file names for parse/write 18 # 2002-12-04 fl moved XMLTreeBuilder back to this module 19 # 2003-01-11 fl fixed entity encoding glitch for us-ascii 20 # 2003-02-13 fl added XML literal factory 21 # 2003-02-21 fl added ProcessingInstruction/PI factory 22 # 2003-05-11 fl added tostring/fromstring helpers 23 # 2003-05-26 fl added ElementPath support 24 # 2003-07-05 fl added makeelement factory method 25 # 2003-07-28 fl added more well-known namespace prefixes 26 # 2003-08-15 fl fixed typo in ElementTree.findtext (Thomas Dartsch) 27 # 2003-09-04 fl fall back on emulator if ElementPath is not installed 28 # 2003-10-31 fl markup updates 29 # 2003-11-15 fl fixed nested namespace bug 30 # 2004-03-28 fl added XMLID helper 31 # 2004-06-02 fl added default support to findtext 32 # 2004-06-08 fl fixed encoding of non-ascii element/attribute names 33 # 2004-08-23 fl take advantage of post-2.1 expat features 34 # 2005-02-01 fl added iterparse implementation 35 # 2005-03-02 fl fixed iterparse support for pre-2.2 versions 36 # 37 # Copyright (c) 1999-2005 by Fredrik Lundh. All rights reserved. 38 # 39 # [email protected] 40 # http://www.pythonware.com 41 # 42 # -------------------------------------------------------------------- 43 # The ElementTree toolkit is 44 # 45 # Copyright (c) 1999-2005 by Fredrik Lundh 46 # 47 # By obtaining, using, and/or copying this software and/or its 48 # associated documentation, you agree that you have read, understood, 49 # and will comply with the following terms and conditions: 50 # 51 # Permission to use, copy, modify, and distribute this software and 52 # its associated documentation for any purpose and without fee is 53 # hereby granted, provided that the above copyright notice appears in 54 # all copies, and that both that copyright notice and this permission 55 # notice appear in supporting documentation, and that the name of 56 # Secret Labs AB or the author not be used in advertising or publicity 57 # pertaining to distribution of the software without specific, written 58 # prior permission. 59 # 60 # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD 61 # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- 62 # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR 63 # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY 64 # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, 65 # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 66 # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 67 # OF THIS SOFTWARE. 68 # -------------------------------------------------------------------- 69 70 # Licensed to PSF under a Contributor Agreement. 71 # See http://www.python.org/2.4/license for licensing details. 72 73 __all__ = [ 74 # public symbols 75 "Comment", 76 "dump", 77 "Element", "ElementTree", 78 "fromstring", 79 "iselement", "iterparse", 80 "parse", 81 "PI", "ProcessingInstruction", 82 "QName", 83 "SubElement", 84 "tostring", 85 "TreeBuilder", 86 "VERSION", "XML", 87 "XMLParser", "XMLTreeBuilder", 88 ] 89 90 ## 91 # The Element type is a flexible container object, designed to 92 # store hierarchical data structures in memory. The type can be 93 # described as a cross between a list and a dictionary. 94 #  95 # Each element has a number of properties associated with it: 96 # <ul> 97 # <li>a tag. This is a string identifying what kind of data 98 # this element represents (the element type, in other words).</li> 99 # <li>a number of attributes, stored in a Python dictionary.</li> 100 # <li>a text string.</li> 101 # <li>an optional tail string.</li> 102 # <li>a number of child elements, stored in a Python sequence</li> 103 # </ul> 104 # 105 # To create an element instance, use the {@link #Element} or {@link 106 # #SubElement} factory functions. 107 #  108 # The {@link #ElementTree} class can be used to wrap an element 109 # structure, and convert it from and to XML. 110 ## 111 112 import string, sys, re 113

114 -class _SimpleElementPath:

115 # emulate pre-1.2 find/findtext/findall behaviour

116 - def find(self, element, tag):

117 for elem in element: 118 if elem.tag == tag: 119 return elem 120 return None

121 - def findtext(self, element, tag, default=None):

122 for elem in element: 123 if elem.tag == tag: 124 return elem.text or "" 125 return default

126 - def findall(self, element, tag):

127 if tag[:3] == ".//": 128 return element.getiterator(tag[3:]) 129 result = [] 130 for elem in element: 131 if elem.tag == tag: 132 result.append(elem) 133 return result

134 135 try: 136 import ElementPath 137 except ImportError: 138 # FIXME: issue warning in this case? 139 ElementPath = _SimpleElementPath() 140 141 # TODO: add support for custom namespace resolvers/default namespaces 142 # TODO: add improved support for incremental parsing 143 144 VERSION = "1.2.6" 145 146 ## 147 # Internal element class. This class defines the Element interface, 148 # and provides a reference implementation of this interface. 149 #  150 # You should not create instances of this class directly. Use the 151 # appropriate factory functions instead, such as {@link #Element} 152 # and {@link #SubElement}. 153 # 154 # @see Element 155 # @see SubElement 156 # @see Comment 157 # @see ProcessingInstruction 158

159 -class _ElementInterface:

160 # <tag attrib>text<child/>...</tag>tail 161 162 ## 163 # (Attribute) Element tag. 164 165 tag = None 166 167 ## 168 # (Attribute) Element attribute dictionary. Where possible, use 169 # {@link #_ElementInterface.get}, 170 # {@link #_ElementInterface.set}, 171 # {@link #_ElementInterface.keys}, and 172 # {@link #_ElementInterface.items} to access 173 # element attributes. 174 175 attrib = None 176 177 ## 178 # (Attribute) Text before first subelement. This is either a 179 # string or the value None, if there was no text. 180 181 text = None 182 183 ## 184 # (Attribute) Text after this element's end tag, but before the 185 # next sibling element's start tag. This is either a string or 186 # the value None, if there was no text. 187 188 tail = None # text after end tag, if any 189

190 - def __init__(self, tag, attrib):

191 self.tag = tag 192 self.attrib = attrib 193 self._children = []

194

195 - def __repr__(self):

196 return "<Element %s at %x>" % (self.tag, id(self))

197 198 ## 199 # Creates a new element object of the same type as this element. 200 # 201 # @param tag Element tag. 202 # @param attrib Element attributes, given as a dictionary. 203 # @return A new element instance. 204

205 - def makeelement(self, tag, attrib):

206 return Element(tag, attrib)

207 208 ## 209 # Returns the number of subelements. 210 # 211 # @return The number of subelements. 212

213 - def __len__(self):

214 return len(self._children)

215 216 ## 217 # Returns the given subelement. 218 # 219 # @param index What subelement to return. 220 # @return The given subelement. 221 # @exception IndexError If the given element does not exist. 222

223 - def __getitem__(self, index):

224 return self._children[index]

225 226 ## 227 # Replaces the given subelement. 228 # 229 # @param index What subelement to replace. 230 # @param element The new element value. 231 # @exception IndexError If the given element does not exist. 232 # @exception AssertionError If element is not a valid object. 233

234 - def __setitem__(self, index, element):

235 assert iselement(element) 236 self._children[index] = element

237 238 ## 239 # Deletes the given subelement. 240 # 241 # @param index What subelement to delete. 242 # @exception IndexError If the given element does not exist. 243

244 - def __delitem__(self, index):

245 del self._children[index]

246 247 ## 248 # Returns a list containing subelements in the given range. 249 # 250 # @param start The first subelement to return. 251 # @param stop The first subelement that shouldn't be returned. 252 # @return A sequence object containing subelements. 253

254 - def __getslice__(self, start, stop):

255 return self._children[start:stop]

256 257 ## 258 # Replaces a number of subelements with elements from a sequence. 259 # 260 # @param start The first subelement to replace. 261 # @param stop The first subelement that shouldn't be replaced. 262 # @param elements A sequence object with zero or more elements. 263 # @exception AssertionError If a sequence member is not a valid object. 264

265 - def __setslice__(self, start, stop, elements):

266 for element in elements: 267 assert iselement(element) 268 self._children[start:stop] = list(elements)

269 270 ## 271 # Deletes a number of subelements. 272 # 273 # @param start The first subelement to delete. 274 # @param stop The first subelement to leave in there. 275

276 - def __delslice__(self, start, stop):

277 del self._children[start:stop]

278 279 ## 280 # Adds a subelement to the end of this element. 281 # 282 # @param element The element to add. 283 # @exception AssertionError If a sequence member is not a valid object. 284

285 - def append(self, element):

286 assert iselement(element) 287 self._children.append(element)

288 289 ## 290 # Inserts a subelement at the given position in this element. 291 # 292 # @param index Where to insert the new subelement. 293 # @exception AssertionError If the element is not a valid object. 294

295 - def insert(self, index, element):

296 assert iselement(element) 297 self._children.insert(index, element)

298 299 ## 300 # Removes a matching subelement. Unlike the find methods, 301 # this method compares elements based on identity, not on tag 302 # value or contents. 303 # 304 # @param element What element to remove. 305 # @exception ValueError If a matching element could not be found. 306 # @exception AssertionError If the element is not a valid object. 307

308 - def remove(self, element):

309 assert iselement(element) 310 self._children.remove(element)

311 312 ## 313 # Returns all subelements. The elements are returned in document 314 # order. 315 # 316 # @return A list of subelements. 317 # @defreturn list of Element instances 318

319 - def getchildren(self):

320 return self._children

321 322 ## 323 # Finds the first matching subelement, by tag name or path. 324 # 325 # @param path What element to look for. 326 # @return The first matching element, or None if no element was found. 327 # @defreturn Element or None 328

329 - def find(self, path):

330 return ElementPath.find(self, path)

331 332 ## 333 # Finds text for the first matching subelement, by tag name or path. 334 # 335 # @param path What element to look for. 336 # @param default What to return if the element was not found. 337 # @return The text content of the first matching element, or the 338 # default value no element was found. Note that if the element 339 # has is found, but has no text content, this method returns an 340 # empty string. 341 # @defreturn string 342

343 - def findtext(self, path, default=None):

344 return ElementPath.findtext(self, path, default)

345 346 ## 347 # Finds all matching subelements, by tag name or path. 348 # 349 # @param path What element to look for. 350 # @return A list or iterator containing all matching elements, 351 # in document order. 352 # @defreturn list of Element instances 353

354 - def findall(self, path):

355 return ElementPath.findall(self, path)

356 357 ## 358 # Resets an element. This function removes all subelements, clears 359 # all attributes, and sets the text and tail attributes to None. 360

361 - def clear(self):

362 self.attrib.clear() 363 self._children = [] 364 self.text = self.tail = None

365 366 ## 367 # Gets an element attribute. 368 # 369 # @param key What attribute to look for. 370 # @param default What to return if the attribute was not found. 371 # @return The attribute value, or the default value, if the 372 # attribute was not found. 373 # @defreturn string or None 374

375 - def get(self, key, default=None):

376 return self.attrib.get(key, default)

377 378 ## 379 # Sets an element attribute. 380 # 381 # @param key What attribute to set. 382 # @param value The attribute value. 383

384 - def set(self, key, value):

385 self.attrib[key] = value

386 387 ## 388 # Gets a list of attribute names. The names are returned in an 389 # arbitrary order (just like for an ordinary Python dictionary). 390 # 391 # @return A list of element attribute names. 392 # @defreturn list of strings 393

394 - def keys(self):

395 return self.attrib.keys()

396 397 ## 398 # Gets element attributes, as a sequence. The attributes are 399 # returned in an arbitrary order. 400 # 401 # @return A list of (name, value) tuples for all attributes. 402 # @defreturn list of (string, string) tuples 403

404 - def items(self):

405 return self.attrib.items()

406 407 ## 408 # Creates a tree iterator. The iterator loops over this element 409 # and all subelements, in document order, and returns all elements 410 # with a matching tag. 411 #  412 # If the tree structure is modified during iteration, the result 413 # is undefined. 414 # 415 # @param tag What tags to look for (default is to return all elements). 416 # @return A list or iterator containing all the matching elements. 417 # @defreturn list or iterator 418

419 - def getiterator(self, tag=None):

420 nodes = [] 421 if tag == "*": 422 tag = None 423 if tag is None or self.tag == tag: 424 nodes.append(self) 425 for node in self._children: 426 nodes.extend(node.getiterator(tag)) 427 return nodes

428 429 # compatibility 430 _Element = _ElementInterface 431 432 ## 433 # Element factory. This function returns an object implementing the 434 # standard Element interface. The exact class or type of that object 435 # is implementation dependent, but it will always be compatible with 436 # the {@link #_ElementInterface} class in this module. 437 #  438 # The element name, attribute names, and attribute values can be 439 # either 8-bit ASCII strings or Unicode strings. 440 # 441 # @param tag The element name. 442 # @param attrib An optional dictionary, containing element attributes. 443 # @param **extra Additional attributes, given as keyword arguments. 444 # @return An element instance. 445 # @defreturn Element 446

447 -def Element(tag, attrib={}, **extra):

448 attrib = attrib.copy() 449 attrib.update(extra) 450 return _ElementInterface(tag, attrib)

451 452 ## 453 # Subelement factory. This function creates an element instance, and 454 # appends it to an existing element. 455 #  456 # The element name, attribute names, and attribute values can be 457 # either 8-bit ASCII strings or Unicode strings. 458 # 459 # @param parent The parent element. 460 # @param tag The subelement name. 461 # @param attrib An optional dictionary, containing element attributes. 462 # @param **extra Additional attributes, given as keyword arguments. 463 # @return An element instance. 464 # @defreturn Element 465

466 -def SubElement(parent, tag, attrib={}, **extra):

467 attrib = attrib.copy() 468 attrib.update(extra) 469 element = parent.makeelement(tag, attrib) 470 parent.append(element) 471 return element

472 473 ## 474 # Comment element factory. This factory function creates a special 475 # element that will be serialized as an XML comment. 476 #  477 # The comment string can be either an 8-bit ASCII string or a Unicode 478 # string. 479 # 480 # @param text A string containing the comment string. 481 # @return An element instance, representing a comment. 482 # @defreturn Element 483

484 -def Comment(text=None):

485 element = Element(Comment) 486 element.text = text 487 return element

488 489 ## 490 # PI element factory. This factory function creates a special element 491 # that will be serialized as an XML processing instruction. 492 # 493 # @param target A string containing the PI target. 494 # @param text A string containing the PI contents, if any. 495 # @return An element instance, representing a PI. 496 # @defreturn Element 497

498 -def ProcessingInstruction(target, text=None):

499 element = Element(ProcessingInstruction) 500 element.text = target 501 if text: 502 element.text = element.text + " " + text 503 return element

504 505 PI = ProcessingInstruction 506 507 ## 508 # QName wrapper. This can be used to wrap a QName attribute value, in 509 # order to get proper namespace handling on output. 510 # 511 # @param text A string containing the QName value, in the form {uri}local, 512 # or, if the tag argument is given, the URI part of a QName. 513 # @param tag Optional tag. If given, the first argument is interpreted as 514 # an URI, and this argument is interpreted as a local name. 515 # @return An opaque object, representing the QName. 516

517 -class QName:

518 - def __init__(self, text_or_uri, tag=None):

519 if tag: 520 text_or_uri = "{%s}%s" % (text_or_uri, tag) 521 self.text = text_or_uri

522 - def __str__(self):

523 return self.text

524 - def __hash__(self):

525 return hash(self.text)

526 - def __cmp__(self, other):

527 if isinstance(other, QName): 528 return cmp(self.text, other.text) 529 return cmp(self.text, other)

530 531 ## 532 # ElementTree wrapper class. This class represents an entire element 533 # hierarchy, and adds some extra support for serialization to and from 534 # standard XML. 535 # 536 # @param element Optional root element. 537 # @keyparam file Optional file handle or name. If given, the 538 # tree is initialized with the contents of this XML file. 539

540 -class ElementTree:

541

542 - def __init__(self, element=None, file=None):

543 assert element is None or iselement(element) 544 self._root = element # first node 545 if file: 546 self.parse(file)

547 548 ## 549 # Gets the root element for this tree. 550 # 551 # @return An element instance. 552 # @defreturn Element 553

554 - def getroot(self):

555 return self._root

556 557 ## 558 # Replaces the root element for this tree. This discards the 559 # current contents of the tree, and replaces it with the given 560 # element. Use with care. 561 # 562 # @param element An element instance. 563

564 - def _setroot(self, element):

565 assert iselement(element) 566 self._root = element

567 568 ## 569 # Loads an external XML document into this element tree. 570 # 571 # @param source A file name or file object. 572 # @param parser An optional parser instance. If not given, the 573 # standard {@link XMLTreeBuilder} parser is used. 574 # @return The document root element. 575 # @defreturn Element 576

577 - def parse(self, source, parser=None):

578 if not hasattr(source, "read"): 579 source = open(source, "rb") 580 if not parser: 581 parser = XMLTreeBuilder() 582 while 1: 583 data = source.read(32768) 584 if not data: 585 break 586 parser.feed(data) 587 self._root = parser.close() 588 return self._root

589 590 ## 591 # Creates a tree iterator for the root element. The iterator loops 592 # over all elements in this tree, in document order. 593 # 594 # @param tag What tags to look for (default is to return all elements) 595 # @return An iterator. 596 # @defreturn iterator 597

598 - def getiterator(self, tag=None):

599 assert self._root is not None 600 return self._root.getiterator(tag)

601 602 ## 603 # Finds the first toplevel element with given tag. 604 # Same as getroot().find(path). 605 # 606 # @param path What element to look for. 607 # @return The first matching element, or None if no element was found. 608 # @defreturn Element or None 609

610 - def find(self, path):

611 assert self._root is not None 612 if path[:1] == "/": 613 path = "." + path 614 return self._root.find(path)

615 616 ## 617 # Finds the element text for the first toplevel element with given 618 # tag. Same as getroot().findtext(path). 619 # 620 # @param path What toplevel element to look for. 621 # @param default What to return if the element was not found. 622 # @return The text content of the first matching element, or the 623 # default value no element was found. Note that if the element 624 # has is found, but has no text content, this method returns an 625 # empty string. 626 # @defreturn string 627

628 - def findtext(self, path, default=None):

629 assert self._root is not None 630 if path[:1] == "/": 631 path = "." + path 632 return self._root.findtext(path, default)

633 634 ## 635 # Finds all toplevel elements with the given tag. 636 # Same as getroot().findall(path). 637 # 638 # @param path What element to look for. 639 # @return A list or iterator containing all matching elements, 640 # in document order. 641 # @defreturn list of Element instances 642

643 - def findall(self, path):

644 assert self._root is not None 645 if path[:1] == "/": 646 path = "." + path 647 return self._root.findall(path)

648 649 ## 650 # Writes the element tree to a file, as XML. 651 # 652 # @param file A file name, or a file object opened for writing. 653 # @param encoding Optional output encoding (default is US-ASCII). 654

655 - def write(self, file, encoding="us-ascii"):

656 assert self._root is not None 657 if not hasattr(file, "write"): 658 file = open(file, "wb") 659 if not encoding: 660 encoding = "us-ascii" 661 elif encoding != "utf-8" and encoding != "us-ascii": 662 file.write("<?xml version='1.0' encoding='%s'?>\n" % encoding) 663 self._write(file, self._root, encoding, {})

664

665 - def _write(self, file, node, encoding, namespaces):

666 # write XML to file 667 tag = node.tag 668 if tag is Comment: 669 file.write("" % _escape_cdata(node.text, encoding)) 670 elif tag is ProcessingInstruction: 671 file.write("<?%s?>" % _escape_cdata(node.text, encoding)) 672 else: 673 items = node.items() 674 xmlns_items = [] # new namespaces in this scope 675 try: 676 if isinstance(tag, QName) or tag[:1] == "{": 677 tag, xmlns = fixtag(tag, namespaces) 678 if xmlns: xmlns_items.append(xmlns) 679 except TypeError: 680 _raise_serialization_error(tag) 681 file.write("<" + _encode(tag, encoding)) 682 if items or xmlns_items: 683 items.sort() # lexical order 684 for k, v in items: 685 try: 686 if isinstance(k, QName) or k[:1] == "{": 687 k, xmlns = fixtag(k, namespaces) 688 if xmlns: xmlns_items.append(xmlns) 689 except TypeError: 690 _raise_serialization_error(k) 691 try: 692 if isinstance(v, QName): 693 v, xmlns = fixtag(v, namespaces) 694 if xmlns: xmlns_items.append(xmlns) 695 except TypeError: 696 _raise_serialization_error(v) 697 file.write(" %s=\"%s\"" % (_encode(k, encoding), 698 _escape_attrib(v, encoding))) 699 for k, v in xmlns_items: 700 file.write(" %s=\"%s\"" % (_encode(k, encoding), 701 _escape_attrib(v, encoding))) 702 if node.text or len(node): 703 file.write(">") 704 if node.text: 705 file.write(_escape_cdata(node.text, encoding)) 706 for n in node: 707 self._write(file, n, encoding, namespaces) 708 file.write("</" + _encode(tag, encoding) + ">") 709 else: 710 file.write(" />") 711 for k, v in xmlns_items: 712 del namespaces[v] 713 if node.tail: 714 file.write(_escape_cdata(node.tail, encoding))

715 716 # -------------------------------------------------------------------- 717 # helpers 718 719 ## 720 # Checks if an object appears to be a valid element object. 721 # 722 # @param An element instance. 723 # @return A true value if this is an element object. 724 # @defreturn flag 725

726 -def iselement(element):

727 # FIXME: not sure about this; might be a better idea to look 728 # for tag/attrib/text attributes 729 return isinstance(element, _ElementInterface) or hasattr(element, "tag")

730 731 ## 732 # Writes an element tree or element structure to sys.stdout. This 733 # function should be used for debugging only. 734 #  735 # The exact output format is implementation dependent. In this 736 # version, it's written as an ordinary XML file. 737 # 738 # @param elem An element tree or an individual element. 739

740 -def dump(elem):

741 # debugging 742 if not isinstance(elem, ElementTree): 743 elem = ElementTree(elem) 744 elem.write(sys.stdout) 745 tail = elem.getroot().tail 746 if not tail or tail[-1] != "\n": 747 sys.stdout.write("\n")

748

749 -def _encode(s, encoding):

750 try: 751 return s.encode(encoding) 752 except AttributeError: 753 return s # 1.5.2: assume the string uses the right encoding

754 755 if sys.version[:3] == "1.5": 756 _escape = re.compile(r"[&<>\"\x80-\xff]+") # 1.5.2 757 else: 758 _escape = re.compile(eval(r'u"[&<>\"\u0080-\uffff]+"')) 759 760 _escape_map = { 761 "&": "&", 762 "<": "<", 763 ">": ">", 764 '"': """, 765 } 766 767 _namespace_map = { 768 # "well-known" namespace prefixes 769 "http://www.w3.org/XML/1998/namespace": "xml", 770 "http://www.w3.org/1999/xhtml": "html", 771 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf", 772 "http://schemas.xmlsoap.org/wsdl/": "wsdl", 773 } 774

775 -def _raise_serialization_error(text):

776 raise TypeError( 777 "cannot serialize %r (type %s)" % (text, type(text).__name__) 778 )

779

780 -def _encode_entity(text, pattern=_escape):

781 # map reserved and non-ascii characters to numerical entities 782 def escape_entities(m, map=_escape_map): 783 out = [] 784 append = out.append 785 for char in m.group(): 786 text = map.get(char) 787 if text is None: 788 text = "&#%d;" % ord(char) 789 append(text) 790 return string.join(out, "")

791 try: 792 return _encode(pattern.sub(escape_entities, text), "ascii") 793 except TypeError: 794 _raise_serialization_error(text) 795 796 # 797 # the following functions assume an ascii-compatible encoding 798 # (or "utf-16") 799

800 -def _escape_cdata(text, encoding=None, replace=string.replace):

801 # escape character data 802 try: 803 if encoding: 804 try: 805 text = _encode(text, encoding) 806 except UnicodeError: 807 return _encode_entity(text) 808 text = replace(text, "&", "&") 809 text = replace(text, "<", "<") 810 text = replace(text, ">", ">") 811 return text 812 except (TypeError, AttributeError): 813 _raise_serialization_error(text)

814

815 -def _escape_attrib(text, encoding=None, replace=string.replace):

816 # escape attribute value 817 try: 818 if encoding: 819 try: 820 text = _encode(text, encoding) 821 except UnicodeError: 822 return _encode_entity(text) 823 text = replace(text, "&", "&") 824 text = replace(text, "'", "'") # FIXME: overkill 825 text = replace(text, "\"", """) 826 text = replace(text, "<", "<") 827 text = replace(text, ">", ">") 828 return text 829 except (TypeError, AttributeError): 830 _raise_serialization_error(text)

831

832 -def fixtag(tag, namespaces):

833 # given a decorated tag (of the form {uri}tag), return prefixed 834 # tag and namespace declaration, if any 835 if isinstance(tag, QName): 836 tag = tag.text 837 namespace_uri, tag = string.split(tag[1:], "}", 1) 838 prefix = namespaces.get(namespace_uri) 839 if prefix is None: 840 prefix = _namespace_map.get(namespace_uri) 841 if prefix is None: 842 prefix = "ns%d" % len(namespaces) 843 namespaces[namespace_uri] = prefix 844 if prefix == "xml": 845 xmlns = None 846 else: 847 xmlns = ("xmlns:%s" % prefix, namespace_uri) 848 else: 849 xmlns = None 850 return "%s:%s" % (prefix, tag), xmlns

851 852 ## 853 # Parses an XML document into an element tree. 854 # 855 # @param source A filename or file object containing XML data. 856 # @param parser An optional parser instance. If not given, the 857 # standard {@link XMLTreeBuilder} parser is used. 858 # @return An ElementTree instance 859

860 -def parse(source, parser=None):

861 tree = ElementTree() 862 tree.parse(source, parser) 863 return tree

864 865 ## 866 # Parses an XML document into an element tree incrementally, and reports 867 # what's going on to the user. 868 # 869 # @param source A filename or file object containing XML data. 870 # @param events A list of events to report back. If omitted, only "end" 871 # events are reported. 872 # @return A (event, elem) iterator. 873

874 -class iterparse:

875

876 - def __init__(self, source, events=None):

877 if not hasattr(source, "read"): 878 source = open(source, "rb") 879 self._file = source 880 self._events = [] 881 self._index = 0 882 self.root = self._root = None 883 self._parser = XMLTreeBuilder() 884 # wire up the parser for event reporting 885 parser = self._parser._parser 886 append = self._events.append 887 if events is None: 888 events = ["end"] 889 for event in events: 890 if event == "start": 891 try: 892 parser.ordered_attributes = 1 893 parser.specified_attributes = 1 894 def handler(tag, attrib_in, event=event, append=append, 895 start=self._parser._start_list): 896 append((event, start(tag, attrib_in)))

897 parser.StartElementHandler = handler 898 except AttributeError: 899 def handler(tag, attrib_in, event=event, append=append, 900 start=self._parser._start): 901 append((event, start(tag, attrib_in)))

902 parser.StartElementHandler = handler 903 elif event == "end": 904 def handler(tag, event=event, append=append, 905 end=self._parser._end): 906 append((event, end(tag))) 907 parser.EndElementHandler = handler 908 elif event == "start-ns": 909 def handler(prefix, uri, event=event, append=append): 910 try: 911 uri = _encode(uri, "ascii") 912 except UnicodeError: 913 pass 914 append((event, (prefix or "", uri))) 915 parser.StartNamespaceDeclHandler = handler 916 elif event == "end-ns": 917 def handler(prefix, event=event, append=append): 918 append((event, None)) 919 parser.EndNamespaceDeclHandler = handler 920

921 - def next(self):

922 while 1: 923 try: 924 item = self._events[self._index] 925 except IndexError: 926 if self._parser is None: 927 self.root = self._root 928 try: 929 raise StopIteration 930 except NameError: 931 raise IndexError 932 # load event buffer 933 del self._events[:] 934 self._index = 0 935 data = self._file.read(16384) 936 if data: 937 self._parser.feed(data) 938 else: 939 self._root = self._parser.close() 940 self._parser = None 941 else: 942 self._index = self._index + 1 943 return item

944 945 try: 946 iter

947 - def __iter__(self):

948 return self

949 except NameError:

950 - def __getitem__(self, index):

951 return self.next()

952 953 ## 954 # Parses an XML document from a string constant. This function can 955 # be used to embed "XML literals" in Python code. 956 # 957 # @param source A string containing XML data. 958 # @return An Element instance. 959 # @defreturn Element 960

961 -def XML(text):

962 parser = XMLTreeBuilder() 963 parser.feed(text) 964 return parser.close()

965 966 ## 967 # Parses an XML document from a string constant, and also returns 968 # a dictionary which maps from element id:s to elements. 969 # 970 # @param source A string containing XML data. 971 # @return A tuple containing an Element instance and a dictionary. 972 # @defreturn (Element, dictionary) 973

974 -def XMLID(text):

975 parser = XMLTreeBuilder() 976 parser.feed(text) 977 tree = parser.close() 978 ids = {} 979 for elem in tree.getiterator(): 980 id = elem.get("id") 981 if id: 982 ids[id] = elem 983 return tree, ids

984 985 ## 986 # Parses an XML document from a string constant. Same as {@link #XML}. 987 # 988 # @def fromstring(text) 989 # @param source A string containing XML data. 990 # @return An Element instance. 991 # @defreturn Element 992 993 fromstring = XML 994 995 ## 996 # Generates a string representation of an XML element, including all 997 # subelements. 998 # 999 # @param element An Element instance. 1000 # @return An encoded string containing the XML data. 1001 # @defreturn string 1002

1003 -def tostring(element, encoding=None):

1004 class dummy: 1005 pass

1006 data = [] 1007 file = dummy() 1008 file.write = data.append 1009 ElementTree(element).write(file, encoding) 1010 return string.join(data, "") 1011 1012 ## 1013 # Generic element structure builder. This builder converts a sequence 1014 # of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link 1015 # #TreeBuilder.end} method calls to a well-formed element structure. 1016 #  1017 # You can use this class to build an element structure using a custom XML 1018 # parser, or a parser for some other XML-like format. 1019 # 1020 # @param element_factory Optional element factory. This factory 1021 # is called to create new Element instances, as necessary. 1022

1023 -class TreeBuilder:

1024

1025 - def __init__(self, element_factory=None):

1026 self._data = [] # data collector 1027 self._elem = [] # element stack 1028 self._last = None # last element 1029 self._tail = None # true if we're after an end tag 1030 if element_factory is None: 1031 element_factory = _ElementInterface 1032 self._factory = element_factory

1033 1034 ## 1035 # Flushes the parser buffers, and returns the toplevel documen 1036 # element. 1037 # 1038 # @return An Element instance. 1039 # @defreturn Element 1040

1041 - def close(self):

1042 assert len(self._elem) == 0, "missing end tags" 1043 assert self._last != None, "missing toplevel element" 1044 return self._last

1045

1046 - def _flush(self):

1047 if self._data: 1048 if self._last is not None: 1049 text = string.join(self._data, "") 1050 if self._tail: 1051 assert self._last.tail is None, "internal error (tail)" 1052 self._last.tail = text 1053 else: 1054 assert self._last.text is None, "internal error (text)" 1055 self._last.text = text 1056 self._data = []

1057 1058 ## 1059 # Adds text to the current element. 1060 # 1061 # @param data A string. This should be either an 8-bit string 1062 # containing ASCII text, or a Unicode string. 1063

1064 - def data(self, data):

1065 self._data.append(data)

1066 1067 ## 1068 # Opens a new element. 1069 # 1070 # @param tag The element name. 1071 # @param attrib A dictionary containing element attributes. 1072 # @return The opened element. 1073 # @defreturn Element 1074

1075 - def start(self, tag, attrs):

1076 self._flush() 1077 self._last = elem = self._factory(tag, attrs) 1078 if self._elem: 1079 self._elem[-1].append(elem) 1080 self._elem.append(elem) 1081 self._tail = 0 1082 return elem

1083 1084 ## 1085 # Closes the current element. 1086 # 1087 # @param tag The element name. 1088 # @return The closed element. 1089 # @defreturn Element 1090

1091 - def end(self, tag):

1092 self._flush() 1093 self._last = self._elem.pop() 1094 assert self._last.tag == tag,\ 1095 "end tag mismatch (expected %s, got %s)" % ( 1096 self._last.tag, tag) 1097 self._tail = 1 1098 return self._last

1099 1100 ## 1101 # Element structure builder for XML source data, based on the 1102 # expat parser. 1103 # 1104 # @keyparam target Target object. If omitted, the builder uses an 1105 # instance of the standard {@link #TreeBuilder} class. 1106 # @keyparam html Predefine HTML entities. This flag is not supported 1107 # by the current implementation. 1108 # @see #ElementTree 1109 # @see #TreeBuilder 1110

1111 -class XMLTreeBuilder:

1112

1113 - def __init__(self, html=0, target=None):

1114 try: 1115 from xml.parsers import expat 1116 except ImportError: 1117 raise ImportError( 1118 "No module named expat; use SimpleXMLTreeBuilder instead" 1119 ) 1120 self._parser = parser = expat.ParserCreate(None, "}") 1121 if target is None: 1122 target = TreeBuilder() 1123 self._target = target 1124 self._names = {} # name memo cache 1125 # callbacks 1126 parser.DefaultHandlerExpand = self._default 1127 parser.StartElementHandler = self._start 1128 parser.EndElementHandler = self._end 1129 parser.CharacterDataHandler = self._data 1130 # let expat do the buffering, if supported 1131 try: 1132 self._parser.buffer_text = 1 1133 except AttributeError: 1134 pass 1135 # use new-style attribute handling, if supported 1136 try: 1137 self._parser.ordered_attributes = 1 1138 self._parser.specified_attributes = 1 1139 parser.StartElementHandler = self._start_list 1140 except AttributeError: 1141 pass 1142 encoding = None 1143 if not parser.returns_unicode: 1144 encoding = "utf-8" 1145 # target.xml(encoding, None) 1146 self._doctype = None 1147 self.entity = {}

1148

1149 - def _fixtext(self, text):

1150 # convert text string to ascii, if possible 1151 try: 1152 return _encode(text, "ascii") 1153 except UnicodeError: 1154 return text

1155

1156 - def _fixname(self, key):

1157 # expand qname, and convert name string to ascii, if possible 1158 try: 1159 name = self._names[key] 1160 except KeyError: 1161 name = key 1162 if "}" in name: 1163 name = "{" + name 1164 self._names[key] = name = self._fixtext(name) 1165 return name

1166

1167 - def _start(self, tag, attrib_in):

1168 fixname = self._fixname 1169 tag = fixname(tag) 1170 attrib = {} 1171 for key, value in attrib_in.items(): 1172 attrib[fixname(key)] = self._fixtext(value) 1173 return self._target.start(tag, attrib)

1174

1175 - def _start_list(self, tag, attrib_in):

1176 fixname = self._fixname 1177 tag = fixname(tag) 1178 attrib = {} 1179 if attrib_in: 1180 for i in range(0, len(attrib_in), 2): 1181 attrib[fixname(attrib_in[i])] = self._fixtext(attrib_in[i+1]) 1182 return self._target.start(tag, attrib)

1183

1184 - def _data(self, text):

1185 return self._target.data(self._fixtext(text))

1186

1187 - def _end(self, tag):

1188 return self._target.end(self._fixname(tag))

1189

1190 - def _default(self, text):

1191 prefix = text[:1] 1192 if prefix == "&": 1193 # deal with undefined entities 1194 try: 1195 self._target.data(self.entity[text[1:-1]]) 1196 except KeyError: 1197 from xml.parsers import expat 1198 raise expat.error( 1199 "undefined entity %s: line %d, column %d" % 1200 (text, self._parser.ErrorLineNumber, 1201 self._parser.ErrorColumnNumber) 1202 ) 1203 elif prefix == "<" and text[:9] == "<!DOCTYPE": 1204 self._doctype = [] # inside a doctype declaration 1205 elif self._doctype is not None: 1206 # parse doctype contents 1207 if prefix == ">": 1208 self._doctype = None 1209 return 1210 text = string.strip(text) 1211 if not text: 1212 return 1213 self._doctype.append(text) 1214 n = len(self._doctype) 1215 if n > 2: 1216 type = self._doctype[1] 1217 if type == "PUBLIC" and n == 4: 1218 name, type, pubid, system = self._doctype 1219 elif type == "SYSTEM" and n == 3: 1220 name, type, system = self._doctype 1221 pubid = None 1222 else: 1223 return 1224 if pubid: 1225 pubid = pubid[1:-1] 1226 self.doctype(name, pubid, system[1:-1]) 1227 self._doctype = None

1228 1229 ## 1230 # Handles a doctype declaration. 1231 # 1232 # @param name Doctype name. 1233 # @param pubid Public identifier. 1234 # @param system System identifier. 1235

1236 - def doctype(self, name, pubid, system):

1237 pass

1238 1239 ## 1240 # Feeds data to the parser. 1241 # 1242 # @param data Encoded data. 1243

1244 - def feed(self, data):

1245 self._parser.Parse(data, 0)

1246 1247 ## 1248 # Finishes feeding data to the parser. 1249 # 1250 # @return An element structure. 1251 # @defreturn Element 1252

1253 - def close(self):

1254 self._parser.Parse("", 1) # end of data 1255 tree = self._target.close() 1256 del self._target, self._parser # get rid of circular references 1257 return tree

1258 1259 # compatibility 1260 XMLParser = XMLTreeBuilder 1261

Source Code for Module nltk.etree.ElementTree