Package nltk :: Package corpus :: Package reader :: Module xmldocs
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.xmldocs

  1  # Natural Language Toolkit: XML Corpus Reader 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Steven Bird <[email protected]> 
  5  # URL: <http://nltk.org> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  """ 
  9  Corpus reader for corpora whose documents are xml files. 
 10   
 11  (note -- not named 'xml' to avoid conflicting w/ standard xml package) 
 12  """ 
 13   
 14  from nltk.corpus.reader.api import CorpusReader 
 15  from nltk.corpus.reader.util import * 
 16  from nltk.data import SeekableUnicodeStreamReader 
 17  from nltk.internals import deprecated, ElementWrapper 
 18  import codecs 
 19   
 20  # Use the c version of ElementTree, which is faster, if possible: 
 21  try: from xml.etree import cElementTree as ElementTree 
 22  except ImportError: from nltk.etree import ElementTree 
 23   
24 -class XMLCorpusReader(CorpusReader):
25 """ 26 Corpus reader for corpora whose documents are xml files. 27 28 Note that the C{XMLCorpusReader} constructor does not take an 29 C{encoding} argument, because the unicode encoding is specified by 30 the XML files themselves. See the XML specs for more info. 31 """
32 - def __init__(self, root, files, wrap_etree=False):
33 self._wrap_etree = wrap_etree 34 CorpusReader.__init__(self, root, files)
35
36 - def xml(self, fileid=None):
37 # Make sure we have exactly one file -- no concatinating xml. 38 if fileid is None and len(self._files) == 1: 39 fileid = self._files[0] 40 if not isinstance(fileid, basestring): 41 raise TypeError('Expected a single file identifier string') 42 # Read the XML in using ElementTree. 43 elt = ElementTree.parse(self.abspath(fileid).open()).getroot() 44 # If requested, wrap it. 45 if self._wrap_etree: 46 elt = ElementWrapper(elt) 47 # Return the ElementTree element. 48 return elt
49
50 - def raw(self, files=None):
51 if files is None: files = self._files 52 elif isinstance(files, basestring): files = [files] 53 return concat([self.open(f).read() for f in files])
54 55 #{ Deprecated since 0.8 56 @deprecated("Use .raw() or .xml() instead.")
57 - def read(self, items=None, format='xml'):
58 if format == 'raw': return self.raw(items) 59 if format == 'xml': return self.xml(items) 60 raise ValueError('bad format %r' % format)
61 #} 62
63 -class XMLCorpusView(StreamBackedCorpusView):
64 """ 65 A corpus view that selects out specified elements from an XML 66 file, and provides a flat list-like interface for accessing them. 67 (Note: C{XMLCorpusView} is not used by L{XMLCorpusReader} itself, 68 but may be used by subclasses of L{XMLCorpusReader}.) 69 70 Every XML corpus view has a X{tag specification}, indicating what 71 XML elements should be included in the view; and each (non-nested) 72 element that matches this specification corresponds to one item in 73 the view. Tag specifications are regular expressions over tag 74 paths, where a tag path is a list of element tag names, sepaated 75 by '/', indicating the ancestry of the element. Some examples: 76 77 - C{'foo'}: A top-level element whose tag is C{foo}. 78 - C{'foo/bar'}: An element whose tag is C{bar} and whose parent 79 is a top-level element whose tag is C{foo}. 80 - C{'.*/foo'}: An element whose tag is C{foo}, appearing anywhere 81 in the xml tree. 82 - C{'.*/(foo|bar)'}: An wlement whose tag is C{foo} or C{bar}, 83 appearing anywhere in the xml tree. 84 85 The view items are generated from the selected XML elements via 86 the method L{handle_elt()}. By default, this method returns the 87 element as-is (i.e., as an ElementTree object); but it can be 88 overridden, either via subclassing or via the C{elt_handler} 89 constructor parameter. 90 """ 91 92 #: If true, then display debugging output to stdout when reading 93 #: blocks. 94 _DEBUG = False 95 96 #: The number of characters read at a time by this corpus reader. 97 _BLOCK_SIZE = 1024 98
99 - def __init__(self, filename, tagspec, elt_handler=None):
100 """ 101 Create a new corpus view based on a specified XML file. 102 103 Note that the C{XMLCorpusView} constructor does not take an 104 C{encoding} argument, because the unicode encoding is 105 specified by the XML files themselves. 106 107 @type tagspec: C{str} 108 @param tagspec: A tag specification, indicating what XML 109 elements should be included in the view. Each non-nested 110 element that matches this specification corresponds to one 111 item in the view. 112 113 @param elt_handler: A function used to transform each element 114 to a value for the view. If no handler is specified, then 115 L{self.handle_elt()} is called, which returns the element 116 as an ElementTree object. The signature of elt_handler is:: 117 118 elt_handler(elt, tagspec) -> value 119 """ 120 if elt_handler: self.handle_elt = elt_handler 121 122 self._tagspec = re.compile(tagspec+r'\Z') 123 """The tag specification for this corpus view.""" 124 125 self._tag_context = {0: ()} 126 """A dictionary mapping from file positions (as returned by 127 C{stream.seek()} to XML contexts. An XML context is a 128 tuple of XML tag names, indicating which tags have not yet 129 been closed.""" 130 131 encoding = self._detect_encoding(filename) 132 StreamBackedCorpusView.__init__(self, filename, encoding=encoding)
133
134 - def _detect_encoding(self, filename):
135 if isinstance(filename, PathPointer): 136 s = filename.open().readline() 137 else: 138 s = open(filename, 'rb').readline() 139 if s.startswith(codecs.BOM_UTF16_BE): 140 return 'utf-16-be' 141 if s.startswith(codecs.BOM_UTF16_LE): 142 return 'utf-16-le' 143 if s.startswith(codecs.BOM_UTF32_BE): 144 return 'utf-32-be' 145 if s.startswith(codecs.BOM_UTF32_LE): 146 return 'utf-32-le' 147 if s.startswith(codecs.BOM_UTF8): 148 return 'utf-8' 149 m = re.match(r'\s*<?xml\b.*\bencoding="([^"]+)"', s) 150 if m: return m.group(1) 151 m = re.match(r"\s*<?xml\b.*\bencoding='([^']+)'", s) 152 if m: return m.group(1) 153 # No encoding found -- what should the default be? 154 return 'utf-8'
155
156 - def handle_elt(self, elt, context):
157 """ 158 Convert an element into an appropriate value for inclusion in 159 the view. Unless overridden by a subclass or by the 160 C{elt_handler} constructor argument, this method simply 161 returns C{elt}. 162 163 @return: The view value corresponding to C{elt}. 164 165 @type elt: C{ElementTree} 166 @param elt: The element that should be converted. 167 168 @type context: C{str} 169 @param context: A string composed of element tags separated by 170 forward slashes, indicating the XML context of the given 171 element. For example, the string C{'foo/bar/baz'} 172 indicates that the element is a C{baz} element whose 173 parent is a C{bar} element and whose grandparent is a 174 top-level C{foo} element. 175 """ 176 return elt
177 178 #: A regular expression that matches XML fragments that do not 179 #: contain any un-closed tags. 180 _VALID_XML_RE = re.compile(r""" 181 [^<]* 182 ( 183 ((<!--.*?-->) | # comment 184 (<![CDATA[.*?]]) | # raw character data 185 (<!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*>) | # doctype decl 186 (<[^>]*>)) # tag or PI 187 [^<]*)* 188 \Z""", 189 re.DOTALL|re.VERBOSE) 190 191 #: A regular expression used to extract the tag name from a start tag, 192 #: end tag, or empty-elt tag string. 193 _XML_TAG_NAME = re.compile('<\s*/?\s*([^\s>]+)') 194 195 #: A regular expression used to find all start-tags, end-tags, and 196 #: emtpy-elt tags in an XML file. This regexp is more lenient than 197 #: the XML spec -- e.g., it allows spaces in some places where the 198 #: spec does not. 199 _XML_PIECE = re.compile(r""" 200 # Include these so we can skip them: 201 (?P<COMMENT> <!--.*?--> )| 202 (?P<CDATA> <![CDATA[.*?]]> )| 203 (?P<PI> <\?.*?\?> )| 204 (?P<DOCTYPE> <!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*> )| 205 # These are the ones we actually care about: 206 (?P<EMPTY_ELT_TAG> <\s*[^>/\?!\s][^>]*/\s*> )| 207 (?P<START_TAG> <\s*[^>/\?!\s][^>]*> )| 208 (?P<END_TAG> <\s*/[^>/\?!\s][^>]*> )""", 209 re.DOTALL|re.VERBOSE) 210
211 - def _read_xml_fragment(self, stream):
212 """ 213 Read a string from the given stream that does not contain any 214 un-closed tags. In particular, this function first reads a 215 block from the stream of size L{self._BLOCK_SIZE}. It then 216 checks if that block contains an un-closed tag. If it does, 217 then this function either backtracks to the last '<', or reads 218 another block. 219 """ 220 fragment = '' 221 222 while True: 223 if isinstance(stream, SeekableUnicodeStreamReader): 224 startpos = stream.tell() 225 # Read a block and add it to the fragment. 226 xml_block = stream.read(self._BLOCK_SIZE) 227 fragment += xml_block 228 229 # Do we have a well-formed xml fragment? 230 if self._VALID_XML_RE.match(fragment): 231 return fragment 232 233 # Do we have a fragment that will never be well-formed? 234 if re.search('[<>]', fragment).group(0) == '>': 235 pos = stream.tell() - ( 236 len(fragment)-re.search('[<>]', fragment).end()) 237 raise ValueError('Unexpected ">" near char %s' % pos) 238 239 # End of file? 240 if not xml_block: 241 raise ValueError('Unexpected end of file: tag not closed') 242 243 # If not, then we must be in the middle of a <..tag..>. 244 # If appropriate, backtrack to the most recent '<' 245 # character. 246 last_open_bracket = fragment.rfind('<') 247 if last_open_bracket > 0: 248 if self._VALID_XML_RE.match(fragment[:last_open_bracket]): 249 if isinstance(stream, SeekableUnicodeStreamReader): 250 stream.seek(startpos) 251 stream.char_seek_forward(last_open_bracket) 252 else: 253 stream.seek(-(len(fragment)-last_open_bracket), 1) 254 return fragment[:last_open_bracket]
255 256 # Otherwise, read another block. (i.e., return to the 257 # top of the loop.) 258
259 - def read_block(self, stream, tagspec=None, elt_handler=None):
260 """ 261 Read from C{stream} until we find at least one element that 262 matches C{tagspec}, and return the result of applying 263 C{elt_handler} to each element found. 264 """ 265 if tagspec is None: tagspec = self._tagspec 266 if elt_handler is None: elt_handler = self.handle_elt 267 268 # Use a stack of strings to keep track of our context: 269 context = list(self._tag_context.get(stream.tell())) 270 assert context is not None # check this -- could it ever happen? 271 272 elts = [] 273 274 elt_start = None # where does the elt start 275 elt_depth = None # what context depth 276 elt_text = '' 277 278 while elts==[] or elt_start is not None: 279 if isinstance(stream, SeekableUnicodeStreamReader): 280 startpos = stream.tell() 281 xml_fragment = self._read_xml_fragment(stream) 282 283 # End of file. 284 if not xml_fragment: 285 if elt_start is None: break 286 else: raise ValueError('Unexpected end of file') 287 288 # Process each <tag> in the xml fragment. 289 for piece in self._XML_PIECE.finditer(xml_fragment): 290 if self._DEBUG: 291 print '%25s %s' % ('/'.join(context)[-20:], piece.group()) 292 293 if piece.group('START_TAG'): 294 name = self._XML_TAG_NAME.match(piece.group()).group(1) 295 # Keep context up-to-date. 296 context.append(name) 297 # Is this one of the elts we're looking for? 298 if elt_start is None: 299 if re.match(tagspec, '/'.join(context)): 300 elt_start = piece.start() 301 elt_depth = len(context) 302 303 elif piece.group('END_TAG'): 304 name = self._XML_TAG_NAME.match(piece.group()).group(1) 305 # sanity checks: 306 if not context: 307 raise ValueError('Unmatched tag </%s>' % name) 308 if name != context[-1]: 309 raise ValueError('Unmatched tag <%s>...</%s>' % 310 (context[-1], name)) 311 # Is this the end of an element? 312 if elt_start is not None and elt_depth == len(context): 313 elt_text += xml_fragment[elt_start:piece.end()] 314 elts.append( (elt_text, '/'.join(context)) ) 315 elt_start = elt_depth = None 316 elt_text = '' 317 # Keep context up-to-date 318 context.pop() 319 320 elif piece.group('EMPTY_ELT_TAG'): 321 name = self._XML_TAG_NAME.match(piece.group()).group(1) 322 if elt_start is None: 323 if re.match(tagspec, '/'.join(context)+'/'+name): 324 elts.append((piece.group(), 325 '/'.join(context)+'/'+name)) 326 327 if elt_start is not None: 328 # If we haven't found any elements yet, then keep 329 # looping until we do. 330 if elts == []: 331 elt_text += xml_fragment[elt_start:] 332 elt_start = 0 333 334 # If we've found at least one element, then try 335 # backtracking to the start of the element that we're 336 # inside of. 337 else: 338 # take back the last start-tag, and return what 339 # we've gotten so far (elts is non-empty). 340 if self._DEBUG: 341 print ' '*36+'(backtrack)' 342 if isinstance(stream, SeekableUnicodeStreamReader): 343 stream.seek(startpos) 344 stream.char_seek_forward(elt_start) 345 else: 346 stream.seek(-(len(xml_fragment)-elt_start), 1) 347 context = context[:elt_depth-1] 348 elt_start = elt_depth = None 349 elt_text = '' 350 351 # Update the _tag_context dict. 352 pos = stream.tell() 353 if pos in self._tag_context: 354 assert tuple(context) == self._tag_context[pos] 355 else: 356 self._tag_context[pos] = tuple(context) 357 358 return [elt_handler(ElementTree.fromstring( 359 elt.encode('ascii', 'xmlcharrefreplace')), 360 context) 361 for (elt, context) in elts]
362