nltk.corpus.reader.xmldocs

64 """ 65 A corpus view that selects out specified elements from an XML 66 file, and provides a flat list-like interface for accessing them. 67 (Note: C{XMLCorpusView} is not used by L{XMLCorpusReader} itself, 68 but may be used by subclasses of L{XMLCorpusReader}.) 69 70 Every XML corpus view has a X{tag specification}, indicating what 71 XML elements should be included in the view; and each (non-nested) 72 element that matches this specification corresponds to one item in 73 the view. Tag specifications are regular expressions over tag 74 paths, where a tag path is a list of element tag names, sepaated 75 by '/', indicating the ancestry of the element. Some examples: 76 77 - C{'foo'}: A top-level element whose tag is C{foo}. 78 - C{'foo/bar'}: An element whose tag is C{bar} and whose parent 79 is a top-level element whose tag is C{foo}. 80 - C{'.*/foo'}: An element whose tag is C{foo}, appearing anywhere 81 in the xml tree. 82 - C{'.*/(foo|bar)'}: An wlement whose tag is C{foo} or C{bar}, 83 appearing anywhere in the xml tree. 84 85 The view items are generated from the selected XML elements via 86 the method L{handle_elt()}. By default, this method returns the 87 element as-is (i.e., as an ElementTree object); but it can be 88 overridden, either via subclassing or via the C{elt_handler} 89 constructor parameter. 90 """ 91 92 #: If true, then display debugging output to stdout when reading 93 #: blocks. 94 _DEBUG = False 95 96 #: The number of characters read at a time by this corpus reader. 97 _BLOCK_SIZE = 1024 98

99 - def __init__(self, filename, tagspec, elt_handler=None):

100 """ 101 Create a new corpus view based on a specified XML file. 102 103 Note that the C{XMLCorpusView} constructor does not take an 104 C{encoding} argument, because the unicode encoding is 105 specified by the XML files themselves. 106 107 @type tagspec: C{str} 108 @param tagspec: A tag specification, indicating what XML 109 elements should be included in the view. Each non-nested 110 element that matches this specification corresponds to one 111 item in the view. 112 113 @param elt_handler: A function used to transform each element 114 to a value for the view. If no handler is specified, then 115 L{self.handle_elt()} is called, which returns the element 116 as an ElementTree object. The signature of elt_handler is:: 117 118 elt_handler(elt, tagspec) -> value 119 """ 120 if elt_handler: self.handle_elt = elt_handler 121 122 self._tagspec = re.compile(tagspec+r'\Z') 123 """The tag specification for this corpus view.""" 124 125 self._tag_context = {0: ()} 126 """A dictionary mapping from file positions (as returned by 127 C{stream.seek()} to XML contexts. An XML context is a 128 tuple of XML tag names, indicating which tags have not yet 129 been closed.""" 130 131 encoding = self._detect_encoding(filename) 132 StreamBackedCorpusView.__init__(self, filename, encoding=encoding)

133

134 - def _detect_encoding(self, filename):

135 if isinstance(filename, PathPointer): 136 s = filename.open().readline() 137 else: 138 s = open(filename, 'rb').readline() 139 if s.startswith(codecs.BOM_UTF16_BE): 140 return 'utf-16-be' 141 if s.startswith(codecs.BOM_UTF16_LE): 142 return 'utf-16-le' 143 if s.startswith(codecs.BOM_UTF32_BE): 144 return 'utf-32-be' 145 if s.startswith(codecs.BOM_UTF32_LE): 146 return 'utf-32-le' 147 if s.startswith(codecs.BOM_UTF8): 148 return 'utf-8' 149 m = re.match(r'\s*<?xml\b.*\bencoding="([^"]+)"', s) 150 if m: return m.group(1) 151 m = re.match(r"\s*<?xml\b.*\bencoding='([^']+)'", s) 152 if m: return m.group(1) 153 # No encoding found -- what should the default be? 154 return 'utf-8'

155

156 - def handle_elt(self, elt, context):

157 """ 158 Convert an element into an appropriate value for inclusion in 159 the view. Unless overridden by a subclass or by the 160 C{elt_handler} constructor argument, this method simply 161 returns C{elt}. 162 163 @return: The view value corresponding to C{elt}. 164 165 @type elt: C{ElementTree} 166 @param elt: The element that should be converted. 167 168 @type context: C{str} 169 @param context: A string composed of element tags separated by 170 forward slashes, indicating the XML context of the given 171 element. For example, the string C{'foo/bar/baz'} 172 indicates that the element is a C{baz} element whose 173 parent is a C{bar} element and whose grandparent is a 174 top-level C{foo} element. 175 """ 176 return elt

177 178 #: A regular expression that matches XML fragments that do not 179 #: contain any un-closed tags. 180 _VALID_XML_RE = re.compile(r""" 181 [^<]* 182 ( 183 (() | # comment 184 (<![CDATA[.*?]]) | # raw character data 185 (<!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*>) | # doctype decl 186 (<[^>]*>)) # tag or PI 187 [^<]*)* 188 \Z""", 189 re.DOTALL|re.VERBOSE) 190 191 #: A regular expression used to extract the tag name from a start tag, 192 #: end tag, or empty-elt tag string. 193 _XML_TAG_NAME = re.compile('<\s*/?\s*([^\s>]+)') 194 195 #: A regular expression used to find all start-tags, end-tags, and 196 #: emtpy-elt tags in an XML file. This regexp is more lenient than 197 #: the XML spec -- e.g., it allows spaces in some places where the 198 #: spec does not. 199 _XML_PIECE = re.compile(r""" 200 # Include these so we can skip them: 201 (?P<COMMENT>  )| 202 (?P<CDATA> <![CDATA[.*?]]> )| 203 (?P<PI> <\?.*?\?> )| 204 (?P<DOCTYPE> <!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*> )| 205 # These are the ones we actually care about: 206 (?P<EMPTY_ELT_TAG> <\s*[^>/\?!\s][^>]*/\s*> )| 207 (?P<START_TAG> <\s*[^>/\?!\s][^>]*> )| 208 (?P<END_TAG> <\s*/[^>/\?!\s][^>]*> )""", 209 re.DOTALL|re.VERBOSE) 210

211 - def _read_xml_fragment(self, stream):

212 """ 213 Read a string from the given stream that does not contain any 214 un-closed tags. In particular, this function first reads a 215 block from the stream of size L{self._BLOCK_SIZE}. It then 216 checks if that block contains an un-closed tag. If it does, 217 then this function either backtracks to the last '<', or reads 218 another block. 219 """ 220 fragment = '' 221 222 while True: 223 if isinstance(stream, SeekableUnicodeStreamReader): 224 startpos = stream.tell() 225 # Read a block and add it to the fragment. 226 xml_block = stream.read(self._BLOCK_SIZE) 227 fragment += xml_block 228 229 # Do we have a well-formed xml fragment? 230 if self._VALID_XML_RE.match(fragment): 231 return fragment 232 233 # Do we have a fragment that will never be well-formed? 234 if re.search('[<>]', fragment).group(0) == '>': 235 pos = stream.tell() - ( 236 len(fragment)-re.search('[<>]', fragment).end()) 237 raise ValueError('Unexpected ">" near char %s' % pos) 238 239 # End of file? 240 if not xml_block: 241 raise ValueError('Unexpected end of file: tag not closed') 242 243 # If not, then we must be in the middle of a <..tag..>. 244 # If appropriate, backtrack to the most recent '<' 245 # character. 246 last_open_bracket = fragment.rfind('<') 247 if last_open_bracket > 0: 248 if self._VALID_XML_RE.match(fragment[:last_open_bracket]): 249 if isinstance(stream, SeekableUnicodeStreamReader): 250 stream.seek(startpos) 251 stream.char_seek_forward(last_open_bracket) 252 else: 253 stream.seek(-(len(fragment)-last_open_bracket), 1) 254 return fragment[:last_open_bracket]

255 256 # Otherwise, read another block. (i.e., return to the 257 # top of the loop.) 258

259 - def read_block(self, stream, tagspec=None, elt_handler=None):

260 """ 261 Read from C{stream} until we find at least one element that 262 matches C{tagspec}, and return the result of applying 263 C{elt_handler} to each element found. 264 """ 265 if tagspec is None: tagspec = self._tagspec 266 if elt_handler is None: elt_handler = self.handle_elt 267 268 # Use a stack of strings to keep track of our context: 269 context = list(self._tag_context.get(stream.tell())) 270 assert context is not None # check this -- could it ever happen? 271 272 elts = [] 273 274 elt_start = None # where does the elt start 275 elt_depth = None # what context depth 276 elt_text = '' 277 278 while elts==[] or elt_start is not None: 279 if isinstance(stream, SeekableUnicodeStreamReader): 280 startpos = stream.tell() 281 xml_fragment = self._read_xml_fragment(stream) 282 283 # End of file. 284 if not xml_fragment: 285 if elt_start is None: break 286 else: raise ValueError('Unexpected end of file') 287 288 # Process each <tag> in the xml fragment. 289 for piece in self._XML_PIECE.finditer(xml_fragment): 290 if self._DEBUG: 291 print '%25s %s' % ('/'.join(context)[-20:], piece.group()) 292 293 if piece.group('START_TAG'): 294 name = self._XML_TAG_NAME.match(piece.group()).group(1) 295 # Keep context up-to-date. 296 context.append(name) 297 # Is this one of the elts we're looking for? 298 if elt_start is None: 299 if re.match(tagspec, '/'.join(context)): 300 elt_start = piece.start() 301 elt_depth = len(context) 302 303 elif piece.group('END_TAG'): 304 name = self._XML_TAG_NAME.match(piece.group()).group(1) 305 # sanity checks: 306 if not context: 307 raise ValueError('Unmatched tag </%s>' % name) 308 if name != context[-1]: 309 raise ValueError('Unmatched tag <%s>...</%s>' % 310 (context[-1], name)) 311 # Is this the end of an element? 312 if elt_start is not None and elt_depth == len(context): 313 elt_text += xml_fragment[elt_start:piece.end()] 314 elts.append( (elt_text, '/'.join(context)) ) 315 elt_start = elt_depth = None 316 elt_text = '' 317 # Keep context up-to-date 318 context.pop() 319 320 elif piece.group('EMPTY_ELT_TAG'): 321 name = self._XML_TAG_NAME.match(piece.group()).group(1) 322 if elt_start is None: 323 if re.match(tagspec, '/'.join(context)+'/'+name): 324 elts.append((piece.group(), 325 '/'.join(context)+'/'+name)) 326 327 if elt_start is not None: 328 # If we haven't found any elements yet, then keep 329 # looping until we do. 330 if elts == []: 331 elt_text += xml_fragment[elt_start:] 332 elt_start = 0 333 334 # If we've found at least one element, then try 335 # backtracking to the start of the element that we're 336 # inside of. 337 else: 338 # take back the last start-tag, and return what 339 # we've gotten so far (elts is non-empty). 340 if self._DEBUG: 341 print ' '*36+'(backtrack)' 342 if isinstance(stream, SeekableUnicodeStreamReader): 343 stream.seek(startpos) 344 stream.char_seek_forward(elt_start) 345 else: 346 stream.seek(-(len(xml_fragment)-elt_start), 1) 347 context = context[:elt_depth-1] 348 elt_start = elt_depth = None 349 elt_text = '' 350 351 # Update the _tag_context dict. 352 pos = stream.tell() 353 if pos in self._tag_context: 354 assert tuple(context) == self._tag_context[pos] 355 else: 356 self._tag_context[pos] = tuple(context) 357 358 return [elt_handler(ElementTree.fromstring( 359 elt.encode('ascii', 'xmlcharrefreplace')), 360 context) 361 for (elt, context) in elts]

Source Code for Module nltk.corpus.reader.xmldocs