1
2
3
4
5
6
7
8 """
9 Corpus reader for corpora whose documents are xml files.
10
11 (note -- not named 'xml' to avoid conflicting w/ standard xml package)
12 """
13
14 from nltk.corpus.reader.api import CorpusReader
15 from nltk.corpus.reader.util import *
16 from nltk.data import SeekableUnicodeStreamReader
17 from nltk.internals import deprecated, ElementWrapper
18 import codecs
19
20
21 try: from xml.etree import cElementTree as ElementTree
22 except ImportError: from nltk.etree import ElementTree
23
25 """
26 Corpus reader for corpora whose documents are xml files.
27
28 Note that the C{XMLCorpusReader} constructor does not take an
29 C{encoding} argument, because the unicode encoding is specified by
30 the XML files themselves. See the XML specs for more info.
31 """
32 - def __init__(self, root, files, wrap_etree=False):
35
36 - def xml(self, fileid=None):
37
38 if fileid is None and len(self._files) == 1:
39 fileid = self._files[0]
40 if not isinstance(fileid, basestring):
41 raise TypeError('Expected a single file identifier string')
42
43 elt = ElementTree.parse(self.abspath(fileid).open()).getroot()
44
45 if self._wrap_etree:
46 elt = ElementWrapper(elt)
47
48 return elt
49
50 - def raw(self, files=None):
54
55
56 @deprecated("Use .raw() or .xml() instead.")
57 - def read(self, items=None, format='xml'):
61
62
64 """
65 A corpus view that selects out specified elements from an XML
66 file, and provides a flat list-like interface for accessing them.
67 (Note: C{XMLCorpusView} is not used by L{XMLCorpusReader} itself,
68 but may be used by subclasses of L{XMLCorpusReader}.)
69
70 Every XML corpus view has a X{tag specification}, indicating what
71 XML elements should be included in the view; and each (non-nested)
72 element that matches this specification corresponds to one item in
73 the view. Tag specifications are regular expressions over tag
74 paths, where a tag path is a list of element tag names, sepaated
75 by '/', indicating the ancestry of the element. Some examples:
76
77 - C{'foo'}: A top-level element whose tag is C{foo}.
78 - C{'foo/bar'}: An element whose tag is C{bar} and whose parent
79 is a top-level element whose tag is C{foo}.
80 - C{'.*/foo'}: An element whose tag is C{foo}, appearing anywhere
81 in the xml tree.
82 - C{'.*/(foo|bar)'}: An wlement whose tag is C{foo} or C{bar},
83 appearing anywhere in the xml tree.
84
85 The view items are generated from the selected XML elements via
86 the method L{handle_elt()}. By default, this method returns the
87 element as-is (i.e., as an ElementTree object); but it can be
88 overridden, either via subclassing or via the C{elt_handler}
89 constructor parameter.
90 """
91
92
93
94 _DEBUG = False
95
96
97 _BLOCK_SIZE = 1024
98
99 - def __init__(self, filename, tagspec, elt_handler=None):
100 """
101 Create a new corpus view based on a specified XML file.
102
103 Note that the C{XMLCorpusView} constructor does not take an
104 C{encoding} argument, because the unicode encoding is
105 specified by the XML files themselves.
106
107 @type tagspec: C{str}
108 @param tagspec: A tag specification, indicating what XML
109 elements should be included in the view. Each non-nested
110 element that matches this specification corresponds to one
111 item in the view.
112
113 @param elt_handler: A function used to transform each element
114 to a value for the view. If no handler is specified, then
115 L{self.handle_elt()} is called, which returns the element
116 as an ElementTree object. The signature of elt_handler is::
117
118 elt_handler(elt, tagspec) -> value
119 """
120 if elt_handler: self.handle_elt = elt_handler
121
122 self._tagspec = re.compile(tagspec+r'\Z')
123 """The tag specification for this corpus view."""
124
125 self._tag_context = {0: ()}
126 """A dictionary mapping from file positions (as returned by
127 C{stream.seek()} to XML contexts. An XML context is a
128 tuple of XML tag names, indicating which tags have not yet
129 been closed."""
130
131 encoding = self._detect_encoding(filename)
132 StreamBackedCorpusView.__init__(self, filename, encoding=encoding)
133
135 if isinstance(filename, PathPointer):
136 s = filename.open().readline()
137 else:
138 s = open(filename, 'rb').readline()
139 if s.startswith(codecs.BOM_UTF16_BE):
140 return 'utf-16-be'
141 if s.startswith(codecs.BOM_UTF16_LE):
142 return 'utf-16-le'
143 if s.startswith(codecs.BOM_UTF32_BE):
144 return 'utf-32-be'
145 if s.startswith(codecs.BOM_UTF32_LE):
146 return 'utf-32-le'
147 if s.startswith(codecs.BOM_UTF8):
148 return 'utf-8'
149 m = re.match(r'\s*<?xml\b.*\bencoding="([^"]+)"', s)
150 if m: return m.group(1)
151 m = re.match(r"\s*<?xml\b.*\bencoding='([^']+)'", s)
152 if m: return m.group(1)
153
154 return 'utf-8'
155
157 """
158 Convert an element into an appropriate value for inclusion in
159 the view. Unless overridden by a subclass or by the
160 C{elt_handler} constructor argument, this method simply
161 returns C{elt}.
162
163 @return: The view value corresponding to C{elt}.
164
165 @type elt: C{ElementTree}
166 @param elt: The element that should be converted.
167
168 @type context: C{str}
169 @param context: A string composed of element tags separated by
170 forward slashes, indicating the XML context of the given
171 element. For example, the string C{'foo/bar/baz'}
172 indicates that the element is a C{baz} element whose
173 parent is a C{bar} element and whose grandparent is a
174 top-level C{foo} element.
175 """
176 return elt
177
178
179
180 _VALID_XML_RE = re.compile(r"""
181 [^<]*
182 (
183 ((<!--.*?-->) | # comment
184 (<![CDATA[.*?]]) | # raw character data
185 (<!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*>) | # doctype decl
186 (<[^>]*>)) # tag or PI
187 [^<]*)*
188 \Z""",
189 re.DOTALL|re.VERBOSE)
190
191
192
193 _XML_TAG_NAME = re.compile('<\s*/?\s*([^\s>]+)')
194
195
196
197
198
199 _XML_PIECE = re.compile(r"""
200 # Include these so we can skip them:
201 (?P<COMMENT> <!--.*?--> )|
202 (?P<CDATA> <![CDATA[.*?]]> )|
203 (?P<PI> <\?.*?\?> )|
204 (?P<DOCTYPE> <!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*> )|
205 # These are the ones we actually care about:
206 (?P<EMPTY_ELT_TAG> <\s*[^>/\?!\s][^>]*/\s*> )|
207 (?P<START_TAG> <\s*[^>/\?!\s][^>]*> )|
208 (?P<END_TAG> <\s*/[^>/\?!\s][^>]*> )""",
209 re.DOTALL|re.VERBOSE)
210
212 """
213 Read a string from the given stream that does not contain any
214 un-closed tags. In particular, this function first reads a
215 block from the stream of size L{self._BLOCK_SIZE}. It then
216 checks if that block contains an un-closed tag. If it does,
217 then this function either backtracks to the last '<', or reads
218 another block.
219 """
220 fragment = ''
221
222 while True:
223 if isinstance(stream, SeekableUnicodeStreamReader):
224 startpos = stream.tell()
225
226 xml_block = stream.read(self._BLOCK_SIZE)
227 fragment += xml_block
228
229
230 if self._VALID_XML_RE.match(fragment):
231 return fragment
232
233
234 if re.search('[<>]', fragment).group(0) == '>':
235 pos = stream.tell() - (
236 len(fragment)-re.search('[<>]', fragment).end())
237 raise ValueError('Unexpected ">" near char %s' % pos)
238
239
240 if not xml_block:
241 raise ValueError('Unexpected end of file: tag not closed')
242
243
244
245
246 last_open_bracket = fragment.rfind('<')
247 if last_open_bracket > 0:
248 if self._VALID_XML_RE.match(fragment[:last_open_bracket]):
249 if isinstance(stream, SeekableUnicodeStreamReader):
250 stream.seek(startpos)
251 stream.char_seek_forward(last_open_bracket)
252 else:
253 stream.seek(-(len(fragment)-last_open_bracket), 1)
254 return fragment[:last_open_bracket]
255
256
257
258
259 - def read_block(self, stream, tagspec=None, elt_handler=None):
260 """
261 Read from C{stream} until we find at least one element that
262 matches C{tagspec}, and return the result of applying
263 C{elt_handler} to each element found.
264 """
265 if tagspec is None: tagspec = self._tagspec
266 if elt_handler is None: elt_handler = self.handle_elt
267
268
269 context = list(self._tag_context.get(stream.tell()))
270 assert context is not None
271
272 elts = []
273
274 elt_start = None
275 elt_depth = None
276 elt_text = ''
277
278 while elts==[] or elt_start is not None:
279 if isinstance(stream, SeekableUnicodeStreamReader):
280 startpos = stream.tell()
281 xml_fragment = self._read_xml_fragment(stream)
282
283
284 if not xml_fragment:
285 if elt_start is None: break
286 else: raise ValueError('Unexpected end of file')
287
288
289 for piece in self._XML_PIECE.finditer(xml_fragment):
290 if self._DEBUG:
291 print '%25s %s' % ('/'.join(context)[-20:], piece.group())
292
293 if piece.group('START_TAG'):
294 name = self._XML_TAG_NAME.match(piece.group()).group(1)
295
296 context.append(name)
297
298 if elt_start is None:
299 if re.match(tagspec, '/'.join(context)):
300 elt_start = piece.start()
301 elt_depth = len(context)
302
303 elif piece.group('END_TAG'):
304 name = self._XML_TAG_NAME.match(piece.group()).group(1)
305
306 if not context:
307 raise ValueError('Unmatched tag </%s>' % name)
308 if name != context[-1]:
309 raise ValueError('Unmatched tag <%s>...</%s>' %
310 (context[-1], name))
311
312 if elt_start is not None and elt_depth == len(context):
313 elt_text += xml_fragment[elt_start:piece.end()]
314 elts.append( (elt_text, '/'.join(context)) )
315 elt_start = elt_depth = None
316 elt_text = ''
317
318 context.pop()
319
320 elif piece.group('EMPTY_ELT_TAG'):
321 name = self._XML_TAG_NAME.match(piece.group()).group(1)
322 if elt_start is None:
323 if re.match(tagspec, '/'.join(context)+'/'+name):
324 elts.append((piece.group(),
325 '/'.join(context)+'/'+name))
326
327 if elt_start is not None:
328
329
330 if elts == []:
331 elt_text += xml_fragment[elt_start:]
332 elt_start = 0
333
334
335
336
337 else:
338
339
340 if self._DEBUG:
341 print ' '*36+'(backtrack)'
342 if isinstance(stream, SeekableUnicodeStreamReader):
343 stream.seek(startpos)
344 stream.char_seek_forward(elt_start)
345 else:
346 stream.seek(-(len(xml_fragment)-elt_start), 1)
347 context = context[:elt_depth-1]
348 elt_start = elt_depth = None
349 elt_text = ''
350
351
352 pos = stream.tell()
353 if pos in self._tag_context:
354 assert tuple(context) == self._tag_context[pos]
355 else:
356 self._tag_context[pos] = tuple(context)
357
358 return [elt_handler(ElementTree.fromstring(
359 elt.encode('ascii', 'xmlcharrefreplace')),
360 context)
361 for (elt, context) in elts]
362