1
2
3
4
5
6
7
8 """
9 Functions to find and load NLTK X{resource files}, such as corpora,
10 grammars, and saved processing objects. Resource files are identified
11 using URLs, such as"C{nltk:corpora/abc/rural.txt}" or
12 "C{http://nltk.org/sample/toy.cfg}". The following URL protocols are
13 supported:
14
15 - "C{file:I{path}}": Specifies the file whose path is C{I{path}}.
16 Both relative and absolute paths may be used.
17
18 - "C{http://I{host}/{path}}": Specifies the file stored on the web
19 server C{I{host}} at path C{I{path}}.
20
21 - "C{nltk:I{path}}": Specifies the file stored in the NLTK data
22 package at C{I{path}}. NLTK will search for these files in the
23 directories specified by L{nltk.data.path}.
24
25 If no protocol is specified, then the default protocol "C{nltk:}" will
26 be used.
27
28 This module provides to functions that can be used to access a
29 resource file, given its URL: L{load()} loads a given resource, and
30 adds it to a resource cache; and L{retrieve()} copies a given resource
31 to a local file.
32 """
33
34 import sys
35 import os, os.path
36 import textwrap
37 import weakref
38 import yaml
39 import re
40 import urllib2
41 import zipfile
42 import codecs
43 import gzip
44
45 try:
46 import cPickle as pickle
47 except:
48 import pickle
49
50 try:
51 from cStringIO import StringIO
52 except:
53 from StringIO import StringIO
54
55 from nltk import cfg, sem
56
57
58
59
60
61 path = []
62 """A list of directories where the NLTK data package might reside.
63 These directories will be checked in order when looking for a
64 resource in the data package. Note that this allows users to
65 substitute in their own versions of resources, if they have them
66 (e.g., in their home directory under ~/nltk/data)."""
67
68
69 path += [d for d in os.environ.get('NLTK_CORPORA', '').split(os.pathsep) if d]
70 path += [d for d in os.environ.get('NLTK_DATA', '').split(os.pathsep) if d]
71 if os.path.expanduser('~/') != '~/': path += [
72 os.path.expanduser('~/nltk_data')]
73
74
75 if sys.platform.startswith('win'): path += [
76 r'C:\nltk_data', r'D:\nltk_data', r'E:\nltk_data',
77 os.path.join(sys.prefix, 'nltk_data'),
78 os.path.join(sys.prefix, 'lib', 'nltk_data')]
79
80
81 else: path += [
82 '/usr/share/nltk_data',
83 '/usr/local/share/nltk_data',
84 '/usr/lib/nltk_data',
85 '/usr/local/lib/nltk_data']
86
87
88
89
90
92 """
93 An abstract base class for 'path pointers,' used by NLTK's data
94 package to identify specific paths. Two subclasses exist:
95 L{FileSystemPathPointer} identifies a file that can be accessed
96 directly via a given absolute path. L{ZipFilePathPointer}
97 identifies a file contained within a zipfile, that can be accessed
98 by reading that zipfile.
99 """
100 - def open(self, encoding=None):
101 """
102 Return a seekable read-only stream that can be used to read
103 the contents of the file identified by this path pointer.
104
105 @raise IOError: If the path specified by this pointer does
106 not contain a readable file.
107 """
108 raise NotImplementedError('abstract base class')
109
111 """
112 Return the size of the file pointed to by this path pointer,
113 in bytes.
114
115 @raise IOError: If the path specified by this pointer does
116 not contain a readable file.
117 """
118 raise NotImplementedError('abstract base class')
119
120 - def join(self, fileid):
121 """
122 Return a new path pointer formed by starting at the path
123 identified by this pointer, and then following the relative
124 path given by C{fileid}. The path components of C{fileid}
125 should be seperated by forward slashes (C{/}), regardless of
126 the underlying file system's path seperator character.
127 """
128 raise NotImplementedError('abstract base class')
129
130
132 """
133 A path pointer that identifies a file which can be accessed
134 directly via a given absolute path. C{FileSystemPathPointer} is a
135 subclass of C{str} for backwards compatibility purposes --
136 this allows old code that expected C{nltk.data.find()} to expect a
137 string to usually work (assuming the resource is not found in a
138 zipfile).
139 """
141 """
142 Create a new path pointer for the given absolute path.
143
144 @raise IOError: If the given path does not exist.
145 """
146 path = os.path.abspath(path)
147 if not os.path.exists(path):
148 raise IOError('No such file or directory: %r' % path)
149 self._path = path
150 str.__init__(self, path)
151
152 path = property(lambda self: self._path, doc="""
153 The absolute path identified by this path pointer.""")
154
155 - def open(self, encoding=None):
160
162 return os.stat(self._path).st_size
163
164 - def join(self, fileid):
167
169 return 'FileSystemPathPointer(%r)' % self._path
170
173
174
176 """
177 A subclass of C{FileSystemPathPointer} that identifies a gzip-compressed
178 file located at a given absolute path. C{GzipFileSystemPathPointer} is
179 appropriate for loading large gzip-compressed pickle objects efficiently.
180 """
181
182 BLOCK_SIZE = 2 * 2**20
183
184 - def open(self, encoding=None):
196
197
199 """
200 A path pointer that identifies a file contained within a zipfile,
201 which can be accessed by reading that zipfile.
202 """
224
225 zipfile = property(lambda self: self._zipfile, doc="""
226 The C{zipfile.ZipFile} object used to access the zip file
227 containing the entry identified by this path pointer.""")
228 entry = property(lambda self: self._entry, doc="""
229 The name of the file within C{zipfile} that this path
230 pointer points to.""")
231
232 - def open(self, encoding=None):
238
241
242 - def join(self, fileid):
245
247 return 'ZipFilePathPointer(%r, %r)' % (
248 self._zipfile.filename, self._entry)
249
250
251
252
253
254 _resource_cache = weakref.WeakValueDictionary()
255 """A weakref dictionary used to cache resources so that they won't
256 need to be loaded more than once."""
257
258 -def find(resource_name):
259 """
260 Find the given resource from the NLTK data package, and return a
261 corresponding path name. If the given resource is not found,
262 raise a C{LookupError}, whose message gives a pointer to the
263 installation instructions for the NLTK data package.
264
265 @type resource_name: C{str}
266 @param resource_name: The name of the resource to search for.
267 Resource names are posix-style relative path names, such as
268 C{'corpora/brown'}. In particular, directory names should
269 always be separated by the C{'/'} character, which will be
270 automatically converted to a platform-appropriate path
271 separator.
272 @rtype: C{str}
273 """
274
275 m = re.match('(.*\.zip)/?(.*)$|', resource_name)
276 zipfile, zipentry = m.groups()
277
278
279 for path_item in path:
280
281
282 if os.path.isfile(path_item) and path_item.endswith('.zip'):
283 try: return ZipFilePathPointer(path_item, resource_name)
284 except IOError: continue
285
286
287 elif os.path.isdir(path_item):
288 if zipfile is None:
289 p = os.path.join(path_item, *resource_name.split('/'))
290 if os.path.exists(p):
291 if p.endswith('.gz'):
292 return GzipFileSystemPathPointer(p)
293 else:
294 return FileSystemPathPointer(p)
295 else:
296 p = os.path.join(path_item, *zipfile.split('/'))
297 if os.path.exists(p):
298 try: return ZipFilePathPointer(p, zipentry)
299 except IOError: continue
300
301
302 msg = textwrap.fill(
303 'Resource %r not found. For installation instructions, '
304 'please see <http://nltk.org/index.php/Installation>.' %
305 (resource_name,), initial_indent=' ', subsequent_indent=' ',
306 width=66)
307 msg += '\n Searched in:' + ''.join('\n - %r' % d for d in path)
308 sep = '*'*70
309 resource_not_found = '\n%s\n%s\n%s' % (sep, msg, sep)
310 raise LookupError(resource_not_found)
311
312 -def retrieve(resource_url, filename=None, verbose=True):
313 """
314 Copy the given resource to a local file. If no filename is
315 specified, then use the URL's filename. If there is already a
316 file named C{filename}, then raise a C{ValueError}.
317
318 @type resource_url: C{str}
319 @param resource_url: A URL specifying where the resource should be
320 loaded from. The default protocol is C{"nltk:"}, which searches
321 for the file in the the NLTK data package.
322 """
323 if filename is None:
324 if resource_url.startswith('file:'):
325 filename = os.path.split(filename)[-1]
326 else:
327 filename = re.sub(r'(^\w+:)?.*/', '', resource_url)
328 if os.path.exists(filename):
329 filename = os.path.abspath(filename)
330 raise ValueError, "File %r already exists!" % filename
331
332 if verbose:
333 print 'Retrieving %r, saving to %r' % (resource_url, filename)
334
335
336 infile = _open(resource_url)
337 outfile = open(filename, 'wb')
338
339
340 while True:
341 s = infile.read(1024*64)
342 outfile.write(s)
343 if not s: break
344
345
346 infile.close()
347 outfile.close()
348
349
350
351
352 FORMATS = {
353 'pickle': "A serialized python object, stored using the pickle module.",
354 'yaml': "A serialzied python object, stored using the yaml module.",
355 'cfg': "A context free grammar, parsed by nltk.cfg.parse_cfg().",
356 'pcfg': "A probabilistic CFG, parsed by nltk.cfg.parse_pcfg().",
357 'fcfg': "A feature CFG, parsed by nltk.cfg.parse_fcfg().",
358 'fol': "A list of first order logic expressions, parsed by "
359 "nltk.sem.parse_fol().",
360 'val': "A semantic valuation, parsed by nltk.sem.parse_valuation().",
361 'raw': "The raw (byte string) contents of a file.",
362 }
363
364
365
366
367 AUTO_FORMATS = {
368 'pickle': 'pickle',
369 'yaml': 'yaml',
370 'cfg': 'cfg',
371 'pcfg': 'pcfg',
372 'fcfg': 'fcfg',
373 'fol': 'fol',
374 'val': 'val'}
375
376 -def load(resource_url, format='auto', cache=True, verbose=False):
377 """
378 Load a given resource from the NLTK data package. The following
379 resource formats are currently supported:
380 - C{'pickle'}
381 - C{'yaml'}
382 - C{'cfg'} (context free grammars)
383 - C{'pcfg'} (probabilistic CFGs)
384 - C{'fcfg'} (feature-based CFGs)
385 - C{'fol'} (formulas of First Order Logic)
386 - C{'val'} (valuation of First Order Logic model)
387 - C{'raw'}
388
389 If no format is specified, C{load()} will attempt to determine a
390 format based on the resource name's file extension. If that
391 fails, C{load()} will raise a C{ValueError} exception.
392
393 @type resource_url: C{str}
394 @param resource_url: A URL specifying where the resource should be
395 loaded from. The default protocol is C{"nltk:"}, which searches
396 for the file in the the NLTK data package.
397 @type cache: C{bool}
398 @param cache: If true, add this resource to a cache. If C{load}
399 finds a resource in its cache, then it will return it from the
400 cache rather than loading it. The cache uses weak references,
401 so a resource wil automatically be expunged from the cache
402 when no more objects are using it.
403
404 @type verbose: C{bool}
405 @param verbose: If true, print a message when loading a resource.
406 Messages are not displayed when a resource is retrieved from
407 the cache.
408 """
409
410 if cache:
411 resource_val = _resource_cache.get(resource_url)
412 if resource_val is not None:
413 if verbose:
414 print '<<Using cached copy of %s>>' % (resource_url,)
415 return resource_val
416
417
418 if verbose:
419 print '<<Loading %s>>' % (resource_url,)
420
421
422 if format == 'auto':
423 resource_url_parts = resource_url.split('.')
424 ext = resource_url_parts[-1]
425 if ext == 'gz':
426 ext = resource_url_parts[-2]
427 format = AUTO_FORMATS.get(ext)
428 if format is None:
429 raise ValueError('Could not determine format for %s based '
430 'on its file\nextension; use the "format" '
431 'argument to specify the format explicitly.'
432 % resource_url)
433
434
435 if format == 'pickle':
436 resource_val = pickle.load(_open(resource_url))
437 elif format == 'yaml':
438 resource_val = yaml.load(_open(resource_url))
439 elif format == 'cfg':
440 resource_val = cfg.parse_cfg(_open(resource_url).read())
441 elif format == 'pcfg':
442 resource_val = cfg.parse_pcfg(_open(resource_url).read())
443 elif format == 'fcfg':
444 resource_val = cfg.parse_fcfg(_open(resource_url).read())
445 elif format == 'fol':
446 resource_val = sem.parse_fol(_open(resource_url).read())
447 elif format == 'val':
448 resource_val = sem.parse_valuation(_open(resource_url).read())
449 elif format == 'raw':
450 resource_val = _open(resource_url).read()
451 else:
452 assert format not in FORMATS
453 raise ValueError('Unknown format type!')
454
455
456 if cache:
457 try:
458 _resource_cache[resource_url] = resource_val
459 except TypeError:
460
461
462 pass
463
464 return resource_val
465
466 -def show_cfg(resource_url, escape='##'):
467 """
468 Write out a grammar file, ignoring escaped and empty lines
469 @type resource_url: C{str}
470 @param resource_url: A URL specifying where the resource should be
471 loaded from. The default protocol is C{"nltk:"}, which searches
472 for the file in the the NLTK data package.
473 @type escape: C{str}
474 @param escape: Prepended string that signals lines to be ignored
475 """
476 resource_val = load(resource_url, format='raw', cache=False)
477 lines = resource_val.splitlines()
478 for l in lines:
479 if l.startswith(escape): continue
480 if re.match('^$', l): continue
481 print l
482
483
485 """
486 Remove all objects from the resource cache.
487 @see: L{load()}
488 """
489 _resource_cache.clear()
490
492 """
493 Helper function that returns an open file object for a resource,
494 given its resource URL. If the given resource URL uses the 'ntlk'
495 protocol, or uses no protocol, then use L{nltk.data.find} to find
496 its path, and open it with the given mode; if the resource URL
497 uses the 'file' protocol, then open the file with the given mode;
498 otherwise, delegate to C{urllib2.urlopen}.
499
500 @type resource_url: C{str}
501 @param resource_url: A URL specifying where the resource should be
502 loaded from. The default protocol is C{"nltk:"}, which searches
503 for the file in the the NLTK data package.
504 """
505
506 protocol, path = re.match('(?:(\w+):)?(.*)', resource_url).groups()
507
508 if protocol is None or protocol.lower() == 'nltk':
509 return find(path).open()
510 elif protocol.lower() == 'file':
511
512 return open(path, 'rb')
513 else:
514 return urllib2.urlopen(resource_url)
515
516
517
518
519
523
525 resource = load(self.__path)
526
527
528
529 self.__dict__ = resource.__dict__
530 self.__class__ = resource.__class__
531
533 self.__load()
534
535
536 return getattr(self, attr)
537
539 self.__load()
540
541
542 return '%r' % self
543
544
545
546
547
549 """
550 A subclass of C{zipfile.ZipFile} that closes its file pointer
551 whenever it is not using it; and re-opens it when it needs to read
552 data from the zipfile. This is useful for reducing the number of
553 open file handles when many zip files are being accessed at once.
554 C{OpenOnDemandZipFile} must be constructed from a filename, not a
555 file-like object (to allow re-opening). C{OpenOnDemandZipFile} is
556 read-only (i.e., C{write} and C{writestr} are disabled.
557 """
564
565 - def read(self, name):
571
572 - def write(self, *args, **kwargs):
573 """@raise NotImplementedError: OpenOnDemandZipfile is read-only"""
574 raise NotImplementedError('OpenOnDemandZipfile is read-only')
575
577 """@raise NotImplementedError: OpenOnDemandZipfile is read-only"""
578 raise NotImplementedError('OpenOnDemandZipfile is read-only')
579
581 return 'OpenOnDemandZipFile(%r)' % self.filename
582
583
584
585
586
588 """
589 A stream reader that automatically encodes the source byte stream
590 into unicode (like C{codecs.StreamReader}); but still supports the
591 C{seek()} and C{tell()} operations correctly. This is in contrast
592 to C{codecs.StreamReader}, which provide *broken* C{seek()} and
593 C{tell()} methods.
594
595 This class was motivated by L{StreamBackedCorpusView}, which
596 makes extensive use of C{seek()} and C{tell()}, and needs to be
597 able to handle unicode-encoded files.
598
599 Note: this class requires stateless decoders. To my knowledge,
600 this shouldn't cause a problem with any of python's builtin
601 unicode encodings.
602 """
603 DEBUG = True
604
605 - def __init__(self, stream, encoding, errors='strict'):
606
607 stream.seek(0)
608
609 self.stream = stream
610 """The underlying stream."""
611
612 self.encoding = encoding
613 """The name of the encoding that should be used to encode the
614 underlying stream."""
615
616 self.errors = errors
617 """The error mode that should be used when decoding data from
618 the underlying stream. Can be 'strict', 'ignore', or
619 'replace'."""
620
621 self.decode = codecs.getdecoder(encoding)
622 """The function that is used to decode byte strings into
623 unicode strings."""
624
625 self.bytebuffer = ''
626 """A buffer to use bytes that have been read but have not yet
627 been decoded. This is only used when the final bytes from
628 a read do not form a complete encoding for a character."""
629
630 self.linebuffer = None
631 """A buffer used by L{readline()} to hold characters that have
632 been read, but have not yet been returned by L{read()} or
633 L{readline()}. This buffer consists of a list of unicode
634 strings, where each string corresponds to a single line.
635 The final element of the list may or may not be a complete
636 line. Note that the existence of a linebuffer makes the
637 L{tell()} operation more complex, because it must backtrack
638 to the beginning of the buffer to determine the correct
639 file position in the underlying byte stream."""
640
641 self._rewind_checkpoint = 0
642 """The file position at which the most recent read on the
643 underlying stream began. This is used, together with
644 L{_rewind_numchars}, to backtrack to the beginning of
645 L{linebuffer} (which is required by L{tell()})."""
646
647 self._rewind_numchars = None
648 """The number of characters that have been returned since the
649 read that started at L{_rewind_checkpoint}. This is used,
650 together with L{_rewind_checkpoint}, to backtrack to the
651 beginning of L{linebuffer} (which is required by
652 L{tell()})."""
653
654 self._bom = self._check_bom()
655 """The length of the byte order marker at the beginning of
656 the stream (or C{None} for no byte order marker)."""
657
658
659
660
661
662 - def read(self, size=None):
663 """
664 Read up to C{size} bytes, decode them using this reader's
665 encoding, and return the resulting unicode string.
666
667 @param size: The maximum number of bytes to read. If not
668 specified, then read as many bytes as possible.
669
670 @rtype: C{unicode}
671 """
672 chars = self._read(size)
673
674
675 if self.linebuffer:
676 chars = ''.join(self.linebuffer) + chars
677 self.linebuffer = None
678 self._rewind_numchars = None
679
680 return chars
681
683 """
684 Read a line of text, decode it using this reader's encoding,
685 and return the resulting unicode string.
686
687 @param size: The maximum number of bytes to read. If no
688 newline is encountered before C{size} bytes have been
689 read, then the returned value may not be a complete line
690 of text.
691 """
692
693
694
695 if self.linebuffer and len(self.linebuffer) > 1:
696 line = self.linebuffer.pop(0)
697 self._rewind_numchars += len(line)
698 return line
699
700 readsize = size or 72
701 chars = ''
702
703
704 if self.linebuffer:
705 chars += self.linebuffer.pop()
706 self.linebuffer = None
707
708 while True:
709 startpos = self.stream.tell() - len(self.bytebuffer)
710 new_chars = self._read(readsize)
711
712
713
714 if new_chars and new_chars.endswith('\r'):
715 new_chars += self._read(1)
716
717 chars += new_chars
718 lines = chars.splitlines(True)
719 if len(lines) > 1:
720 line = lines[0]
721 self.linebuffer = lines[1:]
722 self._rewind_numchars = len(new_chars)-(len(chars)-len(line))
723 self._rewind_checkpoint = startpos
724 break
725 elif len(lines) == 1:
726 line0withend = lines[0]
727 line0withoutend = lines[0].splitlines(False)[0]
728 if line0withend != line0withoutend:
729 line = line0withend
730 break
731
732 if not new_chars or size is not None:
733 line = chars
734 break
735
736
737 if readsize < 8000:
738 readsize *= 2
739
740 return line
741
742 - def readlines(self, sizehint=None, keepends=True):
743 """
744 Read this file's contents, decode them using this reader's
745 encoding, and return it as a list of unicode lines.
746
747 @rtype: C{list} of C{unicode}
748 @param sizehint: Ignored.
749 @param keepends: If false, then strip newlines.
750 """
751 return self.read().splitlines(keepends)
752
754 """Return the next decoded line from the underlying stream."""
755 line = self.readline()
756 if line: return line
757 else: raise StopIteration
758
760 """Return self"""
761 return self
762
764 """Return self"""
765 return self
766
767
768
769
770
771 closed = property(lambda self: self.stream.closed, doc="""
772 True if the underlying stream is closed.""")
773
774 name = property(lambda self: self.stream.name, doc="""
775 The name of the underlying stream.""")
776
777 mode = property(lambda self: self.stream.mode, doc="""
778 The mode of the underlying stream.""")
779
781 """
782 Close the underlying stream.
783 """
784 self.stream.close()
785
786
787
788
789
790 - def seek(self, offset, whence=0):
791 """
792 Move the stream to a new file position. If the reader is
793 maintaining any buffers, tehn they will be cleared.
794
795 @param offset: A byte count offset.
796 @param whence: If C{whence} is 0, then the offset is from the
797 start of the file (offset should be positive). If
798 C{whence} is 1, then the offset is from the current
799 position (offset may be positive or negative); and if 2,
800 then the offset is from the end of the file (offset should
801 typically be negative).
802 """
803 if whence == 1:
804 raise ValueError('Relative seek is not supported for '
805 'SeekableUnicodeStreamReader -- consider '
806 'using char_seek_forward() instead.')
807 self.stream.seek(offset, whence)
808 self.linebuffer = None
809 self.bytebuffer = ''
810 self._rewind_numchars = None
811 self._rewind_checkpoint = self.stream.tell()
812
814 """
815 Move the read pointer forward by C{offset} characters.
816 """
817 if offset < 0:
818 raise ValueError('Negative offsets are not supported')
819
820 self.seek(self.tell())
821
822 self._char_seek_forward(offset)
823
825 """
826 Move the file position forward by C{offset} characters,
827 ignoring all buffers.
828
829 @param est_bytes: A hint, giving an estimate of the number of
830 bytes that will be neded to move foward by C{offset} chars.
831 Defaults to C{offset}.
832 """
833 if est_bytes is None: est_bytes = offset
834 bytes = ''
835
836 while True:
837
838 newbytes = self.stream.read(est_bytes-len(bytes))
839 bytes += newbytes
840
841
842 chars, bytes_decoded = self._incr_decode(bytes)
843
844
845
846 if len(chars) == offset:
847 self.stream.seek(-len(bytes)+bytes_decoded, 1)
848 return
849
850
851
852 if len(chars) > offset:
853 while len(chars) > offset:
854
855 est_bytes += offset-len(chars)
856 chars, bytes_decoded = self._incr_decode(bytes[:est_bytes])
857 self.stream.seek(-len(bytes)+bytes_decoded, 1)
858 return
859
860
861 est_bytes += offset - len(chars)
862
864 """
865 Return the current file position on the underlying byte
866 stream. If this reader is maintaining any buffers, then the
867 returned file position will be the position of the beginning
868 of those buffers.
869 """
870
871 if self.linebuffer is None:
872 return self.stream.tell() - len(self.bytebuffer)
873
874
875
876
877
878 orig_filepos = self.stream.tell()
879
880
881 bytes_read = ( (orig_filepos-len(self.bytebuffer)) -
882 self._rewind_checkpoint )
883 buf_size = sum([len(line) for line in self.linebuffer])
884 est_bytes = (bytes_read * self._rewind_numchars /
885 (self._rewind_numchars + buf_size))
886
887 self.stream.seek(self._rewind_checkpoint)
888 self._char_seek_forward(self._rewind_numchars, est_bytes)
889 filepos = self.stream.tell()
890
891
892 if self.DEBUG:
893 self.stream.seek(filepos)
894 check1 = self._incr_decode(self.stream.read(50))[0]
895 check2 = ''.join(self.linebuffer)
896 assert check1.startswith(check2) or check2.startswith(check1)
897
898
899
900 self.stream.seek(orig_filepos)
901
902
903 return filepos
904
905
906
907
908
909 - def _read(self, size=None):
910 """
911 Read up to C{size} bytes from the underlying stream, decode
912 them using this reader's encoding, and return the resulting
913 unicode string. C{linebuffer} is *not* included in the
914 result.
915 """
916 if size == 0: return u''
917
918
919 if self._bom and self.stream.tell() == 0:
920 self.stream.read(self._bom)
921
922
923 if size is None:
924 new_bytes = self.stream.read()
925 else:
926 new_bytes = self.stream.read(size)
927 bytes = self.bytebuffer + new_bytes
928
929
930 chars, bytes_decoded = self._incr_decode(bytes)
931
932
933 if (size is not None) and (not chars) and (len(new_bytes) > 0):
934 while not chars:
935 new_bytes = self.stream.read(1)
936 if not new_bytes: break
937 bytes += new_bytes
938 chars, bytes_decoded = self._incr_decode(bytes)
939
940
941 self.bytebuffer = bytes[bytes_decoded:]
942
943
944 return chars
945
947 """
948 Decode the given byte string into a unicode string, using this
949 reader's encoding. If an exception is encountered that
950 appears to be caused by a truncation error, then just decode
951 the byte string without the bytes that cause the trunctaion
952 error.
953
954 @return: A tuple C{(chars, num_consumed)}, where C{chars} is
955 the decoded unicode string, and C{num_consumed} is the
956 number of bytes that were consumed.
957 """
958 while True:
959 try:
960 return self.decode(bytes, 'strict')
961 except UnicodeDecodeError, exc:
962
963
964 if exc.end == len(bytes):
965 return self.decode(bytes[:exc.start], self.errors)
966
967
968 elif self.errors == 'strict':
969 raise
970
971
972
973 else:
974 return self.decode(bytes, self.errors)
975
976 _BOM_TABLE = {
977 'utf8': [(codecs.BOM_UTF8, None)],
978 'utf16': [(codecs.BOM_UTF16_LE, 'utf16-le'),
979 (codecs.BOM_UTF16_BE, 'utf16-be')],
980 'utf16le': [(codecs.BOM_UTF16_LE, None)],
981 'utf16be': [(codecs.BOM_UTF16_BE, None)],
982 'utf32': [(codecs.BOM_UTF32_LE, 'utf32-le'),
983 (codecs.BOM_UTF32_BE, 'utf32-be')],
984 'utf32le': [(codecs.BOM_UTF32_LE, None)],
985 'utf32be': [(codecs.BOM_UTF32_BE, None)],
986 }
987
989
990 enc = re.sub('[ -]', '', self.encoding.lower())
991
992
993 bom_info = self._BOM_TABLE.get(enc)
994
995 if bom_info:
996
997 bytes = self.stream.read(16)
998 self.stream.seek(0)
999
1000
1001 for (bom, new_encoding) in bom_info:
1002 if bytes.startswith(bom):
1003 if new_encoding: self.encoding = new_encoding
1004 return len(bom)
1005
1006 return None
1007