nltk.data

1 # Natural Language Toolkit: Utility functions 2 # 3 # Copyright (C) 2001-2008 NLTK Project 4 # Author: Edward Loper <[email protected]> 5 # URL: <http://nltk.org> 6 # For license information, see LICENSE.TXT 7 8 """ 9 Functions to find and load NLTK X{resource files}, such as corpora, 10 grammars, and saved processing objects. Resource files are identified 11 using URLs, such as"C{nltk:corpora/abc/rural.txt}" or 12 "C{http://nltk.org/sample/toy.cfg}". The following URL protocols are 13 supported: 14 15 - "C{file:I{path}}": Specifies the file whose path is C{I{path}}. 16 Both relative and absolute paths may be used. 17 18 - "C{http://I{host}/{path}}": Specifies the file stored on the web 19 server C{I{host}} at path C{I{path}}. 20 21 - "C{nltk:I{path}}": Specifies the file stored in the NLTK data 22 package at C{I{path}}. NLTK will search for these files in the 23 directories specified by L{nltk.data.path}. 24 25 If no protocol is specified, then the default protocol "C{nltk:}" will 26 be used. 27 28 This module provides to functions that can be used to access a 29 resource file, given its URL: L{load()} loads a given resource, and 30 adds it to a resource cache; and L{retrieve()} copies a given resource 31 to a local file. 32 """ 33 34 import sys 35 import os, os.path 36 import textwrap 37 import weakref 38 import yaml 39 import re 40 import urllib2 41 import zipfile 42 import codecs 43 import gzip 44 45 try: 46 import cPickle as pickle 47 except: 48 import pickle 49 50 try: 51 from cStringIO import StringIO 52 except: 53 from StringIO import StringIO 54 55 from nltk import cfg, sem 56 57 ###################################################################### 58 # Search Path 59 ###################################################################### 60 61 path = [] 62 """A list of directories where the NLTK data package might reside. 63 These directories will be checked in order when looking for a 64 resource in the data package. Note that this allows users to 65 substitute in their own versions of resources, if they have them 66 (e.g., in their home directory under ~/nltk/data).""" 67 68 # User-specified locations: 69 path += [d for d in os.environ.get('NLTK_CORPORA', '').split(os.pathsep) if d] 70 path += [d for d in os.environ.get('NLTK_DATA', '').split(os.pathsep) if d] 71 if os.path.expanduser('~/') != '~/': path += [ 72 os.path.expanduser('~/nltk_data')] 73 74 # Common locations on Windows: 75 if sys.platform.startswith('win'): path += [ 76 r'C:\nltk_data', r'D:\nltk_data', r'E:\nltk_data', 77 os.path.join(sys.prefix, 'nltk_data'), 78 os.path.join(sys.prefix, 'lib', 'nltk_data')] 79 80 # Common locations on UNIX & OS X: 81 else: path += [ 82 '/usr/share/nltk_data', 83 '/usr/local/share/nltk_data', 84 '/usr/lib/nltk_data', 85 '/usr/local/lib/nltk_data'] 86 87 ###################################################################### 88 # Path Pointers 89 ###################################################################### 90

91 -class PathPointer(object):

92 """ 93 An abstract base class for 'path pointers,' used by NLTK's data 94 package to identify specific paths. Two subclasses exist: 95 L{FileSystemPathPointer} identifies a file that can be accessed 96 directly via a given absolute path. L{ZipFilePathPointer} 97 identifies a file contained within a zipfile, that can be accessed 98 by reading that zipfile. 99 """

100 - def open(self, encoding=None):

101 """ 102 Return a seekable read-only stream that can be used to read 103 the contents of the file identified by this path pointer. 104 105 @raise IOError: If the path specified by this pointer does 106 not contain a readable file. 107 """ 108 raise NotImplementedError('abstract base class')

109

110 - def file_size(self):

111 """ 112 Return the size of the file pointed to by this path pointer, 113 in bytes. 114 115 @raise IOError: If the path specified by this pointer does 116 not contain a readable file. 117 """ 118 raise NotImplementedError('abstract base class')

119

120 - def join(self, fileid):

121 """ 122 Return a new path pointer formed by starting at the path 123 identified by this pointer, and then following the relative 124 path given by C{fileid}. The path components of C{fileid} 125 should be seperated by forward slashes (C{/}), regardless of 126 the underlying file system's path seperator character. 127 """ 128 raise NotImplementedError('abstract base class')

129 130

131 -class FileSystemPathPointer(PathPointer, str):

132 """ 133 A path pointer that identifies a file which can be accessed 134 directly via a given absolute path. C{FileSystemPathPointer} is a 135 subclass of C{str} for backwards compatibility purposes -- 136 this allows old code that expected C{nltk.data.find()} to expect a 137 string to usually work (assuming the resource is not found in a 138 zipfile). 139 """

140 - def __init__(self, path):

141 """ 142 Create a new path pointer for the given absolute path. 143 144 @raise IOError: If the given path does not exist. 145 """ 146 path = os.path.abspath(path) 147 if not os.path.exists(path): 148 raise IOError('No such file or directory: %r' % path) 149 self._path = path 150 str.__init__(self, path)

151 152 path = property(lambda self: self._path, doc=""" 153 The absolute path identified by this path pointer.""") 154

155 - def open(self, encoding=None):

156 stream = open(self._path, 'rb') 157 if encoding is not None: 158 stream = SeekableUnicodeStreamReader(stream, encoding) 159 return stream

160

161 - def file_size(self):

162 return os.stat(self._path).st_size

163

164 - def join(self, fileid):

165 path = os.path.join(self._path, *fileid.split('/')) 166 return FileSystemPathPointer(path)

167

168 - def __repr__(self):

169 return 'FileSystemPathPointer(%r)' % self._path

170

171 - def __str__(self):

172 return self._path

173 174

175 -class GzipFileSystemPathPointer(FileSystemPathPointer):

176 """ 177 A subclass of C{FileSystemPathPointer} that identifies a gzip-compressed 178 file located at a given absolute path. C{GzipFileSystemPathPointer} is 179 appropriate for loading large gzip-compressed pickle objects efficiently. 180 """ 181 # 2MB 182 BLOCK_SIZE = 2 * 2**20 183

184 - def open(self, encoding=None):

185 # Why do this? The default blocksize used by gzip.open readline() is 186 # too small which leads to poor performance loading large gzipped 187 # pickle objects. 188 stream = StringIO() 189 file = gzip.open(self._path, 'rb') 190 for line in iter(lambda: file.read(self.BLOCK_SIZE), ''): 191 stream.write(line) 192 stream = StringIO(stream.getvalue()) 193 if encoding: 194 stream = SeekableUnicodeStreamReader(stream, encoding) 195 return stream

196 197

198 -class ZipFilePathPointer(PathPointer):

199 """ 200 A path pointer that identifies a file contained within a zipfile, 201 which can be accessed by reading that zipfile. 202 """

203 - def __init__(self, zipfile, entry=''):

204 """ 205 Create a new path pointer pointing at the specified entry 206 in the given zipfile. 207 208 @raise IOError: If the given zipfile does not exist, or if it 209 does not contain the specified entry. 210 """ 211 if isinstance(zipfile, basestring): 212 zipfile = OpenOnDemandZipFile(os.path.abspath(zipfile)) 213 214 # Normalize the entry string: 215 entry = re.sub('(^|/)/+', r'\1', entry) 216 217 # Check that the entry exists: 218 if entry: 219 try: zipfile.getinfo(entry) 220 except: raise IOError('Zipfile %r does not contain %r' % 221 (zipfile.filename, entry)) 222 self._zipfile = zipfile 223 self._entry = entry

224 225 zipfile = property(lambda self: self._zipfile, doc=""" 226 The C{zipfile.ZipFile} object used to access the zip file 227 containing the entry identified by this path pointer.""") 228 entry = property(lambda self: self._entry, doc=""" 229 The name of the file within C{zipfile} that this path 230 pointer points to.""") 231

232 - def open(self, encoding=None):

233 data = self._zipfile.read(self._entry) 234 stream = StringIO(data) 235 if encoding is not None: 236 stream = SeekableUnicodeStreamReader(stream, encoding) 237 return stream

238

239 - def file_size(self):

240 return self._zipfile.getinfo(self._entry).file_size

241

242 - def join(self, fileid):

243 entry = '%s/%s' % (self._entry, fileid) 244 return ZipFilePathPointer(self._zipfile, entry)

245

246 - def __repr__(self):

247 return 'ZipFilePathPointer(%r, %r)' % ( 248 self._zipfile.filename, self._entry)

249 250 ###################################################################### 251 # Access Functions 252 ###################################################################### 253 254 _resource_cache = weakref.WeakValueDictionary() 255 """A weakref dictionary used to cache resources so that they won't 256 need to be loaded more than once.""" 257

258 -def find(resource_name):

259 """ 260 Find the given resource from the NLTK data package, and return a 261 corresponding path name. If the given resource is not found, 262 raise a C{LookupError}, whose message gives a pointer to the 263 installation instructions for the NLTK data package. 264 265 @type resource_name: C{str} 266 @param resource_name: The name of the resource to search for. 267 Resource names are posix-style relative path names, such as 268 C{'corpora/brown'}. In particular, directory names should 269 always be separated by the C{'/'} character, which will be 270 automatically converted to a platform-appropriate path 271 separator. 272 @rtype: C{str} 273 """ 274 # Check if the resource name includes a zipfile name 275 m = re.match('(.*\.zip)/?(.*)$|', resource_name) 276 zipfile, zipentry = m.groups() 277 278 # Check each item in our path 279 for path_item in path: 280 281 # Is the path item a zipfile? 282 if os.path.isfile(path_item) and path_item.endswith('.zip'): 283 try: return ZipFilePathPointer(path_item, resource_name) 284 except IOError: continue # resource not in zipfile 285 286 # Is the path item a path_item? 287 elif os.path.isdir(path_item): 288 if zipfile is None: 289 p = os.path.join(path_item, *resource_name.split('/')) 290 if os.path.exists(p): 291 if p.endswith('.gz'): 292 return GzipFileSystemPathPointer(p) 293 else: 294 return FileSystemPathPointer(p) 295 else: 296 p = os.path.join(path_item, *zipfile.split('/')) 297 if os.path.exists(p): 298 try: return ZipFilePathPointer(p, zipentry) 299 except IOError: continue # resource not in zipfile 300 301 # Display a friendly error message if the resource wasn't found: 302 msg = textwrap.fill( 303 'Resource %r not found. For installation instructions, ' 304 'please see <http://nltk.org/index.php/Installation>.' % 305 (resource_name,), initial_indent=' ', subsequent_indent=' ', 306 width=66) 307 msg += '\n Searched in:' + ''.join('\n - %r' % d for d in path) 308 sep = '*'*70 309 resource_not_found = '\n%s\n%s\n%s' % (sep, msg, sep) 310 raise LookupError(resource_not_found)

311

312 -def retrieve(resource_url, filename=None, verbose=True):

313 """ 314 Copy the given resource to a local file. If no filename is 315 specified, then use the URL's filename. If there is already a 316 file named C{filename}, then raise a C{ValueError}. 317 318 @type resource_url: C{str} 319 @param resource_url: A URL specifying where the resource should be 320 loaded from. The default protocol is C{"nltk:"}, which searches 321 for the file in the the NLTK data package. 322 """ 323 if filename is None: 324 if resource_url.startswith('file:'): 325 filename = os.path.split(filename)[-1] 326 else: 327 filename = re.sub(r'(^\w+:)?.*/', '', resource_url) 328 if os.path.exists(filename): 329 filename = os.path.abspath(filename) 330 raise ValueError, "File %r already exists!" % filename 331 332 if verbose: 333 print 'Retrieving %r, saving to %r' % (resource_url, filename) 334 335 # Open the input & output streams. 336 infile = _open(resource_url) 337 outfile = open(filename, 'wb') 338 339 # Copy infile -> outfile, using 64k blocks. 340 while True: 341 s = infile.read(1024*64) # 64k blocks. 342 outfile.write(s) 343 if not s: break 344 345 # Close both files. 346 infile.close() 347 outfile.close()

348 349 #: A dictionary describing the formats that are supported by NLTK's 350 #: L{load()} method. Keys are format names, and values are format 351 #: descriptions. 352 FORMATS = { 353 'pickle': "A serialized python object, stored using the pickle module.", 354 'yaml': "A serialzied python object, stored using the yaml module.", 355 'cfg': "A context free grammar, parsed by nltk.cfg.parse_cfg().", 356 'pcfg': "A probabilistic CFG, parsed by nltk.cfg.parse_pcfg().", 357 'fcfg': "A feature CFG, parsed by nltk.cfg.parse_fcfg().", 358 'fol': "A list of first order logic expressions, parsed by " 359 "nltk.sem.parse_fol().", 360 'val': "A semantic valuation, parsed by nltk.sem.parse_valuation().", 361 'raw': "The raw (byte string) contents of a file.", 362 } 363 364 #: A dictionary mapping from file extensions to format names, used 365 #: by L{load()} when C{format="auto"} to decide the format for a 366 #: given resource url. 367 AUTO_FORMATS = { 368 'pickle': 'pickle', 369 'yaml': 'yaml', 370 'cfg': 'cfg', 371 'pcfg': 'pcfg', 372 'fcfg': 'fcfg', 373 'fol': 'fol', 374 'val': 'val'} 375

376 -def load(resource_url, format='auto', cache=True, verbose=False):

377 """ 378 Load a given resource from the NLTK data package. The following 379 resource formats are currently supported: 380 - C{'pickle'} 381 - C{'yaml'} 382 - C{'cfg'} (context free grammars) 383 - C{'pcfg'} (probabilistic CFGs) 384 - C{'fcfg'} (feature-based CFGs) 385 - C{'fol'} (formulas of First Order Logic) 386 - C{'val'} (valuation of First Order Logic model) 387 - C{'raw'} 388 389 If no format is specified, C{load()} will attempt to determine a 390 format based on the resource name's file extension. If that 391 fails, C{load()} will raise a C{ValueError} exception. 392 393 @type resource_url: C{str} 394 @param resource_url: A URL specifying where the resource should be 395 loaded from. The default protocol is C{"nltk:"}, which searches 396 for the file in the the NLTK data package. 397 @type cache: C{bool} 398 @param cache: If true, add this resource to a cache. If C{load} 399 finds a resource in its cache, then it will return it from the 400 cache rather than loading it. The cache uses weak references, 401 so a resource wil automatically be expunged from the cache 402 when no more objects are using it. 403 404 @type verbose: C{bool} 405 @param verbose: If true, print a message when loading a resource. 406 Messages are not displayed when a resource is retrieved from 407 the cache. 408 """ 409 # If we've cached the resource, then just return it. 410 if cache: 411 resource_val = _resource_cache.get(resource_url) 412 if resource_val is not None: 413 if verbose: 414 print '<<Using cached copy of %s>>' % (resource_url,) 415 return resource_val 416 417 # Let the user know what's going on. 418 if verbose: 419 print '<<Loading %s>>' % (resource_url,) 420 421 # Determine the format of the resource. 422 if format == 'auto': 423 resource_url_parts = resource_url.split('.') 424 ext = resource_url_parts[-1] 425 if ext == 'gz': 426 ext = resource_url_parts[-2] 427 format = AUTO_FORMATS.get(ext) 428 if format is None: 429 raise ValueError('Could not determine format for %s based ' 430 'on its file\nextension; use the "format" ' 431 'argument to specify the format explicitly.' 432 % resource_url) 433 434 # Load the resource. 435 if format == 'pickle': 436 resource_val = pickle.load(_open(resource_url)) 437 elif format == 'yaml': 438 resource_val = yaml.load(_open(resource_url)) 439 elif format == 'cfg': 440 resource_val = cfg.parse_cfg(_open(resource_url).read()) 441 elif format == 'pcfg': 442 resource_val = cfg.parse_pcfg(_open(resource_url).read()) 443 elif format == 'fcfg': 444 resource_val = cfg.parse_fcfg(_open(resource_url).read()) 445 elif format == 'fol': 446 resource_val = sem.parse_fol(_open(resource_url).read()) 447 elif format == 'val': 448 resource_val = sem.parse_valuation(_open(resource_url).read()) 449 elif format == 'raw': 450 resource_val = _open(resource_url).read() 451 else: 452 assert format not in FORMATS 453 raise ValueError('Unknown format type!') 454 455 # If requested, add it to the cache. 456 if cache: 457 try: 458 _resource_cache[resource_url] = resource_val 459 except TypeError: 460 # We can't create weak references to some object types, like 461 # strings and tuples. For now, just don't cache them. 462 pass 463 464 return resource_val

465

466 -def show_cfg(resource_url, escape='##'):

467 """ 468 Write out a grammar file, ignoring escaped and empty lines 469 @type resource_url: C{str} 470 @param resource_url: A URL specifying where the resource should be 471 loaded from. The default protocol is C{"nltk:"}, which searches 472 for the file in the the NLTK data package. 473 @type escape: C{str} 474 @param escape: Prepended string that signals lines to be ignored 475 """ 476 resource_val = load(resource_url, format='raw', cache=False) 477 lines = resource_val.splitlines() 478 for l in lines: 479 if l.startswith(escape): continue 480 if re.match('^$', l): continue 481 print l

482 483

484 -def clear_cache():

485 """ 486 Remove all objects from the resource cache. 487 @see: L{load()} 488 """ 489 _resource_cache.clear()

490

491 -def _open(resource_url):

492 """ 493 Helper function that returns an open file object for a resource, 494 given its resource URL. If the given resource URL uses the 'ntlk' 495 protocol, or uses no protocol, then use L{nltk.data.find} to find 496 its path, and open it with the given mode; if the resource URL 497 uses the 'file' protocol, then open the file with the given mode; 498 otherwise, delegate to C{urllib2.urlopen}. 499 500 @type resource_url: C{str} 501 @param resource_url: A URL specifying where the resource should be 502 loaded from. The default protocol is C{"nltk:"}, which searches 503 for the file in the the NLTK data package. 504 """ 505 # Divide the resource name into "<protocol>:<path>". 506 protocol, path = re.match('(?:(\w+):)?(.*)', resource_url).groups() 507 508 if protocol is None or protocol.lower() == 'nltk': 509 return find(path).open() 510 elif protocol.lower() == 'file': 511 # urllib might not use mode='rb', so handle this one ourselves: 512 return open(path, 'rb') 513 else: 514 return urllib2.urlopen(resource_url)

515 516 ###################################################################### 517 # Lazy Resource Loader 518 ###################################################################### 519

520 -class LazyLoader(object):

521 - def __init__(self, path):

522 self.__path = path

523

524 - def __load(self):

525 resource = load(self.__path) 526 # This is where the magic happens! Transform ourselves into 527 # the object by modifying our own __dict__ and __class__ to 528 # match that of `resource`. 529 self.__dict__ = resource.__dict__ 530 self.__class__ = resource.__class__

531

532 - def __getattr__(self, attr):

533 self.__load() 534 # This looks circular, but its not, since __load() changes our 535 # __class__ to something new: 536 return getattr(self, attr)

537

538 - def __repr__(self):

539 self.__load() 540 # This looks circular, but its not, since __load() changes our 541 # __class__ to something new: 542 return '%r' % self

543 544 ###################################################################### 545 # Open-On-Demand ZipFile 546 ###################################################################### 547

548 -class OpenOnDemandZipFile(zipfile.ZipFile):

549 """ 550 A subclass of C{zipfile.ZipFile} that closes its file pointer 551 whenever it is not using it; and re-opens it when it needs to read 552 data from the zipfile. This is useful for reducing the number of 553 open file handles when many zip files are being accessed at once. 554 C{OpenOnDemandZipFile} must be constructed from a filename, not a 555 file-like object (to allow re-opening). C{OpenOnDemandZipFile} is 556 read-only (i.e., C{write} and C{writestr} are disabled. 557 """

558 - def __init__(self, filename):

559 if not isinstance(filename, basestring): 560 raise TypeError('ReopenableZipFile filename must be a string') 561 zipfile.ZipFile.__init__(self, filename) 562 assert self.filename == filename 563 self.close()

564

565 - def read(self, name):

566 assert self.fp is None 567 self.fp = open(self.filename, 'rb') 568 value = zipfile.ZipFile.read(self, name) 569 self.close() 570 return value

571

572 - def write(self, *args, **kwargs):

573 """@raise NotImplementedError: OpenOnDemandZipfile is read-only""" 574 raise NotImplementedError('OpenOnDemandZipfile is read-only')

575

576 - def writestr(self, *args, **kwargs):

577 """@raise NotImplementedError: OpenOnDemandZipfile is read-only""" 578 raise NotImplementedError('OpenOnDemandZipfile is read-only')

579

580 - def __repr__(self):

581 return 'OpenOnDemandZipFile(%r)' % self.filename

582 583 ###################################################################### 584 #{ Seekable Unicode Stream Reader 585 ###################################################################### 586

587 -class SeekableUnicodeStreamReader(object):

588 """ 589 A stream reader that automatically encodes the source byte stream 590 into unicode (like C{codecs.StreamReader}); but still supports the 591 C{seek()} and C{tell()} operations correctly. This is in contrast 592 to C{codecs.StreamReader}, which provide *broken* C{seek()} and 593 C{tell()} methods. 594 595 This class was motivated by L{StreamBackedCorpusView}, which 596 makes extensive use of C{seek()} and C{tell()}, and needs to be 597 able to handle unicode-encoded files. 598 599 Note: this class requires stateless decoders. To my knowledge, 600 this shouldn't cause a problem with any of python's builtin 601 unicode encodings. 602 """ 603 DEBUG = True #: If true, then perform extra sanity checks. 604

605 - def __init__(self, stream, encoding, errors='strict'):

606 # Rewind the stream to its beginning. 607 stream.seek(0) 608 609 self.stream = stream 610 """The underlying stream.""" 611 612 self.encoding = encoding 613 """The name of the encoding that should be used to encode the 614 underlying stream.""" 615 616 self.errors = errors 617 """The error mode that should be used when decoding data from 618 the underlying stream. Can be 'strict', 'ignore', or 619 'replace'.""" 620 621 self.decode = codecs.getdecoder(encoding) 622 """The function that is used to decode byte strings into 623 unicode strings.""" 624 625 self.bytebuffer = '' 626 """A buffer to use bytes that have been read but have not yet 627 been decoded. This is only used when the final bytes from 628 a read do not form a complete encoding for a character.""" 629 630 self.linebuffer = None 631 """A buffer used by L{readline()} to hold characters that have 632 been read, but have not yet been returned by L{read()} or 633 L{readline()}. This buffer consists of a list of unicode 634 strings, where each string corresponds to a single line. 635 The final element of the list may or may not be a complete 636 line. Note that the existence of a linebuffer makes the 637 L{tell()} operation more complex, because it must backtrack 638 to the beginning of the buffer to determine the correct 639 file position in the underlying byte stream.""" 640 641 self._rewind_checkpoint = 0 642 """The file position at which the most recent read on the 643 underlying stream began. This is used, together with 644 L{_rewind_numchars}, to backtrack to the beginning of 645 L{linebuffer} (which is required by L{tell()}).""" 646 647 self._rewind_numchars = None 648 """The number of characters that have been returned since the 649 read that started at L{_rewind_checkpoint}. This is used, 650 together with L{_rewind_checkpoint}, to backtrack to the 651 beginning of L{linebuffer} (which is required by 652 L{tell()}).""" 653 654 self._bom = self._check_bom() 655 """The length of the byte order marker at the beginning of 656 the stream (or C{None} for no byte order marker)."""

657 658 #///////////////////////////////////////////////////////////////// 659 # Read methods 660 #///////////////////////////////////////////////////////////////// 661

662 - def read(self, size=None):

663 """ 664 Read up to C{size} bytes, decode them using this reader's 665 encoding, and return the resulting unicode string. 666 667 @param size: The maximum number of bytes to read. If not 668 specified, then read as many bytes as possible. 669 670 @rtype: C{unicode} 671 """ 672 chars = self._read(size) 673 674 # If linebuffer is not empty, then include it in the result 675 if self.linebuffer: 676 chars = ''.join(self.linebuffer) + chars 677 self.linebuffer = None 678 self._rewind_numchars = None 679 680 return chars

681

682 - def readline(self, size=None):

683 """ 684 Read a line of text, decode it using this reader's encoding, 685 and return the resulting unicode string. 686 687 @param size: The maximum number of bytes to read. If no 688 newline is encountered before C{size} bytes have been 689 read, then the returned value may not be a complete line 690 of text. 691 """ 692 # If we have a non-empty linebuffer, then return the first 693 # line from it. (Note that the last element of linebuffer may 694 # not be a complete line; so let _read() deal with it.) 695 if self.linebuffer and len(self.linebuffer) > 1: 696 line = self.linebuffer.pop(0) 697 self._rewind_numchars += len(line) 698 return line 699 700 readsize = size or 72 701 chars = '' 702 703 # If there's a remaining incomplete line in the buffer, add it. 704 if self.linebuffer: 705 chars += self.linebuffer.pop() 706 self.linebuffer = None 707 708 while True: 709 startpos = self.stream.tell() - len(self.bytebuffer) 710 new_chars = self._read(readsize) 711 712 # If we're at a '\r', then read one extra character, since 713 # it might be a '\n', to get the proper line ending. 714 if new_chars and new_chars.endswith('\r'): 715 new_chars += self._read(1) 716 717 chars += new_chars 718 lines = chars.splitlines(True) 719 if len(lines) > 1: 720 line = lines[0] 721 self.linebuffer = lines[1:] 722 self._rewind_numchars = len(new_chars)-(len(chars)-len(line)) 723 self._rewind_checkpoint = startpos 724 break 725 elif len(lines) == 1: 726 line0withend = lines[0] 727 line0withoutend = lines[0].splitlines(False)[0] 728 if line0withend != line0withoutend: # complete line 729 line = line0withend 730 break 731 732 if not new_chars or size is not None: 733 line = chars 734 break 735 736 # Read successively larger blocks of text. 737 if readsize < 8000: 738 readsize *= 2 739 740 return line

741

742 - def readlines(self, sizehint=None, keepends=True):

743 """ 744 Read this file's contents, decode them using this reader's 745 encoding, and return it as a list of unicode lines. 746 747 @rtype: C{list} of C{unicode} 748 @param sizehint: Ignored. 749 @param keepends: If false, then strip newlines. 750 """ 751 return self.read().splitlines(keepends)

752

753 - def next(self):

754 """Return the next decoded line from the underlying stream.""" 755 line = self.readline() 756 if line: return line 757 else: raise StopIteration

758

759 - def __iter__(self):

760 """Return self""" 761 return self

762

763 - def xreadlines(self):

764 """Return self""" 765 return self

766 767 #///////////////////////////////////////////////////////////////// 768 # Pass-through methods & properties 769 #///////////////////////////////////////////////////////////////// 770 771 closed = property(lambda self: self.stream.closed, doc=""" 772 True if the underlying stream is closed.""") 773 774 name = property(lambda self: self.stream.name, doc=""" 775 The name of the underlying stream.""") 776 777 mode = property(lambda self: self.stream.mode, doc=""" 778 The mode of the underlying stream.""") 779

780 - def close(self):

781 """ 782 Close the underlying stream. 783 """ 784 self.stream.close()

785 786 #///////////////////////////////////////////////////////////////// 787 # Seek and tell 788 #///////////////////////////////////////////////////////////////// 789

790 - def seek(self, offset, whence=0):

791 """ 792 Move the stream to a new file position. If the reader is 793 maintaining any buffers, tehn they will be cleared. 794 795 @param offset: A byte count offset. 796 @param whence: If C{whence} is 0, then the offset is from the 797 start of the file (offset should be positive). If 798 C{whence} is 1, then the offset is from the current 799 position (offset may be positive or negative); and if 2, 800 then the offset is from the end of the file (offset should 801 typically be negative). 802 """ 803 if whence == 1: 804 raise ValueError('Relative seek is not supported for ' 805 'SeekableUnicodeStreamReader -- consider ' 806 'using char_seek_forward() instead.') 807 self.stream.seek(offset, whence) 808 self.linebuffer = None 809 self.bytebuffer = '' 810 self._rewind_numchars = None 811 self._rewind_checkpoint = self.stream.tell()

812

813 - def char_seek_forward(self, offset):

814 """ 815 Move the read pointer forward by C{offset} characters. 816 """ 817 if offset < 0: 818 raise ValueError('Negative offsets are not supported') 819 # Clear all buffers. 820 self.seek(self.tell()) 821 # Perform the seek operation. 822 self._char_seek_forward(offset)

823

824 - def _char_seek_forward(self, offset, est_bytes=None):

825 """ 826 Move the file position forward by C{offset} characters, 827 ignoring all buffers. 828 829 @param est_bytes: A hint, giving an estimate of the number of 830 bytes that will be neded to move foward by C{offset} chars. 831 Defaults to C{offset}. 832 """ 833 if est_bytes is None: est_bytes = offset 834 bytes = '' 835 836 while True: 837 # Read in a block of bytes. 838 newbytes = self.stream.read(est_bytes-len(bytes)) 839 bytes += newbytes 840 841 # Decode the bytes to characters. 842 chars, bytes_decoded = self._incr_decode(bytes) 843 844 # If we got the right number of characters, then seek 845 # backwards over any truncated characters, and return. 846 if len(chars) == offset: 847 self.stream.seek(-len(bytes)+bytes_decoded, 1) 848 return 849 850 # If we went too far, then we can back-up until we get it 851 # right, using the bytes we've already read. 852 if len(chars) > offset: 853 while len(chars) > offset: 854 # Assume at least one byte/char. 855 est_bytes += offset-len(chars) 856 chars, bytes_decoded = self._incr_decode(bytes[:est_bytes]) 857 self.stream.seek(-len(bytes)+bytes_decoded, 1) 858 return 859 860 # Otherwise, we haven't read enough bytes yet; loop again. 861 est_bytes += offset - len(chars)

862

863 - def tell(self):

864 """ 865 Return the current file position on the underlying byte 866 stream. If this reader is maintaining any buffers, then the 867 returned file position will be the position of the beginning 868 of those buffers. 869 """ 870 # If nothing's buffered, then just return our current filepos: 871 if self.linebuffer is None: 872 return self.stream.tell() - len(self.bytebuffer) 873 874 # Otherwise, we'll need to backtrack the filepos until we 875 # reach the beginning of the buffer. 876 877 # Store our original file position, so we can return here. 878 orig_filepos = self.stream.tell() 879 880 # Calculate an estimate of where we think the newline is. 881 bytes_read = ( (orig_filepos-len(self.bytebuffer)) - 882 self._rewind_checkpoint ) 883 buf_size = sum([len(line) for line in self.linebuffer]) 884 est_bytes = (bytes_read * self._rewind_numchars / 885 (self._rewind_numchars + buf_size)) 886 887 self.stream.seek(self._rewind_checkpoint) 888 self._char_seek_forward(self._rewind_numchars, est_bytes) 889 filepos = self.stream.tell() 890 891 # Sanity check 892 if self.DEBUG: 893 self.stream.seek(filepos) 894 check1 = self._incr_decode(self.stream.read(50))[0] 895 check2 = ''.join(self.linebuffer) 896 assert check1.startswith(check2) or check2.startswith(check1) 897 898 # Return to our original filepos (so we don't have to throw 899 # out our buffer.) 900 self.stream.seek(orig_filepos) 901 902 # Return the calculated filepos 903 return filepos

904 905 #///////////////////////////////////////////////////////////////// 906 # Helper methods 907 #///////////////////////////////////////////////////////////////// 908

909 - def _read(self, size=None):

910 """ 911 Read up to C{size} bytes from the underlying stream, decode 912 them using this reader's encoding, and return the resulting 913 unicode string. C{linebuffer} is *not* included in the 914 result. 915 """ 916 if size == 0: return u'' 917 918 # Skip past the byte order marker, if present. 919 if self._bom and self.stream.tell() == 0: 920 self.stream.read(self._bom) 921 922 # Read the requested number of bytes. 923 if size is None: 924 new_bytes = self.stream.read() 925 else: 926 new_bytes = self.stream.read(size) 927 bytes = self.bytebuffer + new_bytes 928 929 # Decode the bytes into unicode characters 930 chars, bytes_decoded = self._incr_decode(bytes) 931 932 # If we got bytes but couldn't decode any, then read further. 933 if (size is not None) and (not chars) and (len(new_bytes) > 0): 934 while not chars: 935 new_bytes = self.stream.read(1) 936 if not new_bytes: break # end of file. 937 bytes += new_bytes 938 chars, bytes_decoded = self._incr_decode(bytes) 939 940 # Record any bytes we didn't consume. 941 self.bytebuffer = bytes[bytes_decoded:] 942 943 # Return the result 944 return chars

945

946 - def _incr_decode(self, bytes):

947 """ 948 Decode the given byte string into a unicode string, using this 949 reader's encoding. If an exception is encountered that 950 appears to be caused by a truncation error, then just decode 951 the byte string without the bytes that cause the trunctaion 952 error. 953 954 @return: A tuple C{(chars, num_consumed)}, where C{chars} is 955 the decoded unicode string, and C{num_consumed} is the 956 number of bytes that were consumed. 957 """ 958 while True: 959 try: 960 return self.decode(bytes, 'strict') 961 except UnicodeDecodeError, exc: 962 # If the exception occurs at the end of the string, 963 # then assume that it's a truncation error. 964 if exc.end == len(bytes): 965 return self.decode(bytes[:exc.start], self.errors) 966 967 # Otherwise, if we're being strict, then raise it. 968 elif self.errors == 'strict': 969 raise 970 971 # If we're not strcit, then re-process it with our 972 # errors setting. This *may* raise an exception. 973 else: 974 return self.decode(bytes, self.errors)

975 976 _BOM_TABLE = { 977 'utf8': [(codecs.BOM_UTF8, None)], 978 'utf16': [(codecs.BOM_UTF16_LE, 'utf16-le'), 979 (codecs.BOM_UTF16_BE, 'utf16-be')], 980 'utf16le': [(codecs.BOM_UTF16_LE, None)], 981 'utf16be': [(codecs.BOM_UTF16_BE, None)], 982 'utf32': [(codecs.BOM_UTF32_LE, 'utf32-le'), 983 (codecs.BOM_UTF32_BE, 'utf32-be')], 984 'utf32le': [(codecs.BOM_UTF32_LE, None)], 985 'utf32be': [(codecs.BOM_UTF32_BE, None)], 986 } 987

988 - def _check_bom(self):

989 # Normalize our encoding name 990 enc = re.sub('[ -]', '', self.encoding.lower()) 991 992 # Look up our encoding in the BOM table. 993 bom_info = self._BOM_TABLE.get(enc) 994 995 if bom_info: 996 # Read a prefix, to check against the BOM(s) 997 bytes = self.stream.read(16) 998 self.stream.seek(0) 999 1000 # Check for each possible BOM. 1001 for (bom, new_encoding) in bom_info: 1002 if bytes.startswith(bom): 1003 if new_encoding: self.encoding = new_encoding 1004 return len(bom) 1005 1006 return None

1007

Source Code for Module nltk.data