Package nltk :: Package corpus :: Module util
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.util

 1  # Natural Language Toolkit: Corpus Reader Utility Functions 
 2  # 
 3  # Copyright (C) 2001-2008 NLTK Project 
 4  # Author: Edward Loper <[email protected]> 
 5  # URL: <http://nltk.org> 
 6  # For license information, see LICENSE.TXT 
 7   
 8  ###################################################################### 
 9  #{ Lazy Corpus Loader 
10  ###################################################################### 
11   
12  import re 
13  import nltk 
14   
15  TRY_ZIPFILE_FIRST = False 
16   
17 -class LazyCorpusLoader(object):
18 """ 19 A proxy object which is used to stand in for a corpus object 20 before the corpus is loaded. This allows NLTK to create an object 21 for each corpus, but defer the costs associated with loading those 22 corpora until the first time that they're actually accessed. 23 24 The first time this object is accessed in any way, it will load 25 the corresponding corpus, and transform itself into that corpus 26 (by modifying its own C{__class__} and C{__dict__} attributes). 27 28 If the corpus can not be found, then accessing this object will 29 raise an exception, displaying installation instructions for the 30 NLTK data package. Once they've properly installed the data 31 package (or modified C{nltk.data.path} to point to its location), 32 they can then use the corpus object without restarting python. 33 """
34 - def __init__(self, name, reader_cls, *args, **kwargs):
35 from nltk.corpus.reader.api import CorpusReader 36 assert issubclass(reader_cls, CorpusReader) 37 self.__name = name 38 self.__reader_cls = reader_cls 39 self.__args = args 40 self.__kwargs = kwargs
41
42 - def __load(self):
43 # Find the corpus root directory. 44 zip_name = re.sub(r'(([^/]*)(/.*)?)', r'\2.zip/\1/', self.__name) 45 if TRY_ZIPFILE_FIRST: 46 try: 47 root = nltk.data.find('corpora/%s' % zip_name) 48 except LookupError: 49 raise 50 root = nltk.data.find('corpora/%s' % self.__name) 51 else: 52 try: 53 root = nltk.data.find('corpora/%s' % self.__name) 54 except LookupError, e: 55 try: root = nltk.data.find('corpora/%s' % zip_name) 56 except LookupError: raise e 57 58 # Load the corpus. 59 corpus = self.__reader_cls(root, *self.__args, **self.__kwargs) 60 61 # This is where the magic happens! Transform ourselves into 62 # the corpus by modifying our own __dict__ and __class__ to 63 # match that of the corpus. 64 self.__dict__ = corpus.__dict__ 65 self.__class__ = corpus.__class__
66
67 - def __getattr__(self, attr):
68 self.__load() 69 # This looks circular, but its not, since __load() changes our 70 # __class__ to something new: 71 return getattr(self, attr)
72
73 - def __repr__(self):
74 self.__load() 75 # This looks circular, but its not, since __load() changes our 76 # __class__ to something new: 77 return '%r' % self
78