Code Coverage for nltk.corpus.util
Partially Tested Functions
import re
import nltk
TRY_ZIPFILE_FIRST = False
class LazyCorpusLoader(object):
"""
A proxy object which is used to stand in for a corpus object
before the corpus is loaded. This allows NLTK to create an object
for each corpus, but defer the costs associated with loading those
corpora until the first time that they're actually accessed.
The first time this object is accessed in any way, it will load
the corresponding corpus, and transform itself into that corpus
(by modifying its own C{__class__} and C{__dict__} attributes).
If the corpus can not be found, then accessing this object will
raise an exception, displaying installation instructions for the
NLTK data package. Once they've properly installed the data
package (or modified C{nltk.data.path} to point to its location),
they can then use the corpus object without restarting python.
"""
def __init__(self, name, reader_cls, *args, **kwargs):
from nltk.corpus.reader.api import CorpusReader
assert issubclass(reader_cls, CorpusReader)
self.__name = name
self.__reader_cls = reader_cls
self.__args = args
self.__kwargs = kwargs
def __load(self):
zip_name = re.sub(r'(([^/]*)(/.*)?)', r'\2.zip/\1/', self.__name)
if TRY_ZIPFILE_FIRST:
try:
root = nltk.data.find('corpora/%s' % zip_name)
except LookupError:
raise
root = nltk.data.find('corpora/%s' % self.__name)
else:
try:
root = nltk.data.find('corpora/%s' % self.__name)
except LookupError, e:
try: root = nltk.data.find('corpora/%s' % zip_name)
except LookupError: raise e
corpus = self.__reader_cls(root, *self.__args, **self.__kwargs)
self.__dict__ = corpus.__dict__
self.__class__ = corpus.__class__
def __getattr__(self, attr):
self.__load()
return getattr(self, attr)
def __repr__(self):
self.__load()
return '%r' % self