1
2
3
4
5
6
7
8
9
10
11
12 import re
13 import nltk
14
15 TRY_ZIPFILE_FIRST = False
16
18 """
19 A proxy object which is used to stand in for a corpus object
20 before the corpus is loaded. This allows NLTK to create an object
21 for each corpus, but defer the costs associated with loading those
22 corpora until the first time that they're actually accessed.
23
24 The first time this object is accessed in any way, it will load
25 the corresponding corpus, and transform itself into that corpus
26 (by modifying its own C{__class__} and C{__dict__} attributes).
27
28 If the corpus can not be found, then accessing this object will
29 raise an exception, displaying installation instructions for the
30 NLTK data package. Once they've properly installed the data
31 package (or modified C{nltk.data.path} to point to its location),
32 they can then use the corpus object without restarting python.
33 """
34 - def __init__(self, name, reader_cls, *args, **kwargs):
41
43
44 zip_name = re.sub(r'(([^/]*)(/.*)?)', r'\2.zip/\1/', self.__name)
45 if TRY_ZIPFILE_FIRST:
46 try:
47 root = nltk.data.find('corpora/%s' % zip_name)
48 except LookupError:
49 raise
50 root = nltk.data.find('corpora/%s' % self.__name)
51 else:
52 try:
53 root = nltk.data.find('corpora/%s' % self.__name)
54 except LookupError, e:
55 try: root = nltk.data.find('corpora/%s' % zip_name)
56 except LookupError: raise e
57
58
59 corpus = self.__reader_cls(root, *self.__args, **self.__kwargs)
60
61
62
63
64 self.__dict__ = corpus.__dict__
65 self.__class__ = corpus.__class__
66
68 self.__load()
69
70
71 return getattr(self, attr)
72
74 self.__load()
75
76
77 return '%r' % self
78