| Home | Trees | Indices | Help |
|
|---|
|
|
1 # Natural Language Toolkit: Corpus Reader Utility Functions
2 #
3 # Copyright (C) 2001-2008 NLTK Project
4 # Author: Edward Loper <[email protected]>
5 # URL: <http://nltk.org>
6 # For license information, see LICENSE.TXT
7
8 ######################################################################
9 #{ Lazy Corpus Loader
10 ######################################################################
11
12 import re
13 import nltk
14
15 TRY_ZIPFILE_FIRST = False
16
18 """
19 A proxy object which is used to stand in for a corpus object
20 before the corpus is loaded. This allows NLTK to create an object
21 for each corpus, but defer the costs associated with loading those
22 corpora until the first time that they're actually accessed.
23
24 The first time this object is accessed in any way, it will load
25 the corresponding corpus, and transform itself into that corpus
26 (by modifying its own C{__class__} and C{__dict__} attributes).
27
28 If the corpus can not be found, then accessing this object will
29 raise an exception, displaying installation instructions for the
30 NLTK data package. Once they've properly installed the data
31 package (or modified C{nltk.data.path} to point to its location),
32 they can then use the corpus object without restarting python.
33 """
35 from nltk.corpus.reader.api import CorpusReader
36 assert issubclass(reader_cls, CorpusReader)
37 self.__name = name
38 self.__reader_cls = reader_cls
39 self.__args = args
40 self.__kwargs = kwargs
41
43 # Find the corpus root directory.
44 zip_name = re.sub(r'(([^/]*)(/.*)?)', r'\2.zip/\1/', self.__name)
45 if TRY_ZIPFILE_FIRST:
46 try:
47 root = nltk.data.find('corpora/%s' % zip_name)
48 except LookupError:
49 raise
50 root = nltk.data.find('corpora/%s' % self.__name)
51 else:
52 try:
53 root = nltk.data.find('corpora/%s' % self.__name)
54 except LookupError, e:
55 try: root = nltk.data.find('corpora/%s' % zip_name)
56 except LookupError: raise e
57
58 # Load the corpus.
59 corpus = self.__reader_cls(root, *self.__args, **self.__kwargs)
60
61 # This is where the magic happens! Transform ourselves into
62 # the corpus by modifying our own __dict__ and __class__ to
63 # match that of the corpus.
64 self.__dict__ = corpus.__dict__
65 self.__class__ = corpus.__class__
66
68 self.__load()
69 # This looks circular, but its not, since __load() changes our
70 # __class__ to something new:
71 return getattr(self, attr)
72
74 self.__load()
75 # This looks circular, but its not, since __load() changes our
76 # __class__ to something new:
77 return '%r' % self
78
| Home | Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0beta1 on Wed Aug 27 15:09:16 2008 | http://epydoc.sourceforge.net |