1
2
3
4
5
6
7
8
9 """
10 API for corpus readers.
11 """
12
13 import os, re
14 from nltk import defaultdict
15 from nltk.internals import deprecated
16 import nltk.corpus.reader.util
17 from nltk.data import PathPointer, FileSystemPathPointer, ZipFilePathPointer
18
20 """
21 A base class for X{corpus reader} classes, each of which can be
22 used to read a specific corpus format. Each individual corpus
23 reader instance is used to read a specific corpus, consisting of
24 one or more files under a common root directory. Each file is
25 identified by its C{file identifier}, which is the relative path
26 to the file from the root directory.
27
28 A separate subclass is be defined for each corpus format. These
29 subclasses define one or more methods that provide 'views' on the
30 corpus contents, such as C{words()} (for a list of words) and
31 C{parsed_sents()} (for a list of parsed sentences). Called with
32 no arguments, these methods will return the contents of the entire
33 corpus. For most corpora, these methods define one or more
34 selection arguments, such as C{files} or C{categories}, which can
35 be used to select which portion of the corpus should be returned.
36 """
37 - def __init__(self, root, files, encoding=None, tag_mapping_function=None):
38 """
39 @type root: L{PathPointer} or C{str}
40 @param root: A path pointer identifying the root directory for
41 this corpus. If a string is specified, then it will be
42 converted to a L{PathPointer} automatically.
43 @param files: A list of the files that make up this corpus.
44 This list can either be specified explicitly, as a list of
45 strings; or implicitly, as a regular expression over file
46 paths. The absolute path for each file will be constructed
47 by joining the reader's root to each file name.
48 @param encoding: The default unicode encoding for the files
49 that make up the corpus. C{encoding}'s value can be any
50 of the following:
51
52 - B{A string}: C{encoding} is the encoding name for all
53 files.
54 - B{A dictionary}: C{encoding[file_id]} is the encoding
55 name for the file whose identifier is C{file_id}. If
56 C{file_id} is not in C{encoding}, then the file
57 contents will be processed using non-unicode byte
58 strings.
59 - B{A list}: C{encoding} should be a list of C{(regexp,
60 encoding)} tuples. The encoding for a file whose
61 identifier is C{file_id} will be the C{encoding} value
62 for the first tuple whose C{regexp} matches the
63 C{file_id}. If no tuple's C{regexp} matches the
64 C{file_id}, the file contents will be processed using
65 non-unicode byte strings.
66 - C{None}: the file contents of all files will be
67 processed using non-unicode byte strings.
68 @param tag_mapping_function: A function for normalizing or
69 simplifying the POS tags returned by the tagged_words()
70 or tagged_sents() methods.
71 """
72
73 if isinstance(root, basestring):
74 m = re.match('(.*\.zip)/?(.*)$|', root)
75 zipfile, zipentry = m.groups()
76 if zipfile:
77 root = ZipFilePathPointer(zipfile, zipentry)
78 else:
79 root = FileSystemPathPointer(root)
80 elif not isinstance(root, PathPointer):
81 raise TypeError('CorpusReader: expected a string or a PathPointer')
82
83
84 if isinstance(files, basestring):
85 files = nltk.corpus.reader.find_corpus_files(root, files)
86
87 self._files = tuple(files)
88 """A list of the relative paths for the files that make up
89 this corpus."""
90
91 self._root = root
92 """The root directory for this corpus."""
93
94
95
96 if isinstance(encoding, list):
97 encoding_dict = {}
98 for fileid in self._files:
99 for x in encoding:
100 (regexp, enc) = x
101 if re.match(regexp, fileid):
102 encoding_dict[fileid] = enc
103 break
104 encoding = encoding_dict
105
106 self._encoding = encoding
107 """The default unicode encoding for the files that make up
108 this corpus. If C{encoding} is C{None}, then the file
109 contents are processed using byte strings (C{str})."""
110 self._tag_mapping_function = tag_mapping_function
111
118
120 """
121 Return a list of file identifiers for the files that make up
122 this corpus.
123 """
124 return self._files
125
127 """
128 Return the absolute path for the given file.
129
130 @type file: C{str}
131 @param file: The file identifier for the file whose path
132 should be returned.
133
134 @rtype: L{PathPointer}
135 """
136 return self._root.join(file)
137
138 - def abspaths(self, files=None, include_encoding=False):
139 """
140 Return a list of the absolute paths for all files in this corpus;
141 or for the given list of files, if specified.
142
143 @type files: C{None} or C{str} or C{list}
144 @param files: Specifies the set of files for which paths should
145 be returned. Can be C{None}, for all files; a list of
146 file identifiers, for a specified set of files; or a single
147 file identifier, for a single file. Note that the return
148 value is always a list of paths, even if C{files} is a
149 single file identifier.
150
151 @param include_encoding: If true, then return a list of
152 C{(path_pointer, encoding)} tuples.
153
154 @rtype: C{list} of L{PathPointer}
155 """
156 if files is None:
157 files = self._files
158 elif isinstance(files, basestring):
159 files = [files]
160
161 paths = [self._root.join(f) for f in files]
162
163 if include_encoding:
164 return zip(paths, [self.encoding(f) for f in files])
165 else:
166 return paths
167
168 - def open(self, file):
169 """
170 Return an open stream that can be used to read the given file.
171 If the file's encoding is not C{None}, then the stream will
172 automatically decode the file's contents into unicode.
173
174 @param file: The file identifier of the file to read.
175 """
176 encoding = self.encoding(file)
177 return self._root.join(file).open(encoding)
178
180 """
181 Return the unicode encoding for the given corpus file, if known.
182 If the encoding is unknown, or if the given file should be
183 processed using byte strings (C{str}), then return C{None}.
184 """
185 if isinstance(self._encoding, dict):
186 return self._encoding.get(file)
187 else:
188 return self._encoding
189
191 root = property(_get_root, doc="""
192 The directory where this corpus is stored.
193
194 @type: L{PathPointer}""")
195
196
197 @deprecated("Use corpus.files() instead")
199 items = property(_get_items)
200
201 @deprecated("Use corpus.abspaths() instead")
203
204
205
206
207
208
210 """
211 A mixin class used to aid in the implementation of corpus readers
212 for categorized corpora. This class defines the method
213 L{categories()}, which returns a list of the categories for the
214 corpus or for a specified set of files; and overrides L{files()}
215 to take a C{categories} argument, restricting the set of files to
216 be returned.
217
218 Subclasses are expected to:
219
220 - Call L{__init__()} to set up the mapping.
221
222 - Override all view methods to accept a C{categories} parameter,
223 which can be used *instead* of the C{files} parameter, to
224 select which files should be included in the returned view.
225 """
226
228 """
229 Initialize this mapping based on keyword arguments, as
230 follows:
231
232 - cat_pattern: A regular expression pattern used to find the
233 category for each file identifier. The pattern will be
234 applied to each file identifier, and the first matching
235 group will be used as the category label for that file.
236
237 - cat_map: A dictionary, mapping from file identifiers to
238 category labels.
239
240 - cat_file: The name of a file that contains the mapping
241 from file identifiers to categories. The argument
242 C{cat_delimiter} can be used to specify a delimiter.
243
244 The corresponding argument will be deleted from C{kwargs}. If
245 more than one argument is specified, an exception will be
246 raised.
247 """
248 self._f2c = None
249 self._c2f = None
250
251 self._pattern = None
252 self._map = None
253 self._file = None
254 self._delimiter = None
255
256 if 'cat_pattern' in kwargs:
257 self._pattern = kwargs['cat_pattern']
258 del kwargs['cat_pattern']
259 elif 'cat_map' in kwargs:
260 self._map = kwargs['cat_map']
261 del kwargs['cat_map']
262 elif 'cat_file' in kwargs:
263 self._file = kwargs['cat_file']
264 del kwargs['cat_file']
265 if 'cat_delimiter' in kwargs:
266 self._delimiter = kwargs['cat_delimiter']
267 del kwargs['cat_delimiter']
268 else:
269 raise ValueError('Expected keyword argument cat_pattern or '
270 'cat_map or cat_file.')
271
272
273 if ('cat_pattern' in kwargs or 'cat_map' in kwargs or
274 'cat_file' in kwargs):
275 raise ValueError('Specify exactly one of: cat_pattern, '
276 'cat_map, cat_file.')
277
279 self._f2c = defaultdict(list)
280 self._c2f = defaultdict(list)
281
282 if self._pattern is not None:
283 for file_id in self._files:
284 category = re.match(self._pattern, file_id).group(1)
285 self._add(file_id, category)
286
287 elif self._map is not None:
288 for (file_id, categories) in self._map.items():
289 for category in categories:
290 self._add(file_id, category)
291
292 elif self._file is not None:
293 for line in self.open(self._file).readlines():
294 line = line.strip()
295 file_id, categories = line.split(self._delimiter, 1)
296 if file_id not in self.files():
297 raise ValueError('In category mapping file %s: %s '
298 'not found' % (catfile, file_id))
299 for category in categories.split(self._delimiter):
300 self._add(file_id, category)
301
302 - def _add(self, file_id, category):
303 self._f2c[file_id].append(category)
304 self._c2f[category].append(file_id)
305
307 """
308 Return a list of the categories that are defined for this corpus,
309 or for the file(s) if it is given.
310 """
311 if self._f2c is None: self._init()
312 if files is None:
313 return sorted(self._c2f)
314 if isinstance(files, basestring):
315 files = [files]
316 return sorted(sum((self._f2c[d] for d in files), []))
317
318 - def files(self, categories=None):
319 """
320 Return a list of file identifiers for the files that make up
321 this corpus, or that make up the given category(s) if specified.
322 """
323 if categories is None:
324 return super(CategorizedCorpusReader, self).files()
325 elif isinstance(categories, basestring):
326 if self._f2c is None: self._init()
327 return sorted(self._c2f[categories])
328 else:
329 if self._f2c is None: self._init()
330 return sorted(sum((self._c2f[c] for c in categories), []))
331