nltk.corpus.reader.api

1 # Natural Language Toolkit: API for Corpus Readers 2 # 3 # Copyright (C) 2001-2008 NLTK Project 4 # Author: Steven Bird <[email protected]> 5 # Edward Loper <[email protected]> 6 # URL: <http://nltk.org> 7 # For license information, see LICENSE.TXT 8 9 """ 10 API for corpus readers. 11 """ 12 13 import os, re 14 from nltk import defaultdict 15 from nltk.internals import deprecated 16 import nltk.corpus.reader.util 17 from nltk.data import PathPointer, FileSystemPathPointer, ZipFilePathPointer 18

19 -class CorpusReader(object):

20 """ 21 A base class for X{corpus reader} classes, each of which can be 22 used to read a specific corpus format. Each individual corpus 23 reader instance is used to read a specific corpus, consisting of 24 one or more files under a common root directory. Each file is 25 identified by its C{file identifier}, which is the relative path 26 to the file from the root directory. 27 28 A separate subclass is be defined for each corpus format. These 29 subclasses define one or more methods that provide 'views' on the 30 corpus contents, such as C{words()} (for a list of words) and 31 C{parsed_sents()} (for a list of parsed sentences). Called with 32 no arguments, these methods will return the contents of the entire 33 corpus. For most corpora, these methods define one or more 34 selection arguments, such as C{files} or C{categories}, which can 35 be used to select which portion of the corpus should be returned. 36 """

37 - def __init__(self, root, files, encoding=None, tag_mapping_function=None):

38 """ 39 @type root: L{PathPointer} or C{str} 40 @param root: A path pointer identifying the root directory for 41 this corpus. If a string is specified, then it will be 42 converted to a L{PathPointer} automatically. 43 @param files: A list of the files that make up this corpus. 44 This list can either be specified explicitly, as a list of 45 strings; or implicitly, as a regular expression over file 46 paths. The absolute path for each file will be constructed 47 by joining the reader's root to each file name. 48 @param encoding: The default unicode encoding for the files 49 that make up the corpus. C{encoding}'s value can be any 50 of the following: 51 52 - B{A string}: C{encoding} is the encoding name for all 53 files. 54 - B{A dictionary}: C{encoding[file_id]} is the encoding 55 name for the file whose identifier is C{file_id}. If 56 C{file_id} is not in C{encoding}, then the file 57 contents will be processed using non-unicode byte 58 strings. 59 - B{A list}: C{encoding} should be a list of C{(regexp, 60 encoding)} tuples. The encoding for a file whose 61 identifier is C{file_id} will be the C{encoding} value 62 for the first tuple whose C{regexp} matches the 63 C{file_id}. If no tuple's C{regexp} matches the 64 C{file_id}, the file contents will be processed using 65 non-unicode byte strings. 66 - C{None}: the file contents of all files will be 67 processed using non-unicode byte strings. 68 @param tag_mapping_function: A function for normalizing or 69 simplifying the POS tags returned by the tagged_words() 70 or tagged_sents() methods. 71 """ 72 # Convert the root to a path pointer, if necessary. 73 if isinstance(root, basestring): 74 m = re.match('(.*\.zip)/?(.*)$|', root) 75 zipfile, zipentry = m.groups() 76 if zipfile: 77 root = ZipFilePathPointer(zipfile, zipentry) 78 else: 79 root = FileSystemPathPointer(root) 80 elif not isinstance(root, PathPointer): 81 raise TypeError('CorpusReader: expected a string or a PathPointer') 82 83 # If `files` is a regexp, then expand it. 84 if isinstance(files, basestring): 85 files = nltk.corpus.reader.find_corpus_files(root, files) 86 87 self._files = tuple(files) 88 """A list of the relative paths for the files that make up 89 this corpus.""" 90 91 self._root = root 92 """The root directory for this corpus.""" 93 94 # If encoding was specified as a list of regexps, then convert 95 # it to a dictionary. 96 if isinstance(encoding, list): 97 encoding_dict = {} 98 for fileid in self._files: 99 for x in encoding: 100 (regexp, enc) = x 101 if re.match(regexp, fileid): 102 encoding_dict[fileid] = enc 103 break 104 encoding = encoding_dict 105 106 self._encoding = encoding 107 """The default unicode encoding for the files that make up 108 this corpus. If C{encoding} is C{None}, then the file 109 contents are processed using byte strings (C{str}).""" 110 self._tag_mapping_function = tag_mapping_function

111

112 - def __repr__(self):

113 if isinstance(self._root, ZipFilePathPointer): 114 path = '%s/%s' % (self._root.zipfile.filename, self._root.entry) 115 else: 116 path = '%s' % self._root.path 117 return '<%s in %r>' % (self.__class__.__name__, path)

118

119 - def files(self):

120 """ 121 Return a list of file identifiers for the files that make up 122 this corpus. 123 """ 124 return self._files

125

126 - def abspath(self, file):

127 """ 128 Return the absolute path for the given file. 129 130 @type file: C{str} 131 @param file: The file identifier for the file whose path 132 should be returned. 133 134 @rtype: L{PathPointer} 135 """ 136 return self._root.join(file)

137

138 - def abspaths(self, files=None, include_encoding=False):

139 """ 140 Return a list of the absolute paths for all files in this corpus; 141 or for the given list of files, if specified. 142 143 @type files: C{None} or C{str} or C{list} 144 @param files: Specifies the set of files for which paths should 145 be returned. Can be C{None}, for all files; a list of 146 file identifiers, for a specified set of files; or a single 147 file identifier, for a single file. Note that the return 148 value is always a list of paths, even if C{files} is a 149 single file identifier. 150 151 @param include_encoding: If true, then return a list of 152 C{(path_pointer, encoding)} tuples. 153 154 @rtype: C{list} of L{PathPointer} 155 """ 156 if files is None: 157 files = self._files 158 elif isinstance(files, basestring): 159 files = [files] 160 161 paths = [self._root.join(f) for f in files] 162 163 if include_encoding: 164 return zip(paths, [self.encoding(f) for f in files]) 165 else: 166 return paths

167

168 - def open(self, file):

169 """ 170 Return an open stream that can be used to read the given file. 171 If the file's encoding is not C{None}, then the stream will 172 automatically decode the file's contents into unicode. 173 174 @param file: The file identifier of the file to read. 175 """ 176 encoding = self.encoding(file) 177 return self._root.join(file).open(encoding)

178

179 - def encoding(self, file):

180 """ 181 Return the unicode encoding for the given corpus file, if known. 182 If the encoding is unknown, or if the given file should be 183 processed using byte strings (C{str}), then return C{None}. 184 """ 185 if isinstance(self._encoding, dict): 186 return self._encoding.get(file) 187 else: 188 return self._encoding

189

190 - def _get_root(self): return self._root

191 root = property(_get_root, doc=""" 192 The directory where this corpus is stored. 193 194 @type: L{PathPointer}""") 195 196 #{ Deprecated since 0.9.1 197 @deprecated("Use corpus.files() instead")

198 - def _get_items(self): return self.files()

199 items = property(_get_items) 200 201 @deprecated("Use corpus.abspaths() instead")

202 - def filenames(self, items=None): return self.abspaths(items)

203 #} 204 205 ###################################################################### 206 #{ Corpora containing categorized items 207 ###################################################################### 208

209 -class CategorizedCorpusReader(object):

210 """ 211 A mixin class used to aid in the implementation of corpus readers 212 for categorized corpora. This class defines the method 213 L{categories()}, which returns a list of the categories for the 214 corpus or for a specified set of files; and overrides L{files()} 215 to take a C{categories} argument, restricting the set of files to 216 be returned. 217 218 Subclasses are expected to: 219 220 - Call L{__init__()} to set up the mapping. 221 222 - Override all view methods to accept a C{categories} parameter, 223 which can be used *instead* of the C{files} parameter, to 224 select which files should be included in the returned view. 225 """ 226

227 - def __init__(self, kwargs):

228 """ 229 Initialize this mapping based on keyword arguments, as 230 follows: 231 232 - cat_pattern: A regular expression pattern used to find the 233 category for each file identifier. The pattern will be 234 applied to each file identifier, and the first matching 235 group will be used as the category label for that file. 236 237 - cat_map: A dictionary, mapping from file identifiers to 238 category labels. 239 240 - cat_file: The name of a file that contains the mapping 241 from file identifiers to categories. The argument 242 C{cat_delimiter} can be used to specify a delimiter. 243 244 The corresponding argument will be deleted from C{kwargs}. If 245 more than one argument is specified, an exception will be 246 raised. 247 """ 248 self._f2c = None #: file-to-category mapping 249 self._c2f = None #: category-to-file mapping 250 251 self._pattern = None #: regexp specifying the mapping 252 self._map = None #: dict specifying the mapping 253 self._file = None #: filename of file containing the mapping 254 self._delimiter = None #: delimiter for L{self._file} 255 256 if 'cat_pattern' in kwargs: 257 self._pattern = kwargs['cat_pattern'] 258 del kwargs['cat_pattern'] 259 elif 'cat_map' in kwargs: 260 self._map = kwargs['cat_map'] 261 del kwargs['cat_map'] 262 elif 'cat_file' in kwargs: 263 self._file = kwargs['cat_file'] 264 del kwargs['cat_file'] 265 if 'cat_delimiter' in kwargs: 266 self._delimiter = kwargs['cat_delimiter'] 267 del kwargs['cat_delimiter'] 268 else: 269 raise ValueError('Expected keyword argument cat_pattern or ' 270 'cat_map or cat_file.') 271 272 273 if ('cat_pattern' in kwargs or 'cat_map' in kwargs or 274 'cat_file' in kwargs): 275 raise ValueError('Specify exactly one of: cat_pattern, ' 276 'cat_map, cat_file.')

277

278 - def _init(self):

279 self._f2c = defaultdict(list) 280 self._c2f = defaultdict(list) 281 282 if self._pattern is not None: 283 for file_id in self._files: 284 category = re.match(self._pattern, file_id).group(1) 285 self._add(file_id, category) 286 287 elif self._map is not None: 288 for (file_id, categories) in self._map.items(): 289 for category in categories: 290 self._add(file_id, category) 291 292 elif self._file is not None: 293 for line in self.open(self._file).readlines(): 294 line = line.strip() 295 file_id, categories = line.split(self._delimiter, 1) 296 if file_id not in self.files(): 297 raise ValueError('In category mapping file %s: %s ' 298 'not found' % (catfile, file_id)) 299 for category in categories.split(self._delimiter): 300 self._add(file_id, category)

301

302 - def _add(self, file_id, category):

303 self._f2c[file_id].append(category) 304 self._c2f[category].append(file_id)

305

306 - def categories(self, files=None):

307 """ 308 Return a list of the categories that are defined for this corpus, 309 or for the file(s) if it is given. 310 """ 311 if self._f2c is None: self._init() 312 if files is None: 313 return sorted(self._c2f) 314 if isinstance(files, basestring): 315 files = [files] 316 return sorted(sum((self._f2c[d] for d in files), []))

317

318 - def files(self, categories=None):

319 """ 320 Return a list of file identifiers for the files that make up 321 this corpus, or that make up the given category(s) if specified. 322 """ 323 if categories is None: 324 return super(CategorizedCorpusReader, self).files() 325 elif isinstance(categories, basestring): 326 if self._f2c is None: self._init() 327 return sorted(self._c2f[categories]) 328 else: 329 if self._f2c is None: self._init() 330 return sorted(sum((self._c2f[c] for c in categories), []))

331

Source Code for Module nltk.corpus.reader.api