nltk.wordnet.dictionary

1 # Natural Language Toolkit: Wordnet Interface: Dictionary classes 2 # 3 # Copyright (C) 2001-2008 NLTK Project 4 # Author: Oliver Steele <[email protected]> 5 # David Ormiston Smith <[email protected]>> 6 # Steven Bird <[email protected]> 7 # URL: <http://nltk.org> 8 # For license information, see LICENSE.TXT 9 10 # Dictionary classes, which allow users to access 11 # Wordnet data via a handy dict notation (see below). 12 13 import types 14 from cache import entityCache 15 16 import nltk.data 17 from nltk.internals import deprecated 18 19 from util import * 20

21 -class Dictionary(object):

22 """ 23 A Dictionary contains all the Words in a given part of speech. Four 24 dictionaries, bound to N, V, ADJ, and ADV, are bound by default in 25 __init.py__. 26 27 Indexing a dictionary by a string retrieves the word named by that 28 string, e.g. dict['dog']. Indexing by an integer n retrieves the 29 nth word, e.g. dict[0]. Access by an arbitrary integer is very 30 slow except in the special case where the words are accessed 31 sequentially; this is to support the use of dictionaries as the 32 range of a for statement and as the sequence argument to map and filter. 33 34 >>> N['dog'] 35 dog(n.) 36 """ 37

38 - def __init__(self, pos, filenameroot):

39 """ 40 @type pos: C{string} 41 @param pos: This L{Dictionary}'s part of speech ('noun', 'verb' etc.) 42 @type filenameroot: C{string} 43 @param filenameroot: filename of the relevant Wordnet dictionary file 44 """ 45 self.pos = pos 46 self._filenameroot = filenameroot 47 self._loaded = False

48

49 - def load(self):

50 if not self._loaded: 51 self.indexFile = IndexFile(self.pos, self._filenameroot) 52 path = nltk.data.find('corpora/wordnet/data.%s' % 53 self._filenameroot) 54 self.dataFile = open(path, FILE_OPEN_MODE) 55 self._loaded = True

56

57 - def __repr__(self):

58 self.load() 59 dictionaryVariables = {} 60 61 if dictionaryVariables.get(self): 62 return self.__module__ + "." + dictionaryVariables[self] 63 64 return "<%s.%s instance for %s>" % \ 65 (self.__module__, "Dictionary", self.pos)

66 67 # Deprecated since 0.9.4 68 @deprecated("Use Dictionary.word() instead.")

69 - def getWord(self, form, line=None):

70 return word(self, form, line)

71

72 - def word(self, form, line=None):

73 """ 74 @type form: C{string} 75 @param form: word string e.g, 'dog' 76 @type line: C{string} 77 @param line: appropriate line sourced from the index file (optional) 78 @return: The L{Word} object with the supplied form, if present. 79 """ 80 self.load() 81 key = form.lower().replace(' ', '_') 82 pos = self.pos 83 84 def loader(key=key, line=line, indexFile=self.indexFile): 85 from synset import Word 86 line = line or indexFile.get(key) 87 return line and Word(line)

88 89 word = entityCache.get((pos, key), loader) 90 91 if word: return word 92 else: raise KeyError, "%s is not in the %s database" % (`form`, `pos`)

93 94 # Deprecated since 0.9.4 95 @deprecated("Use Dictionary.word() instead.")

96 - def getSynset(self, offset):

97 return synset(self, offset)

98

99 - def synset(self, offset):

100 """ 101 @type offset: C{int} 102 @param offset: integer offset into a Wordnet file, at which the 103 desired L{Synset} can be found. 104 105 @return: The relevant L{Synset}, if present. 106 """ 107 108 self.load() 109 def loader(pos=self.pos, offset=offset, dataFile=self.dataFile): 110 from synset import Synset 111 dataFile.seek(offset) 112 line = dataFile.readline() 113 return Synset(pos, offset, line)

114 115 return entityCache.get((self.pos, offset), loader) 116

117 - def _buildIndexCacheFile(self):

118 self.load() 119 self.indexFile._buildIndexCacheFile()

120

121 - def __nonzero__(self):

122 """ 123 >>> N and 'true' 124 'true' 125 """ 126 self.load() 127 return 1

128

129 - def __len__(self):

130 """ 131 Return the number of index entries. 132 133 >>> len(ADJ) 134 21435 135 """ 136 self.load() 137 if not hasattr(self, 'length'): 138 self.length = len(self.indexFile) 139 140 return self.length

141

142 - def __getslice__(self, a, b):

143 self.load() 144 results = [] 145 146 if type(a) == type('') and type(b) == type(''): 147 raise NotImplementedError() 148 149 elif type(a) == type(1) and type(b) == type(1): 150 for i in range(a, b): 151 results.append(self[i]) 152 153 else: 154 raise TypeError 155 156 return results

157

158 - def __getitem__(self, index):

159 """ 160 If index is a String, return the Word whose form is 161 index. If index is an integer n, return the Word 162 indexed by the n'th Word in the Index file. 163 164 >>> N['dog'] 165 dog(n.) 166 >>> N[0] 167 'hood(n.) 168 """ 169 self.load() 170 if type(index) in types.StringTypes: 171 return self.word(index) 172 173 elif type(index) == types.IntType: 174 line = self.indexFile[index] 175 return self.word(string.replace(line[:string.find(line, ' ')], '_', ' '), line) 176 177 else: 178 raise TypeError, "%s is not a String or Int" % `index`

179

180 - def __iter__(self):

181 self.load() 182 return iter(self.keys())

183

184 - def __contains__(self, item):

185 self.load() 186 return self.has_key(item)

187

188 - def get(self, key, default=None):

189 """ 190 Return the Word whose form is key, or default. 191 192 >>> N.get('dog') 193 dog(n.) 194 195 @type key: C{string} 196 @param key: the string form of a L{Word} e.g. 'dog' 197 @type default: L{Word} 198 @param default: An optional L{Word} to return if no entry can be found 199 with the supplied key. 200 @return: The L{Word} whose form is given by 'key' 201 """ 202 self.load() 203 try: 204 return self[key] 205 206 except LookupError: 207 return default

208

209 - def keys(self):

210 """ 211 @return: A sorted list of strings that index words in this 212 dictionary. 213 """ 214 self.load() 215 return self.indexFile.keys()

216

217 - def has_key(self, form):

218 """ 219 Checks if the supplied argument is an index into this dictionary. 220 221 >>> N.has_key('dog') 222 1 223 >>> N.has_key('inu') 224 0 225 226 @type form: C{string} 227 @param form: a word string e.g. 'dog' 228 @return: true iff the argument indexes a word in this dictionary. 229 """ 230 self.load() 231 return self.indexFile.has_key(form)

232 233 # Testing 234

235 - def _testKeys(self):

236 # Verify that index lookup can find each word in the index file. 237 self.load() 238 print "Testing: ", self 239 file = open(self.indexFile.file.name, _FILE_OPEN_MODE) 240 counter = 0 241 242 while 1: 243 line = file.readline() 244 245 if line == '': break 246 247 if line[0] != ' ': 248 key = string.replace(line[:string.find(line, ' ')], '_', ' ') 249 250 if (counter % 1000) == 0: 251 print "%s..." % (key,), 252 import sys 253 sys.stdout.flush() 254 255 counter = counter + 1 256 self[key] 257 258 file.close() 259 print "done."

260 261 # Dictionaries 262 263 N = Dictionary(NOUN, NOUN) 264 V = Dictionary(VERB, VERB) 265 ADJ = Dictionary(ADJECTIVE, ADJECTIVE) 266 ADV = Dictionary(ADVERB, ADVERB) 267 268 Dictionaries = {NOUN: N, VERB: V, ADJECTIVE: ADJ, ADVERB: ADV} 269

270 -def dictionaryFor(pos):

271 """ 272 Return the dictionary for the supplied part of speech. 273 274 @type pos: C{string} 275 @param pos: The part of speech of the desired dictionary. 276 277 @return: The desired dictionary. 278 """ 279 pos = normalizePOS(pos) 280 try: 281 d = Dictionaries[pos] 282 except KeyError: 283 raise RuntimeError, "The " + `pos` + " dictionary has not been created" 284 285 return d

286 287 288 # Lookup functions 289 290 # Deprecated since 0.9.4 291 @deprecated("Use dictionary.word() instead.")

292 -def getWord(form, pos=NOUN):

293 return word(form, pos)

294

295 -def word(form, pos=NOUN):

296 """ 297 Return a word with the given lexical form and pos. 298 299 @type form: C{string} 300 @param form: the sought-after word string e.g. 'dog' 301 302 @type pos: C{string} 303 @param pos: the desired part of speech. Defaults to 'noun'. 304 305 @return: the L{Word} object corresponding to form and pos, if it exists. 306 """ 307 return dictionaryFor(pos).word(form)

308 309 # Deprecated since 0.9.4 310 @deprecated("Use dictionary.sense() instead.")

311 -def getSense(form, pos=NOUN, senseno=0):

312 return sense(form, pos, senseno)

313

314 -def sense(form, pos=NOUN, senseno=0):

315 """ 316 Lookup a sense by its sense number. Used by repr(sense). 317 318 @type form: C{string} 319 @param form: the sought-after word string e.g. 'dog' 320 @type pos: C{string} 321 @param pos: the desired part of speech. Defaults to 'noun'. 322 @type senseno: C{int} 323 @param senseno: the id of the desired word sense. Defaults to 0. 324 @return: the L{Synset} object corresponding to form, pos and senseno, if it exists. 325 """ 326 return word(form, pos)[senseno]

327 328 # Deprecated since 0.9.4 329 @deprecated("Use dictionary.synset() instead.")

330 -def getSynset(pos, offset):

331 return synset(pos, offset)

332 333 # shadows module

334 -def synset(pos, offset):

335 """ 336 Lookup a synset by its offset. 337 338 @type pos: C{string} 339 @param pos: the desired part of speech. 340 @type offset: C{int} 341 @param offset: the offset into the relevant Wordnet dictionary file. 342 @return: the L{Synset} object extracted from the Wordnet dictionary file. 343 """ 344 return dictionaryFor(pos).synset(offset)

345

Source Code for Module nltk.wordnet.dictionary