nltk.wordnet.util

1 # Natural Language Toolkit: Wordnet Utilities 2 # 3 # Copyright (C) 2001-2008 NLTK Project 4 # Author: Oliver Steele <[email protected]> 5 # Steven Bird <[email protected]> 6 # David Ormiston Smith <[email protected]>> 7 # Jussi Salmela <[email protected]> 8 # URL: <http://nltk.org> 9 # For license information, see LICENSE.TXT 10 11 import os 12 import string 13 import types 14 15 import nltk.data 16 17 ANTONYM = 'antonym' 18 HYPERNYM = 'hypernym' 19 HYPONYM = 'hyponym' 20 ATTRIBUTE = 'attribute' 21 ALSO_SEE = 'also see' 22 ENTAILMENT = 'entailment' 23 CAUSE = 'cause' 24 VERB_GROUP = 'verb group' 25 MEMBER_MERONYM = 'member meronym' 26 SUBSTANCE_MERONYM = 'substance meronym' 27 PART_MERONYM = 'part meronym' 28 MEMBER_HOLONYM = 'member holonym' 29 SUBSTANCE_HOLONYM = 'substance holonym' 30 PART_HOLONYM = 'part holonym' 31 SIMILAR = 'similar' 32 PARTICIPLE_OF = 'participle of' 33 PERTAINYM = 'pertainym' 34 # New in wn 2.0: 35 FRAMES = 'frames' 36 CLASSIF_CATEGORY = 'domain category' 37 CLASSIF_USAGE = 'domain usage' 38 CLASSIF_REGIONAL = 'domain region' 39 CLASS_CATEGORY = 'class category' 40 CLASS_USAGE = 'class usage' 41 CLASS_REGIONAL = 'class region' 42 # New in wn 2.1: 43 INSTANCE_HYPERNYM = 'hypernym (instance)' 44 INSTANCE_HYPONYM = 'hyponym (instance)' 45 46 POINTER_TYPES = ( 47 ANTONYM, 48 HYPERNYM, 49 HYPONYM, 50 ATTRIBUTE, 51 ALSO_SEE, 52 ENTAILMENT, 53 CAUSE, 54 VERB_GROUP, 55 MEMBER_MERONYM, 56 SUBSTANCE_MERONYM, 57 PART_MERONYM, 58 MEMBER_HOLONYM, 59 SUBSTANCE_HOLONYM, 60 PART_HOLONYM, 61 SIMILAR, 62 PARTICIPLE_OF, 63 PERTAINYM, 64 # New in wn 2.0: 65 FRAMES, 66 CLASSIF_CATEGORY, 67 CLASSIF_USAGE, 68 CLASSIF_REGIONAL, 69 CLASS_CATEGORY, 70 CLASS_USAGE, 71 CLASS_REGIONAL, 72 # New in wn 2.1: 73 INSTANCE_HYPERNYM, 74 INSTANCE_HYPONYM, 75 ) 76 77 ATTRIBUTIVE = 'attributive' 78 PREDICATIVE = 'predicative' 79 IMMEDIATE_POSTNOMINAL = 'immediate postnominal' 80 ADJECTIVE_POSITIONS = (ATTRIBUTIVE, PREDICATIVE, IMMEDIATE_POSTNOMINAL, None) 81 82 VERB_FRAME_STRINGS = ( 83 None, 84 "Something %s", 85 "Somebody %s", 86 "It is %sing", 87 "Something is %sing PP", 88 "Something %s something Adjective/Noun", 89 "Something %s Adjective/Noun", 90 "Somebody %s Adjective", 91 "Somebody %s something", 92 "Somebody %s somebody", 93 "Something %s somebody", 94 "Something %s something", 95 "Something %s to somebody", 96 "Somebody %s on something", 97 "Somebody %s somebody something", 98 "Somebody %s something to somebody", 99 "Somebody %s something from somebody", 100 "Somebody %s somebody with something", 101 "Somebody %s somebody of something", 102 "Somebody %s something on somebody", 103 "Somebody %s somebody PP", 104 "Somebody %s something PP", 105 "Somebody %s PP", 106 "Somebody's (body part) %s", 107 "Somebody %s somebody to INFINITIVE", 108 "Somebody %s somebody INFINITIVE", 109 "Somebody %s that CLAUSE", 110 "Somebody %s to somebody", 111 "Somebody %s to INFINITIVE", 112 "Somebody %s whether INFINITIVE", 113 "Somebody %s somebody into V-ing something", 114 "Somebody %s something with something", 115 "Somebody %s INFINITIVE", 116 "Somebody %s VERB-ing", 117 "It %s that CLAUSE", 118 "Something %s INFINITIVE") 119 120 ############################################################ 121 # Parts of Speech 122 ############################################################ 123 124 NOUN = 'noun' 125 VERB = 'verb' 126 ADJECTIVE = 'adj' 127 ADVERB = 'adv' 128 129 pos_abbrs = {NOUN: 'n.', VERB: 'v.', ADJECTIVE: 'adj.', ADVERB: 'adv.'} 130 131 _POSNormalizationTable = {} 132 133 for pos, abbreviations in ( 134 (NOUN, "noun n n."), 135 (VERB, "verb v v."), 136 (ADJECTIVE, "adjective adj adj. a s"), 137 (ADVERB, "adverb adv adv. r")): 138 tokens = abbreviations.split() 139 140 for token in tokens: 141 _POSNormalizationTable[token] = pos 142 _POSNormalizationTable[token.upper()] = pos 143

144 -def normalizePOS(pos):

145 """ 146 Return the standard form of the supplied part of speech. 147 148 @type pos: C{string} 149 @param pos: A (non-standard) part of speech string. 150 @return: A standard form part of speech string. 151 """ 152 try: 153 norm = _POSNormalizationTable[pos] 154 except KeyError: 155 raise TypeError, `pos` + " is not a part of speech type" 156 return norm

157 158 ############################################################ 159 # File utilities 160 ############################################################ 161 162 # Work around a Windows Python bug 163 FILE_OPEN_MODE = os.name in ('dos', 'nt') and 'rb' or 'r' 164 165

166 -def dataFilePathname(filenameroot):

167 """ 168 @type filenameroot: {string} 169 @param filenameroot: base form of the data file's filename. 170 @return: the full path to the data file. 171 """

172 173 174

175 -def binarySearchFile(file, key, cache={}, cacheDepth=-1):

176 """ 177 Searches through a sorted file using the binary search algorithm. 178 179 @type file: file 180 @param file: the file to be searched through. 181 @type key: {string} 182 @param key: the identifier we are searching for. 183 @return: The line from the file with first word key. 184 """ 185 from stat import ST_SIZE 186 187 key = key + ' ' 188 keylen = len(key) 189 start, end = 0, os.stat(file.name)[ST_SIZE] - 1 190 currentDepth = 0 191 192 while start < end: 193 lastState = start, end 194 middle = (start + end) / 2 195 196 if cache.get(middle): 197 offset, line = cache[middle] 198 199 else: 200 line = "" 201 while True: 202 file.seek(max(0, middle - 1)) 203 if middle > 0: 204 file.readline() 205 offset = file.tell() 206 line = file.readline() 207 if line != "": break 208 # at EOF; try to find start of the last line 209 middle = (start + middle)/2 210 if middle == end -1: 211 return None 212 if currentDepth < cacheDepth: 213 cache[middle] = (offset, line) 214 215 if offset > end: 216 assert end != middle - 1, "infinite loop" 217 end = middle - 1 218 elif line[:keylen] == key: 219 return line 220 elif line > key: 221 assert end != middle - 1, "infinite loop" 222 end = middle - 1 223 elif line < key: 224 start = offset + len(line) - 1 225 226 currentDepth += 1 227 thisState = start, end 228 229 if lastState == thisState: 230 # Detects the condition where we're searching past the end 231 # of the file, which is otherwise difficult to detect 232 return None 233 234 return None

235 236 237 238 # Low level IndexFile class and various file utilities, 239 # to do the lookups in the Wordnet database files. 240

241 -class IndexFile(object):

242 """ 243 An IndexFile is an implementation class that presents a 244 Sequence and Dictionary interface to a sorted index file. 245 """ 246

247 - def __init__(self, pos, filenameroot):

248 """ 249 @type pos: {string} 250 @param pos: The part of speech of this index file e.g. 'noun' 251 @type filenameroot: {string} 252 @param filenameroot: The base filename of the index file. 253 """ 254 self.pos = pos 255 path = nltk.data.find('corpora/wordnet/index.%s' % filenameroot) 256 self.file = open(path, FILE_OPEN_MODE) 257 258 # Table of (pathname, offset) -> (line, nextOffset) 259 self.offsetLineCache = {} 260 261 self.rewind()

262 263 # The following code gives errors on import. As far as I can 264 # understand, this code checks to see if the required data already 265 # exists as a serialised Python object. More investigation required. 266 267 # self.shelfname = os.path.join(get_basedir(), "wordnet", pos + ".pyidx") 268 269 # try: 270 # import shelve 271 # self.indexCache = shelve.open(self.shelfname, 'r') 272 273 # except: 274 # pass 275

276 - def rewind(self):

277 """ 278 Rewind to the beginning of the file. Place the file pointer at the 279 beginning of the first line whose first character is not whitespace. 280 """ 281 self.file.seek(0) 282 283 while True: 284 offset = self.file.tell() 285 line = self.file.readline() 286 if (line[0] != ' '): 287 self.file.seek(offset) 288 break 289 self.nextIndex = 0 290 self.nextOffset = offset

291

292 - def __nonzero__(self):

293 return 1

294

295 - def __len__(self):

296 if hasattr(self, 'indexCache'): 297 return len(self.indexCache) 298 self.rewind() 299 lines = 0 300 while True: 301 line = self.file.readline() 302 if line == "": 303 break 304 lines += 1 305 return lines

306

307 - def __getitem__(self, index):

308 if type(index) in types.StringTypes: 309 if hasattr(self, 'indexCache'): 310 return self.indexCache[index] 311 312 return binarySearchFile(self.file, index, self.offsetLineCache, 8) 313 314 elif type(index) == types.IntType: 315 if hasattr(self, 'indexCache'): 316 return self.get(self.keys[index]) 317 if index < self.nextIndex: 318 self.rewind() 319 while self.nextIndex <= index: 320 self.file.seek(self.nextOffset) 321 line = self.file.readline() 322 if line == "": 323 raise IndexError, "index out of range" 324 self.nextIndex += 1 325 self.nextOffset = self.file.tell() 326 return line 327 328 else: raise TypeError, "%s is not a String or Int" % `index`

329

330 - def get(self, key, default=None):

331 """ 332 @type key: {string} 333 @param key: first word of a line from an index file. 334 @param default: Return this if no entry exists for 'key'. 335 """ 336 try: 337 return self[key] 338 except LookupError: 339 return default

340

341 - def keys(self):

342 """ 343 @return: a list of the keys of this index file. 344 """ 345 346 if hasattr(self, 'indexCache'): 347 keys = self.indexCache.keys() 348 keys.sort() 349 return keys 350 else: 351 keys = [] 352 self.rewind() 353 while True: 354 line = self.file.readline() 355 if not line: break 356 key = line.split(' ', 1)[0] 357 keys.append(key.replace('_', ' ')) 358 return keys

359

360 - def has_key(self, key):

361 """ 362 @type key: {string} 363 @param key: the first word of a line in this index file. 364 @return: True/false if this key is a valid index into the file. 365 """ 366 key = key.replace(' ', '_') # test case: V['haze over'] 367 368 if hasattr(self, 'indexCache'): 369 return self.indexCache.has_key(key) 370 371 return self.get(key) != None

372

373 - def _buildIndexCacheFile(self):

374 375 import shelve 376 import os 377 378 print "Building %s:" % (self.shelfname,), 379 tempname = self.shelfname + ".temp" 380 381 try: 382 indexCache = shelve.open(tempname) 383 self.rewind() 384 count = 0 385 386 while True: 387 offset, line = self.file.tell(), self.file.readline() 388 if not line: break 389 key = line[:string.find(line, ' ')] 390 if (count % 1000) == 0: 391 print "%s..." % (key,), 392 import sys 393 sys.stdout.flush() 394 indexCache[key] = line 395 count += 1 396 indexCache.close() 397 os.rename(tempname, self.shelfname) 398 399 finally: 400 try: os.remove(tempname) 401 except: pass 402 403 print "done." 404 self.indexCache = shelve.open(self.shelfname, 'r')

405 406 # 407 # utilities 408 # 409 410 GET_INDEX_SUBSTITUTIONS = ((' ', '-'), ('-', ' '), ('-', ''), (' ', ''), ('.', '')) 411

412 -def getIndex(form, pos=NOUN):

413 """Search for _form_ in the index file corresponding to 414 _pos_. getIndex applies to _form_ an algorithm that replaces 415 underscores with hyphens, hyphens with underscores, removes 416 hyphens and underscores, and removes periods in an attempt to find 417 a form of the string that is an exact match for an entry in the 418 index file corresponding to _pos_. The dictionary is looked up for 419 each transformed string until a match is found or all the different 420 strings have been tried. It returns a Word or None.""" 421 422 from dictionary import dictionaryFor 423 424 def trySubstitutions(form, substitutions, lookup=True, dictionary=dictionaryFor(pos)): 425 if lookup and form in dictionary: 426 return dictionary[form] 427 elif substitutions: 428 (old, new) = substitutions[0] 429 substitute = string.replace(form, old, new) 430 if substitute and substitute != form and substitute in dictionary: 431 return dictionary[substitute] 432 return trySubstitutions(form, substitutions[1:], lookup=False) or \ 433 (substitute and trySubstitutions(substitute, substitutions[1:]))

434 return trySubstitutions(form, GET_INDEX_SUBSTITUTIONS) 435 436 if __name__ == "__main__": 437 indexFile = IndexFile("noun", "noun") 438 path = nltk.data.find('corpora/wordnet/data.noun') 439 dataFile = open(path, FILE_OPEN_MODE) 440 loaded = True 441 print 'OK' 442

Source Code for Module nltk.wordnet.util