1
2
3
4
5
6
7
8
9
10
11 import os
12 import string
13 import types
14
15 import nltk.data
16
17 ANTONYM = 'antonym'
18 HYPERNYM = 'hypernym'
19 HYPONYM = 'hyponym'
20 ATTRIBUTE = 'attribute'
21 ALSO_SEE = 'also see'
22 ENTAILMENT = 'entailment'
23 CAUSE = 'cause'
24 VERB_GROUP = 'verb group'
25 MEMBER_MERONYM = 'member meronym'
26 SUBSTANCE_MERONYM = 'substance meronym'
27 PART_MERONYM = 'part meronym'
28 MEMBER_HOLONYM = 'member holonym'
29 SUBSTANCE_HOLONYM = 'substance holonym'
30 PART_HOLONYM = 'part holonym'
31 SIMILAR = 'similar'
32 PARTICIPLE_OF = 'participle of'
33 PERTAINYM = 'pertainym'
34
35 FRAMES = 'frames'
36 CLASSIF_CATEGORY = 'domain category'
37 CLASSIF_USAGE = 'domain usage'
38 CLASSIF_REGIONAL = 'domain region'
39 CLASS_CATEGORY = 'class category'
40 CLASS_USAGE = 'class usage'
41 CLASS_REGIONAL = 'class region'
42
43 INSTANCE_HYPERNYM = 'hypernym (instance)'
44 INSTANCE_HYPONYM = 'hyponym (instance)'
45
46 POINTER_TYPES = (
47 ANTONYM,
48 HYPERNYM,
49 HYPONYM,
50 ATTRIBUTE,
51 ALSO_SEE,
52 ENTAILMENT,
53 CAUSE,
54 VERB_GROUP,
55 MEMBER_MERONYM,
56 SUBSTANCE_MERONYM,
57 PART_MERONYM,
58 MEMBER_HOLONYM,
59 SUBSTANCE_HOLONYM,
60 PART_HOLONYM,
61 SIMILAR,
62 PARTICIPLE_OF,
63 PERTAINYM,
64
65 FRAMES,
66 CLASSIF_CATEGORY,
67 CLASSIF_USAGE,
68 CLASSIF_REGIONAL,
69 CLASS_CATEGORY,
70 CLASS_USAGE,
71 CLASS_REGIONAL,
72
73 INSTANCE_HYPERNYM,
74 INSTANCE_HYPONYM,
75 )
76
77 ATTRIBUTIVE = 'attributive'
78 PREDICATIVE = 'predicative'
79 IMMEDIATE_POSTNOMINAL = 'immediate postnominal'
80 ADJECTIVE_POSITIONS = (ATTRIBUTIVE, PREDICATIVE, IMMEDIATE_POSTNOMINAL, None)
81
82 VERB_FRAME_STRINGS = (
83 None,
84 "Something %s",
85 "Somebody %s",
86 "It is %sing",
87 "Something is %sing PP",
88 "Something %s something Adjective/Noun",
89 "Something %s Adjective/Noun",
90 "Somebody %s Adjective",
91 "Somebody %s something",
92 "Somebody %s somebody",
93 "Something %s somebody",
94 "Something %s something",
95 "Something %s to somebody",
96 "Somebody %s on something",
97 "Somebody %s somebody something",
98 "Somebody %s something to somebody",
99 "Somebody %s something from somebody",
100 "Somebody %s somebody with something",
101 "Somebody %s somebody of something",
102 "Somebody %s something on somebody",
103 "Somebody %s somebody PP",
104 "Somebody %s something PP",
105 "Somebody %s PP",
106 "Somebody's (body part) %s",
107 "Somebody %s somebody to INFINITIVE",
108 "Somebody %s somebody INFINITIVE",
109 "Somebody %s that CLAUSE",
110 "Somebody %s to somebody",
111 "Somebody %s to INFINITIVE",
112 "Somebody %s whether INFINITIVE",
113 "Somebody %s somebody into V-ing something",
114 "Somebody %s something with something",
115 "Somebody %s INFINITIVE",
116 "Somebody %s VERB-ing",
117 "It %s that CLAUSE",
118 "Something %s INFINITIVE")
119
120
121
122
123
124 NOUN = 'noun'
125 VERB = 'verb'
126 ADJECTIVE = 'adj'
127 ADVERB = 'adv'
128
129 pos_abbrs = {NOUN: 'n.', VERB: 'v.', ADJECTIVE: 'adj.', ADVERB: 'adv.'}
130
131 _POSNormalizationTable = {}
132
133 for pos, abbreviations in (
134 (NOUN, "noun n n."),
135 (VERB, "verb v v."),
136 (ADJECTIVE, "adjective adj adj. a s"),
137 (ADVERB, "adverb adv adv. r")):
138 tokens = abbreviations.split()
139
140 for token in tokens:
141 _POSNormalizationTable[token] = pos
142 _POSNormalizationTable[token.upper()] = pos
143
145 """
146 Return the standard form of the supplied part of speech.
147
148 @type pos: C{string}
149 @param pos: A (non-standard) part of speech string.
150 @return: A standard form part of speech string.
151 """
152 try:
153 norm = _POSNormalizationTable[pos]
154 except KeyError:
155 raise TypeError, `pos` + " is not a part of speech type"
156 return norm
157
158
159
160
161
162
163 FILE_OPEN_MODE = os.name in ('dos', 'nt') and 'rb' or 'r'
164
165
167 """
168 @type filenameroot: {string}
169 @param filenameroot: base form of the data file's filename.
170 @return: the full path to the data file.
171 """
172
173
174
176 """
177 Searches through a sorted file using the binary search algorithm.
178
179 @type file: file
180 @param file: the file to be searched through.
181 @type key: {string}
182 @param key: the identifier we are searching for.
183 @return: The line from the file with first word key.
184 """
185 from stat import ST_SIZE
186
187 key = key + ' '
188 keylen = len(key)
189 start, end = 0, os.stat(file.name)[ST_SIZE] - 1
190 currentDepth = 0
191
192 while start < end:
193 lastState = start, end
194 middle = (start + end) / 2
195
196 if cache.get(middle):
197 offset, line = cache[middle]
198
199 else:
200 line = ""
201 while True:
202 file.seek(max(0, middle - 1))
203 if middle > 0:
204 file.readline()
205 offset = file.tell()
206 line = file.readline()
207 if line != "": break
208
209 middle = (start + middle)/2
210 if middle == end -1:
211 return None
212 if currentDepth < cacheDepth:
213 cache[middle] = (offset, line)
214
215 if offset > end:
216 assert end != middle - 1, "infinite loop"
217 end = middle - 1
218 elif line[:keylen] == key:
219 return line
220 elif line > key:
221 assert end != middle - 1, "infinite loop"
222 end = middle - 1
223 elif line < key:
224 start = offset + len(line) - 1
225
226 currentDepth += 1
227 thisState = start, end
228
229 if lastState == thisState:
230
231
232 return None
233
234 return None
235
236
237
238
239
240
242 """
243 An IndexFile is an implementation class that presents a
244 Sequence and Dictionary interface to a sorted index file.
245 """
246
248 """
249 @type pos: {string}
250 @param pos: The part of speech of this index file e.g. 'noun'
251 @type filenameroot: {string}
252 @param filenameroot: The base filename of the index file.
253 """
254 self.pos = pos
255 path = nltk.data.find('corpora/wordnet/index.%s' % filenameroot)
256 self.file = open(path, FILE_OPEN_MODE)
257
258
259 self.offsetLineCache = {}
260
261 self.rewind()
262
263
264
265
266
267
268
269
270
271
272
273
274
275
277 """
278 Rewind to the beginning of the file. Place the file pointer at the
279 beginning of the first line whose first character is not whitespace.
280 """
281 self.file.seek(0)
282
283 while True:
284 offset = self.file.tell()
285 line = self.file.readline()
286 if (line[0] != ' '):
287 self.file.seek(offset)
288 break
289 self.nextIndex = 0
290 self.nextOffset = offset
291
294
296 if hasattr(self, 'indexCache'):
297 return len(self.indexCache)
298 self.rewind()
299 lines = 0
300 while True:
301 line = self.file.readline()
302 if line == "":
303 break
304 lines += 1
305 return lines
306
308 if type(index) in types.StringTypes:
309 if hasattr(self, 'indexCache'):
310 return self.indexCache[index]
311
312 return binarySearchFile(self.file, index, self.offsetLineCache, 8)
313
314 elif type(index) == types.IntType:
315 if hasattr(self, 'indexCache'):
316 return self.get(self.keys[index])
317 if index < self.nextIndex:
318 self.rewind()
319 while self.nextIndex <= index:
320 self.file.seek(self.nextOffset)
321 line = self.file.readline()
322 if line == "":
323 raise IndexError, "index out of range"
324 self.nextIndex += 1
325 self.nextOffset = self.file.tell()
326 return line
327
328 else: raise TypeError, "%s is not a String or Int" % `index`
329
330 - def get(self, key, default=None):
331 """
332 @type key: {string}
333 @param key: first word of a line from an index file.
334 @param default: Return this if no entry exists for 'key'.
335 """
336 try:
337 return self[key]
338 except LookupError:
339 return default
340
359
361 """
362 @type key: {string}
363 @param key: the first word of a line in this index file.
364 @return: True/false if this key is a valid index into the file.
365 """
366 key = key.replace(' ', '_')
367
368 if hasattr(self, 'indexCache'):
369 return self.indexCache.has_key(key)
370
371 return self.get(key) != None
372
374
375 import shelve
376 import os
377
378 print "Building %s:" % (self.shelfname,),
379 tempname = self.shelfname + ".temp"
380
381 try:
382 indexCache = shelve.open(tempname)
383 self.rewind()
384 count = 0
385
386 while True:
387 offset, line = self.file.tell(), self.file.readline()
388 if not line: break
389 key = line[:string.find(line, ' ')]
390 if (count % 1000) == 0:
391 print "%s..." % (key,),
392 import sys
393 sys.stdout.flush()
394 indexCache[key] = line
395 count += 1
396 indexCache.close()
397 os.rename(tempname, self.shelfname)
398
399 finally:
400 try: os.remove(tempname)
401 except: pass
402
403 print "done."
404 self.indexCache = shelve.open(self.shelfname, 'r')
405
406
407
408
409
410 GET_INDEX_SUBSTITUTIONS = ((' ', '-'), ('-', ' '), ('-', ''), (' ', ''), ('.', ''))
411
413 """Search for _form_ in the index file corresponding to
414 _pos_. getIndex applies to _form_ an algorithm that replaces
415 underscores with hyphens, hyphens with underscores, removes
416 hyphens and underscores, and removes periods in an attempt to find
417 a form of the string that is an exact match for an entry in the
418 index file corresponding to _pos_. The dictionary is looked up for
419 each transformed string until a match is found or all the different
420 strings have been tried. It returns a Word or None."""
421
422 from dictionary import dictionaryFor
423
424 def trySubstitutions(form, substitutions, lookup=True, dictionary=dictionaryFor(pos)):
425 if lookup and form in dictionary:
426 return dictionary[form]
427 elif substitutions:
428 (old, new) = substitutions[0]
429 substitute = string.replace(form, old, new)
430 if substitute and substitute != form and substitute in dictionary:
431 return dictionary[substitute]
432 return trySubstitutions(form, substitutions[1:], lookup=False) or \
433 (substitute and trySubstitutions(substitute, substitutions[1:]))
434 return trySubstitutions(form, GET_INDEX_SUBSTITUTIONS)
435
436 if __name__ == "__main__":
437 indexFile = IndexFile("noun", "noun")
438 path = nltk.data.find('corpora/wordnet/data.noun')
439 dataFile = open(path, FILE_OPEN_MODE)
440 loaded = True
441 print 'OK'
442