1
2
3
4
5
6
7
8
9
10 import math
11 import pickle
12 import string
13 import re
14
15 from nltk import defaultdict
16
17 from util import *
18 import dictionary
19 import similarity
20 from frequency import *
21 from lexname import Lexname
22
25 """
26 Extract a word from a line of a WordNet POS file.
27 @type line: C{string}
28 @param line: The appropriate line taken from the Wordnet data files.
29 """
30
31 tokens = line.split()
32 ints = map(int, tokens[int(tokens[3]) + 4:])
33
34 self.form = tokens[0].replace('_', ' ')
35 self.pos = normalizePOS(tokens[1])
36 self.taggedSenseCount = ints[1]
37 self._synsetOffsets = ints[2:ints[0]+2]
38
40 """
41 Get a sequence of the L{synsets}s of this word.
42
43 >>> from nltk.wordnet import *
44 >>> N['dog'].synsets()
45 [{noun: dog, domestic dog, Canis familiaris}, {noun: frump, dog}, {noun: dog}, {noun: cad, bounder, blackguard, dog, hound, heel}, {noun: frank, frankfurter, hotdog, hot dog, dog, wiener, wienerwurst, weenie}, {noun: pawl, detent, click, dog}, {noun: andiron, firedog, dog, dog-iron}]
46
47 @return: A list of this L{Word}'s L{Synset}s
48 """
49
50 try:
51 return self._synsets
52 except AttributeError:
53 self._synsets = [dictionary.synset(self.pos, offset)
54 for offset in self._synsetOffsets]
55 del self._synsetOffsets
56 return self._synsets
57
59 """
60 Return a list of WordSense objects corresponding to this word's L{synset}s.
61 """
62 return [s.wordSense(self.form) for s in self.synsets()]
63
65 """
66 Return the frequencies of each sense of this word in a tagged concordance.
67 """
68 return [s.count() for s in self.senses()]
69
71 """
72 >>> from nltk.wordnet import *
73 >>> N['dog'].isTagged()
74 True
75
76 @return: True/false (1/0) if one of this L{Word}'s senses is tagged.
77 """
78 return self.taggedSenseCount > 0
79
80
81
82
83
84
85
86
87
88
89
90
91
92
95
98
101
104
107
111
113 return self.form + ' (' + self.pos + ")"
114
117
119 return hash((self.form, self.pos))
120
121
123 """
124 A single word-sense pairing, indicated by in WordNet by a sense key of
125 the form::
126 lemma%ss_type:lex_filenum:lex_id:head_word:head_id
127 """
128
129 _ssTypeMap = {'n': 1, 'v': 2, 'a': 3, 'r': 4, 's':5}
130 _ssTypeRevMap = dict((v,k) for k,v in _ssTypeMap.iteritems())
131
133 self.senseKey = senseKey
134 self.lemma, remainder = senseKey.split('%', 1)
135 (ssType, lexFilenum, lexId,
136 self.headWord, headId) = remainder.split(':')
137
138 self.ssType = self._ssTypeRevMap[int(ssType)]
139 self.lexFilenum = int(lexFilenum)
140 self.lexId = int(lexId)
141 try:
142 self.headId = int(headId)
143 except ValueError:
144 self.headId = None
145
148
161
165
168
172
175
179
182
184 return hash(self.senseKey)
185
186
187 __repr__ = __str__
188
189 @staticmethod
202
203 @staticmethod
204 - def fromKeyParams(lemma, ss_type, lex_filenum, lex_id,
205 head_word='', head_id=''):
206
207 if head_word:
208 head_id = '%02d' % head_id
209
210 return WordSense('%s%%%d:%02d:%02d:%s:%s'
211 % (lemma, ss_type, lex_filenum, lex_id, head_word, head_id))
212
213
215 """
216 A set of synonyms.
217
218 Each synset contains one or more Senses, which represent a
219 specific sense of a specific word. Senses can be retrieved via
220 synset.senses() or through the index notations synset[0],
221 synset[string], or synset[word]. Synsets participate in
222 lexical relations, which can be accessed via synset.relations().
223
224 >>> from nltk.wordnet import *
225 >>> N['dog'][0]
226 {noun: dog, domestic_dog, Canis_familiaris}
227 >>> N['dog'][0][HYPERNYM]
228 [{noun: canine, canid}, {noun: domestic_animal, domesticated_animal}]
229 >>> V['think'][0].verbFrameStrings
230 ['Something think something Adjective/Noun', 'Somebody think somebody']
231
232 @type pos: C{string}
233 @ivar pos: The part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB.
234
235 @type offset: C{int}
236 @ivar offset: An integer offset into the part-of-speech file. Together
237 with pos, this can be used as a unique id.
238
239 @type gloss: C{string}
240 @ivar gloss: A gloss (dictionary definition) for the sense.
241
242 @type verbFrames: C{list} of C{integer}
243 @ivar verbFrames: A sequence of integers that index into
244 VERB_FRAME_STRINGS. These list the verb frames that any
245 Sense in this synset participates in. (See also
246 Sense.verbFrames.) Defined only for verbs.
247 """
248
250 """Initialize the synset from a line in a WordNet lexicographer file."""
251
252
253 self.pos = pos
254
255
256
257 self.offset = offset
258
259
260
261
262
263 dividerIndex = line.index('|')
264 tokens = line[:dividerIndex].split()
265 self.ssType = tokens[2]
266 self.gloss = line[dividerIndex + 1:].strip()
267 self.lexname = Lexname.lexnames[int(tokens[1])]
268
269
270
271
272
273 synset_cnt = int(tokens[3], 16)
274
275
276 (senseTuples, remainder1) = _partition(tokens[4:], 2, synset_cnt)
277 self.words = [form for form, lex_id in senseTuples]
278
279
280 (self._pointerTuples, remainder2) = _partition(remainder1[1:], 4, int(remainder1[0]))
281
282
283 if self.ssType == 's':
284
285 self.headSynset = self.relation('similar')[0]
286 self.wordSenses = [WordSense.fromSynset(self, form, int(lex_id, 16))
287 for form, lex_id in senseTuples]
288
289
290
291
292
293
294
295
296
297
298
299
300
301 if pos == VERB:
302 (vfTuples, remainder3) = _partition(remainder2[1:], 3, int(remainder2[0]))
303
304
305 def extractVerbFrames(index, vfTuples):
306 return tuple(map(lambda t:int(t[1]), filter(lambda t,i=index:int(t[2],16) in (0, i), vfTuples)))
307
308 senseVerbFrames = []
309 for index in range(1, len(self.words) + 1):
310 senseVerbFrames.append(extractVerbFrames(index, vfTuples))
311 self._senseVerbFrames = senseVerbFrames
312
313
314
315
316
317 self.verbFrames = tuple(extractVerbFrames(None, vfTuples))
318
319
320 self.verbFrameStrings = self.extractVerbFrameStrings(vfTuples)
321
323 """
324 Return the WordSense object for the given word in this synset.
325 """
326 word = word.replace(' ', '_')
327 try:
328 index = self.words.index(word)
329 except ValueError:
330 try:
331
332 index = self.words.index(word.title())
333 except ValueError:
334 raise ValueError(
335 "Could not find word '%s' for this synset." % word)
336
337 return self.wordSenses[index]
338
340 """
341 Return a list of verb frame strings for this synset.
342 """
343
344 frame_indices = [int(t[1]) for t in vfTuples if int(t[2], 16) == 0]
345 try:
346 verbFrames = [VERB_FRAME_STRINGS[i] for i in frame_indices]
347 except IndexError:
348 return []
349
350 form = self[0]
351 verbFrameStrings = [vf % form for vf in verbFrames]
352 return verbFrameStrings
353
380
383
384
386 """
387 >>> from nltk.wordnet import *
388 >>> N['dog'][0].isTagged()
389 True
390
391 >>> N['dog'][1].isTagged()
392 False
393
394 @return: True/false (1/0) if one of this L{Word}'s senses is tagged.
395 """
396 return len(filter(Word.isTagged, self.words)) > 0
397
399 """
400 Return a human-readable representation.
401
402 >>> from nltk.wordnet import *
403 >>> str(N['dog'][0].synset)
404 '{noun: dog, domestic dog, Canis familiaris}'
405 """
406 return "{" + self.pos + ": " + string.join(self.words, ", ") + "}"
407
409 return "{" + self.pos + ": " + string.join(self.words, ", ") + "}"
410
413
415 return hash((self.pos, self.offset))
416
418 return not (self==other)
419
421 try:
422 return self.words[idx]
423 except TypeError:
424 return self.relation(idx)
425
427 return iter(self.words)
428
430 return item in self.words
431
434
437
439 """
440 >>> from nltk.wordnet import *
441 >>> len(N['dog'][0].synset)
442 3
443 """
444 return len(self.words)
445
447 """
448 @return: The length of the longest hypernym path from this synset to the root.
449 """
450
451 if self[HYPERNYM] == []:
452 return 0
453
454 deepest = 0
455 for hypernym in self[HYPERNYM]:
456 depth = hypernym.max_depth()
457 if depth > deepest:
458 deepest = depth
459 return deepest + 1
460
462 """
463 @return: The length of the shortest hypernym path from this synset to the root.
464 """
465
466 if self[HYPERNYM] == []:
467 return 0
468
469 shallowest = 1000
470 for hypernym in self[HYPERNYM]:
471 depth = hypernym.max_depth()
472 if depth < shallowest:
473 shallowest = depth
474 return shallowest + 1
475
477 """Return the transitive closure of source under the rel relationship, breadth-first
478
479 >>> dog = N['dog'][0]
480 >>> dog.closure(HYPERNYM)
481 [{noun: dog, domestic dog, Canis familiaris}, {noun: canine, canid}, {noun: carnivore}, {noun: placental, placental mammal, eutherian, eutherian mammal}, {noun: mammal, mammalian}, {noun: vertebrate, craniate}, {noun: chordate}, {noun: animal, animate being, beast, brute, creature, fauna}, {noun: organism, being}, {noun: living thing, animate thing}, {noun: object, physical object}, {noun: physical entity}, {noun: entity}]
482 """
483 from nltk.util import breadth_first
484 synset_offsets = []
485 for synset in breadth_first(self, lambda s:s[rel], depth):
486 if synset.offset != self.offset and synset.offset not in synset_offsets:
487 synset_offsets.append(synset.offset)
488 yield synset
489
490
492 """
493 Get the path(s) from this synset to the root, where each path is a
494 list of the synset nodes traversed on the way to the root.
495
496 @return: A list of lists, where each list gives the node sequence
497 connecting the initial L{Synset} node and a root node.
498 """
499 paths = []
500
501 hypernyms = self[HYPERNYM]
502 if len(hypernyms) == 0:
503 paths = [[self]]
504
505 for hypernym in hypernyms:
506 for ancestor_list in hypernym.hypernym_paths():
507 ancestor_list.append(self)
508 paths.append(ancestor_list)
509 return paths
510
512 """
513 Get the path(s) from this synset to the root, counting the distance
514 of each node from the initial node on the way. A list of
515 (synset, distance) tuples is returned.
516
517 @type distance: C{int}
518 @param distance: the distance (number of edges) from this hypernym to
519 the original hypernym L{Synset} on which this method was called.
520 @return: A list of (L{Synset}, int) tuples where each L{Synset} is
521 a hypernym of the first L{Synset}.
522 """
523 distances = set([(self, distance)])
524
525 for hypernym in self[HYPERNYM]:
526 distances |= hypernym.hypernym_distances(distance+1, verbose=False)
527 if verbose:
528 print "> Hypernym Distances:", self, string.join(synset.__str__() + ":" + `dist` for synset, dist in distances)
529 return distances
530
532 """
533 Returns the distance of the shortest path linking the two synsets (if
534 one exists). For each synset, all the ancestor nodes and their distances
535 are recorded and compared. The ancestor node common to both synsets that
536 can be reached with the minimum number of traversals is used. If no
537 ancestor nodes are common, -1 is returned. If a node is compared with
538 itself 0 is returned.
539
540 @type other: L{Synset}
541 @param other: The Synset to which the shortest path will be found.
542 @return: The number of edges in the shortest path connecting the two
543 nodes, or -1 if no path exists.
544 """
545
546 if self == other: return 0
547
548 path_distance = -1
549
550 dist_list1 = self.hypernym_distances(0)
551 dist_dict1 = {}
552
553 dist_list2 = other.hypernym_distances(0)
554 dist_dict2 = {}
555
556
557
558
559
560
561 for (l, d) in [(dist_list1, dist_dict1), (dist_list2, dist_dict2)]:
562 for (key, value) in l:
563 if key in d:
564 if value < d[key]:
565 d[key] = value
566 else:
567 d[key] = value
568
569
570
571
572 for synset1 in dist_dict1.keys():
573 for synset2 in dist_dict2.keys():
574 if synset1 == synset2:
575 new_distance = dist_dict1[synset1] + dist_dict2[synset2]
576 if path_distance < 0 or new_distance < path_distance:
577 path_distance = new_distance
578
579 return path_distance
580
581 - def tree(self, rel, depth=-1, cut_mark=None):
582 """
583 >>> dog = N['dog'][0]
584 >>> from pprint import pprint
585 >>> pprint(dog.tree(HYPERNYM))
586 ['dog' in {noun: dog, domestic dog, Canis familiaris},
587 [{noun: canine, canid},
588 [{noun: carnivore},
589 [{noun: placental, placental mammal, eutherian, eutherian mammal},
590 [{noun: mammal, mammalian},
591 [{noun: vertebrate, craniate},
592 [{noun: chordate},
593 [{noun: animal, animate being, beast, brute, creature, fauna},
594 [{noun: organism, being},
595 [{noun: living thing, animate thing},
596 [{noun: object, physical object},
597 [{noun: physical entity}, [{noun: entity}]]]]]]]]]]]]]
598 """
599
600 tree = [self]
601 if depth != 0:
602 tree += [x.tree(rel, depth-1, cut_mark) for x in self[rel]]
603 elif cut_mark:
604 tree += [cut_mark]
605 return tree
606
607
608
611
614
617
620
623
626
627
628
629
630 _RELATION_TABLE = {
631 '!': ANTONYM, '@': HYPERNYM, '~': HYPONYM, '=': ATTRIBUTE,
632 '^': ALSO_SEE, '*': ENTAILMENT, '>': CAUSE, '$': VERB_GROUP,
633 '#m': MEMBER_MERONYM, '#s': SUBSTANCE_MERONYM, '#p': PART_MERONYM,
634 '%m': MEMBER_HOLONYM, '%s': SUBSTANCE_HOLONYM, '%p': PART_HOLONYM,
635 '&': SIMILAR, '<': PARTICIPLE_OF, '\\': PERTAINYM, '+': FRAMES,
636 ';c': CLASSIF_CATEGORY, ';u': CLASSIF_USAGE, ';r': CLASSIF_REGIONAL,
637 '-c': CLASS_CATEGORY, '-u': CLASS_USAGE, '-r': CLASS_REGIONAL,
638 '@i': INSTANCE_HYPERNYM,'~i': INSTANCE_HYPONYM,
639 }
640
641
642
643 -def _index(key, sequence, testfn=None, keyfn=None):
644 """
645 Return the index of key within sequence, using testfn for
646 comparison and transforming items of sequence by keyfn first.
647
648 >>> _index('e', 'hello')
649 1
650 >>> _index('E', 'hello', testfn=_equalsIgnoreCase)
651 1
652 >>> _index('x', 'hello')
653 """
654 index = 0
655 for element in sequence:
656 value = element
657 if keyfn:
658 value = keyfn(value)
659 if (not testfn and value == key) or (testfn and testfn(value, key)):
660 return index
661 index = index + 1
662 return None
663
665 """
666 Partition sequence into C{count} subsequences of
667 length C{size}, and a remainder.
668
669 Return C{(partitions, remainder)}, where C{partitions} is a sequence of
670 C{count} subsequences of cardinality C{size}, and
671 C{apply(append, partitions) + remainder == sequence}.
672 """
673
674 partitions = []
675 for index in range(0, size * count, size):
676 partitions.append(sequence[index:index + size])
677 return (partitions, sequence[size * count:])
678
680 """
681 Return -1, 0, or 1 according to a comparison first by type,
682 then by class, and finally by each of fields. Used when comparing two
683 Wordnet objects (Synsets, Words, or Senses) to each other.
684 """
685 if not hasattr(b, '__class__'):
686 return cmp(type(a), type(b))
687 elif a.__class__ != b.__class__:
688 return cmp(a.__class__, b.__class__)
689
690 for field in fields:
691 diff = cmp(getattr(a, field), getattr(b, field))
692 if diff: return diff
693
694 return 0
695
697 """
698 Return true iff a and b have the same lowercase representation.
699
700 >>> _equalsIgnoreCase('dog', 'Dog')
701 True
702 >>> _equalsIgnoreCase('dOg', 'DOG')
703 True
704 """
705 return a == b or a.lower() == b.lower()
706
707
708
710 from nltk import wordnet
711 from pprint import pprint
712
713 dog = wordnet.N['dog']
714 cat = wordnet.N['cat']
715
716 print "wordnet.N['dog']"
717 print 'dog' in wordnet.N
718 print dog
719 print dog.pos, dog.form
720 print dog.taggedSenseCount
721 print dog.synsets()
722 print dog.isTagged()
723
724
725
726
727 print "Verb Frames:",
728 print wordnet.V['think'][0].verbFrameStrings
729
730 print "Relations:"
731 print dog[0].relations()
732 print dog[0][wordnet.HYPERNYM]
733
734 print "Glosses:"
735 print dog[0].gloss
736 print dog[0].relation(wordnet.HYPERNYM)[0].gloss
737
738 print
739 print "Paths and Distances:"
740 print
741
742 print dog[0].hypernym_paths()
743 print dog[0].hypernym_distances(0)
744 print dog[0].shortest_path_distance(cat[0])
745
746 print
747 print "Closures and Trees:"
748 print
749
750 pprint(wordnet.ADJ['red'][0].closure(wordnet.SIMILAR, depth=1))
751 pprint(wordnet.ADJ['red'][0].closure(wordnet.SIMILAR, depth=2))
752 pprint(dog[0].tree(wordnet.HYPERNYM))
753 pprint(dog[0].tree(wordnet.HYPERNYM, depth=2, cut_mark = '...'))
754
755 entity = wordnet.N["entity"]
756 print entity, entity[0]
757 print entity[0][wordnet.HYPONYM]
758 pprint(entity[0].tree(wordnet.HYPONYM, depth=1), indent=4)
759 abstract_entity = wordnet.N["abstract entity"]
760 print abstract_entity, abstract_entity[0]
761 print abstract_entity[0][wordnet.HYPONYM]
762 pprint(abstract_entity[0].tree(wordnet.HYPONYM, depth=1), indent=4)
763
764
765
766
767 print "All the words in the hyponym synsets of dog[0]"
768 print [word for synset in dog[0][wordnet.HYPONYM] for word in synset]
769
770 print "Hyponyms of the first (and only) sense of 'animal' that are homophonous with verbs:"
771 print [word for synset in wordnet.N['animal'][0].closure(wordnet.HYPONYM) for word in synset if word in wordnet.V]
772
773
774 print "Senses of 'raise'(v.) and 'lower'(v.) that are antonyms:"
775 print filter(lambda p:p[0] in p[1][wordnet.ANTONYM], [(r,l) for r in wordnet.V['raise'] for l in wordnet.V['lower']])
776
777 print
778 print "Similarity: dog~cat"
779 print
780
781 print "Path Distance Similarity:",
782 print dog[0].path_similarity(cat[0])
783 print "Leacock Chodorow Similarity:",
784 print dog[0].lch_similarity(cat[0])
785 print "Wu Palmer Similarity:",
786 print dog[0].wup_similarity(cat[0])
787
788
789
790
791
792
793
794
795
796 if __name__ == '__main__':
797 demo()
798