Package nltk :: Package wordnet :: Module stemmer
[hide private]
[frames] | no frames]

Source Code for Module nltk.wordnet.stemmer

  1  # Natural Language Toolkit: Wordnet Stemmer 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Oliver Steele <[email protected]> 
  5  #         Steven Bird <[email protected]> 
  6  #         David Ormiston Smith <[email protected]> 
  7  #         Jussi Salmela <[email protected]> 
  8  # URL: <http://nltk.org> 
  9  # For license information, see LICENSE.TXT 
 10   
 11  from itertools import islice 
 12   
 13  import nltk.data 
 14  from nltk import defaultdict 
 15   
 16  from dictionary import dictionaryFor 
 17  from util import * 
 18   
 19  MORPHOLOGICAL_SUBSTITUTIONS = { 
 20      NOUN: 
 21        [('s', ''),      ('ses', 's'),   ('ves', 'f'), 
 22         ('xes', 'x'),   ('zes', 'z'),   ('ches', 'ch'), 
 23         ('shes', 'sh'), ('men', 'man'), ('ies', 'y')], 
 24      VERB: 
 25        [('s', ''),      ('ies', 'y'),   ('es', 'e'),    ('es', ''), 
 26         ('ed', 'e'),    ('ed', ''),     ('ing', 'e'),   ('ing', '')], 
 27      ADJECTIVE: 
 28        [('er', ''),     ('est', ''),    ('er', 'e'),    ('est', 'e')], 
 29      ADVERB: 
 30        []} 
 31   
32 -def morphy(form, pos=NOUN):
33 '''Identify the base forms for a given word-form with a given POS. 34 First it checks if the word is found in the exception list for this POS. 35 If so, it identifies all the exception's base forms. 36 Next it recurses with the word-form and a list of 37 suffix substitutions for that POS. 38 For every (old,new) pair of strings in the substitution list, if 39 the form ends with old, a new form is created by replacing old with 40 new and doing a recursive call. 41 42 >>> morphy('dogs') 43 'dog' 44 >>> morphy('churches') 45 'church' 46 >>> morphy('aardwolves') 47 'aardwolf' 48 >>> morphy('abaci') 49 'abacus' 50 >>> morphy('hardrock', ADVERB) 51 ''' 52 53 first = list(islice(_morphy(form, pos), 1)) 54 if len(first) == 1: 55 return first[0] 56 else: 57 return None
58
59 -def _morphy(form, pos=NOUN):
60 pos = normalizePOS(pos) 61 section = {NOUN: NOUN, VERB: VERB, ADJECTIVE: ADJECTIVE, ADVERB: ADVERB}[pos] 62 excfile = open(nltk.data.find('corpora/wordnet/%s.exc' % section)) 63 substitutions = MORPHOLOGICAL_SUBSTITUTIONS[pos] 64 dictionary=dictionaryFor(pos) 65 collection=[] 66 def trySubstitutions(form, # reduced form 67 substitutions): # remaining substitutions 68 if dictionary.has_key(form): 69 yield form 70 for n,(old,new) in enumerate(substitutions): 71 if form.endswith(old): 72 new_form = form[:-len(old)] + new 73 for f in trySubstitutions(new_form, substitutions[:n] + 74 substitutions[n+1:]): 75 yield f
76 77 exceptions = binarySearchFile(excfile, form) 78 if exceptions: 79 forms = exceptions[exceptions.find(' ')+1:-1].split() 80 for f in forms: 81 yield f 82 if pos == NOUN and form.endswith('ful'): 83 suffix = 'ful' 84 form = form[:-3] 85 else: 86 suffix = '' 87 for f in trySubstitutions(form, substitutions): 88 yield f + suffix 89 90 # Demo 91
92 -def p(word):
93 word = word.lower() 94 print '\n====================' 95 print 'Word is', word 96 print '====================' 97 pos_forms = defaultdict(set) 98 # ['noun', 'verb', 'adj', 'adv'] 99 for pos in [NOUN, VERB, ADJECTIVE, ADVERB]: 100 for form in _morphy(word, pos=pos): 101 pos_forms[pos].add(form) 102 for pos in [NOUN, VERB, ADJECTIVE, ADVERB]: 103 if pos in pos_forms: 104 print '%s: ' % pos.capitalize(), 105 for f in pos_forms[pos]: 106 print f, 107 print 108 print '===================='
109
110 -def demo():
111 for word in ['dogs', 'churches', 'aardwolves', 'abaci', 'hardrock']: 112 p(word) 113 while True: 114 word = raw_input('Enter a word: ') 115 if word == '': break 116 p(word)
117 118 if __name__ == '__main__': 119 demo() 120 121 __all__ = ['demo', 'morphy', '_morphy'] 122