Package nltk :: Package stem :: Module regexp
[hide private]
[frames] | no frames]

Source Code for Module nltk.stem.regexp

 1  # Natural Language Toolkit: Stemmers 
 2  # 
 3  # Copyright (C) 2001-2008 NLTK Project 
 4  # Author: Trevor Cohn <[email protected]> 
 5  #         Edward Loper <[email protected]> 
 6  #         Steven Bird <[email protected]> 
 7  # URL: <http://nltk.org> 
 8  # For license information, see LICENSE.TXT 
 9   
10  import re 
11   
12  from api import * 
13   
14 -class RegexpStemmer(StemmerI):
15 """ 16 A stemmer that uses regular expressions to identify morphological 17 affixes. Any substrings that match the regular expressions will 18 be removed. 19 """
20 - def __init__(self, regexp, min=0):
21 """ 22 Create a new regexp stemmer. 23 24 @type regexp: C{string} or C{regexp} 25 @param regexp: The regular expression that should be used to 26 identify morphological affixes. 27 @type min: int 28 @param min: The minimum length of string to stem 29 """ 30 31 if not hasattr(regexp, 'pattern'): 32 regexp = re.compile(regexp) 33 self._regexp = regexp 34 self._min = min
35
36 - def stem(self, word):
37 if len(word) < self._min: 38 return word 39 else: 40 return self._regexp.sub('', word)
41
42 - def __repr__(self):
43 return '<RegexpStemmer: %r>' % self._regexp.pattern
44
45 -def demo():
46 from nltk import tokenize, stem 47 48 # Create a simple regular expression based stemmer 49 stemmer = stem.RegexpStemmer('ing$|s$|e$', min=4) 50 text = "John was eating icecream" 51 tokens = text.split() 52 53 # Print the results. 54 print stemmer 55 for word in tokens: 56 print '%20s => %s' % (word, stemmer.stem(word)) 57 print
58 59 60 if __name__ == '__main__': demo() 61