nltk.stem.regexp

15 """ 16 A stemmer that uses regular expressions to identify morphological 17 affixes. Any substrings that match the regular expressions will 18 be removed. 19 """

20 - def __init__(self, regexp, min=0):

21 """ 22 Create a new regexp stemmer. 23 24 @type regexp: C{string} or C{regexp} 25 @param regexp: The regular expression that should be used to 26 identify morphological affixes. 27 @type min: int 28 @param min: The minimum length of string to stem 29 """ 30 31 if not hasattr(regexp, 'pattern'): 32 regexp = re.compile(regexp) 33 self._regexp = regexp 34 self._min = min

35

36 - def stem(self, word):

37 if len(word) < self._min: 38 return word 39 else: 40 return self._regexp.sub('', word)

41

42 - def __repr__(self):

43 return '<RegexpStemmer: %r>' % self._regexp.pattern

46 from nltk import tokenize, stem 47 48 # Create a simple regular expression based stemmer 49 stemmer = stem.RegexpStemmer('ing$|s$|e$', min=4) 50 text = "John was eating icecream" 51 tokens = text.split() 52 53 # Print the results. 54 print stemmer 55 for word in tokens: 56 print '%20s => %s' % (word, stemmer.stem(word)) 57 print

Source Code for Module nltk.stem.regexp