Package nltk :: Package stem :: Module lancaster
[hide private]
[frames] | no frames]

Source Code for Module nltk.stem.lancaster

  1  # Natural Language Toolkit: Stemmers
 
  2  #
 
  3  # Copyright (C) 2001-2008 NLTK Project
 
  4  # Author: Steven Tomcavage <[email protected]>
 
  5  # URL: <http://nltk.org>
 
  6  # For license information, see LICENSE.TXT
 
  7  
 
  8  """
 
  9  A word stemmer based on the Lancaster stemming algorithm.
 
 10  Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
 
 11  """ 
 12  
 
 13  import re 
 14  from api import * 
 15  
 
16 -class LancasterStemmer(StemmerI):
17 18 # The rule list is static since it doesn't change between instances 19 rule_tuple = ( 20 "ai*2.", # -ia > - if intact 21 "a*1.", # -a > - if intact 22 "bb1.", # -bb > -b 23 "city3s.", # -ytic > -ys 24 "ci2>", # -ic > - 25 "cn1t>", # -nc > -nt 26 "dd1.", # -dd > -d 27 "dei3y>", # -ied > -y 28 "deec2ss.", # -ceed >", -cess 29 "dee1.", # -eed > -ee 30 "de2>", # -ed > - 31 "dooh4>", # -hood > - 32 "e1>", # -e > - 33 "feil1v.", # -lief > -liev 34 "fi2>", # -if > - 35 "gni3>", # -ing > - 36 "gai3y.", # -iag > -y 37 "ga2>", # -ag > - 38 "gg1.", # -gg > -g 39 "ht*2.", # -th > - if intact 40 "hsiug5ct.", # -guish > -ct 41 "hsi3>", # -ish > - 42 "i*1.", # -i > - if intact 43 "i1y>", # -i > -y 44 "ji1d.", # -ij > -id -- see nois4j> & vis3j> 45 "juf1s.", # -fuj > -fus 46 "ju1d.", # -uj > -ud 47 "jo1d.", # -oj > -od 48 "jeh1r.", # -hej > -her 49 "jrev1t.", # -verj > -vert 50 "jsim2t.", # -misj > -mit 51 "jn1d.", # -nj > -nd 52 "j1s.", # -j > -s 53 "lbaifi6.", # -ifiabl > - 54 "lbai4y.", # -iabl > -y 55 "lba3>", # -abl > - 56 "lbi3.", # -ibl > - 57 "lib2l>", # -bil > -bl 58 "lc1.", # -cl > c 59 "lufi4y.", # -iful > -y 60 "luf3>", # -ful > - 61 "lu2.", # -ul > - 62 "lai3>", # -ial > - 63 "lau3>", # -ual > - 64 "la2>", # -al > - 65 "ll1.", # -ll > -l 66 "mui3.", # -ium > - 67 "mu*2.", # -um > - if intact 68 "msi3>", # -ism > - 69 "mm1.", # -mm > -m 70 "nois4j>", # -sion > -j 71 "noix4ct.", # -xion > -ct 72 "noi3>", # -ion > - 73 "nai3>", # -ian > - 74 "na2>", # -an > - 75 "nee0.", # protect -een 76 "ne2>", # -en > - 77 "nn1.", # -nn > -n 78 "pihs4>", # -ship > - 79 "pp1.", # -pp > -p 80 "re2>", # -er > - 81 "rae0.", # protect -ear 82 "ra2.", # -ar > - 83 "ro2>", # -or > - 84 "ru2>", # -ur > - 85 "rr1.", # -rr > -r 86 "rt1>", # -tr > -t 87 "rei3y>", # -ier > -y 88 "sei3y>", # -ies > -y 89 "sis2.", # -sis > -s 90 "si2>", # -is > - 91 "ssen4>", # -ness > - 92 "ss0.", # protect -ss 93 "suo3>", # -ous > - 94 "su*2.", # -us > - if intact 95 "s*1>", # -s > - if intact 96 "s0.", # -s > -s 97 "tacilp4y.", # -plicat > -ply 98 "ta2>", # -at > - 99 "tnem4>", # -ment > - 100 "tne3>", # -ent > - 101 "tna3>", # -ant > - 102 "tpir2b.", # -ript > -rib 103 "tpro2b.", # -orpt > -orb 104 "tcud1.", # -duct > -duc 105 "tpmus2.", # -sumpt > -sum 106 "tpec2iv.", # -cept > -ceiv 107 "tulo2v.", # -olut > -olv 108 "tsis0.", # protect -sist 109 "tsi3>", # -ist > - 110 "tt1.", # -tt > -t 111 "uqi3.", # -iqu > - 112 "ugo1.", # -ogu > -og 113 "vis3j>", # -siv > -j 114 "vie0.", # protect -eiv 115 "vi2>", # -iv > - 116 "ylb1>", # -bly > -bl 117 "yli3y>", # -ily > -y 118 "ylp0.", # protect -ply 119 "yl2>", # -ly > - 120 "ygo1.", # -ogy > -og 121 "yhp1.", # -phy > -ph 122 "ymo1.", # -omy > -om 123 "ypo1.", # -opy > -op 124 "yti3>", # -ity > - 125 "yte3>", # -ety > - 126 "ytl2.", # -lty > -l 127 "yrtsi5.", # -istry > - 128 "yra3>", # -ary > - 129 "yro3>", # -ory > - 130 "yfi3.", # -ify > - 131 "ycn2t>", # -ncy > -nt 132 "yca3>", # -acy > - 133 "zi2>", # -iz > - 134 "zy1s." # -yz > -ys 135 ) 136 137
138 - def __init__(self):
139 """Create an instance of the Lancaster stemmer. 140 """ 141 # Setup an empty rule dictionary - this will be filled in later 142 self.rule_dictionary = {}
143
144 - def parseRules(self, rule_tuple):
145 """Validate the set of rules used in this stemmer. 146 """ 147 valid_rule = re.compile("^[a-z]+\*?\d[a-z]*[>\.]?$") 148 # Empty any old rules from the rule set before adding new ones 149 self.rule_dictionary = {} 150 151 for rule in rule_tuple: 152 if not valid_rule.match(rule): 153 raise ValueError, "The rule %s is invalid" % rule 154 first_letter = rule[0:1] 155 if first_letter in self.rule_dictionary: 156 self.rule_dictionary[first_letter].append(rule) 157 else: 158 self.rule_dictionary[first_letter] = [rule]
159
160 - def stem(self, word):
161 """Stem a word using the Lancaster stemmer. 162 """ 163 # Lower-case the word, since all the rules are lower-cased 164 word = word.lower() 165 166 # Save a copy of the original word 167 intact_word = word 168 169 # If the user hasn't supplied any rules, setup the default rules 170 if len(self.rule_dictionary) == 0: 171 self.parseRules(LancasterStemmer.rule_tuple) 172 173 return self.__doStemming(word, intact_word)
174
175 - def __doStemming(self, word, intact_word):
176 """Perform the actual word stemming 177 """ 178 179 valid_rule = re.compile("^([a-z]+)(\*?)(\d)([a-z]*)([>\.]?)$") 180 181 proceed = True 182 183 while proceed: 184 185 # Find the position of the last letter of the word to be stemmed 186 last_letter_position = self.__getLastLetter(word) 187 188 # Only stem the word if it has a last letter and a rule matching that last letter 189 if last_letter_position < 0 or word[last_letter_position] not in self.rule_dictionary: 190 proceed = False 191 192 else: 193 rule_was_applied = False 194 195 # Go through each rule that matches the word's final letter 196 for rule in self.rule_dictionary[word[last_letter_position]]: 197 rule_match = valid_rule.match(rule) 198 if rule_match: 199 (ending_string, 200 intact_flag, 201 remove_total, 202 append_string, 203 cont_flag) = rule_match.groups() 204 205 # Convert the number of chars to remove when stemming 206 # from a string to an integer 207 remove_total = int(remove_total) 208 209 # Proceed if word's ending matches rule's word ending 210 if word.endswith(ending_string[::-1]): 211 if intact_flag: 212 if (word == intact_word and 213 self.__isAcceptable(word, remove_total)): 214 word = self.__applyRule(word, 215 remove_total, 216 append_string) 217 rule_was_applied = True 218 if cont_flag == '.': 219 proceed = False 220 break 221 elif self.__isAcceptable(word, remove_total): 222 word = self.__applyRule(word, 223 remove_total, 224 append_string) 225 rule_was_applied = True 226 if cont_flag == '.': 227 proceed = False 228 break 229 # If no rules apply, the word doesn't need any more stemming 230 if rule_was_applied == False: 231 proceed = False 232 return word
233
234 - def __getLastLetter(self, word):
235 """Get the zero-based index of the last alphabetic character in this string 236 """ 237 last_letter = -1 238 for position in range(len(word)): 239 if word[position].isalpha(): 240 last_letter = position 241 else: 242 break 243 return last_letter
244
245 - def __isAcceptable(self, word, remove_total):
246 """Determine if the word is acceptable for stemming. 247 """ 248 word_is_acceptable = False 249 # If the word starts with a vowel, it must be at least 2 250 # characters long to be stemmed 251 if word[0] in "aeiouy": 252 if (len(word) - remove_total >= 2): 253 word_is_acceptable = True 254 # If the word starts with a consonant, it must be at least 3 255 # characters long (including one vowel) to be stemmed 256 elif (len(word) - remove_total >= 3): 257 if word[1] in "aeiouy": 258 word_is_acceptable = True 259 elif word[2] in "aeiouy": 260 word_is_acceptable = True 261 return word_is_acceptable
262 263
264 - def __applyRule(self, word, remove_total, append_string):
265 """Apply the stemming rule to the word 266 """ 267 # Remove letters from the end of the word 268 new_word_length = len(word) - remove_total 269 word = word[0:new_word_length] 270 271 # And add new letters to the end of the truncated word 272 if append_string: 273 word += append_string 274 return word
275
276 - def __repr__(self):
277 return '<LancasterStemmer>'
278
279 -def demo():
280 """A demonstration of the lancaster stemmer on a samples described in 281 Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61. 282 """ 283 from nltk import stem 284 285 stemmer = stem.LancasterStemmer() 286 287 print "%-20s%-20s" % ("Original Word", "Stemmed Word") 288 print "*" * 40 289 290 for word in ( 291 'maximum', # Remove "-um" when word is intact 292 'presumably', # Don't remove "-um" when word is not intact 293 'multiply', # No action taken if word ends with "-ply" 294 'provision', # Replace "-sion" with "-j" to trigger "j" set of rules 295 'owed', # Word starting with vowel must contain at least 2 letters 296 'ear', # ditto. 297 'saying', # Words starting with consonant must contain at least 3 298 'crying', # letters and one of those letters must be a vowel 299 'string', # ditto. 300 'meant', # ditto. 301 'cement'): # ditto. 302 stemmed_word = stemmer.stem(word) 303 print "%-20s%-20s" % (word, stemmed_word)
304 305 306 if __name__ == '__main__': 307 demo() 308