Package nltk :: Package stem :: Module porter
[hide private]
[frames] | no frames]

Source Code for Module nltk.stem.porter

  1  #!/usr/bin/env python 
  2  #  
  3  # Copyright (c) 2002 Vivake Gupta (vivakeATomniscia.org).  All rights reserved. 
  4  #  
  5  # This program is free software; you can redistribute it and/or 
  6  # modify it under the terms of the GNU General Public License as 
  7  # published by the Free Software Foundation; either version 2 of the 
  8  # License, or (at your option) any later version. 
  9  # 
 10  # This program is distributed in the hope that it will be useful, 
 11  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 12  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 13  # GNU General Public License for more details. 
 14  # 
 15  # You should have received a copy of the GNU General Public License 
 16  # along with this program; if not, write to the Free Software 
 17  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 
 18  # USA 
 19  # 
 20  # This software is maintained by Vivake (vivakeATomniscia.org) and is available at: 
 21  #     http://www.omniscia.org/~vivake/python/PorterStemmer.py 
 22  # 
 23  # Additional modifications were made to incorporate this module into 
 24  # nltk.  All such modifications are marked with "--NLTK--".  The nltk 
 25  # version of this module is maintained by the NLTK development staff, 
 26  # and is available from the NLTK webpage: 
 27  #     <http://nltk.sourceforge.net> 
 28   
 29  """Porter Stemming Algorithm 
 30   
 31  This is the Porter stemming algorithm, ported to Python from the 
 32  version coded up in ANSI C by the author. It follows the algorithm 
 33  presented in 
 34   
 35  Porter, M. "An algorithm for suffix stripping." Program 14.3 (1980): 130-137. 
 36   
 37  only differing from it at the points maked --DEPARTURE-- and --NEW-- 
 38  below. 
 39   
 40  For a more faithful version of the Porter algorithm, see 
 41   
 42      http://www.tartarus.org/~martin/PorterStemmer/ 
 43   
 44  Later additions: 
 45   
 46     June 2000 
 47      
 48     The 'l' of the 'logi' -> 'log' rule is put with the stem, so that 
 49     short stems like 'geo' 'theo' etc work like 'archaeo' 'philo' etc. 
 50   
 51     This follows a suggestion of Barry Wilkins, reasearch student at 
 52     Birmingham. 
 53   
 54   
 55     February 2000 
 56   
 57     the cvc test for not dropping final -e now looks after vc at the 
 58     beginning of a word, so are, eve, ice, ore, use keep final -e. In this 
 59     test c is any consonant, including w, x and y. This extension was 
 60     suggested by Chris Emerson. 
 61   
 62     -fully    -> -ful   treated like  -fulness -> -ful, and 
 63     -tionally -> -tion  treated like  -tional  -> -tion 
 64   
 65     both in Step 2. These were suggested by Hiranmay Ghosh, of New Delhi. 
 66   
 67     Invariants proceed, succeed, exceed. Also suggested by Hiranmay Ghosh. 
 68   
 69  Additional modifications were made to incorperate this module into 
 70  nltk.  All such modifications are marked with \"--NLTK--\".  The nltk 
 71  version of this module is maintained by the NLTK developers, and is 
 72  available from <http://nltk.sourceforge.net> 
 73  """ 
 74   
 75  ## --NLTK-- 
 76  ## Declare this module's documentation format. 
 77  __docformat__ = 'plaintext' 
 78   
 79  import sys 
 80  import re 
 81   
 82  ## --NLTK-- 
 83  ## Import the nltk.stemmer module, which defines the stemmer interface 
 84  from api import * 
 85   
86 -class PorterStemmer(StemmerI):
87 88 ## --NLTK-- 89 ## Add a module docstring 90 """ 91 A word stemmer based on the Porter stemming algorithm. 92 93 Porter, M. \"An algorithm for suffix stripping.\" 94 Program 14.3 (1980): 130-137. 95 96 A few minor modifications have been made to Porter's basic 97 algorithm. See the source code of this module for more 98 information. 99 100 The Porter Stemmer requires that all tokens have string types. 101 """ 102
103 - def __init__(self):
104 """The main part of the stemming algorithm starts here. 105 b is a buffer holding a word to be stemmed. The letters are in b[k0], 106 b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is 107 readjusted downwards as the stemming progresses. Zero termination is 108 not in fact used in the algorithm. 109 110 Note that only lower case sequences are stemmed. Forcing to lower case 111 should be done before stem(...) is called. 112 """ 113 114 self.b = "" # buffer for word to be stemmed 115 self.k = 0 116 self.k0 = 0 117 self.j = 0 # j is a general offset into the string 118 119 ## --NEW-- 120 ## This is a table of irregular forms. It is quite short, but still 121 ## reflects the errors actually drawn to Martin Porter's attention over 122 ## a 20 year period! 123 ## 124 ## Extend it as necessary. 125 ## 126 ## The form of the table is: 127 ## { 128 ## "p1" : ["s11","s12","s13", ... ], 129 ## "p2" : ["s21","s22","s23", ... ], 130 ## ... 131 ## "pn" : ["sn1","sn2","sn3", ... ] 132 ## } 133 ## 134 ## String sij is mapped to paradigm form pi, and the main stemming 135 ## process is then bypassed. 136 137 irregular_forms = { 138 "sky" : ["sky", "skies"], 139 "die" : ["dying"], 140 "lie" : ["lying"], 141 "tie" : ["tying"], 142 "news" : ["news"], 143 "inning" : ["innings", "inning"], 144 "outing" : ["outings", "outing"], 145 "canning" : ["cannings", "canning"], 146 "howe" : ["howe"], 147 148 # --NEW-- 149 "proceed" : ["proceed"], 150 "exceed" : ["exceed"], 151 "succeed" : ["succeed"], # Hiranmay Ghosh 152 } 153 154 self.pool = {} 155 for key in irregular_forms.keys(): 156 for val in irregular_forms[key]: 157 self.pool[val] = key
158
159 - def cons(self, i):
160 """cons(i) is TRUE <=> b[i] is a consonant.""" 161 if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' or self.b[i] == 'o' or self.b[i] == 'u': 162 return 0 163 if self.b[i] == 'y': 164 if i == self.k0: 165 return 1 166 else: 167 return (not self.cons(i - 1)) 168 return 1
169
170 - def m(self):
171 """m() measures the number of consonant sequences between k0 and j. 172 if c is a consonant sequence and v a vowel sequence, and <..> 173 indicates arbitrary presence, 174 175 <c><v> gives 0 176 <c>vc<v> gives 1 177 <c>vcvc<v> gives 2 178 <c>vcvcvc<v> gives 3 179 .... 180 """ 181 n = 0 182 i = self.k0 183 while 1: 184 if i > self.j: 185 return n 186 if not self.cons(i): 187 break 188 i = i + 1 189 i = i + 1 190 while 1: 191 while 1: 192 if i > self.j: 193 return n 194 if self.cons(i): 195 break 196 i = i + 1 197 i = i + 1 198 n = n + 1 199 while 1: 200 if i > self.j: 201 return n 202 if not self.cons(i): 203 break 204 i = i + 1 205 i = i + 1
206
207 - def vowelinstem(self):
208 """vowelinstem() is TRUE <=> k0,...j contains a vowel""" 209 for i in range(self.k0, self.j + 1): 210 if not self.cons(i): 211 return 1 212 return 0
213
214 - def doublec(self, j):
215 """doublec(j) is TRUE <=> j,(j-1) contain a double consonant.""" 216 if j < (self.k0 + 1): 217 return 0 218 if (self.b[j] != self.b[j-1]): 219 return 0 220 return self.cons(j)
221
222 - def cvc(self, i):
223 """cvc(i) is TRUE <=> 224 225 a) ( --NEW--) i == 1, and p[0] p[1] is vowel consonant, or 226 227 b) p[i - 2], p[i - 1], p[i] has the form consonant - 228 vowel - consonant and also if the second c is not w, x or y. this 229 is used when trying to restore an e at the end of a short word. 230 e.g. 231 232 cav(e), lov(e), hop(e), crim(e), but 233 snow, box, tray. 234 """ 235 if i == 0: return 0 # i == 0 never happens perhaps 236 if i == 1: return (not self.cons(0) and self.cons(1)) 237 if not self.cons(i) or self.cons(i-1) or not self.cons(i-2): return 0 238 239 ch = self.b[i] 240 if ch == 'w' or ch == 'x' or ch == 'y': 241 return 0 242 243 return 1
244
245 - def ends(self, s):
246 """ends(s) is TRUE <=> k0,...k ends with the string s.""" 247 length = len(s) 248 if s[length - 1] != self.b[self.k]: # tiny speed-up 249 return 0 250 if length > (self.k - self.k0 + 1): 251 return 0 252 if self.b[self.k-length+1:self.k+1] != s: 253 return 0 254 self.j = self.k - length 255 return 1
256
257 - def setto(self, s):
258 """setto(s) sets (j+1),...k to the characters in the string s, readjusting k.""" 259 length = len(s) 260 self.b = self.b[:self.j+1] + s + self.b[self.j+length+1:] 261 self.k = self.j + length
262
263 - def r(self, s):
264 """r(s) is used further down.""" 265 if self.m() > 0: 266 self.setto(s)
267
268 - def step1ab(self):
269 """step1ab() gets rid of plurals and -ed or -ing. e.g. 270 271 caresses -> caress 272 ponies -> poni 273 sties -> sti 274 tie -> tie (--NEW--: see below) 275 caress -> caress 276 cats -> cat 277 278 feed -> feed 279 agreed -> agree 280 disabled -> disable 281 282 matting -> mat 283 mating -> mate 284 meeting -> meet 285 milling -> mill 286 messing -> mess 287 288 meetings -> meet 289 """ 290 if self.b[self.k] == 's': 291 if self.ends("sses"): 292 self.k = self.k - 2 293 elif self.ends("ies"): 294 if self.j == 0: 295 self.k = self.k - 1 296 # this line extends the original algorithm, so that 297 # 'flies'->'fli' but 'dies'->'die' etc 298 else: 299 self.k = self.k - 2 300 elif self.b[self.k - 1] != 's': 301 self.k = self.k - 1 302 303 if self.ends("ied"): 304 if self.j == 0: 305 self.k = self.k - 1 306 else: 307 self.k = self.k - 2 308 # this line extends the original algorithm, so that 309 # 'spied'->'spi' but 'died'->'die' etc 310 311 elif self.ends("eed"): 312 if self.m() > 0: 313 self.k = self.k - 1 314 elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem(): 315 self.k = self.j 316 if self.ends("at"): self.setto("ate") 317 elif self.ends("bl"): self.setto("ble") 318 elif self.ends("iz"): self.setto("ize") 319 elif self.doublec(self.k): 320 self.k = self.k - 1 321 ch = self.b[self.k] 322 if ch == 'l' or ch == 's' or ch == 'z': 323 self.k = self.k + 1 324 elif (self.m() == 1 and self.cvc(self.k)): 325 self.setto("e")
326
327 - def step1c(self):
328 """step1c() turns terminal y to i when there is another vowel in the stem. 329 --NEW--: This has been modified from the original Porter algorithm so that y->i 330 is only done when y is preceded by a consonant, but not if the stem 331 is only a single consonant, i.e. 332 333 (*c and not c) Y -> I 334 335 So 'happy' -> 'happi', but 336 'enjoy' -> 'enjoy' etc 337 338 This is a much better rule. Formerly 'enjoy'->'enjoi' and 'enjoyment'-> 339 'enjoy'. Step 1c is perhaps done too soon; but with this modification that 340 no longer really matters. 341 342 Also, the removal of the vowelinstem(z) condition means that 'spy', 'fly', 343 'try' ... stem to 'spi', 'fli', 'tri' and conflate with 'spied', 'tried', 344 'flies' ... 345 """ 346 if self.ends("y") and self.j > 0 and self.cons(self.k - 1): 347 self.b = self.b[:self.k] + 'i' + self.b[self.k+1:]
348
349 - def step2(self):
350 """step2() maps double suffices to single ones. 351 so -ization ( = -ize plus -ation) maps to -ize etc. note that the 352 string before the suffix must give m() > 0. 353 """ 354 if self.b[self.k - 1] == 'a': 355 if self.ends("ational"): self.r("ate") 356 elif self.ends("tional"): self.r("tion") 357 elif self.b[self.k - 1] == 'c': 358 if self.ends("enci"): self.r("ence") 359 elif self.ends("anci"): self.r("ance") 360 elif self.b[self.k - 1] == 'e': 361 if self.ends("izer"): self.r("ize") 362 elif self.b[self.k - 1] == 'l': 363 if self.ends("bli"): self.r("ble") # --DEPARTURE-- 364 # To match the published algorithm, replace this phrase with 365 # if self.ends("abli"): self.r("able") 366 elif self.ends("alli"): 367 if self.m() > 0: # --NEW-- 368 self.setto("al") 369 self.step2() 370 elif self.ends("fulli"): self.r("ful") # --NEW-- 371 elif self.ends("entli"): self.r("ent") 372 elif self.ends("eli"): self.r("e") 373 elif self.ends("ousli"): self.r("ous") 374 elif self.b[self.k - 1] == 'o': 375 if self.ends("ization"): self.r("ize") 376 elif self.ends("ation"): self.r("ate") 377 elif self.ends("ator"): self.r("ate") 378 elif self.b[self.k - 1] == 's': 379 if self.ends("alism"): self.r("al") 380 elif self.ends("iveness"): self.r("ive") 381 elif self.ends("fulness"): self.r("ful") 382 elif self.ends("ousness"): self.r("ous") 383 elif self.b[self.k - 1] == 't': 384 if self.ends("aliti"): self.r("al") 385 elif self.ends("iviti"): self.r("ive") 386 elif self.ends("biliti"): self.r("ble") 387 elif self.b[self.k - 1] == 'g': # --DEPARTURE-- 388 if self.ends("logi"): 389 self.j = self.j + 1 # --NEW-- (Barry Wilkins) 390 self.r("og")
391 # To match the published algorithm, delete this phrase 392
393 - def step3(self):
394 """step3() dels with -ic-, -full, -ness etc. similar strategy to step2.""" 395 if self.b[self.k] == 'e': 396 if self.ends("icate"): self.r("ic") 397 elif self.ends("ative"): self.r("") 398 elif self.ends("alize"): self.r("al") 399 elif self.b[self.k] == 'i': 400 if self.ends("iciti"): self.r("ic") 401 elif self.b[self.k] == 'l': 402 if self.ends("ical"): self.r("ic") 403 elif self.ends("ful"): self.r("") 404 elif self.b[self.k] == 's': 405 if self.ends("ness"): self.r("")
406
407 - def step4(self):
408 """step4() takes off -ant, -ence etc., in context <c>vcvc<v>.""" 409 if self.b[self.k - 1] == 'a': 410 if self.ends("al"): pass 411 else: return 412 elif self.b[self.k - 1] == 'c': 413 if self.ends("ance"): pass 414 elif self.ends("ence"): pass 415 else: return 416 elif self.b[self.k - 1] == 'e': 417 if self.ends("er"): pass 418 else: return 419 elif self.b[self.k - 1] == 'i': 420 if self.ends("ic"): pass 421 else: return 422 elif self.b[self.k - 1] == 'l': 423 if self.ends("able"): pass 424 elif self.ends("ible"): pass 425 else: return 426 elif self.b[self.k - 1] == 'n': 427 if self.ends("ant"): pass 428 elif self.ends("ement"): pass 429 elif self.ends("ment"): pass 430 elif self.ends("ent"): pass 431 else: return 432 elif self.b[self.k - 1] == 'o': 433 if self.ends("ion") and (self.b[self.j] == 's' or self.b[self.j] == 't'): pass 434 elif self.ends("ou"): pass 435 # takes care of -ous 436 else: return 437 elif self.b[self.k - 1] == 's': 438 if self.ends("ism"): pass 439 else: return 440 elif self.b[self.k - 1] == 't': 441 if self.ends("ate"): pass 442 elif self.ends("iti"): pass 443 else: return 444 elif self.b[self.k - 1] == 'u': 445 if self.ends("ous"): pass 446 else: return 447 elif self.b[self.k - 1] == 'v': 448 if self.ends("ive"): pass 449 else: return 450 elif self.b[self.k - 1] == 'z': 451 if self.ends("ize"): pass 452 else: return 453 else: 454 return 455 if self.m() > 1: 456 self.k = self.j
457
458 - def step5(self):
459 """step5() removes a final -e if m() > 1, and changes -ll to -l if 460 m() > 1. 461 """ 462 self.j = self.k 463 if self.b[self.k] == 'e': 464 a = self.m() 465 if a > 1 or (a == 1 and not self.cvc(self.k-1)): 466 self.k = self.k - 1 467 if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1: 468 self.k = self.k -1
469
470 - def stem_word(self, p, i=0, j=None):
471 """In stem(p,i,j), p is a char pointer, and the string to be stemmed 472 is from p[i] to p[j] inclusive. Typically i is zero and j is the 473 offset to the last character of a string, (p[j+1] == '\0'). The 474 stemmer adjusts the characters p[i] ... p[j] and returns the new 475 end-point of the string, k. Stemming never increases word length, so 476 i <= k <= j. To turn the stemmer into a module, declare 'stem' as 477 extern, and delete the remainder of this file. 478 """ 479 ## --NLTK-- 480 ## Don't print results as we go (commented out the next line) 481 #print p[i:j+1] 482 if j == None: 483 j = len(p) - 1 484 485 # copy the parameters into statics 486 self.b = p 487 self.k = j 488 self.k0 = i 489 490 if self.pool.has_key(self.b[self.k0:self.k+1]): 491 return self.pool[self.b[self.k0:self.k+1]] 492 493 if self.k <= self.k0 + 1: 494 return self.b # --DEPARTURE-- 495 496 # With this line, strings of length 1 or 2 don't go through the 497 # stemming process, although no mention is made of this in the 498 # published algorithm. Remove the line to match the published 499 # algorithm. 500 501 self.step1ab() 502 self.step1c() 503 self.step2() 504 self.step3() 505 self.step4() 506 self.step5() 507 return self.b[self.k0:self.k+1]
508
509 - def adjust_case(self, word, stem):
510 lower = word.lower() 511 512 ret = "" 513 for x in xrange(len(stem)): 514 if lower[x] == stem[x]: 515 ret += word[x] 516 else: 517 ret += stem[x] 518 519 return ret
520 521 ## --NLTK-- 522 ## Don't use this procedure; we want to work with individual 523 ## tokens, instead. (commented out the following procedure) 524 #def stem(self, text): 525 # parts = re.split("(\W+)", text) 526 # numWords = (len(parts) + 1)/2 527 # 528 # ret = "" 529 # for i in xrange(numWords): 530 # word = parts[2 * i] 531 # separator = "" 532 # if ((2 * i) + 1) < len(parts): 533 # separator = parts[(2 * i) + 1] 534 # 535 # stem = self.stem_word(string.lower(word), 0, len(word) - 1) 536 # ret = ret + self.adjust_case(word, stem) 537 # ret = ret + separator 538 # return ret 539 540 ## --NLTK-- 541 ## Define a stem() method that implements the StemmerI interface.
542 - def stem(self, word):
543 stem = self.stem_word(word.lower(), 0, len(word) - 1) 544 return self.adjust_case(word, stem)
545 546 ## --NLTK-- 547 ## Add a string representation function
548 - def __repr__(self):
549 return '<PorterStemmer>'
550 551 ## --NLTK-- 552 ## This test procedure isn't applicable. 553 #if __name__ == '__main__': 554 # p = PorterStemmer() 555 # if len(sys.argv) > 1: 556 # for f in sys.argv[1:]: 557 # infile = open(f, 'r') 558 # while 1: 559 # w = infile.readline() 560 # if w == '': 561 # break 562 # w = w[:-1] 563 # print p.stem(w) 564 565 ##--NLTK-- 566 ## Added a demo() function 567
568 -def demo():
569 """ 570 A demonstration of the porter stemmer on a sample from 571 the Penn Treebank corpus. 572 """ 573 574 from nltk.corpus import treebank 575 from nltk import stem 576 577 stemmer = stem.PorterStemmer() 578 579 orig = [] 580 stemmed = [] 581 for item in treebank.files()[:3]: 582 for (word, tag) in treebank.tagged_words(item): 583 orig.append(word) 584 stemmed.append(stemmer.stem(word)) 585 586 # Convert the results to a string, and word-wrap them. 587 results = ' '.join(stemmed) 588 results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip() 589 590 # Convert the original to a string, and word wrap it. 591 original = ' '.join(orig) 592 original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip() 593 594 # Print the results. 595 print '-Original-'.center(70).replace(' ', '*').replace('-', ' ') 596 print original 597 print '-Results-'.center(70).replace(' ', '*').replace('-', ' ') 598 print results 599 print '*'*70
600 601 ##--NLTK-- 602 ## Call demo() if we're invoked directly. 603 if __name__ == '__main__': demo() 604