1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29 """Porter Stemming Algorithm
30
31 This is the Porter stemming algorithm, ported to Python from the
32 version coded up in ANSI C by the author. It follows the algorithm
33 presented in
34
35 Porter, M. "An algorithm for suffix stripping." Program 14.3 (1980): 130-137.
36
37 only differing from it at the points maked --DEPARTURE-- and --NEW--
38 below.
39
40 For a more faithful version of the Porter algorithm, see
41
42 http://www.tartarus.org/~martin/PorterStemmer/
43
44 Later additions:
45
46 June 2000
47
48 The 'l' of the 'logi' -> 'log' rule is put with the stem, so that
49 short stems like 'geo' 'theo' etc work like 'archaeo' 'philo' etc.
50
51 This follows a suggestion of Barry Wilkins, reasearch student at
52 Birmingham.
53
54
55 February 2000
56
57 the cvc test for not dropping final -e now looks after vc at the
58 beginning of a word, so are, eve, ice, ore, use keep final -e. In this
59 test c is any consonant, including w, x and y. This extension was
60 suggested by Chris Emerson.
61
62 -fully -> -ful treated like -fulness -> -ful, and
63 -tionally -> -tion treated like -tional -> -tion
64
65 both in Step 2. These were suggested by Hiranmay Ghosh, of New Delhi.
66
67 Invariants proceed, succeed, exceed. Also suggested by Hiranmay Ghosh.
68
69 Additional modifications were made to incorperate this module into
70 nltk. All such modifications are marked with \"--NLTK--\". The nltk
71 version of this module is maintained by the NLTK developers, and is
72 available from <http://nltk.sourceforge.net>
73 """
74
75
76
77 __docformat__ = 'plaintext'
78
79 import sys
80 import re
81
82
83
84 from api import *
85
87
88
89
90 """
91 A word stemmer based on the Porter stemming algorithm.
92
93 Porter, M. \"An algorithm for suffix stripping.\"
94 Program 14.3 (1980): 130-137.
95
96 A few minor modifications have been made to Porter's basic
97 algorithm. See the source code of this module for more
98 information.
99
100 The Porter Stemmer requires that all tokens have string types.
101 """
102
104 """The main part of the stemming algorithm starts here.
105 b is a buffer holding a word to be stemmed. The letters are in b[k0],
106 b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is
107 readjusted downwards as the stemming progresses. Zero termination is
108 not in fact used in the algorithm.
109
110 Note that only lower case sequences are stemmed. Forcing to lower case
111 should be done before stem(...) is called.
112 """
113
114 self.b = ""
115 self.k = 0
116 self.k0 = 0
117 self.j = 0
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137 irregular_forms = {
138 "sky" : ["sky", "skies"],
139 "die" : ["dying"],
140 "lie" : ["lying"],
141 "tie" : ["tying"],
142 "news" : ["news"],
143 "inning" : ["innings", "inning"],
144 "outing" : ["outings", "outing"],
145 "canning" : ["cannings", "canning"],
146 "howe" : ["howe"],
147
148
149 "proceed" : ["proceed"],
150 "exceed" : ["exceed"],
151 "succeed" : ["succeed"],
152 }
153
154 self.pool = {}
155 for key in irregular_forms.keys():
156 for val in irregular_forms[key]:
157 self.pool[val] = key
158
160 """cons(i) is TRUE <=> b[i] is a consonant."""
161 if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' or self.b[i] == 'o' or self.b[i] == 'u':
162 return 0
163 if self.b[i] == 'y':
164 if i == self.k0:
165 return 1
166 else:
167 return (not self.cons(i - 1))
168 return 1
169
171 """m() measures the number of consonant sequences between k0 and j.
172 if c is a consonant sequence and v a vowel sequence, and <..>
173 indicates arbitrary presence,
174
175 <c><v> gives 0
176 <c>vc<v> gives 1
177 <c>vcvc<v> gives 2
178 <c>vcvcvc<v> gives 3
179 ....
180 """
181 n = 0
182 i = self.k0
183 while 1:
184 if i > self.j:
185 return n
186 if not self.cons(i):
187 break
188 i = i + 1
189 i = i + 1
190 while 1:
191 while 1:
192 if i > self.j:
193 return n
194 if self.cons(i):
195 break
196 i = i + 1
197 i = i + 1
198 n = n + 1
199 while 1:
200 if i > self.j:
201 return n
202 if not self.cons(i):
203 break
204 i = i + 1
205 i = i + 1
206
208 """vowelinstem() is TRUE <=> k0,...j contains a vowel"""
209 for i in range(self.k0, self.j + 1):
210 if not self.cons(i):
211 return 1
212 return 0
213
215 """doublec(j) is TRUE <=> j,(j-1) contain a double consonant."""
216 if j < (self.k0 + 1):
217 return 0
218 if (self.b[j] != self.b[j-1]):
219 return 0
220 return self.cons(j)
221
223 """cvc(i) is TRUE <=>
224
225 a) ( --NEW--) i == 1, and p[0] p[1] is vowel consonant, or
226
227 b) p[i - 2], p[i - 1], p[i] has the form consonant -
228 vowel - consonant and also if the second c is not w, x or y. this
229 is used when trying to restore an e at the end of a short word.
230 e.g.
231
232 cav(e), lov(e), hop(e), crim(e), but
233 snow, box, tray.
234 """
235 if i == 0: return 0
236 if i == 1: return (not self.cons(0) and self.cons(1))
237 if not self.cons(i) or self.cons(i-1) or not self.cons(i-2): return 0
238
239 ch = self.b[i]
240 if ch == 'w' or ch == 'x' or ch == 'y':
241 return 0
242
243 return 1
244
246 """ends(s) is TRUE <=> k0,...k ends with the string s."""
247 length = len(s)
248 if s[length - 1] != self.b[self.k]:
249 return 0
250 if length > (self.k - self.k0 + 1):
251 return 0
252 if self.b[self.k-length+1:self.k+1] != s:
253 return 0
254 self.j = self.k - length
255 return 1
256
258 """setto(s) sets (j+1),...k to the characters in the string s, readjusting k."""
259 length = len(s)
260 self.b = self.b[:self.j+1] + s + self.b[self.j+length+1:]
261 self.k = self.j + length
262
264 """r(s) is used further down."""
265 if self.m() > 0:
266 self.setto(s)
267
269 """step1ab() gets rid of plurals and -ed or -ing. e.g.
270
271 caresses -> caress
272 ponies -> poni
273 sties -> sti
274 tie -> tie (--NEW--: see below)
275 caress -> caress
276 cats -> cat
277
278 feed -> feed
279 agreed -> agree
280 disabled -> disable
281
282 matting -> mat
283 mating -> mate
284 meeting -> meet
285 milling -> mill
286 messing -> mess
287
288 meetings -> meet
289 """
290 if self.b[self.k] == 's':
291 if self.ends("sses"):
292 self.k = self.k - 2
293 elif self.ends("ies"):
294 if self.j == 0:
295 self.k = self.k - 1
296
297
298 else:
299 self.k = self.k - 2
300 elif self.b[self.k - 1] != 's':
301 self.k = self.k - 1
302
303 if self.ends("ied"):
304 if self.j == 0:
305 self.k = self.k - 1
306 else:
307 self.k = self.k - 2
308
309
310
311 elif self.ends("eed"):
312 if self.m() > 0:
313 self.k = self.k - 1
314 elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem():
315 self.k = self.j
316 if self.ends("at"): self.setto("ate")
317 elif self.ends("bl"): self.setto("ble")
318 elif self.ends("iz"): self.setto("ize")
319 elif self.doublec(self.k):
320 self.k = self.k - 1
321 ch = self.b[self.k]
322 if ch == 'l' or ch == 's' or ch == 'z':
323 self.k = self.k + 1
324 elif (self.m() == 1 and self.cvc(self.k)):
325 self.setto("e")
326
328 """step1c() turns terminal y to i when there is another vowel in the stem.
329 --NEW--: This has been modified from the original Porter algorithm so that y->i
330 is only done when y is preceded by a consonant, but not if the stem
331 is only a single consonant, i.e.
332
333 (*c and not c) Y -> I
334
335 So 'happy' -> 'happi', but
336 'enjoy' -> 'enjoy' etc
337
338 This is a much better rule. Formerly 'enjoy'->'enjoi' and 'enjoyment'->
339 'enjoy'. Step 1c is perhaps done too soon; but with this modification that
340 no longer really matters.
341
342 Also, the removal of the vowelinstem(z) condition means that 'spy', 'fly',
343 'try' ... stem to 'spi', 'fli', 'tri' and conflate with 'spied', 'tried',
344 'flies' ...
345 """
346 if self.ends("y") and self.j > 0 and self.cons(self.k - 1):
347 self.b = self.b[:self.k] + 'i' + self.b[self.k+1:]
348
350 """step2() maps double suffices to single ones.
351 so -ization ( = -ize plus -ation) maps to -ize etc. note that the
352 string before the suffix must give m() > 0.
353 """
354 if self.b[self.k - 1] == 'a':
355 if self.ends("ational"): self.r("ate")
356 elif self.ends("tional"): self.r("tion")
357 elif self.b[self.k - 1] == 'c':
358 if self.ends("enci"): self.r("ence")
359 elif self.ends("anci"): self.r("ance")
360 elif self.b[self.k - 1] == 'e':
361 if self.ends("izer"): self.r("ize")
362 elif self.b[self.k - 1] == 'l':
363 if self.ends("bli"): self.r("ble")
364
365
366 elif self.ends("alli"):
367 if self.m() > 0:
368 self.setto("al")
369 self.step2()
370 elif self.ends("fulli"): self.r("ful")
371 elif self.ends("entli"): self.r("ent")
372 elif self.ends("eli"): self.r("e")
373 elif self.ends("ousli"): self.r("ous")
374 elif self.b[self.k - 1] == 'o':
375 if self.ends("ization"): self.r("ize")
376 elif self.ends("ation"): self.r("ate")
377 elif self.ends("ator"): self.r("ate")
378 elif self.b[self.k - 1] == 's':
379 if self.ends("alism"): self.r("al")
380 elif self.ends("iveness"): self.r("ive")
381 elif self.ends("fulness"): self.r("ful")
382 elif self.ends("ousness"): self.r("ous")
383 elif self.b[self.k - 1] == 't':
384 if self.ends("aliti"): self.r("al")
385 elif self.ends("iviti"): self.r("ive")
386 elif self.ends("biliti"): self.r("ble")
387 elif self.b[self.k - 1] == 'g':
388 if self.ends("logi"):
389 self.j = self.j + 1
390 self.r("og")
391
392
394 """step3() dels with -ic-, -full, -ness etc. similar strategy to step2."""
395 if self.b[self.k] == 'e':
396 if self.ends("icate"): self.r("ic")
397 elif self.ends("ative"): self.r("")
398 elif self.ends("alize"): self.r("al")
399 elif self.b[self.k] == 'i':
400 if self.ends("iciti"): self.r("ic")
401 elif self.b[self.k] == 'l':
402 if self.ends("ical"): self.r("ic")
403 elif self.ends("ful"): self.r("")
404 elif self.b[self.k] == 's':
405 if self.ends("ness"): self.r("")
406
408 """step4() takes off -ant, -ence etc., in context <c>vcvc<v>."""
409 if self.b[self.k - 1] == 'a':
410 if self.ends("al"): pass
411 else: return
412 elif self.b[self.k - 1] == 'c':
413 if self.ends("ance"): pass
414 elif self.ends("ence"): pass
415 else: return
416 elif self.b[self.k - 1] == 'e':
417 if self.ends("er"): pass
418 else: return
419 elif self.b[self.k - 1] == 'i':
420 if self.ends("ic"): pass
421 else: return
422 elif self.b[self.k - 1] == 'l':
423 if self.ends("able"): pass
424 elif self.ends("ible"): pass
425 else: return
426 elif self.b[self.k - 1] == 'n':
427 if self.ends("ant"): pass
428 elif self.ends("ement"): pass
429 elif self.ends("ment"): pass
430 elif self.ends("ent"): pass
431 else: return
432 elif self.b[self.k - 1] == 'o':
433 if self.ends("ion") and (self.b[self.j] == 's' or self.b[self.j] == 't'): pass
434 elif self.ends("ou"): pass
435
436 else: return
437 elif self.b[self.k - 1] == 's':
438 if self.ends("ism"): pass
439 else: return
440 elif self.b[self.k - 1] == 't':
441 if self.ends("ate"): pass
442 elif self.ends("iti"): pass
443 else: return
444 elif self.b[self.k - 1] == 'u':
445 if self.ends("ous"): pass
446 else: return
447 elif self.b[self.k - 1] == 'v':
448 if self.ends("ive"): pass
449 else: return
450 elif self.b[self.k - 1] == 'z':
451 if self.ends("ize"): pass
452 else: return
453 else:
454 return
455 if self.m() > 1:
456 self.k = self.j
457
459 """step5() removes a final -e if m() > 1, and changes -ll to -l if
460 m() > 1.
461 """
462 self.j = self.k
463 if self.b[self.k] == 'e':
464 a = self.m()
465 if a > 1 or (a == 1 and not self.cvc(self.k-1)):
466 self.k = self.k - 1
467 if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1:
468 self.k = self.k -1
469
471 """In stem(p,i,j), p is a char pointer, and the string to be stemmed
472 is from p[i] to p[j] inclusive. Typically i is zero and j is the
473 offset to the last character of a string, (p[j+1] == '\0'). The
474 stemmer adjusts the characters p[i] ... p[j] and returns the new
475 end-point of the string, k. Stemming never increases word length, so
476 i <= k <= j. To turn the stemmer into a module, declare 'stem' as
477 extern, and delete the remainder of this file.
478 """
479
480
481
482 if j == None:
483 j = len(p) - 1
484
485
486 self.b = p
487 self.k = j
488 self.k0 = i
489
490 if self.pool.has_key(self.b[self.k0:self.k+1]):
491 return self.pool[self.b[self.k0:self.k+1]]
492
493 if self.k <= self.k0 + 1:
494 return self.b
495
496
497
498
499
500
501 self.step1ab()
502 self.step1c()
503 self.step2()
504 self.step3()
505 self.step4()
506 self.step5()
507 return self.b[self.k0:self.k+1]
508
510 lower = word.lower()
511
512 ret = ""
513 for x in xrange(len(stem)):
514 if lower[x] == stem[x]:
515 ret += word[x]
516 else:
517 ret += stem[x]
518
519 return ret
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542 - def stem(self, word):
545
546
547
549 return '<PorterStemmer>'
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
569 """
570 A demonstration of the porter stemmer on a sample from
571 the Penn Treebank corpus.
572 """
573
574 from nltk.corpus import treebank
575 from nltk import stem
576
577 stemmer = stem.PorterStemmer()
578
579 orig = []
580 stemmed = []
581 for item in treebank.files()[:3]:
582 for (word, tag) in treebank.tagged_words(item):
583 orig.append(word)
584 stemmed.append(stemmer.stem(word))
585
586
587 results = ' '.join(stemmed)
588 results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()
589
590
591 original = ' '.join(orig)
592 original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()
593
594
595 print '-Original-'.center(70).replace(' ', '*').replace('-', ' ')
596 print original
597 print '-Results-'.center(70).replace(' ', '*').replace('-', ' ')
598 print results
599 print '*'*70
600
601
602
603 if __name__ == '__main__': demo()
604