1
2
3
4
5
6
7
8 """
9 A word stemmer based on the Lancaster stemming algorithm.
10 Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
11 """
12
13 import re
14 from api import *
15
17
18
19 rule_tuple = (
20 "ai*2.",
21 "a*1.",
22 "bb1.",
23 "city3s.",
24 "ci2>",
25 "cn1t>",
26 "dd1.",
27 "dei3y>",
28 "deec2ss.",
29 "dee1.",
30 "de2>",
31 "dooh4>",
32 "e1>",
33 "feil1v.",
34 "fi2>",
35 "gni3>",
36 "gai3y.",
37 "ga2>",
38 "gg1.",
39 "ht*2.",
40 "hsiug5ct.",
41 "hsi3>",
42 "i*1.",
43 "i1y>",
44 "ji1d.",
45 "juf1s.",
46 "ju1d.",
47 "jo1d.",
48 "jeh1r.",
49 "jrev1t.",
50 "jsim2t.",
51 "jn1d.",
52 "j1s.",
53 "lbaifi6.",
54 "lbai4y.",
55 "lba3>",
56 "lbi3.",
57 "lib2l>",
58 "lc1.",
59 "lufi4y.",
60 "luf3>",
61 "lu2.",
62 "lai3>",
63 "lau3>",
64 "la2>",
65 "ll1.",
66 "mui3.",
67 "mu*2.",
68 "msi3>",
69 "mm1.",
70 "nois4j>",
71 "noix4ct.",
72 "noi3>",
73 "nai3>",
74 "na2>",
75 "nee0.",
76 "ne2>",
77 "nn1.",
78 "pihs4>",
79 "pp1.",
80 "re2>",
81 "rae0.",
82 "ra2.",
83 "ro2>",
84 "ru2>",
85 "rr1.",
86 "rt1>",
87 "rei3y>",
88 "sei3y>",
89 "sis2.",
90 "si2>",
91 "ssen4>",
92 "ss0.",
93 "suo3>",
94 "su*2.",
95 "s*1>",
96 "s0.",
97 "tacilp4y.",
98 "ta2>",
99 "tnem4>",
100 "tne3>",
101 "tna3>",
102 "tpir2b.",
103 "tpro2b.",
104 "tcud1.",
105 "tpmus2.",
106 "tpec2iv.",
107 "tulo2v.",
108 "tsis0.",
109 "tsi3>",
110 "tt1.",
111 "uqi3.",
112 "ugo1.",
113 "vis3j>",
114 "vie0.",
115 "vi2>",
116 "ylb1>",
117 "yli3y>",
118 "ylp0.",
119 "yl2>",
120 "ygo1.",
121 "yhp1.",
122 "ymo1.",
123 "ypo1.",
124 "yti3>",
125 "yte3>",
126 "ytl2.",
127 "yrtsi5.",
128 "yra3>",
129 "yro3>",
130 "yfi3.",
131 "ycn2t>",
132 "yca3>",
133 "zi2>",
134 "zy1s."
135 )
136
137
139 """Create an instance of the Lancaster stemmer.
140 """
141
142 self.rule_dictionary = {}
143
145 """Validate the set of rules used in this stemmer.
146 """
147 valid_rule = re.compile("^[a-z]+\*?\d[a-z]*[>\.]?$")
148
149 self.rule_dictionary = {}
150
151 for rule in rule_tuple:
152 if not valid_rule.match(rule):
153 raise ValueError, "The rule %s is invalid" % rule
154 first_letter = rule[0:1]
155 if first_letter in self.rule_dictionary:
156 self.rule_dictionary[first_letter].append(rule)
157 else:
158 self.rule_dictionary[first_letter] = [rule]
159
160 - def stem(self, word):
174
176 """Perform the actual word stemming
177 """
178
179 valid_rule = re.compile("^([a-z]+)(\*?)(\d)([a-z]*)([>\.]?)$")
180
181 proceed = True
182
183 while proceed:
184
185
186 last_letter_position = self.__getLastLetter(word)
187
188
189 if last_letter_position < 0 or word[last_letter_position] not in self.rule_dictionary:
190 proceed = False
191
192 else:
193 rule_was_applied = False
194
195
196 for rule in self.rule_dictionary[word[last_letter_position]]:
197 rule_match = valid_rule.match(rule)
198 if rule_match:
199 (ending_string,
200 intact_flag,
201 remove_total,
202 append_string,
203 cont_flag) = rule_match.groups()
204
205
206
207 remove_total = int(remove_total)
208
209
210 if word.endswith(ending_string[::-1]):
211 if intact_flag:
212 if (word == intact_word and
213 self.__isAcceptable(word, remove_total)):
214 word = self.__applyRule(word,
215 remove_total,
216 append_string)
217 rule_was_applied = True
218 if cont_flag == '.':
219 proceed = False
220 break
221 elif self.__isAcceptable(word, remove_total):
222 word = self.__applyRule(word,
223 remove_total,
224 append_string)
225 rule_was_applied = True
226 if cont_flag == '.':
227 proceed = False
228 break
229
230 if rule_was_applied == False:
231 proceed = False
232 return word
233
235 """Get the zero-based index of the last alphabetic character in this string
236 """
237 last_letter = -1
238 for position in range(len(word)):
239 if word[position].isalpha():
240 last_letter = position
241 else:
242 break
243 return last_letter
244
246 """Determine if the word is acceptable for stemming.
247 """
248 word_is_acceptable = False
249
250
251 if word[0] in "aeiouy":
252 if (len(word) - remove_total >= 2):
253 word_is_acceptable = True
254
255
256 elif (len(word) - remove_total >= 3):
257 if word[1] in "aeiouy":
258 word_is_acceptable = True
259 elif word[2] in "aeiouy":
260 word_is_acceptable = True
261 return word_is_acceptable
262
263
264 - def __applyRule(self, word, remove_total, append_string):
265 """Apply the stemming rule to the word
266 """
267
268 new_word_length = len(word) - remove_total
269 word = word[0:new_word_length]
270
271
272 if append_string:
273 word += append_string
274 return word
275
277 return '<LancasterStemmer>'
278
280 """A demonstration of the lancaster stemmer on a samples described in
281 Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
282 """
283 from nltk import stem
284
285 stemmer = stem.LancasterStemmer()
286
287 print "%-20s%-20s" % ("Original Word", "Stemmed Word")
288 print "*" * 40
289
290 for word in (
291 'maximum',
292 'presumably',
293 'multiply',
294 'provision',
295 'owed',
296 'ear',
297 'saying',
298 'crying',
299 'string',
300 'meant',
301 'cement'):
302 stemmed_word = stemmer.stem(word)
303 print "%-20s%-20s" % (word, stemmed_word)
304
305
306 if __name__ == '__main__':
307 demo()
308