1
2
3
4
5
6
7
8
9 """
10 Utility functions and classes for classifiers.
11 """
12 import math
13
14
15 import nltk.classify.util
16 from nltk.util import LazyMappedList
17
18
19
20
21
22
23
24
25
27 """
28 Use the L{LazyMappedList} class to construct a lazy list-like
29 object that is analagous to C{map(feature_func, toks)}. In
30 particular, if C{labeled=False}, then the returned list-like
31 object's values are equal to::
32
33 [feature_func(tok) for tok in toks]
34
35 If C{labeled=True}, then the returned list-like object's values
36 are equal to::
37
38 [(feature_func(tok), label) for (tok, label) in toks]
39
40 The primary purpose of this function is to avoid the memory
41 overhead involved in storing all the featuresets for every token
42 in a corpus. Instead, these featuresets are constructed lazily,
43 as-needed. The reduction in memory overhead can be especially
44 significant when the underlying list of tokens is itself lazy (as
45 is the case with many corpus readers).
46
47 @param feature_func: The function that will be applied to each
48 token. It should return a featureset -- i.e., a C{dict}
49 mapping feature names to feature values.
50 @param toks: The list of tokens to which C{feature_func} should be
51 applied. If C{labeled=True}, then the list elements will be
52 passed directly to C{feature_func()}. If C{labeled=False},
53 then the list elements should be tuples C{(tok,label)}, and
54 C{tok} will be passed to C{feature_func()}.
55 @param labeled: If true, then C{toks} contains labeled tokens --
56 i.e., tuples of the form C{(tok, label)}. (Default:
57 auto-detect based on types.)
58 """
59 if labeled is None:
60 labeled = tokens and isinstance(tokens[0], (tuple, list))
61 if labeled:
62 def lazy_func(labeled_token):
63 return (feature_func(labeled_token[0]), labeled_token[1])
64 return LazyMappedList(toks, lazy_func)
65 else:
66 return LazyMappedList(toks, feature_func)
67
69 """
70 @return: A list of all labels that are attested in the given list
71 of tokens.
72 @rtype: C{list} of (immutable)
73 @param tokens: The list of classified tokens from which to extract
74 labels. A classified token has the form C{(token, label)}.
75 @type tokens: C{list}
76 """
77 return tuple(set([label for (tok,label) in tokens]))
78
83
88
90 """
91 A helper class that implements cutoff checks based on number of
92 iterations and log likelihood.
93
94 Accuracy cutoffs are also implemented, but they're almost never
95 a good idea to use.
96 """
98 self.cutoffs = cutoffs.copy()
99 if 'min_ll' in cutoffs:
100 cutoffs['min_ll'] = -abs(cutoffs['min_ll'])
101 if 'min_lldelta' in cutoffs:
102 cutoffs['min_lldelta'] = abs(cutoffs['min_lldelta'])
103 self.ll = None
104 self.acc = None
105 self.iter = 1
106
107 - def check(self, classifier, train_toks):
108 cutoffs = self.cutoffs
109 self.iter += 1
110 if 'max_iter' in cutoffs and self.iter >= cutoffs['max_iter']:
111 return True
112
113 if 'min_ll' in cutoffs or 'min_lldelta' in cutoffs:
114 new_ll = nltk.classify.util.log_likelihood(classifier, train_toks)
115 if 'min_ll' in cutoffs and new_ll >= cutoffs['min_ll']:
116 return True
117 if ('min_lldelta' in cutoffs and self.ll and
118 ((new_ll - self.ll) <= abs(cutoffs['min_lldelta']))):
119 return True
120 self.ll = new_ll
121
122 if 'max_acc' in cutoffs or 'min_accdelta' in cutoffs:
123 new_acc = nltk.classify.util.log_likelihood(
124 classifier, train_toks)
125 if 'max_acc' in cutoffs and new_acc >= cutoffs['max_acc']:
126 return True
127 if ('min_accdelta' in cutoffs and self.acc and
128 ((new_acc - self.acc) <= abs(cutoffs['min_accdelta']))):
129 return True
130 self.acc = new_acc
131
132 return False
133
134
135
136
137
139 features = {}
140 features['alwayson'] = True
141 features['startswith'] = name[0].lower()
142 features['endswith'] = name[-1].lower()
143 for letter in 'abcdefghijklmnopqrstuvwxyz':
144 features['count(%s)' % letter] = name.lower().count(letter)
145 features['has(%s)' % letter] = letter in name.lower()
146 return features
147
149 features = {}
150 features['alwayson'] = True
151 features['startswith(vowel)'] = name[0].lower() in 'aeiouy'
152 features['endswith(vowel)'] = name[-1].lower() in 'aeiouy'
153 for letter in 'abcdefghijklmnopqrstuvwxyz':
154 features['count(%s)' % letter] = name.lower().count(letter)
155 features['has(%s)' % letter] = letter in name.lower()
156 features['startswith(%s)' % letter] = (letter==name[0].lower())
157 features['endswith(%s)' % letter] = (letter==name[-1].lower())
158 return features
159
161 from nltk.corpus import names
162 import random
163
164
165 namelist = ([(name, 'male') for name in names.words('male.txt')] +
166 [(name, 'female') for name in names.words('female.txt')])
167
168
169 random.seed(123456)
170 random.shuffle(namelist)
171 train = namelist[:5000]
172 test = namelist[5000:5500]
173
174
175 print 'Training classifier...'
176 classifier = trainer( [(features(n), g) for (n,g) in train] )
177
178
179 print 'Testing classifier...'
180 acc = accuracy(classifier, [(features(n),g) for (n,g) in test])
181 print 'Accuracy: %6.4f' % acc
182
183
184
185 try:
186 test_featuresets = [features(n) for (n,g) in test]
187 pdists = classifier.batch_prob_classify(test_featuresets)
188 ll = [pdist.logprob(gold)
189 for ((name, gold), pdist) in zip(test, pdists)]
190 print 'Avg. log likelihood: %6.4f' % (sum(ll)/len(test))
191 print
192 print 'Unseen Names P(Male) P(Female)\n'+'-'*40
193 for ((name, gender), pdist) in zip(test, pdists)[:5]:
194 if gender == 'male':
195 fmt = ' %-15s *%6.4f %6.4f'
196 else:
197 fmt = ' %-15s %6.4f *%6.4f'
198 print fmt % (name, pdist.prob('male'), pdist.prob('female'))
199 except NotImplementedError:
200 pass
201
202
203 return classifier
204
205 _inst_cache = {}
206 -def wsd_demo(trainer, word, features, n=1000):
207 from nltk.corpus import senseval
208 import random
209
210
211 print 'Reading data...'
212 global _inst_cache
213 if word not in _inst_cache:
214 _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
215 instances = _inst_cache[word][:]
216 if n> len(instances): n = len(instances)
217 senses = list(set(l for (i,l) in instances))
218 print ' Senses: ' + ' '.join(senses)
219
220
221 print 'Splitting into test & train...'
222 random.seed(123456)
223 random.shuffle(instances)
224 train = instances[:int(.8*n)]
225 test = instances[int(.8*n):n]
226
227
228 print 'Training classifier...'
229 classifier = trainer( [(features(i), l) for (i,l) in train] )
230
231
232 print 'Testing classifier...'
233 acc = accuracy(classifier, [(features(i),l) for (i,l) in test])
234 print 'Accuracy: %6.4f' % acc
235
236
237
238 try:
239 test_featuresets = [features(i) for (i,n) in test]
240 pdists = classifier.batch_prob_classify(test_featuresets)
241 ll = [pdist.logprob(gold)
242 for ((name, gold), pdist) in zip(test, pdists)]
243 print 'Avg. log likelihood: %6.4f' % (sum(ll)/len(test))
244 except NotImplementedError:
245 pass
246
247
248 return classifier
249