nltk.classify.util

1 # Natural Language Toolkit: Classifier Utility Functions 2 # 3 # Copyright (C) 2001-2008 NLTK Project 4 # Author: Edward Loper <[email protected]> 5 # Steven Bird <[email protected]> (minor additions) 6 # URL: <http://nltk.org> 7 # For license information, see LICENSE.TXT 8 9 """ 10 Utility functions and classes for classifiers. 11 """ 12 import math 13 14 #from nltk.util import Deprecated 15 import nltk.classify.util # for accuracy & log_likelihood 16 from nltk.util import LazyMappedList 17 18 ###################################################################### 19 #{ Helper Functions 20 ###################################################################### 21 22 # alternative name possibility: 'map_featurefunc()'? 23 # alternative name possibility: 'detect_features()'? 24 # alternative name possibility: 'map_featuredetect()'? 25 # or.. just have users use LazyMappedList directly?

26 -def apply_features(feature_func, toks, labeled=None):

27 """ 28 Use the L{LazyMappedList} class to construct a lazy list-like 29 object that is analagous to C{map(feature_func, toks)}. In 30 particular, if C{labeled=False}, then the returned list-like 31 object's values are equal to:: 32 33 [feature_func(tok) for tok in toks] 34 35 If C{labeled=True}, then the returned list-like object's values 36 are equal to:: 37 38 [(feature_func(tok), label) for (tok, label) in toks] 39 40 The primary purpose of this function is to avoid the memory 41 overhead involved in storing all the featuresets for every token 42 in a corpus. Instead, these featuresets are constructed lazily, 43 as-needed. The reduction in memory overhead can be especially 44 significant when the underlying list of tokens is itself lazy (as 45 is the case with many corpus readers). 46 47 @param feature_func: The function that will be applied to each 48 token. It should return a featureset -- i.e., a C{dict} 49 mapping feature names to feature values. 50 @param toks: The list of tokens to which C{feature_func} should be 51 applied. If C{labeled=True}, then the list elements will be 52 passed directly to C{feature_func()}. If C{labeled=False}, 53 then the list elements should be tuples C{(tok,label)}, and 54 C{tok} will be passed to C{feature_func()}. 55 @param labeled: If true, then C{toks} contains labeled tokens -- 56 i.e., tuples of the form C{(tok, label)}. (Default: 57 auto-detect based on types.) 58 """ 59 if labeled is None: 60 labeled = tokens and isinstance(tokens[0], (tuple, list)) 61 if labeled: 62 def lazy_func(labeled_token): 63 return (feature_func(labeled_token[0]), labeled_token[1])

64 return LazyMappedList(toks, lazy_func) 65 else: 66 return LazyMappedList(toks, feature_func) 67

68 -def attested_labels(tokens):

69 """ 70 @return: A list of all labels that are attested in the given list 71 of tokens. 72 @rtype: C{list} of (immutable) 73 @param tokens: The list of classified tokens from which to extract 74 labels. A classified token has the form C{(token, label)}. 75 @type tokens: C{list} 76 """ 77 return tuple(set([label for (tok,label) in tokens]))

78

79 -def log_likelihood(classifier, gold):

80 results = classifier.batch_prob_classify([fs for (fs,l) in gold]) 81 ll = [pdist.prob(l) for ((fs,l), pdist) in zip(gold, results)] 82 return math.log(float(sum(ll))/len(ll))

83

84 -def accuracy(classifier, gold):

85 results = classifier.batch_classify([fs for (fs,l) in gold]) 86 correct = [l==r for ((fs,l), r) in zip(gold, results)] 87 return float(sum(correct))/len(correct)

88

89 -class CutoffChecker(object):

90 """ 91 A helper class that implements cutoff checks based on number of 92 iterations and log likelihood. 93 94 Accuracy cutoffs are also implemented, but they're almost never 95 a good idea to use. 96 """

97 - def __init__(self, cutoffs):

98 self.cutoffs = cutoffs.copy() 99 if 'min_ll' in cutoffs: 100 cutoffs['min_ll'] = -abs(cutoffs['min_ll']) 101 if 'min_lldelta' in cutoffs: 102 cutoffs['min_lldelta'] = abs(cutoffs['min_lldelta']) 103 self.ll = None 104 self.acc = None 105 self.iter = 1

106

107 - def check(self, classifier, train_toks):

108 cutoffs = self.cutoffs 109 self.iter += 1 110 if 'max_iter' in cutoffs and self.iter >= cutoffs['max_iter']: 111 return True # iteration cutoff. 112 113 if 'min_ll' in cutoffs or 'min_lldelta' in cutoffs: 114 new_ll = nltk.classify.util.log_likelihood(classifier, train_toks) 115 if 'min_ll' in cutoffs and new_ll >= cutoffs['min_ll']: 116 return True # log likelihood cutoff 117 if ('min_lldelta' in cutoffs and self.ll and 118 ((new_ll - self.ll) <= abs(cutoffs['min_lldelta']))): 119 return True # log likelihood delta cutoff 120 self.ll = new_ll 121 122 if 'max_acc' in cutoffs or 'min_accdelta' in cutoffs: 123 new_acc = nltk.classify.util.log_likelihood( 124 classifier, train_toks) 125 if 'max_acc' in cutoffs and new_acc >= cutoffs['max_acc']: 126 return True # log likelihood cutoff 127 if ('min_accdelta' in cutoffs and self.acc and 128 ((new_acc - self.acc) <= abs(cutoffs['min_accdelta']))): 129 return True # log likelihood delta cutoff 130 self.acc = new_acc 131 132 return False # no cutoff reached.

133 134 ###################################################################### 135 #{ Demos 136 ###################################################################### 137

138 -def names_demo_features(name):

139 features = {} 140 features['alwayson'] = True 141 features['startswith'] = name[0].lower() 142 features['endswith'] = name[-1].lower() 143 for letter in 'abcdefghijklmnopqrstuvwxyz': 144 features['count(%s)' % letter] = name.lower().count(letter) 145 features['has(%s)' % letter] = letter in name.lower() 146 return features

147

148 -def binary_names_demo_features(name):

149 features = {} 150 features['alwayson'] = True 151 features['startswith(vowel)'] = name[0].lower() in 'aeiouy' 152 features['endswith(vowel)'] = name[-1].lower() in 'aeiouy' 153 for letter in 'abcdefghijklmnopqrstuvwxyz': 154 features['count(%s)' % letter] = name.lower().count(letter) 155 features['has(%s)' % letter] = letter in name.lower() 156 features['startswith(%s)' % letter] = (letter==name[0].lower()) 157 features['endswith(%s)' % letter] = (letter==name[-1].lower()) 158 return features

159

160 -def names_demo(trainer, features=names_demo_features):

161 from nltk.corpus import names 162 import random 163 164 # Construct a list of classified names, using the names corpus. 165 namelist = ([(name, 'male') for name in names.words('male.txt')] + 166 [(name, 'female') for name in names.words('female.txt')]) 167 168 # Randomly split the names into a test & train set. 169 random.seed(123456) 170 random.shuffle(namelist) 171 train = namelist[:5000] 172 test = namelist[5000:5500] 173 174 # Train up a classifier. 175 print 'Training classifier...' 176 classifier = trainer( [(features(n), g) for (n,g) in train] ) 177 178 # Run the classifier on the test data. 179 print 'Testing classifier...' 180 acc = accuracy(classifier, [(features(n),g) for (n,g) in test]) 181 print 'Accuracy: %6.4f' % acc 182 183 # For classifiers that can find probabilities, show the log 184 # likelihood and some sample probability distributions. 185 try: 186 test_featuresets = [features(n) for (n,g) in test] 187 pdists = classifier.batch_prob_classify(test_featuresets) 188 ll = [pdist.logprob(gold) 189 for ((name, gold), pdist) in zip(test, pdists)] 190 print 'Avg. log likelihood: %6.4f' % (sum(ll)/len(test)) 191 print 192 print 'Unseen Names P(Male) P(Female)\n'+'-'*40 193 for ((name, gender), pdist) in zip(test, pdists)[:5]: 194 if gender == 'male': 195 fmt = ' %-15s *%6.4f %6.4f' 196 else: 197 fmt = ' %-15s %6.4f *%6.4f' 198 print fmt % (name, pdist.prob('male'), pdist.prob('female')) 199 except NotImplementedError: 200 pass 201 202 # Return the classifier 203 return classifier

204 205 _inst_cache = {}

206 -def wsd_demo(trainer, word, features, n=1000):

207 from nltk.corpus import senseval 208 import random 209 210 # Get the instances. 211 print 'Reading data...' 212 global _inst_cache 213 if word not in _inst_cache: 214 _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)] 215 instances = _inst_cache[word][:] 216 if n> len(instances): n = len(instances) 217 senses = list(set(l for (i,l) in instances)) 218 print ' Senses: ' + ' '.join(senses) 219 220 # Randomly split the names into a test & train set. 221 print 'Splitting into test & train...' 222 random.seed(123456) 223 random.shuffle(instances) 224 train = instances[:int(.8*n)] 225 test = instances[int(.8*n):n] 226 227 # Train up a classifier. 228 print 'Training classifier...' 229 classifier = trainer( [(features(i), l) for (i,l) in train] ) 230 231 # Run the classifier on the test data. 232 print 'Testing classifier...' 233 acc = accuracy(classifier, [(features(i),l) for (i,l) in test]) 234 print 'Accuracy: %6.4f' % acc 235 236 # For classifiers that can find probabilities, show the log 237 # likelihood and some sample probability distributions. 238 try: 239 test_featuresets = [features(i) for (i,n) in test] 240 pdists = classifier.batch_prob_classify(test_featuresets) 241 ll = [pdist.logprob(gold) 242 for ((name, gold), pdist) in zip(test, pdists)] 243 print 'Avg. log likelihood: %6.4f' % (sum(ll)/len(test)) 244 except NotImplementedError: 245 pass 246 247 # Return the classifier 248 return classifier

249

Source Code for Module nltk.classify.util