nltk.classify.naivebayes

1 # Natural Language Toolkit: Naive Bayes Classifiers 2 # 3 # Copyright (C) 2001-2008 NLTK Project 4 # Author: Edward Loper <[email protected]> 5 # URL: <http://nltk.org> 6 # For license information, see LICENSE.TXT 7 # 8 # $Id: naivebayes.py 2063 2004-07-17 21:02:24Z edloper $ 9 10 """ 11 A classifier based on the Naive Bayes algorithm. In order to find the 12 probability for a label, this algorithm first uses the Bayes rule to 13 express P(label|features) in terms of P(label) and P(features|label):: 14 15 P(label) * P(features|label) 16 P(label|features) = ------------------------------ 17 P(features) 18 19 The algorithm then makes the 'naive' assumption that all features are 20 independent, given the label:: 21 22 P(label) * P(f1|label) * ... * P(fn|label) 23 P(label|features) = -------------------------------------------- 24 P(features) 25 26 Rather than computing P(featues) explicitly, the algorithm just 27 calculates the denominator for each label, and normalizes them so they 28 sum to one:: 29 30 P(label) * P(f1|label) * ... * P(fn|label) 31 P(label|features) = -------------------------------------------- 32 SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) ) 33 """ 34 35 from nltk import defaultdict 36 from nltk.probability import * 37 38 from api import * 39 40 ##////////////////////////////////////////////////////// 41 ## Naive Bayes Classifier 42 ##////////////////////////////////////////////////////// 43

44 -class NaiveBayesClassifier(ClassifierI):

45 """ 46 A Naive Bayes classifier. Naive Bayes classifiers are 47 paramaterized by two probability distributions: 48 49 - P(label) gives the probability that an input will receive each 50 label, given no information about the input's features. 51 52 - P(fname=fval|label) gives the probability that a given feature 53 (fname) will receive a given value (fval), given that the 54 label (label). 55 56 If the classifier encounters an input with a feature that has 57 never been seen with any label, then rather than assigning a 58 probability of 0 to all labels, it will ignore that feature. 59 60 The feature value 'None' is reserved for unseen feature values; 61 you generally should not use 'None' as a feature value for one of 62 your own features. 63 """

64 - def __init__(self, label_probdist, feature_probdist):

65 """ 66 @param label_probdist: P(label), the probability distribution 67 over labels. It is expressed as a L{ProbDistI} whose 68 samples are labels. I.e., P(label) = 69 C{label_probdist.prob(label)}. 70 71 @param feature_probdist: P(fname=fval|label), the probability 72 distribution for feature values, given labels. It is 73 expressed as a dictionary whose keys are C{(label,fname)} 74 pairs and whose values are L{ProbDistI}s over feature 75 values. I.e., P(fname=fval|label) = 76 C{feature_probdist[label,fname].prob(fval)}. If a given 77 C{(label,fname)} is not a key in C{feature_probdist}, then 78 it is assumed that the corresponding P(fname=fval|label) 79 is 0 for all values of C{fval}. 80 """ 81 self._label_probdist = label_probdist 82 self._feature_probdist = feature_probdist 83 self._labels = label_probdist.samples()

84

85 - def labels(self):

86 return self._labels

87

88 - def classify(self, featureset):

89 return self.prob_classify(featureset).max()

90

91 - def prob_classify(self, featureset):

92 # Discard any feature names that we've never seen before. 93 # Otherwise, we'll just assign a probability of 0 to 94 # everything. 95 featureset = featureset.copy() 96 for fname in featureset.keys(): 97 for label in self._labels: 98 if (label, fname) in self._feature_probdist: 99 break 100 else: 101 #print 'Ignoring unseen feature %s' % fname 102 del featureset[fname] 103 104 # Find the log probabilty of each label, given the features. 105 # Start with the log probability of the label itself. 106 logprob = {} 107 for label in self._labels: 108 logprob[label] = self._label_probdist.logprob(label) 109 110 # Then add in the log probability of features given labels. 111 for label in self._labels: 112 for (fname, fval) in featureset.items(): 113 if (label, fname) in self._feature_probdist: 114 feature_probs = self._feature_probdist[label,fname] 115 logprob[label] += feature_probs.logprob(fval) 116 else: 117 # nb: This case will never come up if the 118 # classifier was created by 119 # NaiveBayesClassifier.train(). 120 logprob[label] += sum_logs([]) # = -INF. 121 122 return DictionaryProbDist(logprob, normalize=True, log=True)

123

124 - def show_most_informative_features(self, n=10):

125 # Determine the most relevant features, and display them. 126 cpdist = self._feature_probdist 127 print '\nMost Informative Features' 128 129 for (fname, fval) in self.most_informative_features(n): 130 def labelprob(l): 131 return cpdist[l,fname].prob(fval)

132 labels = sorted([l for l in self._labels 133 if fval in cpdist[l,fname].samples()], 134 key=labelprob) 135 if len(labels) == 1: continue 136 l0 = labels[0] 137 l1 = labels[-1] 138 if cpdist[l0,fname].prob(fval) == 0: 139 ratio = 'INF' 140 else: 141 ratio = '%8.1f' % (cpdist[l1,fname].prob(fval) / 142 cpdist[l0,fname].prob(fval)) 143 print ('%24s = %-16r %5s : %-5s = %s : 1.0' % 144 (fname, fval, l1[:5], l0[:5], ratio))

145

146 - def most_informative_features(self, n=100):

147 """ 148 Return a list of the 'most informative' features used by this 149 classifier. For the purpose of this function, the 150 informativeness of a feature C{(fname,fval)} is equal to the 151 highest value of P(fname=fval|label), for any label, divided by 152 the lowest value of P(fname=fval|label), for any label:: 153 154 max[ P(fname=fval|label1) / P(fname=fval|label2) ] 155 """ 156 # The set of (fname, fval) pairs used by this classifier. 157 features = set() 158 # The max & min probability associated w/ each (fname, fval) 159 # pair. Maps (fname,fval) -> float. 160 maxprob = defaultdict(lambda: 0.0) 161 minprob = defaultdict(lambda: 1.0) 162 163 for (label, fname), probdist in self._feature_probdist.items(): 164 for fval in probdist.samples(): 165 feature = (fname, fval) 166 features.add( feature ) 167 p = probdist.prob(fval) 168 maxprob[feature] = max(p, maxprob[feature]) 169 minprob[feature] = min(p, minprob[feature]) 170 if minprob[feature] == 0: 171 features.discard(feature) 172 173 # Convert features to a list, & sort it by how informative 174 # features are. 175 features = sorted(features, 176 key=lambda feature: minprob[feature]/maxprob[feature]) 177 return features[:n]

178 179 @staticmethod

180 - def train(labeled_featuresets, estimator=ELEProbDist):

181 """ 182 @param labeled_featuresets: A list of classified featuresets, 183 i.e., a list of tuples C{(featureset, label)}. 184 """ 185 label_freqdist = FreqDist() 186 feature_freqdist = defaultdict(FreqDist) 187 feature_values = defaultdict(set) 188 fnames = set() 189 190 # Count up how many times each feature value occured, given 191 # the label and featurename. 192 for featureset, label in labeled_featuresets: 193 label_freqdist.inc(label) 194 for fname, fval in featureset.items(): 195 # Increment freq(fval|label, fname) 196 feature_freqdist[label, fname].inc(fval) 197 # Record that fname can take the value fval. 198 feature_values[fname].add(fval) 199 # Keep a list of all feature names. 200 fnames.add(fname) 201 202 # If a feature didn't have a value given for an instance, then 203 # we assume that it gets the implicit value 'None.' This loop 204 # counts up the number of 'missing' feature values for each 205 # (label,fname) pair, and increments the count of the fval 206 # 'None' by that amount. 207 for label in label_freqdist: 208 num_samples = label_freqdist[label] 209 for fname in fnames: 210 count = feature_freqdist[label, fname].N() 211 feature_freqdist[label, fname].inc(None, num_samples-count) 212 feature_values[fname].add(None) 213 214 # Create the P(label) distribution 215 label_probdist = estimator(label_freqdist) 216 217 # Create the P(fval|label, fname) distribution 218 feature_probdist = {} 219 for ((label, fname), freqdist) in feature_freqdist.items(): 220 probdist = estimator(freqdist, bins=len(feature_values[fname])) 221 feature_probdist[label,fname] = probdist 222 223 return NaiveBayesClassifier(label_probdist, feature_probdist)

224 225 ##////////////////////////////////////////////////////// 226 ## Demo 227 ##////////////////////////////////////////////////////// 228

229 -def demo():

230 from nltk.classify.util import names_demo 231 classifier = names_demo(NaiveBayesClassifier.train) 232 classifier.show_most_informative_features()

233 234 if __name__ == '__main__': 235 demo() 236

Source Code for Module nltk.classify.naivebayes