Code Coverage for nltk.classify.naivebayes
Untested Functions
|
Partially Tested Functions
|
"""
A classifier based on the Naive Bayes algorithm. In order to find the
probability for a label, this algorithm first uses the Bayes rule to
express P(label|features) in terms of P(label) and P(features|label)::
P(label) * P(features|label)
P(label|features) = ------------------------------
P(features)
The algorithm then makes the 'naive' assumption that all features are
independent, given the label::
P(label) * P(f1|label) * ... * P(fn|label)
P(label|features) = --------------------------------------------
P(features)
Rather than computing P(featues) explicitly, the algorithm just
calculates the denominator for each label, and normalizes them so they
sum to one::
P(label) * P(f1|label) * ... * P(fn|label)
P(label|features) = --------------------------------------------
SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) )
"""
from nltk import defaultdict
from nltk.probability import *
from api import *
class NaiveBayesClassifier(ClassifierI):
"""
A Naive Bayes classifier. Naive Bayes classifiers are
paramaterized by two probability distributions:
- P(label) gives the probability that an input will receive each
label, given no information about the input's features.
- P(fname=fval|label) gives the probability that a given feature
(fname) will receive a given value (fval), given that the
label (label).
If the classifier encounters an input with a feature that has
never been seen with any label, then rather than assigning a
probability of 0 to all labels, it will ignore that feature.
The feature value 'None' is reserved for unseen feature values;
you generally should not use 'None' as a feature value for one of
your own features.
"""
def __init__(self, label_probdist, feature_probdist):
"""
@param label_probdist: P(label), the probability distribution
over labels. It is expressed as a L{ProbDistI} whose
samples are labels. I.e., P(label) =
C{label_probdist.prob(label)}.
@param feature_probdist: P(fname=fval|label), the probability
distribution for feature values, given labels. It is
expressed as a dictionary whose keys are C{(label,fname)}
pairs and whose values are L{ProbDistI}s over feature
values. I.e., P(fname=fval|label) =
C{feature_probdist[label,fname].prob(fval)}. If a given
C{(label,fname)} is not a key in C{feature_probdist}, then
it is assumed that the corresponding P(fname=fval|label)
is 0 for all values of C{fval}.
"""
self._label_probdist = label_probdist
self._feature_probdist = feature_probdist
self._labels = label_probdist.samples()
def labels(self):
return self._labels
def classify(self, featureset):
return self.prob_classify(featureset).max()
def prob_classify(self, featureset):
featureset = featureset.copy()
for fname in featureset.keys():
for label in self._labels:
if (label, fname) in self._feature_probdist:
break
else:
del featureset[fname]
logprob = {}
for label in self._labels:
logprob[label] = self._label_probdist.logprob(label)
for label in self._labels:
for (fname, fval) in featureset.items():
if (label, fname) in self._feature_probdist:
feature_probs = self._feature_probdist[label,fname]
logprob[label] += feature_probs.logprob(fval)
else:
logprob[label] += sum_logs([])
return DictionaryProbDist(logprob, normalize=True, log=True)
def show_most_informative_features(self, n=10):
cpdist = self._feature_probdist
print '\nMost Informative Features'
for (fname, fval) in self.most_informative_features(n):
def labelprob(l):
return cpdist[l,fname].prob(fval)
labels = sorted([l for l in self._labels
if fval in cpdist[l,fname].samples()],
key=labelprob)
if len(labels) == 1: continue
l0 = labels[0]
l1 = labels[-1]
if cpdist[l0,fname].prob(fval) == 0:
ratio = 'INF'
else:
ratio = '%8.1f' % (cpdist[l1,fname].prob(fval) /
cpdist[l0,fname].prob(fval))
print ('%24s = %-16r %5s : %-5s = %s : 1.0' %
(fname, fval, l1[:5], l0[:5], ratio))
def most_informative_features(self, n=100):
"""
Return a list of the 'most informative' features used by this
classifier. For the purpose of this function, the
informativeness of a feature C{(fname,fval)} is equal to the
highest value of P(fname=fval|label), for any label, divided by
the lowest value of P(fname=fval|label), for any label::
max[ P(fname=fval|label1) / P(fname=fval|label2) ]
"""
features = set()
maxprob = defaultdict(lambda: 0.0)
minprob = defaultdict(lambda: 1.0)
for (label, fname), probdist in self._feature_probdist.items():
for fval in probdist.samples():
feature = (fname, fval)
features.add( feature )
p = probdist.prob(fval)
maxprob[feature] = max(p, maxprob[feature])
minprob[feature] = min(p, minprob[feature])
if minprob[feature] == 0:
features.discard(feature)
features = sorted(features,
key=lambda feature: minprob[feature]/maxprob[feature])
return features[:n]
@staticmethod
def train(labeled_featuresets, estimator=ELEProbDist):
"""
@param labeled_featuresets: A list of classified featuresets,
i.e., a list of tuples C{(featureset, label)}.
"""
label_freqdist = FreqDist()
feature_freqdist = defaultdict(FreqDist)
feature_values = defaultdict(set)
fnames = set()
for featureset, label in labeled_featuresets:
label_freqdist.inc(label)
for fname, fval in featureset.items():
feature_freqdist[label, fname].inc(fval)
feature_values[fname].add(fval)
fnames.add(fname)
for label in label_freqdist:
num_samples = label_freqdist[label]
for fname in fnames:
count = feature_freqdist[label, fname].N()
feature_freqdist[label, fname].inc(None, num_samples-count)
feature_values[fname].add(None)
label_probdist = estimator(label_freqdist)
feature_probdist = {}
for ((label, fname), freqdist) in feature_freqdist.items():
probdist = estimator(freqdist, bins=len(feature_values[fname]))
feature_probdist[label,fname] = probdist
return NaiveBayesClassifier(label_probdist, feature_probdist)
def demo():
from nltk.classify.util import names_demo
classifier = names_demo(NaiveBayesClassifier.train)
classifier.show_most_informative_features()
if __name__ == '__main__':
demo()