1
2
3
4
5
6
7
8
9
10 """
11 A classifier based on the Naive Bayes algorithm. In order to find the
12 probability for a label, this algorithm first uses the Bayes rule to
13 express P(label|features) in terms of P(label) and P(features|label)::
14
15 P(label) * P(features|label)
16 P(label|features) = ------------------------------
17 P(features)
18
19 The algorithm then makes the 'naive' assumption that all features are
20 independent, given the label::
21
22 P(label) * P(f1|label) * ... * P(fn|label)
23 P(label|features) = --------------------------------------------
24 P(features)
25
26 Rather than computing P(featues) explicitly, the algorithm just
27 calculates the denominator for each label, and normalizes them so they
28 sum to one::
29
30 P(label) * P(f1|label) * ... * P(fn|label)
31 P(label|features) = --------------------------------------------
32 SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) )
33 """
34
35 from nltk import defaultdict
36 from nltk.probability import *
37
38 from api import *
39
40
41
42
43
45 """
46 A Naive Bayes classifier. Naive Bayes classifiers are
47 paramaterized by two probability distributions:
48
49 - P(label) gives the probability that an input will receive each
50 label, given no information about the input's features.
51
52 - P(fname=fval|label) gives the probability that a given feature
53 (fname) will receive a given value (fval), given that the
54 label (label).
55
56 If the classifier encounters an input with a feature that has
57 never been seen with any label, then rather than assigning a
58 probability of 0 to all labels, it will ignore that feature.
59
60 The feature value 'None' is reserved for unseen feature values;
61 you generally should not use 'None' as a feature value for one of
62 your own features.
63 """
64 - def __init__(self, label_probdist, feature_probdist):
65 """
66 @param label_probdist: P(label), the probability distribution
67 over labels. It is expressed as a L{ProbDistI} whose
68 samples are labels. I.e., P(label) =
69 C{label_probdist.prob(label)}.
70
71 @param feature_probdist: P(fname=fval|label), the probability
72 distribution for feature values, given labels. It is
73 expressed as a dictionary whose keys are C{(label,fname)}
74 pairs and whose values are L{ProbDistI}s over feature
75 values. I.e., P(fname=fval|label) =
76 C{feature_probdist[label,fname].prob(fval)}. If a given
77 C{(label,fname)} is not a key in C{feature_probdist}, then
78 it is assumed that the corresponding P(fname=fval|label)
79 is 0 for all values of C{fval}.
80 """
81 self._label_probdist = label_probdist
82 self._feature_probdist = feature_probdist
83 self._labels = label_probdist.samples()
84
87
90
92
93
94
95 featureset = featureset.copy()
96 for fname in featureset.keys():
97 for label in self._labels:
98 if (label, fname) in self._feature_probdist:
99 break
100 else:
101
102 del featureset[fname]
103
104
105
106 logprob = {}
107 for label in self._labels:
108 logprob[label] = self._label_probdist.logprob(label)
109
110
111 for label in self._labels:
112 for (fname, fval) in featureset.items():
113 if (label, fname) in self._feature_probdist:
114 feature_probs = self._feature_probdist[label,fname]
115 logprob[label] += feature_probs.logprob(fval)
116 else:
117
118
119
120 logprob[label] += sum_logs([])
121
122 return DictionaryProbDist(logprob, normalize=True, log=True)
123
132 labels = sorted([l for l in self._labels
133 if fval in cpdist[l,fname].samples()],
134 key=labelprob)
135 if len(labels) == 1: continue
136 l0 = labels[0]
137 l1 = labels[-1]
138 if cpdist[l0,fname].prob(fval) == 0:
139 ratio = 'INF'
140 else:
141 ratio = '%8.1f' % (cpdist[l1,fname].prob(fval) /
142 cpdist[l0,fname].prob(fval))
143 print ('%24s = %-16r %5s : %-5s = %s : 1.0' %
144 (fname, fval, l1[:5], l0[:5], ratio))
145
178
179 @staticmethod
181 """
182 @param labeled_featuresets: A list of classified featuresets,
183 i.e., a list of tuples C{(featureset, label)}.
184 """
185 label_freqdist = FreqDist()
186 feature_freqdist = defaultdict(FreqDist)
187 feature_values = defaultdict(set)
188 fnames = set()
189
190
191
192 for featureset, label in labeled_featuresets:
193 label_freqdist.inc(label)
194 for fname, fval in featureset.items():
195
196 feature_freqdist[label, fname].inc(fval)
197
198 feature_values[fname].add(fval)
199
200 fnames.add(fname)
201
202
203
204
205
206
207 for label in label_freqdist:
208 num_samples = label_freqdist[label]
209 for fname in fnames:
210 count = feature_freqdist[label, fname].N()
211 feature_freqdist[label, fname].inc(None, num_samples-count)
212 feature_values[fname].add(None)
213
214
215 label_probdist = estimator(label_freqdist)
216
217
218 feature_probdist = {}
219 for ((label, fname), freqdist) in feature_freqdist.items():
220 probdist = estimator(freqdist, bins=len(feature_values[fname]))
221 feature_probdist[label,fname] = probdist
222
223 return NaiveBayesClassifier(label_probdist, feature_probdist)
224
225
226
227
228
233
234 if __name__ == '__main__':
235 demo()
236