1
2
3
4
5
6
7
8
9
10 """
11 A classifier model that decides which label to assign to a token on
12 the basis of a tree structure, where branches correspond to conditions
13 on feature values, and leaves correspond to label assignments.
14 """
15
16 from nltk.probability import *
17 from nltk import defaultdict
18
19 from api import *
20
22 - def __init__(self, label, feature_name=None, decisions=None):
23 self._label = label
24 self._fname = feature_name
25 self._decisions = decisions
26
33
35
36 if self._fname is None:
37 return self._label
38
39
40 fval = featureset[self._fname]
41 if fval in self._decisions:
42 return self._decisions[fval].classify(featureset)
43 else:
44 return self._label
45
46 - def error(self, labeled_featuresets):
47 errors = 0
48 for featureset, label in labeled_featuresets:
49 if self.classify(featureset) != label:
50 errors += 1
51 return float(errors)/len(labeled_featuresets)
52
53 - def pp(self, width=70, prefix='', depth=4):
54 if self._fname is None:
55 n = width-len(prefix)-15
56 return '%s%s %s\n' % (prefix, '.'*n, self._label)
57 s = ''
58 for i, (fval, result) in enumerate(sorted(self._decisions.items())):
59 hdr = '%s%s=%s? ' % (prefix, self._fname, fval)
60 n = width-15-len(hdr)
61 s += '%s%s %s\n' % (hdr, '.'*(n), result._label)
62 if result._fname is not None and depth>1:
63 s += result.pp(width, prefix+' ', depth-1)
64 return s
65
68
69 @staticmethod
70 - def train(labeled_featuresets, entropy_cutoff=0.05, depth_cutoff=100,
71 support_cutoff=10):
72
73 feature_names = set()
74 for featureset, label in labeled_featuresets:
75 for fname in featureset:
76 feature_names.add(fname)
77
78
79 tree = DecisionTreeClassifier.best_stump(
80 feature_names, labeled_featuresets)
81
82 tree.refine(labeled_featuresets, entropy_cutoff, depth_cutoff-1,
83 support_cutoff)
84
85
86 return tree
87
88 @staticmethod
89 - def leaf(labeled_featuresets):
93
94 @staticmethod
95 - def stump(feature_name, labeled_featuresets):
108
109 - def refine(self, labeled_featuresets, entropy_cutoff, depth_cutoff,
110 support_cutoff):
111 if len(labeled_featuresets) <= support_cutoff: return
112 if self._fname is None: return
113 if depth_cutoff <= 0: return
114 for fval in self._decisions:
115 fval_featuresets = [(featureset,label) for (featureset,label)
116 in labeled_featuresets
117 if featureset[self._fname] == fval]
118
119 label_freqs = FreqDist([label for (featureset,label)
120 in fval_featuresets])
121 if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
122 self._decisions[fval] = DecisionTreeClassifier.train(
123 fval_featuresets, entropy_cutoff, depth_cutoff)
124
125 @staticmethod
126 - def best_stump(feature_names, labeled_featuresets):
138
139
140
141
142
148
149 if __name__ == '__main__':
150 demo()
151