Package nltk :: Package classify :: Module decisiontree
[hide private]
[frames] | no frames]

Source Code for Module nltk.classify.decisiontree

  1  # Natural Language Toolkit: Decision Tree Classifiers 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Edward Loper <[email protected]> 
  5  # URL: <http://nltk.org> 
  6  # For license information, see LICENSE.TXT 
  7  # 
  8  # $Id: naivebayes.py 2063 2004-07-17 21:02:24Z edloper $ 
  9   
 10  """ 
 11  A classifier model that decides which label to assign to a token on 
 12  the basis of a tree structure, where branches correspond to conditions 
 13  on feature values, and leaves correspond to label assignments. 
 14  """ 
 15   
 16  from nltk.probability import * 
 17  from nltk import defaultdict 
 18   
 19  from api import * 
 20   
21 -class DecisionTreeClassifier(ClassifierI):
22 - def __init__(self, label, feature_name=None, decisions=None):
23 self._label = label 24 self._fname = feature_name 25 self._decisions = decisions
26
27 - def labels(self):
28 labels = [self._label] 29 if self._decisions is not None: 30 for dt in self._decisions.values(): 31 labels.extend(dt.labels()) 32 return list(set(labels))
33
34 - def classify(self, featureset):
35 # Decision leaf: 36 if self._fname is None: 37 return self._label 38 39 # Decision tree: 40 fval = featureset[self._fname] 41 if fval in self._decisions: 42 return self._decisions[fval].classify(featureset) 43 else: 44 return self._label
45
46 - def error(self, labeled_featuresets):
47 errors = 0 48 for featureset, label in labeled_featuresets: 49 if self.classify(featureset) != label: 50 errors += 1 51 return float(errors)/len(labeled_featuresets)
52
53 - def pp(self, width=70, prefix='', depth=4):
54 if self._fname is None: 55 n = width-len(prefix)-15 56 return '%s%s %s\n' % (prefix, '.'*n, self._label) 57 s = '' 58 for i, (fval, result) in enumerate(sorted(self._decisions.items())): 59 hdr = '%s%s=%s? ' % (prefix, self._fname, fval) 60 n = width-15-len(hdr) 61 s += '%s%s %s\n' % (hdr, '.'*(n), result._label) 62 if result._fname is not None and depth>1: 63 s += result.pp(width, prefix+' ', depth-1) 64 return s
65
66 - def __str__(self):
67 return self.pp()
68 69 @staticmethod
70 - def train(labeled_featuresets, entropy_cutoff=0.05, depth_cutoff=100, 71 support_cutoff=10):
72 # Collect a list of all feature names. 73 feature_names = set() 74 for featureset, label in labeled_featuresets: 75 for fname in featureset: 76 feature_names.add(fname) 77 78 # Start with a stump.. 79 tree = DecisionTreeClassifier.best_stump( 80 feature_names, labeled_featuresets) 81 82 tree.refine(labeled_featuresets, entropy_cutoff, depth_cutoff-1, 83 support_cutoff) 84 85 # Return it 86 return tree
87 88 @staticmethod
89 - def leaf(labeled_featuresets):
90 label = FreqDist([label for (featureset,label) 91 in labeled_featuresets]).max() 92 return DecisionTreeClassifier(label)
93 94 @staticmethod
95 - def stump(feature_name, labeled_featuresets):
96 label = FreqDist([label for (featureset,label) 97 in labeled_featuresets]).max() 98 99 # Find the best label for each value. 100 freqs = defaultdict(FreqDist) # freq(label|value) 101 for featureset, label in labeled_featuresets: 102 feature_value = featureset[feature_name] 103 freqs[feature_value].inc(label) 104 105 decisions = dict([(val, DecisionTreeClassifier(freqs[val].max())) 106 for val in freqs]) 107 return DecisionTreeClassifier(label, feature_name, decisions)
108
109 - def refine(self, labeled_featuresets, entropy_cutoff, depth_cutoff, 110 support_cutoff):
111 if len(labeled_featuresets) <= support_cutoff: return 112 if self._fname is None: return 113 if depth_cutoff <= 0: return 114 for fval in self._decisions: 115 fval_featuresets = [(featureset,label) for (featureset,label) 116 in labeled_featuresets 117 if featureset[self._fname] == fval] 118 119 label_freqs = FreqDist([label for (featureset,label) 120 in fval_featuresets]) 121 if entropy(MLEProbDist(label_freqs)) > entropy_cutoff: 122 self._decisions[fval] = DecisionTreeClassifier.train( 123 fval_featuresets, entropy_cutoff, depth_cutoff)
124 125 @staticmethod
126 - def best_stump(feature_names, labeled_featuresets):
127 best_stump = DecisionTreeClassifier.leaf(labeled_featuresets) 128 best_error = best_stump.error(labeled_featuresets) 129 for fname in feature_names: 130 stump = DecisionTreeClassifier.stump(fname, labeled_featuresets) 131 stump_error = stump.error(labeled_featuresets) 132 if stump_error < best_error: 133 best_error = stump_error 134 best_stump = stump 135 print ('best stump for %4d toks uses %20s err=%6.4f' % 136 (len(labeled_featuresets), best_stump._fname, best_error)) 137 return best_stump
138 139 ##////////////////////////////////////////////////////// 140 ## Demo 141 ##////////////////////////////////////////////////////// 142
143 -def demo():
144 from nltk.classify.util import names_demo, binary_names_demo_features 145 classifier = names_demo(DecisionTreeClassifier.train, 146 binary_names_demo_features) 147 print classifier.pp(depth=7)
148 149 if __name__ == '__main__': 150 demo() 151