nltk.classify.api

1 # Natural Language Toolkit: Classifier Interface 2 # 3 # Copyright (C) 2001-2008 NLTK Project 4 # Author: Edward Loper <[email protected]> 5 # Steven Bird <[email protected]> (minor additions) 6 # URL: <http://nltk.org> 7 # For license information, see LICENSE.TXT 8 9 """ 10 Interfaces for labeling tokens with category labels (or X{class 11 labels}). 12 13 L{ClassifierI} is a standard interface for X{single-category 14 classification}, in which: 15 16 - The set of categories is known. 17 - The number of categories is finite. 18 - Each text belongs to exactly one category. 19 20 L{MultiClassifierI} is a standard interface for C{multi-category 21 classification}, in which: 22 23 - The set of categories is known. 24 - The number of categories is finite. 25 - Each text belongs to zero or more categories. 26 """ 27 from nltk.internals import deprecated, overridden 28 29 ##////////////////////////////////////////////////////// 30 #{ Classification Interfaces 31 ##////////////////////////////////////////////////////// 32

33 -class ClassifierI(object):

34 """ 35 A processing interface for labeling tokens with a single category 36 label (or X{class}). Labels are typically C{string}s or 37 C{integer}s, but can be any immutable type. The set of labels 38 that the classifier chooses from must be fixed and finite. 39 40 Subclasses must define: 41 - L{labels()} 42 - either L{classify()} or L{batch_classify()} (or both) 43 44 Subclasses may define: 45 - either L{prob_classify()} or L{batch_prob_classify()} (or both) 46 """

47 - def labels(self):

48 """ 49 @return: the list of category labels used by this classifier. 50 @rtype: C{list} of (immutable) 51 """ 52 raise NotImplementedError()

53

54 - def classify(self, featureset):

55 """ 56 @return: the most appropriate label for the given featureset. 57 @rtype: label 58 """ 59 if overridden(self.batch_classify): 60 return self.batch_classify([featureset])[0] 61 else: 62 raise NotImplementedError()

63

64 - def prob_classify(self, featureset):

65 """ 66 @return: a probability distribution over labels for the given 67 featureset. 68 @rtype: L{ProbDistI <nltk.probability.ProbDistI>} 69 """ 70 if overridden(self.batch_prob_classify): 71 return self.batch_prob_classify([featureset])[0] 72 else: 73 raise NotImplementedError()

74

75 - def batch_classify(self, featuresets):

76 """ 77 Apply L{self.classify()} to each element of C{featuresets}. I.e.: 78 79 >>> return [self.classify(fs) for fs in featuresets] 80 81 @rtype: C{list} of I{label} 82 """ 83 return [self.classify(fs) for fs in featuresets]

84

85 - def batch_prob_classify(self, featuresets):

86 """ 87 Apply L{self.prob_classify()} to each element of C{featuresets}. I.e.: 88 89 >>> return [self.prob_classify(fs) for fs in featuresets] 90 91 @rtype: C{list} of L{ProbDistI <nltk.probability.ProbDistI>} 92 """ 93 return [self.prob_classify(fs) for fs in featuresets]

94 95 #{ Deprecated 96 @deprecated("Use .batch_prob_classify() instead.")

97 - def batch_probdist(self, featuresets):

98 return self.batch_prob_classify(featuresets)

99 @deprecated("Use .prob_classify() instead.")

100 - def probdist(self, featureset):

101 return self.prob_classify(featureset)

102 #} 103

104 -class MultiClassifierI(object):

105 """ 106 A processing interface for labeling tokens with zero or more 107 category labels (or X{labels}). Labels are typically C{string}s 108 or C{integer}s, but can be any immutable type. The set of labels 109 that the multi-classifier chooses from must be fixed and finite. 110 111 Subclasses must define: 112 - L{labels()} 113 - either L{classify()} or L{batch_classify()} (or both) 114 115 Subclasses may define: 116 - either L{prob_classify()} or L{batch_prob_classify()} (or both) 117 """

118 - def labels(self):

119 """ 120 @return: the list of category labels used by this classifier. 121 @rtype: C{list} of (immutable) 122 """ 123 raise NotImplementedError()

124

125 - def classify(self, featureset):

126 """ 127 @return: the most appropriate set of labels for the given featureset. 128 @rtype: C{set} of I{label} 129 """ 130 if overridden(self.batch_classify): 131 return self.batch_classify([featureset])[0] 132 else: 133 raise NotImplementedError()

134

135 - def prob_classify(self, featureset):

136 """ 137 @return: a probability distribution over sets of labels for the 138 given featureset. 139 @rtype: L{ProbDistI <nltk.probability.ProbDistI>} 140 """ 141 if overridden(self.batch_prob_classify): 142 return self.batch_prob_classify([featureset])[0] 143 else: 144 raise NotImplementedError()

145

146 - def batch_classify(self, featuresets):

147 """ 148 Apply L{self.classify()} to each element of C{featuresets}. I.e.: 149 150 >>> return [self.classify(fs) for fs in featuresets] 151 152 @rtype: C{list} of (C{set} of I{label}) 153 """ 154 return [self.classify(fs) for fs in featuresets]

155

156 - def batch_prob_classify(self, featuresets):

157 """ 158 Apply L{self.prob_classify()} to each element of C{featuresets}. I.e.: 159 160 >>> return [self.prob_classify(fs) for fs in featuresets] 161 162 @rtype: C{list} of L{ProbDistI <nltk.probability.ProbDistI>} 163 """ 164 return [self.prob_classify(fs) for fs in featuresets]

165 166 #{ Deprecated 167 @deprecated("Use .batch_prob_classify() instead.")

168 - def batch_probdist(self, featuresets):

169 return self.batch_prob_classify(featuresets)

170 @deprecated("Use .prob_classify() instead.")

171 - def probdist(self, featureset):

172 return self.prob_classify(featureset)

173 #} 174 175 # # [XX] IN PROGRESS: 176 # class SequenceClassifierI(object): 177 # """ 178 # A processing interface for labeling sequences of tokens with a 179 # single category label (or X{class}). Labels are typically 180 # C{string}s or C{integer}s, but can be any immutable type. The set 181 # of labels that the classifier chooses from must be fixed and 182 # finite. 183 # """ 184 # def labels(self): 185 # """ 186 # @return: the list of category labels used by this classifier. 187 # @rtype: C{list} of (immutable) 188 # """ 189 # raise NotImplementedError() 190 191 # def prob_classify(self, featureset): 192 # """ 193 # Return a probability distribution over labels for the given 194 # featureset. 195 196 # If C{featureset} is a list of featuresets, then return a 197 # corresponding list containing the probability distribution 198 # over labels for each of the given featuresets, where the 199 # M{i}th element of this list is the most appropriate label for 200 # the M{i}th element of C{featuresets}. 201 # """ 202 # raise NotImplementedError() 203 204 # def classify(self, featureset): 205 # """ 206 # Return the most appropriate label for the given featureset. 207 208 # If C{featureset} is a list of featuresets, then return a 209 # corresponding list containing the most appropriate label for 210 # each of the given featuresets, where the M{i}th element of 211 # this list is the most appropriate label for the M{i}th element 212 # of C{featuresets}. 213 # """ 214 # raise NotImplementedError() 215

Source Code for Module nltk.classify.api