Package nltk :: Package classify :: Module maxent
[hide private]
[frames] | no frames]

Source Code for Module nltk.classify.maxent

   1  # Natural Language Toolkit: Maximum Entropy Classifiers 
   2  # 
   3  # Copyright (C) 2001-2008 NLTK Project 
   4  # Author: Edward Loper <[email protected]> 
   5  # URL: <http://nltk.org> 
   6  # For license information, see LICENSE.TXT 
   7  # 
   8  # $Id: naivebayes.py 2063 2004-07-17 21:02:24Z edloper $ 
   9   
  10  """ 
  11  A classifier model based on maximum entropy modeling framework.  This 
  12  framework considers all of the probability distributions that are 
  13  empirically consistant with the training data; and chooses the 
  14  distribution with the highest entropy.  A probability distribution is 
  15  X{empirically consistant} with a set of training data if its estimated 
  16  frequency with which a class and a feature vector value co-occur is 
  17  equal to the actual frequency in the data. 
  18   
  19  Terminology: 'feature' 
  20  ====================== 
  21  The term I{feature} is usually used to refer to some property of an 
  22  unlabeled token.  For example, when performing word sense 
  23  disambiguation, we might define a C{'prevword'} feature whose value is 
  24  the word preceeding the target word.  However, in the context of 
  25  maxent modeling, the term I{feature} is typically used to refer to a 
  26  property of a X{labeled} token.  In order to prevent confusion, we 
  27  will introduce two distinct terms to disambiguate these two different 
  28  concepts: 
  29   
  30    - An X{input-feature} is a property of an unlabeled token. 
  31    - A X{joint-feature} is a property of a labeled token. 
  32   
  33  In the rest of the C{nltk.classify} module, the term X{features} is 
  34  used to refer to what we will call X{input-features} in this module. 
  35   
  36  In literature that describes and discusses maximum entropy models, 
  37  input-features are typically called X{contexts}, and joint-features 
  38  are simply referred to as X{features}. 
  39   
  40  Converting Input-Features to Joint-Features 
  41  ------------------------------------------- 
  42  In maximum entropy models, joint-features are required to have numeric 
  43  values.  Typically, each input-feature C{input_feat} is mapped to a 
  44  set of joint-features of the form:: 
  45   
  46      joint_feat(token, label) = { 1 if input_feat(token) == feat_val 
  47                                 {      and label == some_label 
  48                                 { 
  49                                 { 0 otherwise 
  50   
  51  For all values of C{feat_val} and C{some_label}.  This mapping is 
  52  performed by classes that implement the L{MaxentFeatureEncodingI} 
  53  interface. 
  54  """ 
  55  __docformat__ = 'epytext en' 
  56   
  57  import numpy 
  58  import time 
  59  import tempfile 
  60  import os 
  61   
  62  from nltk import defaultdict 
  63  from nltk.probability import * 
  64   
  65  import nltk.classify.util # for accuracy & log_likelihood 
  66  from api import * 
  67  from util import attested_labels, CutoffChecker 
  68  from megam import call_megam, write_megam_file, parse_megam_weights 
  69   
  70   
  71  ###################################################################### 
  72  #{ Classifier Model 
  73  ###################################################################### 
  74   
75 -class MaxentClassifier(ClassifierI):
76 """ 77 A maximum entropy classifier (also known as a X{conditional 78 exponential classifier}). This classifier is parameterized by a 79 set of X{weights}, which are used to combine the joint-features 80 that are generated from a featureset by an X{encoding}. In 81 particular, the encoding maps each C{(featureset, label)} pair to 82 a vector. The probability of each label is then computed using 83 the following equation:: 84 85 dotprod(weights, encode(fs,label)) 86 prob(fs|label) = --------------------------------------------------- 87 sum(dotprod(weights, encode(fs,l)) for l in labels) 88 89 Where C{dotprod} is the dot product:: 90 91 dotprod(a,b) = sum(x*y for (x,y) in zip(a,b)) 92 """
93 - def __init__(self, encoding, weights, logarithmic=True):
94 """ 95 Construct a new maxent classifier model. Typically, new 96 classifier models are created using the L{train()} method. 97 98 @type encoding: L{MaxentFeatureEncodingI} 99 @param encoding: An encoding that is used to convert the 100 featuresets that are given to the C{classify} method into 101 joint-feature vectors, which are used by the maxent 102 classifier model. 103 104 @type weights: C{list} of C{float} 105 @param weights: The feature weight vector for this classifier. 106 107 @type logarithmic: C{bool} 108 @param logarithmic: If false, then use non-logarithmic weights. 109 """ 110 self._encoding = encoding 111 self._weights = weights 112 self._logarithmic = logarithmic 113 #self._logarithmic = False 114 assert encoding.length() == len(weights)
115
116 - def labels(self):
117 return self._encoding.labels()
118
119 - def set_weights(self, new_weights):
120 """ 121 Set the feature weight vector for this classifier. 122 @param new_weights: The new feature weight vector. 123 @type new_weights: C{list} of C{float} 124 """ 125 self._weights = new_weights 126 assert (self._encoding.length() == len(new_weights))
127
128 - def weights(self):
129 """ 130 @return: The feature weight vector for this classifier. 131 @rtype: C{list} of C{float} 132 """ 133 return self._weights
134
135 - def classify(self, featureset):
136 return self.prob_classify(featureset).max()
137
138 - def prob_classify(self, featureset):
139 prob_dict = {} 140 for label in self._encoding.labels(): 141 feature_vector = self._encoding.encode(featureset, label) 142 143 if self._logarithmic: 144 total = 0.0 145 for (f_id, f_val) in feature_vector: 146 total += self._weights[f_id] * f_val 147 prob_dict[label] = total 148 149 else: 150 prod = 1.0 151 for (f_id, f_val) in feature_vector: 152 prod *= self._weights[f_id] ** f_val 153 prob_dict[label] = prod 154 155 # Normalize the dictionary to give a probability distribution 156 return DictionaryProbDist(prob_dict, log=self._logarithmic, 157 normalize=True)
158
159 - def explain(self, featureset, columns=4):
160 """ 161 Print a table showing the effect of each of the features in 162 the given feature set, and how they combine to determine the 163 probabilities of each label for that featureset. 164 """ 165 descr_width = 50 166 TEMPLATE = ' %-'+str(descr_width-2)+'s%s%8.3f' 167 168 pdist = self.prob_classify(featureset) 169 labels = sorted(pdist.samples(), key=pdist.prob, reverse=True) 170 labels = labels[:columns] 171 print ' Feature'.ljust(descr_width)+''.join( 172 '%8s' % l[:7] for l in labels) 173 print ' '+'-'*(descr_width-2+8*len(labels)) 174 sums = defaultdict(int) 175 for i, label in enumerate(labels): 176 feature_vector = self._encoding.encode(featureset, label) 177 feature_vector.sort(key=lambda (fid,_): abs(self._weights[fid]), 178 reverse=True) 179 for (f_id, f_val) in feature_vector: 180 if self._logarithmic: score = self._weights[f_id] * f_val 181 else: score = self._weights[fid] ** f_val 182 descr = self._encoding.describe(f_id) 183 descr = descr.split(' and label is ')[0] # hack 184 if len(descr) > 47: descr = descr[:44]+'...' 185 print TEMPLATE % (descr, i*8*' ', score) 186 sums[label] += score 187 print ' '+'-'*(descr_width-1+8*len(labels)) 188 print ' TOTAL:'.ljust(descr_width)+''.join( 189 '%8.3f' % sums[l] for l in labels) 190 print ' PROBS:'.ljust(descr_width)+''.join( 191 '%8.3f' % pdist.prob(l) for l in labels)
192
193 - def show_most_informative_features(self, n=10, show='all'):
194 """ 195 @param show: all, neg, or pos (for negative-only or positive-only) 196 """ 197 fids = sorted(range(len(self._weights)), 198 key=lambda fid: abs(self._weights[fid]), 199 reverse=True) 200 if show == 'pos': 201 fids = [fid for fid in fids if self._weights[fid]>0] 202 elif show == 'neg': 203 fids = [fid for fid in fids if self._weights[fid]<0] 204 for fid in fids[:n]: 205 print '%8.3f %s' % (self._weights[fid], 206 self._encoding.describe(fid))
207
208 - def __repr__(self):
209 return ('<ConditionalExponentialClassifier: %d labels, %d features>' % 210 (len(self._encoding.labels()), self._encoding.length()))
211 212 #: A list of the algorithm names that are accepted for the 213 #: L{train()} method's C{algorithm} parameter. 214 ALGORITHMS = ['GIS', 'IIS', 'CG', 'BFGS', 'Powell', 'LBFGSB', 215 'Nelder-Mead', 'MEGAM'] 216 217 @classmethod
218 - def train(cls, train_toks, algorithm=None, trace=3, encoding=None, 219 labels=None, sparse=True, gaussian_prior_sigma=0, **cutoffs):
220 """ 221 Train a new maxent classifier based on the given corpus of 222 training samples. This classifier will have its weights 223 chosen to maximize entropy while remaining empirically 224 consistent with the training corpus. 225 226 @rtype: L{MaxentClassifier} 227 @return: The new maxent classifier 228 229 @type train_toks: C{list} 230 @param train_toks: Training data, represented as a list of 231 pairs, the first member of which is a featureset, 232 and the second of which is a classification label. 233 234 @type algorithm: C{str} 235 @param algorithm: A case-insensitive string, specifying which 236 algorithm should be used to train the classifier. The 237 following algorithms are currently available. 238 239 - Iterative Scaling Methods 240 - C{'GIS'}: Generalized Iterative Scaling 241 - C{'IIS'}: Improved Iterative Scaling 242 243 - Optimization Methods (require C{scipy}) 244 - C{'CG'}: Conjugate gradient 245 - C{'BFGS'}: Broyden-Fletcher-Goldfarb-Shanno algorithm 246 - C{'Powell'}: Powell agorithm 247 - C{'LBFGSB'}: A limited-memory variant of the BFGS algorithm 248 - C{'Nelder-Mead'}: The Nelder-Mead algorithm 249 250 - External Libraries 251 - C{'megam'}: LM-BFGS algorithm, with training performed 252 by an U{megam <http://www.cs.utah.edu/~hal/megam/>}. 253 (requires that C{megam} be installed.) 254 255 The default algorithm is C{'CG'} if C{'scipy'} is 256 installed; and C{'iis'} otherwise. 257 258 @type trace: C{int} 259 @param trace: The level of diagnostic tracing output to produce. 260 Higher values produce more verbose output. 261 262 @type encoding: L{MaxentFeatureEncodingI} 263 @param encoding: A feature encoding, used to convert featuresets 264 into feature vectors. If none is specified, then a 265 L{BinaryMaxentFeatureEncoding} will be built based on the 266 features that are attested in the training corpus. 267 268 @type labels: C{list} of C{str} 269 @param labels: The set of possible labels. If none is given, then 270 the set of all labels attested in the training data will be 271 used instead. 272 273 @param sparse: If true, then use sparse matrices instead of 274 dense matrices. Currently, this is only supported by 275 the scipy (optimization method) algorithms. For other 276 algorithms, its value is ignored. 277 278 @param gaussian_prior_sigma: The sigma value for a gaussian 279 prior on model weights. Currently, this is supported by 280 the scipy (optimization method) algorithms and C{megam}. 281 For other algorithms, its value is ignored. 282 283 @param cutoffs: Arguments specifying various conditions under 284 which the training should be halted. (Some of the cutoff 285 conditions are not supported by some algorithms.) 286 287 - C{max_iter=v}: Terminate after C{v} iterations. 288 - C{min_ll=v}: Terminate after the negative average 289 log-likelihood drops under C{v}. 290 - C{min_lldelta=v}: Terminate if a single iteration improves 291 log likelihood by less than C{v}. 292 - C{tolerance=v}: Terminate a scipy optimization method when 293 improvement drops below a tolerance level C{v}. The 294 exact meaning of this tolerance depends on the scipy 295 algorithm used. See C{scipy} documentation for more 296 info. Default values: 1e-3 for CG, 1e-5 for LBFGSB, 297 and 1e-4 for other algorithms. I{(C{scipy} only)} 298 """ 299 if algorithm is None: 300 try: 301 import scipy 302 algorithm = 'cg' 303 except ImportError: 304 algorithm = 'iis' 305 for key in cutoffs: 306 if key not in ('max_iter', 'min_ll', 'min_lldelta', 307 'tolerance', 'max_acc', 'min_accdelta'): 308 raise TypeError('Unexpected keyword arg %r' % key) 309 algorithm = algorithm.lower() 310 if algorithm == 'iis': 311 return train_maxent_classifier_with_iis( 312 train_toks, trace, encoding, labels, **cutoffs) 313 elif algorithm == 'gis': 314 return train_maxent_classifier_with_gis( 315 train_toks, trace, encoding, labels, **cutoffs) 316 elif algorithm in cls._SCIPY_ALGS: 317 return train_maxent_classifier_with_scipy( 318 train_toks, trace, encoding, labels, 319 cls._SCIPY_ALGS[algorithm], sparse, 320 gaussian_prior_sigma, **cutoffs) 321 elif algorithm == 'megam': 322 return train_maxent_classifier_with_megam( 323 train_toks, trace, encoding, labels, 324 gaussian_prior_sigma, **cutoffs) 325 else: 326 raise ValueError('Unknown algorithm %s' % algorithm)
327 328 _SCIPY_ALGS = {'cg':'CG', 'bfgs':'BFGS', 'powell':'Powell', 329 'lbfgsb':'LBFGSB', 'nelder-mead':'Nelder-Mead'}
330 331 332 #: Alias for MaxentClassifier. 333 ConditionalExponentialClassifier = MaxentClassifier 334 335 @deprecated('Use MaxentClassifier.train() instead')
336 -def train_maxent_classifier(*args, **kwargs):
337 return MaxentClassifier.train(*args, **kwargs)
338 339 ###################################################################### 340 #{ Feature Encodings 341 ###################################################################### 342
343 -class MaxentFeatureEncodingI(object):
344 """ 345 A mapping that converts a set of input-feature values to a vector 346 of joint-feature values, given a label. This conversion is 347 necessary to translate featuresets into a format that can be used 348 by maximum entropy models. 349 350 The set of joint-features used by a given encoding is fixed, and 351 each index in the generated joint-feature vectors corresponds to a 352 single joint-feature. The length of the generated joint-feature 353 vectors is therefore constant (for a given encoding). 354 355 Because the joint-feature vectors generated by 356 C{MaxentFeatureEncodingI} are typically very sparse, they are 357 represented as a list of C{(index, value)} tuples, specifying the 358 value of each non-zero joint-feature. 359 360 Feature encodings are generally created using the L{train()} 361 method, which generates an appropriate encoding based on the 362 input-feature values and labels that are present in a given 363 corpus. 364 """
365 - def encode(self, featureset, label):
366 """ 367 Given a (featureset, label) pair, return the corresponding 368 vector of joint-feature values. This vector is represented as 369 a list of C{(index, value)} tuples, specifying the value of 370 each non-zero joint-feature. 371 372 @type featureset: C{dict} 373 @rtype: C{list} of C{(int, number)} 374 """ 375 raise AssertionError('Not implemented')
376
377 - def length(self):
378 """ 379 @return: The size of the fixed-length joint-feature vectors 380 that are generated by this encoding. 381 @rtype: C{int} 382 """ 383 raise AssertionError('Not implemented')
384
385 - def labels(self):
386 """ 387 @return: A list of the \"known labels\" -- i.e., all labels 388 C{l} such that C{self.encode(fs,l)} can be a nonzero 389 joint-feature vector for some value of C{fs}. 390 @rtype: C{list} 391 """ 392 raise AssertionError('Not implemented')
393
394 - def describe(self, fid):
395 """ 396 @return: A string describing the value of the joint-feature 397 whose index in the generated feature vectors is C{fid}. 398 @rtype: C{str} 399 """ 400 raise AssertionError('Not implemented')
401
402 - def train(cls, train_toks):
403 """ 404 Construct and return new feature encoding, based on a given 405 training corpus C{train_toks}. 406 407 @type train_toks: C{list} of C{tuples} of (C{dict}, C{str}) 408 @param train_toks: Training data, represented as a list of 409 pairs, the first member of which is a feature dictionary, 410 and the second of which is a classification label. 411 """ 412 raise AssertionError('Not implemented')
413
414 -class FunctionBackedMaxentFeatureEncoding(MaxentFeatureEncodingI):
415 """ 416 A feature encoding that calls a user-supplied function to map a 417 given featureset/label pair to a sparse joint-feature vector. 418 """
419 - def __init__(self, func, length, labels):
420 """ 421 Construct a new feature encoding based on the given function. 422 423 @type func: (callable) 424 @param func: A function that takes two arguments, a featureset 425 and a label, and returns the sparse joint feature vector 426 that encodes them: 427 428 >>> func(featureset, label) -> feature_vector 429 430 This sparse joint feature vector (C{feature_vector}) is a 431 list of C{(index,value)} tuples. 432 433 @type length: C{int} 434 @param length: The size of the fixed-length joint-feature 435 vectors that are generated by this encoding. 436 437 @type labels: C{list} 438 @param labels: A list of the \"known labels\" for this 439 encoding -- i.e., all labels C{l} such that 440 C{self.encode(fs,l)} can be a nonzero joint-feature vector 441 for some value of C{fs}. 442 """ 443 self._length = length 444 self._func = func 445 self._labels = labels
446
447 - def encode(self, featureset, label):
448 return self._func(featureset, label)
449
450 - def length(self):
451 return self._length
452
453 - def labels(self):
454 return self._labels
455
456 - def describe(self, fid):
457 return 'no description available'
458
459 -class BinaryMaxentFeatureEncoding(MaxentFeatureEncodingI):
460 """ 461 A feature encoding that generates vectors containing a binary 462 joint-features of the form:: 463 464 joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label) 465 { 466 { 0 otherwise 467 468 Where C{fname} is the name of an input-feature, C{fval} is a value 469 for that input-feature, and C{label} is a label. 470 471 Typically, these features are constructed based on a training 472 corpus, using the L{train()} method. This method will create one 473 feature for each combination of C{fname}, C{fval}, and C{label} 474 that occurs at least once in the training corpus. 475 476 The C{unseen_features} parameter can be used to add X{unseen-value 477 features}, which are used whenever an input feature has a value 478 that was not encountered in the training corpus. These features 479 have the form:: 480 481 joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname]) 482 { and l == label 483 { 484 { 0 otherwise 485 486 Where C{is_unseen(fname, fval)} is true if the encoding does not 487 contain any joint features that are true when C{fs[fname]==fval}. 488 489 The C{alwayson_features} parameter can be used to add X{always-on 490 features}, which have the form:: 491 492 joint_feat(fs, l) = { 1 if (l == label) 493 { 494 { 0 otherwise 495 496 These always-on features allow the maxent model to directly model 497 the prior probabilities of each label. 498 """
499 - def __init__(self, labels, mapping, unseen_features=False, 500 alwayson_features=False):
501 """ 502 @param labels: A list of the \"known labels\" for this encoding. 503 504 @param mapping: A dictionary mapping from C{(fname,fval,label)} 505 tuples to corresponding joint-feature indexes. These 506 indexes must be the set of integers from 0...len(mapping). 507 If C{mapping[fname,fval,label]=id}, then 508 C{self.encode({..., fname:fval, ...}, label)[id]} is 1; 509 otherwise, it is 0. 510 511 @param unseen_features: If true, then include unseen value 512 features in the generated joint-feature vectors. 513 514 @param alwayson_features: If true, then include always-on 515 features in the generated joint-feature vectors. 516 """ 517 if set(mapping.values()) != set(range(len(mapping))): 518 raise ValueError('Mapping values must be exactly the ' 519 'set of integers from 0...len(mapping)') 520 521 self._labels = list(labels) 522 """A list of attested labels.""" 523 524 self._mapping = mapping 525 """dict mapping from (fname,fval,label) -> fid""" 526 527 self._length = len(mapping) 528 """The length of generated joint feature vectors.""" 529 530 self._alwayson = None 531 """dict mapping from label -> fid""" 532 533 self._unseen = None 534 """dict mapping from fname -> fid""" 535 536 if alwayson_features: 537 self._alwayson = dict([(label,i+self._length) 538 for (i,label) in enumerate(labels)]) 539 self._length += len(self._alwayson) 540 541 if unseen_features: 542 fnames = set(fname for (fname, fval, label) in mapping) 543 self._unseen = dict([(fname, i+self._length) 544 for (i, fname) in enumerate(fnames)]) 545 self._length += len(fnames)
546
547 - def encode(self, featureset, label):
548 # Inherit docs. 549 encoding = [] 550 551 # Convert input-features to joint-features: 552 for fname, fval in featureset.items(): 553 # Known feature name & value: 554 if (fname, fval, label) in self._mapping: 555 encoding.append((self._mapping[fname, fval, label], 1)) 556 557 # Otherwise, we might want to fire an "unseen-value feature". 558 elif self._unseen: 559 # Have we seen this fname/fval combination with any label? 560 for label2 in self._labels: 561 if (fname, fval, label2) in self._mapping: 562 break # we've seen this fname/fval combo 563 # We haven't -- fire the unseen-value feature 564 else: 565 if fname in self._unseen: 566 encoding.append((self._unseen[fname], 1)) 567 568 # Add always-on features: 569 if self._alwayson and label in self._alwayson: 570 encoding.append((self._alwayson[label], 1)) 571 572 return encoding
573
574 - def describe(self, f_id):
575 # Inherit docs. 576 if not isinstance(f_id, (int, long)): 577 raise TypeError('describe() expected an int') 578 try: 579 self._inv_mapping 580 except AttributeError: 581 self._inv_mapping = [-1]*len(self._mapping) 582 for (info, i) in self._mapping.items(): 583 self._inv_mapping[i] = info 584 585 if f_id < len(self._mapping): 586 (fname, fval, label) = self._inv_mapping[f_id] 587 return '%s==%r and label is %r' % (fname, fval, label) 588 elif self._alwayson and f_id in self._alwayson.values(): 589 for (label, f_id2) in self._alwayson.items(): 590 if f_id==f_id2: return 'label is %r' % label 591 elif self._unseen and f_id in self._unseen.values(): 592 for (fname, f_id2) in self._unseen.items(): 593 if f_id==f_id2: return '%s is unseen' % fname 594 else: 595 raise ValueError('Bad feature id')
596
597 - def labels(self):
598 # Inherit docs. 599 return self._labels
600
601 - def length(self):
602 # Inherit docs. 603 return self._length
604 605 @classmethod
606 - def train(cls, train_toks, count_cutoff=0, labels=None, **options):
607 """ 608 Construct and return new feature encoding, based on a given 609 training corpus C{train_toks}. See the L{class description 610 <BinaryMaxentFeatureEncoding>} for a description of the 611 joint-features that will be included in this encoding. 612 613 @type train_toks: C{list} of C{tuples} of (C{dict}, C{str}) 614 @param train_toks: Training data, represented as a list of 615 pairs, the first member of which is a feature dictionary, 616 and the second of which is a classification label. 617 618 @type count_cutoff: C{int} 619 @param count_cutoff: A cutoff value that is used to discard 620 rare joint-features. If a joint-feature's value is 1 621 fewer than C{count_cutoff} times in the training corpus, 622 then that joint-feature is not included in the generated 623 encoding. 624 625 @type labels: C{list} 626 @param labels: A list of labels that should be used by the 627 classifier. If not specified, then the set of labels 628 attested in C{train_toks} will be used. 629 630 @param options: Extra parameters for the constructor, such as 631 C{unseen_features} and C{alwayson_features}. 632 """ 633 mapping = {} # maps (fname, fval, label) -> fid 634 seen_labels = set() # The set of labels we've encountered 635 count = defaultdict(int) # maps (fname, fval) -> count 636 637 for (tok, label) in train_toks: 638 if labels and label not in labels: 639 raise ValueError('Unexpected label %s' % label) 640 seen_labels.add(label) 641 642 # Record each of the features. 643 for (fname, fval) in tok.items(): 644 645 # If a count cutoff is given, then only add a joint 646 # feature once the corresponding (fname, fval, label) 647 # tuple exceeds that cutoff. 648 count[fname,fval] += 1 649 if count[fname,fval] >= count_cutoff: 650 if (fname, fval, label) not in mapping: 651 mapping[fname, fval, label] = len(mapping) 652 653 if labels is None: labels = seen_labels 654 return cls(labels, mapping, **options)
655
656 -class GISEncoding(BinaryMaxentFeatureEncoding):
657 """ 658 A binary feature encoding which adds one new joint-feature to the 659 joint-features defined by L{BinaryMaxentFeatureEncoding}: a 660 correction feature, whose value is chosen to ensure that the 661 sparse vector always sums to a constant non-negative number. This 662 new feature is used to ensure two preconditions for the GIS 663 training algorithm: 664 - At least one feature vector index must be nonzero for every 665 token. 666 - The feature vector must sum to a constant non-negative number 667 for every token. 668 """
669 - def __init__(self, labels, mapping, unseen_features=False, 670 alwayson_features=False, C=None):
671 """ 672 @param C: The correction constant. The value of the correction 673 feature is based on this value. In particular, its value is 674 C{C - sum([v for (f,v) in encoding])}. 675 @seealso: L{BinaryMaxentFeatureEncoding.__init__} 676 """ 677 BinaryMaxentFeatureEncoding.__init__( 678 self, labels, mapping, unseen_features, alwayson_features) 679 if C is None: 680 C = len(set([fname for (fname,fval,label) in mapping]))+1 681 682 self._C = C 683 """The non-negative constant that all encoded feature vectors 684 will sum to."""
685 686 C = property(lambda self: self._C, doc=""" 687 The non-negative constant that all encoded feature vectors 688 will sum to.""") 689
690 - def encode(self, featureset, label):
691 # Get the basic encoding. 692 encoding = BinaryMaxentFeatureEncoding.encode(self, featureset, label) 693 base_length = BinaryMaxentFeatureEncoding.length(self) 694 695 # Add a correction feature. 696 total = sum([v for (f,v) in encoding]) 697 if total >= self._C: 698 raise ValueError('Correction feature is not high enough!') 699 encoding.append( (base_length, self._C-total) ) 700 701 # Return the result 702 return encoding
703
704 - def length(self):
705 return BinaryMaxentFeatureEncoding.length(self) + 1
706
707 - def describe(self, f_id):
708 if f_id == BinaryMaxentFeatureEncoding.length(self): 709 return 'Correction feature (%s)' % self._C 710 else: 711 return BinaryMaxentFeatureEncoding.describe(self, f_id)
712 713 ###################################################################### 714 #{ Classifier Trainer: Generalized Iterative Scaling 715 ###################################################################### 716
717 -def train_maxent_classifier_with_gis(train_toks, trace=3, encoding=None, 718 labels=None, **cutoffs):
719 """ 720 Train a new C{ConditionalExponentialClassifier}, using the given 721 training samples, using the Generalized Iterative Scaling 722 algorithm. This C{ConditionalExponentialClassifier} will encode 723 the model that maximizes entropy from all the models that are 724 empirically consistent with C{train_toks}. 725 726 @see: L{train_maxent_classifier()} for parameter descriptions. 727 """ 728 cutoffs.setdefault('max_iter', 100) 729 cutoffchecker = CutoffChecker(cutoffs) 730 731 # Construct an encoding from the training data. 732 if encoding is None: 733 encoding = GISEncoding.train(train_toks, labels=labels) 734 735 if not hasattr(encoding, 'C'): 736 raise TypeError('The GIS algorithm requires an encoding that ' 737 'defines C (e.g., GISEncoding).') 738 739 # Cinv is the inverse of the sum of each joint feature vector. 740 # This controls the learning rate: higher Cinv (or lower C) gives 741 # faster learning. 742 Cinv = 1.0/encoding.C 743 744 # Count how many times each feature occurs in the training data. 745 empirical_fcount = calculate_empirical_fcount(train_toks, encoding) 746 747 # Check for any features that are not attested in train_toks. 748 unattested = set(numpy.nonzero(empirical_fcount==0)[0]) 749 750 # Build the classifier. Start with weight=0 for each attested 751 # feature, and weight=-infinity for each unattested feature. 752 weights = numpy.zeros(len(empirical_fcount), 'd') 753 for fid in unattested: weights[fid] = numpy.NINF 754 classifier = ConditionalExponentialClassifier(encoding, weights) 755 756 # Take the log of the empirical fcount. 757 log_empirical_fcount = numpy.log2(empirical_fcount) 758 del empirical_fcount 759 760 # Old log-likelihood and accuracy; used to check if the change 761 # in log-likelihood or accuracy is sufficient to indicate convergence. 762 ll_old = None 763 acc_old = None 764 765 if trace > 0: print ' ==> Training (%d iterations)' % cutoffs['max_iter'] 766 if trace > 2: 767 print 768 print ' Iteration Log Likelihood Accuracy' 769 print ' ---------------------------------------' 770 771 # Train the classifier. 772 try: 773 while True: 774 if trace > 2: 775 ll = cutoffchecker.ll or nltk.classify.util.log_likelihood( 776 classifier, train_toks) 777 acc = cutoffchecker.acc or nltk.classify.util.accuracy( 778 classifier, train_toks) 779 iternum = cutoffchecker.iter 780 print ' %9d %14.5f %9.3f' % (iternum, ll, acc) 781 782 # Use the model to estimate the number of times each 783 # feature should occur in the training data. 784 estimated_fcount = calculate_estimated_fcount( 785 classifier, train_toks, encoding) 786 787 # Take the log of estimated fcount (avoid taking log(0).) 788 for fid in unattested: estimated_fcount[fid] += 1 789 log_estimated_fcount = numpy.log2(estimated_fcount) 790 del estimated_fcount 791 792 # Update the classifier weights 793 weights = classifier.weights() 794 weights += (log_empirical_fcount - log_estimated_fcount) * Cinv 795 classifier.set_weights(weights) 796 797 # Check the log-likelihood & accuracy cutoffs. 798 if cutoffchecker.check(classifier, train_toks): 799 break 800 801 except KeyboardInterrupt: 802 print ' Training stopped: keyboard interrupt' 803 except: 804 raise 805 806 if trace > 2: 807 ll = nltk.classify.util.log_likelihood(classifier, train_toks) 808 acc = nltk.classify.util.accuracy(classifier, train_toks) 809 print ' Final %14.5f %9.3f' % (ll, acc) 810 811 # Return the classifier. 812 return classifier
813
814 -def calculate_empirical_fcount(train_toks, encoding):
815 fcount = numpy.zeros(encoding.length(), 'd') 816 817 for tok, label in train_toks: 818 for (index, val) in encoding.encode(tok, label): 819 fcount[index] += val 820 821 return fcount
822
823 -def calculate_estimated_fcount(classifier, train_toks, encoding):
824 fcount = numpy.zeros(encoding.length(), 'd') 825 826 for tok, label in train_toks: 827 pdist = classifier.prob_classify(tok) 828 for label in pdist.samples(): 829 prob = pdist.prob(label) 830 for (fid, fval) in encoding.encode(tok, label): 831 fcount[fid] += prob*fval 832 833 return fcount
834 835 836 ###################################################################### 837 #{ Classifier Trainer: Improved Iterative Scaling 838 ###################################################################### 839
840 -def train_maxent_classifier_with_iis(train_toks, trace=3, encoding=None, 841 labels=None, **cutoffs):
842 """ 843 Train a new C{ConditionalExponentialClassifier}, using the given 844 training samples, using the Improved Iterative Scaling algorithm. 845 This C{ConditionalExponentialClassifier} will encode the model 846 that maximizes entropy from all the models that are empirically 847 consistent with C{train_toks}. 848 849 @see: L{train_maxent_classifier()} for parameter descriptions. 850 """ 851 cutoffs.setdefault('max_iter', 100) 852 cutoffchecker = CutoffChecker(cutoffs) 853 854 # Construct an encoding from the training data. 855 if encoding is None: 856 encoding = BinaryMaxentFeatureEncoding.train(train_toks, labels=labels) 857 858 # Count how many times each feature occurs in the training data. 859 empirical_ffreq = (calculate_empirical_fcount(train_toks, encoding) / 860 len(train_toks)) 861 862 # Find the nf map, and related variables nfarray and nfident. 863 # nf is the sum of the features for a given labeled text. 864 # nfmap compresses this sparse set of values to a dense list. 865 # nfarray performs the reverse operation. nfident is 866 # nfarray multiplied by an identity matrix. 867 nfmap = calculate_nfmap(train_toks, encoding) 868 nfarray = numpy.array(sorted(nfmap, key=nfmap.__getitem__), 'd') 869 nftranspose = numpy.reshape(nfarray, (len(nfarray), 1)) 870 871 # Check for any features that are not attested in train_toks. 872 unattested = set(numpy.nonzero(empirical_ffreq==0)[0]) 873 874 # Build the classifier. Start with weight=0 for each attested 875 # feature, and weight=-infinity for each unattested feature. 876 weights = numpy.zeros(len(empirical_ffreq), 'd') 877 for fid in unattested: weights[fid] = numpy.NINF 878 classifier = ConditionalExponentialClassifier(encoding, weights) 879 880 if trace > 0: print ' ==> Training (%d iterations)' % cutoffs['max_iter'] 881 if trace > 2: 882 print 883 print ' Iteration Log Likelihood Accuracy' 884 print ' ---------------------------------------' 885 886 # Old log-likelihood and accuracy; used to check if the change 887 # in log-likelihood or accuracy is sufficient to indicate convergence. 888 ll_old = None 889 acc_old = None 890 891 # Train the classifier. 892 try: 893 while True: 894 if trace > 2: 895 ll = cutoffchecker.ll or nltk.classify.util.log_likelihood( 896 classifier, train_toks) 897 acc = cutoffchecker.acc or nltk.classify.util.accuracy( 898 classifier, train_toks) 899 iternum = cutoffchecker.iter 900 print ' %9d %14.5f %9.3f' % (iternum, ll, acc) 901 902 # Calculate the deltas for this iteration, using Newton's method. 903 deltas = calculate_deltas( 904 train_toks, classifier, unattested, empirical_ffreq, 905 nfmap, nfarray, nftranspose, encoding) 906 907 # Use the deltas to update our weights. 908 weights = classifier.weights() 909 weights += deltas 910 classifier.set_weights(weights) 911 912 # Check the log-likelihood & accuracy cutoffs. 913 if cutoffchecker.check(classifier, train_toks): 914 break 915 916 except KeyboardInterrupt: 917 print ' Training stopped: keyboard interrupt' 918 except: 919 raise 920 921 922 if trace > 2: 923 ll = nltk.classify.util.log_likelihood(classifier, train_toks) 924 acc = nltk.classify.util.accuracy(classifier, train_toks) 925 print ' Final %14.5f %9.3f' % (ll, acc) 926 927 # Return the classifier. 928 return classifier
929
930 -def calculate_nfmap(train_toks, encoding):
931 """ 932 Construct a map that can be used to compress C{nf} (which is 933 typically sparse). 934 935 M{nf(feature_vector)} is the sum of the feature values for 936 M{feature_vector}. 937 938 This represents the number of features that are active for a 939 given labeled text. This method finds all values of M{nf(t)} 940 that are attested for at least one token in the given list of 941 training tokens; and constructs a dictionary mapping these 942 attested values to a continuous range M{0...N}. For example, 943 if the only values of M{nf()} that were attested were 3, 5, 944 and 7, then C{_nfmap} might return the dictionary {3:0, 5:1, 945 7:2}. 946 947 @return: A map that can be used to compress C{nf} to a dense 948 vector. 949 @rtype: C{dictionary} from C{int} to C{int} 950 """ 951 # Map from nf to indices. This allows us to use smaller arrays. 952 nfset = set() 953 for tok, _ in train_toks: 954 for label in encoding.labels(): 955 nfset.add(sum([val for (id,val) in encoding.encode(tok,label)])) 956 return dict([(nf, i) for (i, nf) in enumerate(nfset)])
957
958 -def calculate_deltas(train_toks, classifier, unattested, ffreq_empirical, 959 nfmap, nfarray, nftranspose, encoding):
960 """ 961 Calculate the update values for the classifier weights for 962 this iteration of IIS. These update weights are the value of 963 C{delta} that solves the equation:: 964 965 ffreq_empirical[i] 966 = 967 SUM[fs,l] (classifier.prob_classify(fs).prob(l) * 968 feature_vector(fs,l)[i] * 969 exp(delta[i] * nf(feature_vector(fs,l)))) 970 971 Where: 972 - M{(fs,l)} is a (featureset, label) tuple from C{train_toks} 973 - M{feature_vector(fs,l)} = C{encoding.encode(fs,l)} 974 - M{nf(vector)} = C{sum([val for (id,val) in vector])} 975 976 This method uses Newton's method to solve this equation for 977 M{delta[i]}. In particular, it starts with a guess of 978 C{delta[i]}=1; and iteratively updates C{delta} with:: 979 980 delta[i] -= (ffreq_empirical[i] - sum1[i])/(-sum2[i]) 981 982 until convergence, where M{sum1} and M{sum2} are defined as:: 983 984 sum1[i](delta) = SUM[fs,l] f[i](fs,l,delta) 985 986 sum2[i](delta) = SUM[fs,l] (f[i](fs,l,delta) * 987 nf(feature_vector(fs,l))) 988 989 f[i](fs,l,delta) = (classifier.prob_classify(fs).prob(l) * 990 feature_vector(fs,l)[i] * 991 exp(delta[i] * nf(feature_vector(fs,l)))) 992 993 Note that M{sum1} and M{sum2} depend on C{delta}; so they need 994 to be re-computed each iteration. 995 996 The variables C{nfmap}, C{nfarray}, and C{nftranspose} are 997 used to generate a dense encoding for M{nf(ltext)}. This 998 allows C{_deltas} to calculate M{sum1} and M{sum2} using 999 matrices, which yields a signifigant performance improvement. 1000 1001 @param train_toks: The set of training tokens. 1002 @type train_toks: C{list} of C{tuples} of (C{dict}, C{str}) 1003 @param classifier: The current classifier. 1004 @type classifier: C{ClassifierI} 1005 @param ffreq_empirical: An array containing the empirical 1006 frequency for each feature. The M{i}th element of this 1007 array is the empirical frequency for feature M{i}. 1008 @type ffreq_empirical: C{sequence} of C{float} 1009 @param unattested: An array that is 1 for features that are 1010 not attested in the training data; and 0 for features that 1011 are attested. In other words, C{unattested[i]==0} iff 1012 C{ffreq_empirical[i]==0}. 1013 @type unattested: C{sequence} of C{int} 1014 @param nfmap: A map that can be used to compress C{nf} to a dense 1015 vector. 1016 @type nfmap: C{dictionary} from C{int} to C{int} 1017 @param nfarray: An array that can be used to uncompress C{nf} 1018 from a dense vector. 1019 @type nfarray: C{array} of C{float} 1020 @param nftranspose: C{array} of C{float} 1021 @type nftranspose: The transpose of C{nfarray} 1022 """ 1023 # These parameters control when we decide that we've 1024 # converged. It probably should be possible to set these 1025 # manually, via keyword arguments to train. 1026 NEWTON_CONVERGE = 1e-12 1027 MAX_NEWTON = 300 1028 1029 deltas = numpy.ones(encoding.length(), 'd') 1030 1031 # Precompute the A matrix: 1032 # A[nf][id] = sum ( p(fs) * p(label|fs) * f(fs,label) ) 1033 # over all label,fs s.t. num_features[label,fs]=nf 1034 A = numpy.zeros((len(nfmap), encoding.length()), 'd') 1035 1036 for tok, label in train_toks: 1037 dist = classifier.prob_classify(tok) 1038 1039 for label in encoding.labels(): 1040 # Generate the feature vector 1041 feature_vector = encoding.encode(tok,label) 1042 # Find the number of active features 1043 nf = sum([val for (id, val) in feature_vector]) 1044 # Update the A matrix 1045 for (id, val) in feature_vector: 1046 A[nfmap[nf], id] += dist.prob(label) * val 1047 A /= len(train_toks) 1048 1049 # Iteratively solve for delta. Use the following variables: 1050 # - nf_delta[x][y] = nfarray[x] * delta[y] 1051 # - exp_nf_delta[x][y] = exp(nf[x] * delta[y]) 1052 # - nf_exp_nf_delta[x][y] = nf[x] * exp(nf[x] * delta[y]) 1053 # - sum1[i][nf] = sum p(fs)p(label|fs)f[i](label,fs) 1054 # exp(delta[i]nf) 1055 # - sum2[i][nf] = sum p(fs)p(label|fs)f[i](label,fs) 1056 # nf exp(delta[i]nf) 1057 for rangenum in range(MAX_NEWTON): 1058 nf_delta = numpy.outer(nfarray, deltas) 1059 exp_nf_delta = 2 ** nf_delta 1060 nf_exp_nf_delta = nftranspose * exp_nf_delta 1061 sum1 = numpy.sum(exp_nf_delta * A, axis=0) 1062 sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0) 1063 1064 # Avoid division by zero. 1065 for fid in unattested: sum2[fid] += 1 1066 1067 # Update the deltas. 1068 deltas -= (ffreq_empirical - sum1) / -sum2 1069 1070 # We can stop once we converge. 1071 n_error = (numpy.sum(abs((ffreq_empirical-sum1)))/ 1072 numpy.sum(abs(deltas))) 1073 if n_error < NEWTON_CONVERGE: 1074 return deltas 1075 1076 return deltas
1077 1078 ###################################################################### 1079 #{ Classifier Trainer: scipy algorithms (GC, LBFGSB, etc.) 1080 ###################################################################### 1081 1082 # [xx] n.b.: it's possible to supply custom trace functions, which 1083 # could be used to make trace output consistent with iis/gis.
1084 -def train_maxent_classifier_with_scipy(train_toks, trace=3, encoding=None, 1085 labels=None, algorithm='CG', 1086 sparse=True, gaussian_prior_sigma=0, 1087 **cutoffs):
1088 """ 1089 Train a new C{ConditionalExponentialClassifier}, using the given 1090 training samples, using the specified C{scipy} optimization 1091 algorithm. This C{ConditionalExponentialClassifier} will encode 1092 the model that maximizes entropy from all the models that are 1093 empirically consistent with C{train_toks}. 1094 1095 @see: L{train_maxent_classifier()} for parameter descriptions. 1096 @require: The C{scipy} package must be installed. 1097 """ 1098 try: 1099 import scipy.sparse, scipy.maxentropy 1100 except ImportError, e: 1101 raise ValueError('The maxent training algorithm %r requires ' 1102 'that the scipy package be installed. See ' 1103 'http://www.scipy.org/' % algorithm) 1104 1105 # Construct an encoding from the training data. 1106 if encoding is None: 1107 encoding = BinaryMaxentFeatureEncoding.train(train_toks, labels=labels) 1108 elif labels is not None: 1109 raise ValueError('Specify encoding or labels, not both') 1110 1111 labels = encoding.labels() 1112 labelnum = dict([(label, i) for (i, label) in enumerate(labels)]) 1113 num_features = encoding.length() 1114 num_toks = len(train_toks) 1115 num_labels = len(labels) 1116 1117 # Decide whether to use a sparse matrix or a dense one. Very 1118 # limited testing has shown that the lil matrix format 1119 # (list-of-lists) performs better than csr and csc formats. 1120 # Limited testing also suggests that the sparse matrix format 1121 # doesn't save much memory over the dense format in practice 1122 # (in terms of max memory usage). 1123 if sparse: zeros = scipy.sparse.lil_matrix 1124 else: zeros = numpy.zeros 1125 1126 # Construct the 'F' matrix, which lists the feature values for 1127 # each training instance. F[i, j*len(labels)+k] is equal to the 1128 # value of the i'th feature for the feature vector corresponding 1129 # to (tok[j], label[k]). 1130 F = zeros((num_features, num_toks*num_labels)) 1131 1132 # Construct the 'N' matrix, which specifies the correct label for 1133 # each training instance. N[0, j*len(labels)+k] is equal to one 1134 # iff label[k] is the correct label for tok[j]. 1135 N = zeros((1, num_toks*num_labels)) 1136 1137 # Fill in the 'F' and 'N' matrices (just make one pass through the 1138 # training tokens.) 1139 for toknum, (featureset, label) in enumerate(train_toks): 1140 N[0, toknum*len(labels) + labelnum[label]] += 1 1141 for label2 in labels: 1142 for (fid, fval) in encoding.encode(featureset, label2): 1143 F[fid, toknum*len(labels) + labelnum[label2]] = fval 1144 1145 # Set up the scipy model, based on the matrices F and N. 1146 model = scipy.maxentropy.conditionalmodel(F, N, num_toks) 1147 # note -- model.setsmooth() is buggy. 1148 if gaussian_prior_sigma: 1149 model.sigma2 = gaussian_prior_sigma**2 1150 if algorithm == 'LBFGSB': 1151 model.log = None 1152 if trace >= 3: 1153 model.verbose = True 1154 if 'max_iter' in cutoffs: 1155 model.maxiter = cutoffs['max_iter'] 1156 if 'tolerance' in cutoffs: 1157 if algorithm == 'CG': model.avegtol = cutoffs['tolerance'] 1158 elif algorithm == 'LBFGSB': model.maxgtol = cutoffs['tolerance'] 1159 else: model.tol = cutoffs['tolerance'] 1160 1161 # Train the model. 1162 model.fit(algorithm=algorithm) 1163 1164 # Convert the model's weights from base-e to base-2 weights. 1165 weights = model.params * numpy.log2(numpy.e) 1166 1167 # Build the classifier 1168 return MaxentClassifier(encoding, weights)
1169 1170 ###################################################################### 1171 #{ Classifier Trainer: megam 1172 ###################################################################### 1173 1174 # [xx] possible extension: add support for using implicit file format; 1175 # this would need to put requirements on what encoding is used. But 1176 # we may need this for other maxent classifier trainers that require 1177 # implicit formats anyway.
1178 -def train_maxent_classifier_with_megam(train_toks, trace=3, encoding=None, 1179 labels=None, gaussian_prior_sigma=0, 1180 **cutoffs):
1181 """ 1182 Train a new C{ConditionalExponentialClassifier}, using the given 1183 training samples, using the external C{megam} library. This 1184 C{ConditionalExponentialClassifier} will encode the model that 1185 maximizes entropy from all the models that are empirically 1186 consistent with C{train_toks}. 1187 1188 @see: L{train_maxent_classifier()} for parameter descriptions. 1189 @see: L{nltk.classify.megam} 1190 """ 1191 explicit = True 1192 1193 # Construct an encoding from the training data. 1194 if encoding is None: 1195 encoding = BinaryMaxentFeatureEncoding.train(train_toks, 1196 labels=labels, alwayson_features=True) 1197 elif labels is not None: 1198 raise ValueError('Specify encoding or labels, not both') 1199 1200 # Write a training file for megam. 1201 try: 1202 fd, trainfile_name = tempfile.mkstemp(prefix='nltk-') 1203 trainfile = os.fdopen(fd, 'wb') 1204 write_megam_file(train_toks, encoding, trainfile, explicit=explicit) 1205 trainfile.close() 1206 except (OSError, IOError, ValueError), e: 1207 raise ValueError('Error while creating megam training file: %s' % e) 1208 1209 # Run megam on the training file. 1210 options = ['-nobias'] 1211 if explicit: options += ['-explicit'] 1212 if gaussian_prior_sigma: 1213 # [XX] *why* is this the right formula to convert sigma to 1214 # lambda? how is lambda defined?? The 7.01 is pretty random 1215 # seeming -- it was picked by experimentation. 1216 options += ['-lambda', '%s' % (7.01/(gaussian_prior_sigma**2))] 1217 else: 1218 options += ['-lambda', '0'] 1219 if trace < 3: 1220 options += ['-quiet'] 1221 if 'max_iter' in cutoffs: 1222 options += ['-maxi', '%s' % cutoffs['max_iter']] 1223 if 'll_delta' in cutoffs: 1224 # [xx] this is actually a perplexity delta, not a log 1225 # likelihood delta 1226 options += ['-dpp', '%s' % abs(cutoffs['ll_delta'])] 1227 stdout = call_megam(options+['multiclass', trainfile_name]) 1228 1229 # Delete the training file 1230 try: os.remove(trainfile_name) 1231 except (OSError, IOError), e: 1232 print 'Warning: unable to delete %s: %s' % (trainfile_name, e) 1233 1234 # Parse the generated weight vector. 1235 weights = parse_megam_weights(stdout, explicit) 1236 1237 # Convert from base-e to base-2 weights. 1238 weights *= numpy.log2(numpy.e) 1239 1240 # Build the classifier 1241 return MaxentClassifier(encoding, weights)
1242 1243 ###################################################################### 1244 #{ Demo 1245 ######################################################################
1246 -def demo():
1247 from nltk.classify.util import names_demo 1248 classifier = names_demo(MaxentClassifier.train)
1249 1250 if __name__ == '__main__': 1251 demo() 1252