1
2
3
4
5
6
7
8
9
10 """
11 A classifier model based on maximum entropy modeling framework. This
12 framework considers all of the probability distributions that are
13 empirically consistant with the training data; and chooses the
14 distribution with the highest entropy. A probability distribution is
15 X{empirically consistant} with a set of training data if its estimated
16 frequency with which a class and a feature vector value co-occur is
17 equal to the actual frequency in the data.
18
19 Terminology: 'feature'
20 ======================
21 The term I{feature} is usually used to refer to some property of an
22 unlabeled token. For example, when performing word sense
23 disambiguation, we might define a C{'prevword'} feature whose value is
24 the word preceeding the target word. However, in the context of
25 maxent modeling, the term I{feature} is typically used to refer to a
26 property of a X{labeled} token. In order to prevent confusion, we
27 will introduce two distinct terms to disambiguate these two different
28 concepts:
29
30 - An X{input-feature} is a property of an unlabeled token.
31 - A X{joint-feature} is a property of a labeled token.
32
33 In the rest of the C{nltk.classify} module, the term X{features} is
34 used to refer to what we will call X{input-features} in this module.
35
36 In literature that describes and discusses maximum entropy models,
37 input-features are typically called X{contexts}, and joint-features
38 are simply referred to as X{features}.
39
40 Converting Input-Features to Joint-Features
41 -------------------------------------------
42 In maximum entropy models, joint-features are required to have numeric
43 values. Typically, each input-feature C{input_feat} is mapped to a
44 set of joint-features of the form::
45
46 joint_feat(token, label) = { 1 if input_feat(token) == feat_val
47 { and label == some_label
48 {
49 { 0 otherwise
50
51 For all values of C{feat_val} and C{some_label}. This mapping is
52 performed by classes that implement the L{MaxentFeatureEncodingI}
53 interface.
54 """
55 __docformat__ = 'epytext en'
56
57 import numpy
58 import time
59 import tempfile
60 import os
61
62 from nltk import defaultdict
63 from nltk.probability import *
64
65 import nltk.classify.util
66 from api import *
67 from util import attested_labels, CutoffChecker
68 from megam import call_megam, write_megam_file, parse_megam_weights
69
70
71
72
73
74
76 """
77 A maximum entropy classifier (also known as a X{conditional
78 exponential classifier}). This classifier is parameterized by a
79 set of X{weights}, which are used to combine the joint-features
80 that are generated from a featureset by an X{encoding}. In
81 particular, the encoding maps each C{(featureset, label)} pair to
82 a vector. The probability of each label is then computed using
83 the following equation::
84
85 dotprod(weights, encode(fs,label))
86 prob(fs|label) = ---------------------------------------------------
87 sum(dotprod(weights, encode(fs,l)) for l in labels)
88
89 Where C{dotprod} is the dot product::
90
91 dotprod(a,b) = sum(x*y for (x,y) in zip(a,b))
92 """
93 - def __init__(self, encoding, weights, logarithmic=True):
94 """
95 Construct a new maxent classifier model. Typically, new
96 classifier models are created using the L{train()} method.
97
98 @type encoding: L{MaxentFeatureEncodingI}
99 @param encoding: An encoding that is used to convert the
100 featuresets that are given to the C{classify} method into
101 joint-feature vectors, which are used by the maxent
102 classifier model.
103
104 @type weights: C{list} of C{float}
105 @param weights: The feature weight vector for this classifier.
106
107 @type logarithmic: C{bool}
108 @param logarithmic: If false, then use non-logarithmic weights.
109 """
110 self._encoding = encoding
111 self._weights = weights
112 self._logarithmic = logarithmic
113
114 assert encoding.length() == len(weights)
115
117 return self._encoding.labels()
118
120 """
121 Set the feature weight vector for this classifier.
122 @param new_weights: The new feature weight vector.
123 @type new_weights: C{list} of C{float}
124 """
125 self._weights = new_weights
126 assert (self._encoding.length() == len(new_weights))
127
129 """
130 @return: The feature weight vector for this classifier.
131 @rtype: C{list} of C{float}
132 """
133 return self._weights
134
137
139 prob_dict = {}
140 for label in self._encoding.labels():
141 feature_vector = self._encoding.encode(featureset, label)
142
143 if self._logarithmic:
144 total = 0.0
145 for (f_id, f_val) in feature_vector:
146 total += self._weights[f_id] * f_val
147 prob_dict[label] = total
148
149 else:
150 prod = 1.0
151 for (f_id, f_val) in feature_vector:
152 prod *= self._weights[f_id] ** f_val
153 prob_dict[label] = prod
154
155
156 return DictionaryProbDist(prob_dict, log=self._logarithmic,
157 normalize=True)
158
159 - def explain(self, featureset, columns=4):
160 """
161 Print a table showing the effect of each of the features in
162 the given feature set, and how they combine to determine the
163 probabilities of each label for that featureset.
164 """
165 descr_width = 50
166 TEMPLATE = ' %-'+str(descr_width-2)+'s%s%8.3f'
167
168 pdist = self.prob_classify(featureset)
169 labels = sorted(pdist.samples(), key=pdist.prob, reverse=True)
170 labels = labels[:columns]
171 print ' Feature'.ljust(descr_width)+''.join(
172 '%8s' % l[:7] for l in labels)
173 print ' '+'-'*(descr_width-2+8*len(labels))
174 sums = defaultdict(int)
175 for i, label in enumerate(labels):
176 feature_vector = self._encoding.encode(featureset, label)
177 feature_vector.sort(key=lambda (fid,_): abs(self._weights[fid]),
178 reverse=True)
179 for (f_id, f_val) in feature_vector:
180 if self._logarithmic: score = self._weights[f_id] * f_val
181 else: score = self._weights[fid] ** f_val
182 descr = self._encoding.describe(f_id)
183 descr = descr.split(' and label is ')[0]
184 if len(descr) > 47: descr = descr[:44]+'...'
185 print TEMPLATE % (descr, i*8*' ', score)
186 sums[label] += score
187 print ' '+'-'*(descr_width-1+8*len(labels))
188 print ' TOTAL:'.ljust(descr_width)+''.join(
189 '%8.3f' % sums[l] for l in labels)
190 print ' PROBS:'.ljust(descr_width)+''.join(
191 '%8.3f' % pdist.prob(l) for l in labels)
192
207
209 return ('<ConditionalExponentialClassifier: %d labels, %d features>' %
210 (len(self._encoding.labels()), self._encoding.length()))
211
212
213
214 ALGORITHMS = ['GIS', 'IIS', 'CG', 'BFGS', 'Powell', 'LBFGSB',
215 'Nelder-Mead', 'MEGAM']
216
217 @classmethod
218 - def train(cls, train_toks, algorithm=None, trace=3, encoding=None,
219 labels=None, sparse=True, gaussian_prior_sigma=0, **cutoffs):
220 """
221 Train a new maxent classifier based on the given corpus of
222 training samples. This classifier will have its weights
223 chosen to maximize entropy while remaining empirically
224 consistent with the training corpus.
225
226 @rtype: L{MaxentClassifier}
227 @return: The new maxent classifier
228
229 @type train_toks: C{list}
230 @param train_toks: Training data, represented as a list of
231 pairs, the first member of which is a featureset,
232 and the second of which is a classification label.
233
234 @type algorithm: C{str}
235 @param algorithm: A case-insensitive string, specifying which
236 algorithm should be used to train the classifier. The
237 following algorithms are currently available.
238
239 - Iterative Scaling Methods
240 - C{'GIS'}: Generalized Iterative Scaling
241 - C{'IIS'}: Improved Iterative Scaling
242
243 - Optimization Methods (require C{scipy})
244 - C{'CG'}: Conjugate gradient
245 - C{'BFGS'}: Broyden-Fletcher-Goldfarb-Shanno algorithm
246 - C{'Powell'}: Powell agorithm
247 - C{'LBFGSB'}: A limited-memory variant of the BFGS algorithm
248 - C{'Nelder-Mead'}: The Nelder-Mead algorithm
249
250 - External Libraries
251 - C{'megam'}: LM-BFGS algorithm, with training performed
252 by an U{megam <http://www.cs.utah.edu/~hal/megam/>}.
253 (requires that C{megam} be installed.)
254
255 The default algorithm is C{'CG'} if C{'scipy'} is
256 installed; and C{'iis'} otherwise.
257
258 @type trace: C{int}
259 @param trace: The level of diagnostic tracing output to produce.
260 Higher values produce more verbose output.
261
262 @type encoding: L{MaxentFeatureEncodingI}
263 @param encoding: A feature encoding, used to convert featuresets
264 into feature vectors. If none is specified, then a
265 L{BinaryMaxentFeatureEncoding} will be built based on the
266 features that are attested in the training corpus.
267
268 @type labels: C{list} of C{str}
269 @param labels: The set of possible labels. If none is given, then
270 the set of all labels attested in the training data will be
271 used instead.
272
273 @param sparse: If true, then use sparse matrices instead of
274 dense matrices. Currently, this is only supported by
275 the scipy (optimization method) algorithms. For other
276 algorithms, its value is ignored.
277
278 @param gaussian_prior_sigma: The sigma value for a gaussian
279 prior on model weights. Currently, this is supported by
280 the scipy (optimization method) algorithms and C{megam}.
281 For other algorithms, its value is ignored.
282
283 @param cutoffs: Arguments specifying various conditions under
284 which the training should be halted. (Some of the cutoff
285 conditions are not supported by some algorithms.)
286
287 - C{max_iter=v}: Terminate after C{v} iterations.
288 - C{min_ll=v}: Terminate after the negative average
289 log-likelihood drops under C{v}.
290 - C{min_lldelta=v}: Terminate if a single iteration improves
291 log likelihood by less than C{v}.
292 - C{tolerance=v}: Terminate a scipy optimization method when
293 improvement drops below a tolerance level C{v}. The
294 exact meaning of this tolerance depends on the scipy
295 algorithm used. See C{scipy} documentation for more
296 info. Default values: 1e-3 for CG, 1e-5 for LBFGSB,
297 and 1e-4 for other algorithms. I{(C{scipy} only)}
298 """
299 if algorithm is None:
300 try:
301 import scipy
302 algorithm = 'cg'
303 except ImportError:
304 algorithm = 'iis'
305 for key in cutoffs:
306 if key not in ('max_iter', 'min_ll', 'min_lldelta',
307 'tolerance', 'max_acc', 'min_accdelta'):
308 raise TypeError('Unexpected keyword arg %r' % key)
309 algorithm = algorithm.lower()
310 if algorithm == 'iis':
311 return train_maxent_classifier_with_iis(
312 train_toks, trace, encoding, labels, **cutoffs)
313 elif algorithm == 'gis':
314 return train_maxent_classifier_with_gis(
315 train_toks, trace, encoding, labels, **cutoffs)
316 elif algorithm in cls._SCIPY_ALGS:
317 return train_maxent_classifier_with_scipy(
318 train_toks, trace, encoding, labels,
319 cls._SCIPY_ALGS[algorithm], sparse,
320 gaussian_prior_sigma, **cutoffs)
321 elif algorithm == 'megam':
322 return train_maxent_classifier_with_megam(
323 train_toks, trace, encoding, labels,
324 gaussian_prior_sigma, **cutoffs)
325 else:
326 raise ValueError('Unknown algorithm %s' % algorithm)
327
328 _SCIPY_ALGS = {'cg':'CG', 'bfgs':'BFGS', 'powell':'Powell',
329 'lbfgsb':'LBFGSB', 'nelder-mead':'Nelder-Mead'}
330
331
332
333 ConditionalExponentialClassifier = MaxentClassifier
334
335 @deprecated('Use MaxentClassifier.train() instead')
338
339
340
341
342
344 """
345 A mapping that converts a set of input-feature values to a vector
346 of joint-feature values, given a label. This conversion is
347 necessary to translate featuresets into a format that can be used
348 by maximum entropy models.
349
350 The set of joint-features used by a given encoding is fixed, and
351 each index in the generated joint-feature vectors corresponds to a
352 single joint-feature. The length of the generated joint-feature
353 vectors is therefore constant (for a given encoding).
354
355 Because the joint-feature vectors generated by
356 C{MaxentFeatureEncodingI} are typically very sparse, they are
357 represented as a list of C{(index, value)} tuples, specifying the
358 value of each non-zero joint-feature.
359
360 Feature encodings are generally created using the L{train()}
361 method, which generates an appropriate encoding based on the
362 input-feature values and labels that are present in a given
363 corpus.
364 """
365 - def encode(self, featureset, label):
366 """
367 Given a (featureset, label) pair, return the corresponding
368 vector of joint-feature values. This vector is represented as
369 a list of C{(index, value)} tuples, specifying the value of
370 each non-zero joint-feature.
371
372 @type featureset: C{dict}
373 @rtype: C{list} of C{(int, number)}
374 """
375 raise AssertionError('Not implemented')
376
378 """
379 @return: The size of the fixed-length joint-feature vectors
380 that are generated by this encoding.
381 @rtype: C{int}
382 """
383 raise AssertionError('Not implemented')
384
386 """
387 @return: A list of the \"known labels\" -- i.e., all labels
388 C{l} such that C{self.encode(fs,l)} can be a nonzero
389 joint-feature vector for some value of C{fs}.
390 @rtype: C{list}
391 """
392 raise AssertionError('Not implemented')
393
395 """
396 @return: A string describing the value of the joint-feature
397 whose index in the generated feature vectors is C{fid}.
398 @rtype: C{str}
399 """
400 raise AssertionError('Not implemented')
401
402 - def train(cls, train_toks):
403 """
404 Construct and return new feature encoding, based on a given
405 training corpus C{train_toks}.
406
407 @type train_toks: C{list} of C{tuples} of (C{dict}, C{str})
408 @param train_toks: Training data, represented as a list of
409 pairs, the first member of which is a feature dictionary,
410 and the second of which is a classification label.
411 """
412 raise AssertionError('Not implemented')
413
415 """
416 A feature encoding that calls a user-supplied function to map a
417 given featureset/label pair to a sparse joint-feature vector.
418 """
419 - def __init__(self, func, length, labels):
420 """
421 Construct a new feature encoding based on the given function.
422
423 @type func: (callable)
424 @param func: A function that takes two arguments, a featureset
425 and a label, and returns the sparse joint feature vector
426 that encodes them:
427
428 >>> func(featureset, label) -> feature_vector
429
430 This sparse joint feature vector (C{feature_vector}) is a
431 list of C{(index,value)} tuples.
432
433 @type length: C{int}
434 @param length: The size of the fixed-length joint-feature
435 vectors that are generated by this encoding.
436
437 @type labels: C{list}
438 @param labels: A list of the \"known labels\" for this
439 encoding -- i.e., all labels C{l} such that
440 C{self.encode(fs,l)} can be a nonzero joint-feature vector
441 for some value of C{fs}.
442 """
443 self._length = length
444 self._func = func
445 self._labels = labels
446
447 - def encode(self, featureset, label):
448 return self._func(featureset, label)
449
452
455
457 return 'no description available'
458
460 """
461 A feature encoding that generates vectors containing a binary
462 joint-features of the form::
463
464 joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label)
465 {
466 { 0 otherwise
467
468 Where C{fname} is the name of an input-feature, C{fval} is a value
469 for that input-feature, and C{label} is a label.
470
471 Typically, these features are constructed based on a training
472 corpus, using the L{train()} method. This method will create one
473 feature for each combination of C{fname}, C{fval}, and C{label}
474 that occurs at least once in the training corpus.
475
476 The C{unseen_features} parameter can be used to add X{unseen-value
477 features}, which are used whenever an input feature has a value
478 that was not encountered in the training corpus. These features
479 have the form::
480
481 joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname])
482 { and l == label
483 {
484 { 0 otherwise
485
486 Where C{is_unseen(fname, fval)} is true if the encoding does not
487 contain any joint features that are true when C{fs[fname]==fval}.
488
489 The C{alwayson_features} parameter can be used to add X{always-on
490 features}, which have the form::
491
492 joint_feat(fs, l) = { 1 if (l == label)
493 {
494 { 0 otherwise
495
496 These always-on features allow the maxent model to directly model
497 the prior probabilities of each label.
498 """
499 - def __init__(self, labels, mapping, unseen_features=False,
500 alwayson_features=False):
501 """
502 @param labels: A list of the \"known labels\" for this encoding.
503
504 @param mapping: A dictionary mapping from C{(fname,fval,label)}
505 tuples to corresponding joint-feature indexes. These
506 indexes must be the set of integers from 0...len(mapping).
507 If C{mapping[fname,fval,label]=id}, then
508 C{self.encode({..., fname:fval, ...}, label)[id]} is 1;
509 otherwise, it is 0.
510
511 @param unseen_features: If true, then include unseen value
512 features in the generated joint-feature vectors.
513
514 @param alwayson_features: If true, then include always-on
515 features in the generated joint-feature vectors.
516 """
517 if set(mapping.values()) != set(range(len(mapping))):
518 raise ValueError('Mapping values must be exactly the '
519 'set of integers from 0...len(mapping)')
520
521 self._labels = list(labels)
522 """A list of attested labels."""
523
524 self._mapping = mapping
525 """dict mapping from (fname,fval,label) -> fid"""
526
527 self._length = len(mapping)
528 """The length of generated joint feature vectors."""
529
530 self._alwayson = None
531 """dict mapping from label -> fid"""
532
533 self._unseen = None
534 """dict mapping from fname -> fid"""
535
536 if alwayson_features:
537 self._alwayson = dict([(label,i+self._length)
538 for (i,label) in enumerate(labels)])
539 self._length += len(self._alwayson)
540
541 if unseen_features:
542 fnames = set(fname for (fname, fval, label) in mapping)
543 self._unseen = dict([(fname, i+self._length)
544 for (i, fname) in enumerate(fnames)])
545 self._length += len(fnames)
546
547 - def encode(self, featureset, label):
548
549 encoding = []
550
551
552 for fname, fval in featureset.items():
553
554 if (fname, fval, label) in self._mapping:
555 encoding.append((self._mapping[fname, fval, label], 1))
556
557
558 elif self._unseen:
559
560 for label2 in self._labels:
561 if (fname, fval, label2) in self._mapping:
562 break
563
564 else:
565 if fname in self._unseen:
566 encoding.append((self._unseen[fname], 1))
567
568
569 if self._alwayson and label in self._alwayson:
570 encoding.append((self._alwayson[label], 1))
571
572 return encoding
573
575
576 if not isinstance(f_id, (int, long)):
577 raise TypeError('describe() expected an int')
578 try:
579 self._inv_mapping
580 except AttributeError:
581 self._inv_mapping = [-1]*len(self._mapping)
582 for (info, i) in self._mapping.items():
583 self._inv_mapping[i] = info
584
585 if f_id < len(self._mapping):
586 (fname, fval, label) = self._inv_mapping[f_id]
587 return '%s==%r and label is %r' % (fname, fval, label)
588 elif self._alwayson and f_id in self._alwayson.values():
589 for (label, f_id2) in self._alwayson.items():
590 if f_id==f_id2: return 'label is %r' % label
591 elif self._unseen and f_id in self._unseen.values():
592 for (fname, f_id2) in self._unseen.items():
593 if f_id==f_id2: return '%s is unseen' % fname
594 else:
595 raise ValueError('Bad feature id')
596
598
599 return self._labels
600
602
603 return self._length
604
605 @classmethod
606 - def train(cls, train_toks, count_cutoff=0, labels=None, **options):
607 """
608 Construct and return new feature encoding, based on a given
609 training corpus C{train_toks}. See the L{class description
610 <BinaryMaxentFeatureEncoding>} for a description of the
611 joint-features that will be included in this encoding.
612
613 @type train_toks: C{list} of C{tuples} of (C{dict}, C{str})
614 @param train_toks: Training data, represented as a list of
615 pairs, the first member of which is a feature dictionary,
616 and the second of which is a classification label.
617
618 @type count_cutoff: C{int}
619 @param count_cutoff: A cutoff value that is used to discard
620 rare joint-features. If a joint-feature's value is 1
621 fewer than C{count_cutoff} times in the training corpus,
622 then that joint-feature is not included in the generated
623 encoding.
624
625 @type labels: C{list}
626 @param labels: A list of labels that should be used by the
627 classifier. If not specified, then the set of labels
628 attested in C{train_toks} will be used.
629
630 @param options: Extra parameters for the constructor, such as
631 C{unseen_features} and C{alwayson_features}.
632 """
633 mapping = {}
634 seen_labels = set()
635 count = defaultdict(int)
636
637 for (tok, label) in train_toks:
638 if labels and label not in labels:
639 raise ValueError('Unexpected label %s' % label)
640 seen_labels.add(label)
641
642
643 for (fname, fval) in tok.items():
644
645
646
647
648 count[fname,fval] += 1
649 if count[fname,fval] >= count_cutoff:
650 if (fname, fval, label) not in mapping:
651 mapping[fname, fval, label] = len(mapping)
652
653 if labels is None: labels = seen_labels
654 return cls(labels, mapping, **options)
655
657 """
658 A binary feature encoding which adds one new joint-feature to the
659 joint-features defined by L{BinaryMaxentFeatureEncoding}: a
660 correction feature, whose value is chosen to ensure that the
661 sparse vector always sums to a constant non-negative number. This
662 new feature is used to ensure two preconditions for the GIS
663 training algorithm:
664 - At least one feature vector index must be nonzero for every
665 token.
666 - The feature vector must sum to a constant non-negative number
667 for every token.
668 """
669 - def __init__(self, labels, mapping, unseen_features=False,
670 alwayson_features=False, C=None):
671 """
672 @param C: The correction constant. The value of the correction
673 feature is based on this value. In particular, its value is
674 C{C - sum([v for (f,v) in encoding])}.
675 @seealso: L{BinaryMaxentFeatureEncoding.__init__}
676 """
677 BinaryMaxentFeatureEncoding.__init__(
678 self, labels, mapping, unseen_features, alwayson_features)
679 if C is None:
680 C = len(set([fname for (fname,fval,label) in mapping]))+1
681
682 self._C = C
683 """The non-negative constant that all encoded feature vectors
684 will sum to."""
685
686 C = property(lambda self: self._C, doc="""
687 The non-negative constant that all encoded feature vectors
688 will sum to.""")
689
690 - def encode(self, featureset, label):
703
706
712
713
714
715
716
719 """
720 Train a new C{ConditionalExponentialClassifier}, using the given
721 training samples, using the Generalized Iterative Scaling
722 algorithm. This C{ConditionalExponentialClassifier} will encode
723 the model that maximizes entropy from all the models that are
724 empirically consistent with C{train_toks}.
725
726 @see: L{train_maxent_classifier()} for parameter descriptions.
727 """
728 cutoffs.setdefault('max_iter', 100)
729 cutoffchecker = CutoffChecker(cutoffs)
730
731
732 if encoding is None:
733 encoding = GISEncoding.train(train_toks, labels=labels)
734
735 if not hasattr(encoding, 'C'):
736 raise TypeError('The GIS algorithm requires an encoding that '
737 'defines C (e.g., GISEncoding).')
738
739
740
741
742 Cinv = 1.0/encoding.C
743
744
745 empirical_fcount = calculate_empirical_fcount(train_toks, encoding)
746
747
748 unattested = set(numpy.nonzero(empirical_fcount==0)[0])
749
750
751
752 weights = numpy.zeros(len(empirical_fcount), 'd')
753 for fid in unattested: weights[fid] = numpy.NINF
754 classifier = ConditionalExponentialClassifier(encoding, weights)
755
756
757 log_empirical_fcount = numpy.log2(empirical_fcount)
758 del empirical_fcount
759
760
761
762 ll_old = None
763 acc_old = None
764
765 if trace > 0: print ' ==> Training (%d iterations)' % cutoffs['max_iter']
766 if trace > 2:
767 print
768 print ' Iteration Log Likelihood Accuracy'
769 print ' ---------------------------------------'
770
771
772 try:
773 while True:
774 if trace > 2:
775 ll = cutoffchecker.ll or nltk.classify.util.log_likelihood(
776 classifier, train_toks)
777 acc = cutoffchecker.acc or nltk.classify.util.accuracy(
778 classifier, train_toks)
779 iternum = cutoffchecker.iter
780 print ' %9d %14.5f %9.3f' % (iternum, ll, acc)
781
782
783
784 estimated_fcount = calculate_estimated_fcount(
785 classifier, train_toks, encoding)
786
787
788 for fid in unattested: estimated_fcount[fid] += 1
789 log_estimated_fcount = numpy.log2(estimated_fcount)
790 del estimated_fcount
791
792
793 weights = classifier.weights()
794 weights += (log_empirical_fcount - log_estimated_fcount) * Cinv
795 classifier.set_weights(weights)
796
797
798 if cutoffchecker.check(classifier, train_toks):
799 break
800
801 except KeyboardInterrupt:
802 print ' Training stopped: keyboard interrupt'
803 except:
804 raise
805
806 if trace > 2:
807 ll = nltk.classify.util.log_likelihood(classifier, train_toks)
808 acc = nltk.classify.util.accuracy(classifier, train_toks)
809 print ' Final %14.5f %9.3f' % (ll, acc)
810
811
812 return classifier
813
822
834
835
836
837
838
839
842 """
843 Train a new C{ConditionalExponentialClassifier}, using the given
844 training samples, using the Improved Iterative Scaling algorithm.
845 This C{ConditionalExponentialClassifier} will encode the model
846 that maximizes entropy from all the models that are empirically
847 consistent with C{train_toks}.
848
849 @see: L{train_maxent_classifier()} for parameter descriptions.
850 """
851 cutoffs.setdefault('max_iter', 100)
852 cutoffchecker = CutoffChecker(cutoffs)
853
854
855 if encoding is None:
856 encoding = BinaryMaxentFeatureEncoding.train(train_toks, labels=labels)
857
858
859 empirical_ffreq = (calculate_empirical_fcount(train_toks, encoding) /
860 len(train_toks))
861
862
863
864
865
866
867 nfmap = calculate_nfmap(train_toks, encoding)
868 nfarray = numpy.array(sorted(nfmap, key=nfmap.__getitem__), 'd')
869 nftranspose = numpy.reshape(nfarray, (len(nfarray), 1))
870
871
872 unattested = set(numpy.nonzero(empirical_ffreq==0)[0])
873
874
875
876 weights = numpy.zeros(len(empirical_ffreq), 'd')
877 for fid in unattested: weights[fid] = numpy.NINF
878 classifier = ConditionalExponentialClassifier(encoding, weights)
879
880 if trace > 0: print ' ==> Training (%d iterations)' % cutoffs['max_iter']
881 if trace > 2:
882 print
883 print ' Iteration Log Likelihood Accuracy'
884 print ' ---------------------------------------'
885
886
887
888 ll_old = None
889 acc_old = None
890
891
892 try:
893 while True:
894 if trace > 2:
895 ll = cutoffchecker.ll or nltk.classify.util.log_likelihood(
896 classifier, train_toks)
897 acc = cutoffchecker.acc or nltk.classify.util.accuracy(
898 classifier, train_toks)
899 iternum = cutoffchecker.iter
900 print ' %9d %14.5f %9.3f' % (iternum, ll, acc)
901
902
903 deltas = calculate_deltas(
904 train_toks, classifier, unattested, empirical_ffreq,
905 nfmap, nfarray, nftranspose, encoding)
906
907
908 weights = classifier.weights()
909 weights += deltas
910 classifier.set_weights(weights)
911
912
913 if cutoffchecker.check(classifier, train_toks):
914 break
915
916 except KeyboardInterrupt:
917 print ' Training stopped: keyboard interrupt'
918 except:
919 raise
920
921
922 if trace > 2:
923 ll = nltk.classify.util.log_likelihood(classifier, train_toks)
924 acc = nltk.classify.util.accuracy(classifier, train_toks)
925 print ' Final %14.5f %9.3f' % (ll, acc)
926
927
928 return classifier
929
931 """
932 Construct a map that can be used to compress C{nf} (which is
933 typically sparse).
934
935 M{nf(feature_vector)} is the sum of the feature values for
936 M{feature_vector}.
937
938 This represents the number of features that are active for a
939 given labeled text. This method finds all values of M{nf(t)}
940 that are attested for at least one token in the given list of
941 training tokens; and constructs a dictionary mapping these
942 attested values to a continuous range M{0...N}. For example,
943 if the only values of M{nf()} that were attested were 3, 5,
944 and 7, then C{_nfmap} might return the dictionary {3:0, 5:1,
945 7:2}.
946
947 @return: A map that can be used to compress C{nf} to a dense
948 vector.
949 @rtype: C{dictionary} from C{int} to C{int}
950 """
951
952 nfset = set()
953 for tok, _ in train_toks:
954 for label in encoding.labels():
955 nfset.add(sum([val for (id,val) in encoding.encode(tok,label)]))
956 return dict([(nf, i) for (i, nf) in enumerate(nfset)])
957
958 -def calculate_deltas(train_toks, classifier, unattested, ffreq_empirical,
959 nfmap, nfarray, nftranspose, encoding):
960 """
961 Calculate the update values for the classifier weights for
962 this iteration of IIS. These update weights are the value of
963 C{delta} that solves the equation::
964
965 ffreq_empirical[i]
966 =
967 SUM[fs,l] (classifier.prob_classify(fs).prob(l) *
968 feature_vector(fs,l)[i] *
969 exp(delta[i] * nf(feature_vector(fs,l))))
970
971 Where:
972 - M{(fs,l)} is a (featureset, label) tuple from C{train_toks}
973 - M{feature_vector(fs,l)} = C{encoding.encode(fs,l)}
974 - M{nf(vector)} = C{sum([val for (id,val) in vector])}
975
976 This method uses Newton's method to solve this equation for
977 M{delta[i]}. In particular, it starts with a guess of
978 C{delta[i]}=1; and iteratively updates C{delta} with::
979
980 delta[i] -= (ffreq_empirical[i] - sum1[i])/(-sum2[i])
981
982 until convergence, where M{sum1} and M{sum2} are defined as::
983
984 sum1[i](delta) = SUM[fs,l] f[i](fs,l,delta)
985
986 sum2[i](delta) = SUM[fs,l] (f[i](fs,l,delta) *
987 nf(feature_vector(fs,l)))
988
989 f[i](fs,l,delta) = (classifier.prob_classify(fs).prob(l) *
990 feature_vector(fs,l)[i] *
991 exp(delta[i] * nf(feature_vector(fs,l))))
992
993 Note that M{sum1} and M{sum2} depend on C{delta}; so they need
994 to be re-computed each iteration.
995
996 The variables C{nfmap}, C{nfarray}, and C{nftranspose} are
997 used to generate a dense encoding for M{nf(ltext)}. This
998 allows C{_deltas} to calculate M{sum1} and M{sum2} using
999 matrices, which yields a signifigant performance improvement.
1000
1001 @param train_toks: The set of training tokens.
1002 @type train_toks: C{list} of C{tuples} of (C{dict}, C{str})
1003 @param classifier: The current classifier.
1004 @type classifier: C{ClassifierI}
1005 @param ffreq_empirical: An array containing the empirical
1006 frequency for each feature. The M{i}th element of this
1007 array is the empirical frequency for feature M{i}.
1008 @type ffreq_empirical: C{sequence} of C{float}
1009 @param unattested: An array that is 1 for features that are
1010 not attested in the training data; and 0 for features that
1011 are attested. In other words, C{unattested[i]==0} iff
1012 C{ffreq_empirical[i]==0}.
1013 @type unattested: C{sequence} of C{int}
1014 @param nfmap: A map that can be used to compress C{nf} to a dense
1015 vector.
1016 @type nfmap: C{dictionary} from C{int} to C{int}
1017 @param nfarray: An array that can be used to uncompress C{nf}
1018 from a dense vector.
1019 @type nfarray: C{array} of C{float}
1020 @param nftranspose: C{array} of C{float}
1021 @type nftranspose: The transpose of C{nfarray}
1022 """
1023
1024
1025
1026 NEWTON_CONVERGE = 1e-12
1027 MAX_NEWTON = 300
1028
1029 deltas = numpy.ones(encoding.length(), 'd')
1030
1031
1032
1033
1034 A = numpy.zeros((len(nfmap), encoding.length()), 'd')
1035
1036 for tok, label in train_toks:
1037 dist = classifier.prob_classify(tok)
1038
1039 for label in encoding.labels():
1040
1041 feature_vector = encoding.encode(tok,label)
1042
1043 nf = sum([val for (id, val) in feature_vector])
1044
1045 for (id, val) in feature_vector:
1046 A[nfmap[nf], id] += dist.prob(label) * val
1047 A /= len(train_toks)
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057 for rangenum in range(MAX_NEWTON):
1058 nf_delta = numpy.outer(nfarray, deltas)
1059 exp_nf_delta = 2 ** nf_delta
1060 nf_exp_nf_delta = nftranspose * exp_nf_delta
1061 sum1 = numpy.sum(exp_nf_delta * A, axis=0)
1062 sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0)
1063
1064
1065 for fid in unattested: sum2[fid] += 1
1066
1067
1068 deltas -= (ffreq_empirical - sum1) / -sum2
1069
1070
1071 n_error = (numpy.sum(abs((ffreq_empirical-sum1)))/
1072 numpy.sum(abs(deltas)))
1073 if n_error < NEWTON_CONVERGE:
1074 return deltas
1075
1076 return deltas
1077
1078
1079
1080
1081
1082
1083
1088 """
1089 Train a new C{ConditionalExponentialClassifier}, using the given
1090 training samples, using the specified C{scipy} optimization
1091 algorithm. This C{ConditionalExponentialClassifier} will encode
1092 the model that maximizes entropy from all the models that are
1093 empirically consistent with C{train_toks}.
1094
1095 @see: L{train_maxent_classifier()} for parameter descriptions.
1096 @require: The C{scipy} package must be installed.
1097 """
1098 try:
1099 import scipy.sparse, scipy.maxentropy
1100 except ImportError, e:
1101 raise ValueError('The maxent training algorithm %r requires '
1102 'that the scipy package be installed. See '
1103 'http://www.scipy.org/' % algorithm)
1104
1105
1106 if encoding is None:
1107 encoding = BinaryMaxentFeatureEncoding.train(train_toks, labels=labels)
1108 elif labels is not None:
1109 raise ValueError('Specify encoding or labels, not both')
1110
1111 labels = encoding.labels()
1112 labelnum = dict([(label, i) for (i, label) in enumerate(labels)])
1113 num_features = encoding.length()
1114 num_toks = len(train_toks)
1115 num_labels = len(labels)
1116
1117
1118
1119
1120
1121
1122
1123 if sparse: zeros = scipy.sparse.lil_matrix
1124 else: zeros = numpy.zeros
1125
1126
1127
1128
1129
1130 F = zeros((num_features, num_toks*num_labels))
1131
1132
1133
1134
1135 N = zeros((1, num_toks*num_labels))
1136
1137
1138
1139 for toknum, (featureset, label) in enumerate(train_toks):
1140 N[0, toknum*len(labels) + labelnum[label]] += 1
1141 for label2 in labels:
1142 for (fid, fval) in encoding.encode(featureset, label2):
1143 F[fid, toknum*len(labels) + labelnum[label2]] = fval
1144
1145
1146 model = scipy.maxentropy.conditionalmodel(F, N, num_toks)
1147
1148 if gaussian_prior_sigma:
1149 model.sigma2 = gaussian_prior_sigma**2
1150 if algorithm == 'LBFGSB':
1151 model.log = None
1152 if trace >= 3:
1153 model.verbose = True
1154 if 'max_iter' in cutoffs:
1155 model.maxiter = cutoffs['max_iter']
1156 if 'tolerance' in cutoffs:
1157 if algorithm == 'CG': model.avegtol = cutoffs['tolerance']
1158 elif algorithm == 'LBFGSB': model.maxgtol = cutoffs['tolerance']
1159 else: model.tol = cutoffs['tolerance']
1160
1161
1162 model.fit(algorithm=algorithm)
1163
1164
1165 weights = model.params * numpy.log2(numpy.e)
1166
1167
1168 return MaxentClassifier(encoding, weights)
1169
1170
1171
1172
1173
1174
1175
1176
1177
1181 """
1182 Train a new C{ConditionalExponentialClassifier}, using the given
1183 training samples, using the external C{megam} library. This
1184 C{ConditionalExponentialClassifier} will encode the model that
1185 maximizes entropy from all the models that are empirically
1186 consistent with C{train_toks}.
1187
1188 @see: L{train_maxent_classifier()} for parameter descriptions.
1189 @see: L{nltk.classify.megam}
1190 """
1191 explicit = True
1192
1193
1194 if encoding is None:
1195 encoding = BinaryMaxentFeatureEncoding.train(train_toks,
1196 labels=labels, alwayson_features=True)
1197 elif labels is not None:
1198 raise ValueError('Specify encoding or labels, not both')
1199
1200
1201 try:
1202 fd, trainfile_name = tempfile.mkstemp(prefix='nltk-')
1203 trainfile = os.fdopen(fd, 'wb')
1204 write_megam_file(train_toks, encoding, trainfile, explicit=explicit)
1205 trainfile.close()
1206 except (OSError, IOError, ValueError), e:
1207 raise ValueError('Error while creating megam training file: %s' % e)
1208
1209
1210 options = ['-nobias']
1211 if explicit: options += ['-explicit']
1212 if gaussian_prior_sigma:
1213
1214
1215
1216 options += ['-lambda', '%s' % (7.01/(gaussian_prior_sigma**2))]
1217 else:
1218 options += ['-lambda', '0']
1219 if trace < 3:
1220 options += ['-quiet']
1221 if 'max_iter' in cutoffs:
1222 options += ['-maxi', '%s' % cutoffs['max_iter']]
1223 if 'll_delta' in cutoffs:
1224
1225
1226 options += ['-dpp', '%s' % abs(cutoffs['ll_delta'])]
1227 stdout = call_megam(options+['multiclass', trainfile_name])
1228
1229
1230 try: os.remove(trainfile_name)
1231 except (OSError, IOError), e:
1232 print 'Warning: unable to delete %s: %s' % (trainfile_name, e)
1233
1234
1235 weights = parse_megam_weights(stdout, explicit)
1236
1237
1238 weights *= numpy.log2(numpy.e)
1239
1240
1241 return MaxentClassifier(encoding, weights)
1242
1243
1244
1245
1249
1250 if __name__ == '__main__':
1251 demo()
1252