1
2
3
4
5
6
7
8
9
10 """
11 Brill's transformational rule-based tagger.
12 """
13
14 import bisect
15 import random
16 import yaml
17 import textwrap
18
19 from nltk import defaultdict
20
21 from util import untag
22 from api import *
23
24
25
26
27
29 """
30 Brill's transformational rule-based tagger. Brill taggers use an
31 X{initial tagger} (such as L{tag.DefaultTagger}) to assign an intial
32 tag sequence to a text; and then apply an ordered list of
33 transformational rules to correct the tags of individual tokens.
34 These transformation rules are specified by the L{BrillRule}
35 interface.
36
37 Brill taggers can be created directly, from an initial tagger and
38 a list of transformational rules; but more often, Brill taggers
39 are created by learning rules from a training corpus, using either
40 L{BrillTaggerTrainer} or L{FastBrillTaggerTrainer}.
41 """
42
43 yaml_tag = '!nltk.BrillTagger'
44 - def __init__(self, initial_tagger, rules):
45 """
46 @param initial_tagger: The initial tagger
47 @type initial_tagger: L{TaggerI}
48 @param rules: An ordered list of transformation rules that
49 should be used to correct the initial tagging.
50 @type rules: C{list} of L{BrillRule}
51 """
52 self._initial_tagger = initial_tagger
53 self._rules = tuple(rules)
54
57
58 - def tag(self, tokens):
59
60
61
62 tagged_tokens = self._initial_tagger.tag(tokens)
63
64
65
66 tag_to_positions = defaultdict(set)
67 for i, (token, tag) in enumerate(tagged_tokens):
68 tag_to_positions[tag].add(i)
69
70
71
72 for rule in self._rules:
73
74 positions = tag_to_positions.get(rule.original_tag, [])
75
76 changed = rule.apply(tagged_tokens, positions)
77
78
79 for i in changed:
80 tag_to_positions[rule.original_tag].remove(i)
81 tag_to_positions[rule.replacement_tag].add(i)
82
83 return tagged_tokens
84
85
86
87
88
90 """
91 An interface for tag transformations on a tagged corpus, as
92 performed by brill taggers. Each transformation finds all tokens
93 in the corpus that are tagged with a specific X{original tag} and
94 satisfy a specific X{condition}, and replaces their tags with a
95 X{replacement tag}. For any given transformation, the original
96 tag, replacement tag, and condition are fixed. Conditions may
97 depend on the token under consideration, as well as any other
98 tokens in the corpus.
99
100 Brill rules must be comparable and hashable.
101 """
102 - def __init__(self, original_tag, replacement_tag):
103 assert self.__class__ != BrillRule, \
104 "BrillRule is an abstract base class"
105
106 self.original_tag = original_tag
107 """The tag which this C{BrillRule} may cause to be replaced."""
108
109 self.replacement_tag = replacement_tag
110 """The tag with which this C{BrillRule} may replace another tag."""
111
112 - def apply(self, tokens, positions=None):
113 """
114 Apply this rule at every position in C{positions} where it
115 applies to the given sentence. I.e., for each position M{p}
116 in C{positions}, if C{tokens[M{p}]} is tagged with this rule's
117 original tag, and satisfies this rule's condition, then set
118 its tag to be this rule's replacement tag.
119
120 @param tokens: The tagged sentence
121 @type tokens: list of Token
122 @type positions: C{list} of C{int}
123 @param positions: The positions where the transformation is to
124 be tried. If not specified, try it at all positions.
125 @return: The indices of tokens whose tags were changed by this
126 rule.
127 @rtype: C{int}
128 """
129 if positions is None:
130 positions = range(len(tokens))
131
132
133 change = [i for i in positions if self.applies(tokens, i)]
134
135
136
137
138 for i in change:
139 tokens[i] = (tokens[i][0], self.replacement_tag)
140
141 return change
142
144 """
145 @return: True if the rule would change the tag of
146 C{tokens[index]}, False otherwise
147 @rtype: Boolean
148
149 @param tokens: A tagged sentence
150 @type tokens: list of Token
151 @param index: The index to check
152 @type index: int
153 """
154 assert False, "Brill rules must define applies()"
155
156
158 assert False, "Brill rules must be comparable"
160 assert False, "Brill rules must be comparable"
162 assert False, "Brill rules must be hashable"
163
164
166 """
167 An abstract base class for brill rules whose condition checks for
168 the presence of tokens with given properties at given ranges of
169 positions, relative to the token.
170
171 Each subclass of proximate tokens brill rule defines a method
172 M{extract_property}, which extracts a specific property from the
173 the token, such as its text or tag. Each instance is
174 parameterized by a set of tuples, specifying ranges of positions
175 and property values to check for in those ranges:
176
177 - (M{start}, M{end}, M{value})
178
179 The brill rule is then applicable to the M{n}th token iff:
180
181 - The M{n}th token is tagged with the rule's original tag; and
182 - For each (M{start}, M{end}, M{value}) triple:
183 - The property value of at least one token between
184 M{n+start} and M{n+end} (inclusive) is M{value}.
185
186 For example, a proximate token brill template with M{start=end=-1}
187 generates rules that check just the property of the preceding
188 token. Note that multiple properties may be included in a single
189 rule; the rule applies if they all hold.
190 """
191
192 - def __init__(self, original_tag, replacement_tag, *conditions):
193 """
194 Construct a new brill rule that changes a token's tag from
195 C{original_tag} to C{replacement_tag} if all of the properties
196 specified in C{conditions} hold.
197
198 @type conditions: C{tuple} of C{(int, int, *)}
199 @param conditions: A list of 3-tuples C{(start, end, value)},
200 each of which specifies that the property of at least one
201 token between M{n}+C{start} and M{n}+C{end} (inclusive) is
202 C{value}.
203 @raise ValueError: If C{start}>C{end} for any condition.
204 """
205 assert self.__class__ != ProximateTokensRule, \
206 "ProximateTokensRule is an abstract base class"
207 BrillRule.__init__(self, original_tag, replacement_tag)
208 self._conditions = conditions
209 for (s,e,v) in conditions:
210 if s>e:
211 raise ValueError('Condition %s has an invalid range' %
212 ((s,e,v),))
213
214
215 @classmethod
223 @classmethod
225 map = loader.construct_mapping(node, deep=True)
226 return cls(map['original'], map['replacement'],
227 *(tuple(x) for x in map['conditions']))
228
229 @staticmethod
231 """
232 Returns some property characterizing this token, such as its
233 base lexical item or its tag.
234
235 Each implentation of this method should correspond to an
236 implementation of the method with the same name in a subclass
237 of L{ProximateTokensTemplate}.
238
239 @param token: The token
240 @type token: Token
241 @return: The property
242 @rtype: any
243 """
244 assert False, "ProximateTokenRules must define extract_property()"
245
269
271 return (self is other or
272 (other is not None and
273 other.__class__ == self.__class__ and
274 self.original_tag == other.original_tag and
275 self.replacement_tag == other.replacement_tag and
276 self._conditions == other._conditions))
277
279 return not (self==other)
280
282
283 try:
284 return self.__hash
285 except:
286 self.__hash = hash( (self.original_tag, self.replacement_tag,
287 self._conditions, self.__class__.__name__) )
288 return self.__hash
289
291
292
293 try:
294 return self.__repr
295 except:
296 conditions = ' and '.join(['%s in %d...%d' % (v,s,e)
297 for (s,e,v) in self._conditions])
298 self.__repr = ('<%s: %s->%s if %s>' %
299 (self.__class__.__name__, self.original_tag,
300 self.replacement_tag, conditions))
301 return self.__repr
302
303
305 replacement = '%s -> %s' % (self.original_tag,
306 self.replacement_tag)
307 if len(self._conditions) == 0:
308 conditions = ''
309 else:
310 conditions = ' if '+ ', and '.join([self._condition_to_str(c)
311 for c in self._conditions])
312 return replacement+conditions
313
315 """
316 Return a string representation of the given condition.
317 This helper method is used by L{__str__}.
318 """
319 (start, end, value) = condition
320 return ('the %s of %s is %r' %
321 (self.PROPERTY_NAME, self._range_to_str(start, end), value))
322
324 """
325 Return a string representation for the given range. This
326 helper method is used by L{__str__}.
327 """
328 if start == end == 0:
329 return 'this word'
330 if start == end == -1:
331 return 'the preceding word'
332 elif start == end == 1:
333 return 'the following word'
334 elif start == end and start < 0:
335 return 'word i-%d' % -start
336 elif start == end and start > 0:
337 return 'word i+%d' % start
338 else:
339 if start >= 0: start = '+%d' % start
340 if end >= 0: end = '+%d' % end
341 return 'words i%s...i%s' % (start, end)
342
355
357 """
358 A rule which examines the base types of nearby tokens.
359 @see: L{ProximateTokensRule} for details.
360 @see: L{SymmetricProximateTokensTemplate}, which generates these rules.
361 """
362 PROPERTY_NAME = 'text'
363 yaml_tag = '!ProximateWordsRule'
364 @staticmethod
366 """@return: The given token's text."""
367 return token[0]
368
369
370
371
372
374 """
375 An interface for generating lists of transformational rules that
376 apply at given sentence positions. C{BrillTemplateI} is used by
377 C{Brill} training algorithms to generate candidate rules.
378 """
380 raise AssertionError, "BrillTemplateI is an abstract interface"
381
383 """
384 Return a list of the transformational rules that would correct
385 the C{i}th subtoken's tag in the given token. In particular,
386 return a list of zero or more rules that would change
387 C{tagged_tokens[i][1]} to C{correctTag}, if applied
388 to C{token}.
389
390 If the C{i}th subtoken already has the correct tag (i.e., if
391 C{tagged_tokens[i][1]} == C{correctTag}), then
392 C{applicable_rules} should return the empty list.
393
394 @param tokens: The tagged tokens being tagged.
395 @type tokens: C{list} of C{tuple}
396 @param i: The index of the token whose tag should be corrected.
397 @type i: C{int}
398 @param correctTag: The correct tag for the C{i}th token.
399 @type correctTag: (any)
400 @rtype: C{list} of L{BrillRule}
401 """
402 raise AssertionError, "BrillTemplateI is an abstract interface"
403
405 """
406 Returns the set of indices C{i} such that
407 C{applicable_rules(token, i, ...)} depends on the value of
408 the C{index}th subtoken of C{token}.
409
410 This method is used by the \"fast\" Brill tagger trainer.
411
412 @param token: The tokens being tagged.
413 @type token: C{list} of C{tuple}
414 @param index: The index whose neighborhood should be returned.
415 @type index: C{int}
416 @rtype: C{Set}
417 """
418 raise AssertionError, "BrillTemplateI is an abstract interface"
419
421 """
422 An brill templates that generates a list of
423 L{ProximateTokensRule}s that apply at a given sentence
424 position. In particular, each C{ProximateTokensTemplate} is
425 parameterized by a proximate token brill rule class and a list of
426 boundaries, and generates all rules that:
427
428 - use the given brill rule class
429 - use the given list of boundaries as the C{start} and C{end}
430 points for their conditions
431 - are applicable to the given token.
432 """
433 - def __init__(self, rule_class, *boundaries):
434 """
435 Construct a template for generating proximate token brill
436 rules.
437
438 @type rule_class: C{class}
439 @param rule_class: The proximate token brill rule class that
440 should be used to generate new rules. This class must be a
441 subclass of L{ProximateTokensRule}.
442 @type boundaries: C{tuple} of C{(int, int)}
443 @param boundaries: A list of tuples C{(start, end)}, each of
444 which specifies a range for which a condition should be
445 created by each rule.
446 @raise ValueError: If C{start}>C{end} for any boundary.
447 """
448 self._rule_class = rule_class
449 self._boundaries = boundaries
450 for (s,e) in boundaries:
451 if s>e:
452 raise ValueError('Boundary %s has an invalid range' %
453 ((s,e),))
454
456 if tokens[index][1] == correct_tag:
457 return []
458
459
460
461 applicable_conditions = \
462 [self._applicable_conditions(tokens, index, start, end)
463 for (start, end) in self._boundaries]
464
465
466
467
468 condition_combos = [[]]
469 for conditions in applicable_conditions:
470 condition_combos = [old_conditions+[new_condition]
471 for old_conditions in condition_combos
472 for new_condition in conditions]
473
474
475 return [self._rule_class(tokens[index][1], correct_tag, *conds)
476 for conds in condition_combos]
477
479 """
480 @return: A set of all conditions for proximate token rules
481 that are applicable to C{tokens[index]}, given boundaries of
482 C{(start, end)}. I.e., return a list of all tuples C{(start,
483 end, M{value})}, such the property value of at least one token
484 between M{index+start} and M{index+end} (inclusive) is
485 M{value}.
486 """
487 conditions = []
488 s = max(0, index+start)
489 e = min(index+end+1, len(tokens))
490 for i in range(s, e):
491 value = self._rule_class.extract_property(tokens[i])
492 conditions.append( (start, end, value) )
493 return conditions
494
510
512 """
513 Simulates two L{ProximateTokensTemplate}s which are symmetric
514 across the location of the token. For rules of the form \"If the
515 M{n}th token is tagged C{A}, and any tag preceding B{or} following
516 the M{n}th token by a distance between M{x} and M{y} is C{B}, and
517 ... , then change the tag of the nth token from C{A} to C{C}.\"
518
519 One C{ProximateTokensTemplate} is formed by passing in the
520 same arguments given to this class's constructor: tuples
521 representing intervals in which a tag may be found. The other
522 C{ProximateTokensTemplate} is constructed with the negative
523 of all the arguments in reversed order. For example, a
524 C{SymmetricProximateTokensTemplate} using the pair (-2,-1) and the
525 constructor C{SymmetricProximateTokensTemplate} generates the same rules as a
526 C{SymmetricProximateTokensTemplate} using (-2,-1) plus a second
527 C{SymmetricProximateTokensTemplate} using (1,2).
528
529 This is useful because we typically don't want templates to
530 specify only \"following\" or only \"preceding\"; we'd like our
531 rules to be able to look in either direction.
532 """
533 - def __init__(self, rule_class, *boundaries):
534 """
535 Construct a template for generating proximate token brill
536 rules.
537
538 @type rule_class: C{class}
539 @param rule_class: The proximate token brill rule class that
540 should be used to generate new rules. This class must be a
541 subclass of L{ProximateTokensRule}.
542 @type boundaries: C{tuple} of C{(int, int)}
543 @param boundaries: A list of tuples C{(start, end)}, each of
544 which specifies a range for which a condition should be
545 created by each rule.
546 @raise ValueError: If C{start}>C{end} for any boundary.
547 """
548 self._ptt1 = ProximateTokensTemplate(rule_class, *boundaries)
549 reversed = [(-e,-s) for (s,e) in boundaries]
550 self._ptt2 = ProximateTokensTemplate(rule_class, *reversed)
551
552
561
567
568
569
570
571
573 """
574 A trainer for brill taggers.
575 """
576 - def __init__(self, initial_tagger, templates, trace=0,
577 deterministic=None):
578 """
579 @param deterministic: If true, then choose between rules that
580 have the same score by picking the one whose __repr__
581 is lexicographically smaller. If false, then just pick the
582 first rule we find with a given score -- this will depend
583 on the order in which keys are returned from dictionaries,
584 and so may not be the same from one run to the next. If
585 not specified, treat as true iff trace > 0.
586 """
587 if deterministic is None: deterministic = (trace > 0)
588 self._initial_tagger = initial_tagger
589 self._templates = templates
590 self._trace = trace
591 self._deterministic = deterministic
592
593
594
595
596
597 - def train(self, train_sents, max_rules=200, min_score=2):
598 """
599 Trains the Brill tagger on the corpus C{train_token},
600 producing at most C{max_rules} transformations, each of which
601 reduces the net number of errors in the corpus by at least
602 C{min_score}.
603
604 @type train_sents: C{list} of C{list} of L{tuple}
605 @param train_sents: The corpus of tagged tokens
606 @type max_rules: C{int}
607 @param max_rules: The maximum number of transformations to be created
608 @type min_score: C{int}
609 @param min_score: The minimum acceptable net error reduction
610 that each transformation must produce in the corpus.
611 """
612 if self._trace > 0: print ("Training Brill tagger on %d "
613 "sentences..." % len(train_sents))
614
615
616
617
618 test_sents = [self._initial_tagger.tag(untag(sent))
619 for sent in train_sents]
620
621 if self._trace > 2: self._trace_header()
622
623
624 rules = []
625 try:
626 while len(rules) < max_rules:
627 (rule, score, fixscore) = self._best_rule(test_sents,
628 train_sents)
629 if rule is None or score < min_score:
630 if self._trace > 1:
631 print 'Insufficient improvement; stopping'
632 break
633 else:
634
635 rules.append(rule)
636
637
638 k = 0
639 for sent in test_sents:
640 k += len(rule.apply(sent))
641
642 if self._trace > 1:
643 self._trace_rule(rule, score, fixscore, k)
644
645 except KeyboardInterrupt:
646 print "Training stopped manually -- %d rules found" % len(rules)
647
648
649 return BrillTagger(self._initial_tagger, rules)
650
651
652
653
654
655
656
658
659
660
661 correct_indices = defaultdict(list)
662 for sentnum, sent in enumerate(test_sents):
663 for wordnum, tagged_word in enumerate(sent):
664 if tagged_word[1] == train_sents[sentnum][wordnum][1]:
665 tag = tagged_word[1]
666 correct_indices[tag].append( (sentnum, wordnum) )
667
668
669
670
671 rules = self._find_rules(test_sents, train_sents)
672
673
674 best_rule, best_score, best_fixscore = None, 0, 0
675
676
677
678
679 for (rule, fixscore) in rules:
680
681
682
683 if best_score > fixscore or (best_score == fixscore and
684 not self._deterministic):
685 return best_rule, best_score, best_fixscore
686
687
688
689
690 score = fixscore
691 if rule.original_tag in correct_indices:
692 for (sentnum, wordnum) in correct_indices[rule.original_tag]:
693 if rule.applies(test_sents[sentnum], wordnum):
694 score -= 1
695
696
697 if score < best_score or (score == best_score and
698 not self._deterministic):
699 break
700
701
702
703 if score > best_score or (score == best_score and
704 self._deterministic and
705 repr(rule) < repr(best_rule)):
706 best_rule, best_score, best_fixscore = rule, score, fixscore
707
708
709 return best_rule, best_score, best_fixscore
710
712 """
713 Find all rules that correct at least one token's tag in
714 C{test_sents}.
715
716 @return: A list of tuples C{(rule, fixscore)}, where C{rule}
717 is a brill rule and C{fixscore} is the number of tokens
718 whose tag the rule corrects. Note that C{fixscore} does
719 I{not} include the number of tokens whose tags are changed
720 to incorrect values.
721 """
722
723
724 error_indices = []
725 for sentnum, sent in enumerate(test_sents):
726 for wordnum, tagged_word in enumerate(sent):
727 if tagged_word[1] != train_sents[sentnum][wordnum][1]:
728 error_indices.append( (sentnum, wordnum) )
729
730
731
732 rule_score_dict = defaultdict(int)
733 for (sentnum, wordnum) in error_indices:
734 test_sent = test_sents[sentnum]
735 train_sent = train_sents[sentnum]
736 for rule in self._find_rules_at(test_sent, train_sent, wordnum):
737 rule_score_dict[rule] += 1
738
739
740
741 return sorted(rule_score_dict.items(),
742 key=lambda (rule,score): -score)
743
745 """
746 @rtype: C{Set}
747 @return: the set of all rules (based on the templates) that
748 correct token C{i}'s tag in C{test_sent}.
749 """
750 applicable_rules = set()
751 if test_sent[i][1] != train_sent[i][1]:
752 correct_tag = train_sent[i][1]
753 for template in self._templates:
754 new_rules = template.applicable_rules(test_sent, i,
755 correct_tag)
756 applicable_rules.update(new_rules)
757
758 return applicable_rules
759
760
761
762
763
765 print """
766 B |
767 S F r O | Score = Fixed - Broken
768 c i o t | R Fixed = num tags changed incorrect -> correct
769 o x k h | u Broken = num tags changed correct -> incorrect
770 r e e e | l Other = num tags changed incorrect -> incorrect
771 e d n r | e
772 ------------------+-------------------------------------------------------
773 """.rstrip()
774
775 - def _trace_rule(self, rule, score, fixscore, numchanges):
776 if self._trace > 2:
777 print ('%4d%4d%4d%4d ' % (score, fixscore, fixscore-score,
778 numchanges-fixscore*2+score)), '|',
779 print textwrap.fill(str(rule), initial_indent=' '*20,
780 subsequent_indent=' '*18+'| ').strip()
781 else:
782 print rule
783
784
785
786
787
789 """
790 A faster trainer for brill taggers.
791 """
792 - def __init__(self, initial_tagger, templates, trace=0,
793 deterministic=None):
794 if deterministic is None: deterministic = (trace > 0)
795 self._initial_tagger = initial_tagger
796 self._templates = templates
797 self._trace = trace
798 self._deterministic = deterministic
799
800 self._tag_positions = None
801 """Mapping from tags to lists of positions that use that tag."""
802
803 self._rules_by_position = None
804 """Mapping from positions to the set of rules that are known
805 to occur at that position. Position is (sentnum, wordnum).
806 Initially, this will only contain positions where each rule
807 applies in a helpful way; but when we examine a rule, we'll
808 extend this list to also include positions where each rule
809 applies in a harmful or neutral way."""
810
811 self._positions_by_rule = None
812 """Mapping from rule to position to effect, specifying the
813 effect that each rule has on the overall score, at each
814 position. Position is (sentnum, wordnum); and effect is
815 -1, 0, or 1. As with _rules_by_position, this mapping starts
816 out only containing rules with positive effects; but when
817 we examine a rule, we'll extend this mapping to include
818 the positions where the rule is harmful or neutral."""
819
820 self._rules_by_score = None
821 """Mapping from scores to the set of rules whose effect on the
822 overall score is upper bounded by that score. Invariant:
823 rulesByScore[s] will contain r iff the sum of
824 _positions_by_rule[r] is s."""
825
826 self._rule_scores = None
827 """Mapping from rules to upper bounds on their effects on the
828 overall score. This is the inverse mapping to _rules_by_score.
829 Invariant: ruleScores[r] = sum(_positions_by_rule[r])"""
830
831 self._first_unknown_position = None
832 """Mapping from rules to the first position where we're unsure
833 if the rule applies. This records the next position we
834 need to check to see if the rule messed anything up."""
835
836
837
838
839
840 - def train(self, train_sents, max_rules=200, min_score=2):
841
842
843
844 if self._trace > 0: print ("Training Brill tagger on %d "
845 "sentences..." % len(train_sents))
846
847
848
849
850 test_sents = [self._initial_tagger.tag(untag(sent))
851 for sent in train_sents]
852
853
854
855
856 if self._trace > 0: print "Finding initial useful rules..."
857 self._init_mappings(test_sents, train_sents)
858 if self._trace > 0: print (" Found %d useful rules." %
859 len(self._rule_scores))
860
861
862 if self._trace > 2: self._trace_header()
863 elif self._trace == 1: print "Selecting rules..."
864
865
866 rules = []
867 try:
868 while (len(rules) < max_rules):
869
870 rule = self._best_rule(train_sents, test_sents, min_score)
871 if rule:
872 rules.append(rule)
873 else:
874 break
875
876
877 if self._trace > 1: self._trace_rule(rule)
878
879
880 self._apply_rule(rule, test_sents)
881
882
883
884
885 self._update_tag_positions(rule)
886
887
888 self._update_rules(rule, train_sents, test_sents)
889
890
891 except KeyboardInterrupt:
892 print "Training stopped manually -- %d rules found" % len(rules)
893
894
895 self._clean()
896
897
898 return BrillTagger(self._initial_tagger, rules)
899
901 """
902 Initialize the tag position mapping & the rule related
903 mappings. For each error in test_sents, find new rules that
904 would correct them, and add them to the rule mappings.
905 """
906 self._tag_positions = defaultdict(list)
907 self._rules_by_position = defaultdict(set)
908 self._positions_by_rule = defaultdict(dict)
909 self._rules_by_score = defaultdict(set)
910 self._rule_scores = defaultdict(int)
911 self._first_unknown_position = defaultdict(int)
912
913
914
915 for sentnum, sent in enumerate(test_sents):
916 for wordnum, (word, tag) in enumerate(sent):
917
918
919 self._tag_positions[tag].append( (sentnum,wordnum) )
920
921
922 correct_tag = train_sents[sentnum][wordnum][1]
923 if tag != correct_tag:
924 for rule in self._find_rules(sent, wordnum, correct_tag):
925 self._update_rule_applies(rule, sentnum, wordnum,
926 train_sents)
927
929 self._tag_positions = None
930 self._rules_by_position = None
931 self._positions_by_rule = None
932 self._rules_by_score = None
933 self._rule_scores = None
934 self._first_unknown_position = None
935
937 """
938 Use the templates to find rules that apply at index C{wordnum}
939 in the sentence C{sent} and generate the tag C{new_tag}.
940 """
941 for template in self._templates:
942 for rule in template.applicable_rules(sent, wordnum, new_tag):
943 yield rule
944
946 """
947 Update the rule data tables to reflect the fact that
948 C{rule} applies at the position C{(sentnum, wordnum)}.
949 """
950 pos = sentnum, wordnum
951
952
953
954 if pos in self._positions_by_rule[rule]:
955 return
956
957
958 correct_tag = train_sents[sentnum][wordnum][1]
959 if rule.replacement_tag == correct_tag:
960 self._positions_by_rule[rule][pos] = 1
961 elif rule.original_tag == correct_tag:
962 self._positions_by_rule[rule][pos] = -1
963 else:
964 self._positions_by_rule[rule][pos] = 0
965
966
967 self._rules_by_position[pos].add(rule)
968
969
970 old_score = self._rule_scores[rule]
971 self._rule_scores[rule] += self._positions_by_rule[rule][pos]
972
973
974 self._rules_by_score[old_score].discard(rule)
975 self._rules_by_score[self._rule_scores[rule]].add(rule)
976
978 """
979 Update the rule data tables to reflect the fact that C{rule}
980 does not apply at the position C{(sentnum, wordnum)}.
981 """
982 pos = sentnum, wordnum
983
984
985 old_score = self._rule_scores[rule]
986 self._rule_scores[rule] -= self._positions_by_rule[rule][pos]
987
988
989 self._rules_by_score[old_score].discard(rule)
990 self._rules_by_score[self._rule_scores[rule]].add(rule)
991
992
993 del self._positions_by_rule[rule][pos]
994 self._rules_by_position[pos].remove(rule)
995
996
997
998
999 - def _best_rule(self, train_sents, test_sents, min_score):
1000 """
1001 Find the next best rule. This is done by repeatedly taking a
1002 rule with the highest score and stepping through the corpus to
1003 see where it applies. When it makes an error (decreasing its
1004 score) it's bumped down, and we try a new rule with the
1005 highest score. When we find a rule which has the highest
1006 score AND which has been tested against the entire corpus, we
1007 can conclude that it's the next best rule.
1008 """
1009 max_score = max(self._rules_by_score)
1010
1011 while max_score >= min_score:
1012 best_rules = list(self._rules_by_score[max_score])
1013 if self._deterministic:
1014 best_rules.sort(key=repr)
1015 for rule in best_rules:
1016 positions = self._tag_positions[rule.original_tag]
1017
1018 unk = self._first_unknown_position.get(rule, (0,-1))
1019 start = bisect.bisect_left(positions, unk)
1020
1021 for i in range(start, len(positions)):
1022 sentnum, wordnum = positions[i]
1023 if rule.applies(test_sents[sentnum], wordnum):
1024 self._update_rule_applies(rule, sentnum, wordnum,
1025 train_sents)
1026 if self._rule_scores[rule] < max_score:
1027 self._first_unknown_position[rule] = (sentnum,
1028 wordnum+1)
1029 break
1030
1031 if self._rule_scores[rule] == max_score:
1032 self._first_unknown_position[rule] = (len(train_sents)+1,0)
1033 return rule
1034
1035
1036 assert not self._rules_by_score[max_score]
1037 del self._rules_by_score[max_score]
1038 if len(self._rules_by_score) == 0: return None
1039 max_score = max(self._rules_by_score)
1040
1041
1042 return None
1043
1045 """
1046 Update C{test_sents} by applying C{rule} everywhere where its
1047 conditions are meet.
1048 """
1049 update_positions = set(self._positions_by_rule[rule])
1050 old_tag = rule.original_tag
1051 new_tag = rule.replacement_tag
1052
1053 if self._trace > 3: self._trace_apply(len(update_positions))
1054
1055
1056 for (sentnum, wordnum) in update_positions:
1057 text = test_sents[sentnum][wordnum][0]
1058 test_sents[sentnum][wordnum] = (text, new_tag)
1059
1061 """
1062 Update _tag_positions to reflect the changes to tags that are
1063 made by C{rule}.
1064 """
1065
1066 for pos in self._positions_by_rule[rule]:
1067
1068 old_tag_positions = self._tag_positions[rule.original_tag]
1069 old_index = bisect.bisect_left(old_tag_positions, pos)
1070 del old_tag_positions[old_index]
1071
1072 new_tag_positions = self._tag_positions[rule.replacement_tag]
1073 bisect.insort_left(new_tag_positions, pos)
1074
1076 """
1077 Check if we should add or remove any rules from consideration,
1078 given the changes made by C{rule}.
1079 """
1080
1081 neighbors = set()
1082 for sentnum, wordnum in self._positions_by_rule[rule]:
1083 for template in self._templates:
1084 n = template.get_neighborhood(test_sents[sentnum], wordnum)
1085 neighbors.update([(sentnum, i) for i in n])
1086
1087
1088 num_obsolete = num_new = num_unseen = 0
1089 for sentnum, wordnum in neighbors:
1090 test_sent = test_sents[sentnum]
1091 correct_tag = train_sents[sentnum][wordnum][1]
1092
1093
1094
1095
1096 old_rules = set(self._rules_by_position[sentnum, wordnum])
1097 for old_rule in old_rules:
1098 if not old_rule.applies(test_sent, wordnum):
1099 num_obsolete += 1
1100 self._update_rule_not_applies(old_rule, sentnum, wordnum)
1101
1102
1103
1104 site_rules = set()
1105 for template in self._templates:
1106 for new_rule in template.applicable_rules(test_sent, wordnum,
1107 correct_tag):
1108 if new_rule not in old_rules:
1109 num_new += 1
1110 if new_rule not in self._rule_scores:
1111 num_unseen += 1
1112 old_rules.add(new_rule)
1113 self._update_rule_applies(new_rule, sentnum,
1114 wordnum, train_sents)
1115
1116
1117
1118
1119
1120
1121 for new_rule, pos in self._first_unknown_position.items():
1122 if pos > (sentnum, wordnum):
1123 if new_rule not in old_rules:
1124 num_new += 1
1125 if new_rule.applies(test_sent, wordnum):
1126 self._update_rule_applies(new_rule, sentnum,
1127 wordnum, train_sents)
1128
1129 if self._trace > 3:
1130 self._trace_update_rules(num_obsolete, num_new, num_unseen)
1131
1132
1133
1134
1135
1137 print """
1138 B |
1139 S F r O | Score = Fixed - Broken
1140 c i o t | R Fixed = num tags changed incorrect -> correct
1141 o x k h | u Broken = num tags changed correct -> incorrect
1142 r e e e | l Other = num tags changed incorrect -> incorrect
1143 e d n r | e
1144 ------------------+-------------------------------------------------------
1145 """.rstrip()
1146
1148 assert self._rule_scores[rule] == \
1149 sum(self._positions_by_rule[rule].values())
1150
1151 changes = self._positions_by_rule[rule].values()
1152 num_changed = len(changes)
1153 num_fixed = len([c for c in changes if c==1])
1154 num_broken = len([c for c in changes if c==-1])
1155 num_other = len([c for c in changes if c==0])
1156 score = self._rule_scores[rule]
1157
1158 if self._trace > 2:
1159 print '%4d%4d%4d%4d |' % (score,num_fixed,num_broken,num_other),
1160 print textwrap.fill(str(rule), initial_indent=' '*20,
1161 subsequent_indent=' '*18+'| ').strip()
1162 else:
1163 print rule
1164
1166 prefix = ' '*18+'|'
1167 print prefix
1168 print prefix, 'Applying rule to %d positions.' % num_updates
1169
1171 prefix = ' '*18+'|'
1172 print prefix, 'Updated rule tables:'
1173 print prefix, (' - %d rule applications removed' % num_obsolete)
1174 print prefix, (' - %d rule applications added (%d novel)' %
1175 (num_new, num_unseen))
1176 print prefix
1177
1178
1179
1180
1181
1182
1183
1184
1185 -def error_list (train_sents, test_sents, radius=2):
1186 """
1187 Returns a list of human-readable strings indicating the errors in the
1188 given tagging of the corpus.
1189
1190 @param train_sents: The correct tagging of the corpus
1191 @type train_sents: C{list} of C{tuple}
1192 @param test_sents: The tagged corpus
1193 @type test_sents: C{list} of C{tuple}
1194 @param radius: How many tokens on either side of a wrongly-tagged token
1195 to include in the error string. For example, if C{radius}=2,
1196 each error string will show the incorrect token plus two
1197 tokens on either side.
1198 @type radius: int
1199 """
1200 hdr = (('%25s | %s | %s\n' + '-'*26+'+'+'-'*24+'+'+'-'*26) %
1201 ('left context', 'word/test->gold'.center(22), 'right context'))
1202 errors = [hdr]
1203 for (train_sent, test_sent) in zip(train_sents, test_sents):
1204 for wordnum, (word, train_pos) in enumerate(train_sent):
1205 test_pos = test_sent[wordnum][1]
1206 if train_pos != test_pos:
1207 left = ' '.join('%s/%s' % w for w in train_sent[:wordnum])
1208 right = ' '.join('%s/%s' % w for w in train_sent[wordnum+1:])
1209 mid = '%s/%s->%s' % (word, test_pos, train_pos)
1210 errors.append('%25s | %s | %s' %
1211 (left[-25:], mid.center(22), right[:25]))
1212
1213 return errors
1214
1215
1216
1217
1218
1219 -def demo(num_sents=100, max_rules=200, min_score=3,
1220 error_output="errors.out", rule_output="rules.yaml",
1221 randomize=False, train=.8, trace=3):
1222 """
1223 Brill Tagger Demonstration
1224
1225 @param num_sents: how many sentences of training and testing data to use
1226 @type num_sents: L{int}
1227 @param max_rules: maximum number of rule instances to create
1228 @type max_rules: L{int}
1229 @param min_score: the minimum score for a rule in order for it to
1230 be considered
1231 @type min_score: L{int}
1232 @param error_output: the file where errors will be saved
1233 @type error_output: C{string}
1234 @param rule_output: the file where rules will be saved
1235 @type rule_output: C{string}
1236 @param randomize: whether the training data should be a random subset
1237 of the corpus
1238 @type randomize: boolean
1239 @param train: the fraction of the the corpus to be used for training
1240 (1=all)
1241 @type train: L{float}
1242 @param trace: the level of diagnostic tracing output to produce (0-4)
1243 @type trace: L{int}
1244 """
1245
1246 from nltk.corpus import treebank
1247 from nltk import tag
1248 from nltk.tag import brill
1249
1250 nn_cd_tagger = tag.RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
1251 (r'.*', 'NN')])
1252
1253
1254
1255 print "Loading tagged data... "
1256 tagged_data = treebank.tagged_sents()
1257 if randomize:
1258 random.seed(len(sents))
1259 random.shuffle(sents)
1260 cutoff = int(num_sents*train)
1261 training_data = tagged_data[:cutoff]
1262 gold_data = tagged_data[cutoff:num_sents]
1263 testing_data = [[t[0] for t in sent] for sent in gold_data]
1264 print "Done lodaing."
1265
1266
1267 print "Training unigram tagger:"
1268 unigram_tagger = tag.UnigramTagger(training_data,
1269 backoff=nn_cd_tagger)
1270 if gold_data:
1271 print " [accuracy: %f]" % tag.accuracy(unigram_tagger, gold_data)
1272
1273
1274 print "Training bigram tagger:"
1275 bigram_tagger = tag.BigramTagger(training_data,
1276 backoff=unigram_tagger)
1277 if gold_data:
1278 print " [accuracy: %f]" % tag.accuracy(bigram_tagger, gold_data)
1279
1280
1281 templates = [
1282 brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,1)),
1283 brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2,2)),
1284 brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,2)),
1285 brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,3)),
1286 brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,1)),
1287 brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2,2)),
1288 brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,2)),
1289 brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,3)),
1290 brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1,1)),
1291 brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1,1)),
1292 ]
1293 trainer = brill.FastBrillTaggerTrainer(bigram_tagger, templates, trace)
1294
1295 brill_tagger = trainer.train(training_data, max_rules, min_score)
1296
1297 if gold_data:
1298 print("\nBrill accuracy: %f" % tag.accuracy(brill_tagger, gold_data))
1299
1300 if trace <= 1:
1301 print("\nRules: ")
1302 for rule in brill_tagger.rules():
1303 print(str(rule))
1304
1305 print_rules = file(rule_output, 'w')
1306 yaml.dump(brill_tagger, print_rules)
1307 print_rules.close()
1308
1309 testing_data = brill_tagger.batch_tag(testing_data)
1310 error_file = file(error_output, 'w')
1311 error_file.write('Errors for Brill Tagger %r\n\n' % rule_output)
1312 for e in error_list(gold_data, testing_data):
1313 error_file.write(e+'\n')
1314 error_file.close()
1315 print ("Done; rules and errors saved to %s and %s." %
1316 (rule_output, error_output))
1317
1318 if __name__ == '__main__':
1319 demo()
1320
1321