1
2
3
4
5
6
7
8
9
10 """
11 Classes for tagging sentences sequentially, left to right. The
12 abstract base class L{SequentialBackoffTagger} serves as the base
13 class for all the taggers in this module. Tagging of individual words
14 is performed by the method L{choose_tag()
15 <SequentialBackoffTagger.choose_tag>}, which is defined by
16 subclasses of L{SequentialBackoffTagger}. If a tagger is unable to
17 determine a tag for the specified token, then its I{backoff tagger} is
18 consulted instead. Any C{SequentialBackoffTagger} may serve as a
19 backoff tagger for any other C{SequentialBackoffTagger}.
20 """
21
22 import re, yaml
23
24 from nltk import FreqDist, ConditionalFreqDist
25 from nltk.internals import deprecated, Deprecated
26 from nltk.classify.naivebayes import NaiveBayesClassifier
27
28 from api import *
29 from util import *
30
31
32
33
35 """
36 An abstract base class for taggers that tags words sequentially,
37 left to right. Tagging of individual words is performed by the
38 method L{choose_tag()}, which should be defined by subclasses. If
39 a tagger is unable to determine a tag for the specified token,
40 then its backoff tagger is consulted.
41
42 @ivar _taggers: A list of all the taggers that should be tried to
43 tag a token (i.e., C{self} and its backoff taggers).
44 """
46 if backoff is None:
47 self._taggers = [self]
48 else:
49 self._taggers = [self] + backoff._taggers
50
52 if len(self._taggers) < 2: return None
53 else: return self._taggers[1]
54 backoff = property(_get_backoff, doc='''
55 The backoff tagger for this tagger.''')
56
57 - def tag(self, tokens):
63
64 - def tag_one(self, tokens, index, history):
65 """
66 Determine an appropriate tag for the specified token, and
67 return that tag. If this tagger is unable to determine a tag
68 for the specified token, then its backoff tagger is consulted.
69
70 @rtype: C{str}
71 @type tokens: C{list}
72 @param tokens: The list of words that are being tagged.
73 @type index: C{int}
74 @param index: The index of the word whose tag should be
75 returned.
76 @type history: C{list} of C{str}
77 @param history: A list of the tags for all words before
78 C{index}.
79 """
80 tag = None
81 for tagger in self._taggers:
82 tag = tagger.choose_tag(tokens, index, history)
83 if tag is not None: break
84 return tag
85
87 """
88 Decide which tag should be used for the specified token, and
89 return that tag. If this tagger is unable to determine a tag
90 for the specified token, return C{None} -- do I{not} consult
91 the backoff tagger. This method should be overridden by
92 subclasses of C{SequentialBackoffTagger}.
93
94 @rtype: C{str}
95 @type tokens: C{list}
96 @param tokens: The list of words that are being tagged.
97 @type index: C{int}
98 @param index: The index of the word whose tag should be
99 returned.
100 @type history: C{list} of C{str}
101 @param history: A list of the tags for all words before
102 C{index}.
103 """
104 raise AssertionError('SequentialBackoffTagger is an abstract class')
105
106
107
108
109 @deprecated('Use batch_tag instead.')
111 self.tag_batch(sents, verbose)
112
113 -class ContextTagger(SequentialBackoffTagger):
114 """
115 An abstract base class for sequential backoff taggers that choose
116 a tag for a token based on the value of its "context". Different
117 subclasses are used to define different contexts.
118
119 A C{ContextTagger} chooses the tag for a token by calculating the
120 token's context, and looking up the corresponding tag in a table.
121 This table can be constructed manually; or it can be automatically
122 constructed based on a training corpus, using the L{_train()}
123 factory method.
124
125 @ivar _context_to_tag: Dictionary mapping contexts to tags.
126 """
127 - def __init__(self, context_to_tag, backoff=None):
128 """
129 @param context_to_tag: A dictionary mapping contexts to tags.
130 @param backoff: The backoff tagger that should be used for this tagger.
131 """
132 SequentialBackoffTagger.__init__(self, backoff)
133 if context_to_tag:
134 self._context_to_tag = context_to_tag
135 else:
136 self._context_to_tag = {}
137
138 - def context(self, tokens, index, history):
139 """
140 @return: the context that should be used to look up the tag
141 for the specified token; or C{None} if the specified token
142 should not be handled by this tagger.
143 @rtype: (hashable)
144 """
145 raise AssertionError('Abstract base class')
146
147 - def choose_tag(self, tokens, index, history):
148 context = self.context(tokens, index, history)
149 return self._context_to_tag.get(context)
150
152 """
153 @return: The number of entries in the table used by this
154 tagger to map from contexts to tags.
155 """
156 return len(self._context_to_tag)
157
158 - def __repr__(self):
159 return '<%s: size=%d>' % (self.__class__.__name__, self.size())
160
161 - def _train(self, tagged_corpus, cutoff=1, verbose=False):
162 """
163 Initialize this C{ContextTagger}'s L{_context_to_tag} table
164 based on the given training data. In particular, for each
165 context C{I{c}} in the training data, set
166 C{_context_to_tag[I{c}]} to the most frequent tag for that
167 context. However, exclude any contexts that are already
168 tagged perfectly by the backoff tagger(s).
169
170 The old value of C{self._context_to_tag} (if any) is discarded.
171
172 @param tagged_corpus: A tagged corpus. Each item should be
173 a C{list} of C{(word, tag)} tuples.
174 @param cutoff: If the most likely tag for a context occurs
175 fewer than C{cutoff} times, then exclude it from the
176 context-to-tag table for the new tagger.
177 """
178 token_count = hit_count = 0
179
180
181
182 useful_contexts = set()
183
184
185 fd = ConditionalFreqDist()
186 for sentence in tagged_corpus:
187 tokens, tags = zip(*sentence)
188 for index, (token, tag) in enumerate(sentence):
189
190 token_count += 1
191 context = self.context(tokens, index, tags[:index])
192 if context is None: continue
193 fd[context].inc(tag)
194
195 if (self.backoff is None or
196 tag != self.backoff.tag_one(tokens, index, tags[:index])):
197 useful_contexts.add(context)
198
199
200
201
202 for context in useful_contexts:
203 best_tag = fd[context].max()
204 hits = fd[context][best_tag]
205 if hits > cutoff:
206 self._context_to_tag[context] = best_tag
207 hit_count += hits
208
209
210 if verbose:
211 size = len(self._context_to_tag)
212 backoff = 100 - (hit_count * 100.0)/ token_count
213 pruning = 100 - (size * 100.0) / len(fd.conditions())
214 print "[Trained Unigram tagger:",
215 print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
216 size, backoff, pruning)
217
218
219
220
221
223 """
224 A tagger that assigns the same tag to every token.
225 """
226 yaml_tag = '!nltk.DefaultTagger'
227
234
237
239 return '<DefaultTagger: tag=%s>' % self._tag
240
241
243 """
244 A tagger that chooses a token's tag based on its word string and
245 on the preceeding I{n} word's tags. In particular, a tuple
246 C{(tags[i-n:i-1], words[i])} is looked up in a table, and the
247 corresponding tag is returned. N-gram taggers are typically
248 trained on a tagged corpus.
249 """
250 yaml_tag = '!nltk.NgramTagger'
251
252 - def __init__(self, n, train=None, model=None, backoff=None,
253 cutoff=1, verbose=False):
254 """
255 Train a new C{NgramTagger} using the given training data or
256 the supplied model. In particular, construct a new tagger
257 whose table maps from each context C{(tag[i-n:i-1], word[i])}
258 to the most frequent tag for that context. But exclude any
259 contexts that are already tagged perfectly by the backoff
260 tagger.
261
262 @param train: A tagged corpus. Each item should be
263 a C{list} of C{(word, tag)} tuples.
264 @param backoff: A backoff tagger, to be used by the new
265 tagger if it encounters an unknown context.
266 @param cutoff: If the most likely tag for a context occurs
267 fewer than C{cutoff} times, then exclude it from the
268 context-to-tag table for the new tagger.
269 """
270 self._n = n
271
272 if (train and model) or (not train and not model):
273 raise ValueError('Must specify either training data or trained model.')
274 ContextTagger.__init__(self, model, backoff)
275 if train:
276 self._train(train, cutoff, verbose)
277
278 - def context(self, tokens, index, history):
279 tag_context = tuple(history[max(0,index-self._n+1):index])
280 return (tag_context, tokens[index])
281
282
284 """
285 A tagger that chooses a token's tag based its word string.
286 Unigram taggers are typically trained on a tagged corpus.
287 """
288 yaml_tag = '!nltk.UnigramTagger'
289
290 - def __init__(self, train=None, model=None, backoff=None,
291 cutoff=1, verbose=False):
293
294 - def context(self, tokens, index, history):
296
297
299 """
300 A tagger that chooses a token's tag based its word string and on
301 the preceeding words' tag. In particular, a tuple consisting
302 of the previous tag and the word is looked up in a table, and
303 the corresponding tag is returned. Bigram taggers are typically
304 trained on a tagged corpus.
305 """
306 yaml_tag = '!nltk.BigramTagger'
307
308 - def __init__(self, train=None, model=None, backoff=None,
309 cutoff=1, verbose=False):
311
312
314 """
315 A tagger that chooses a token's tag based its word string and on
316 the preceeding two words' tags. In particular, a tuple consisting
317 of the previous two tags and the word is looked up in a table, and
318 the corresponding tag is returned. Trigram taggers are typically
319 trained them on a tagged corpus.
320 """
321 yaml_tag = '!nltk.TrigramTagger'
322
323 - def __init__(self, train=None, model=None, backoff=None,
324 cutoff=1, verbose=False):
326
327
329 """
330 A tagger that chooses a token's tag based on a leading or trailing
331 substring of its word string. (It is important to note that these
332 substrings are not necessarily "true" morphological affixes). In
333 particular, a fixed-length substring of the word is looked up in a
334 table, and the corresponding tag is returned. Affix taggers are
335 typically constructed by training them on a tagged corpus; see
336 L{the constructor <__init__>}.
337 """
338 yaml_tag = '!nltk.AffixTagger'
339
340 - def __init__(self, train=None, model=None, affix_length=-3,
341 min_stem_length=2, backoff=None, cutoff=1, verbose=False):
342 """
343 Construct a new affix tagger.
344
345 @param affix_length: The length of the affixes that should be
346 considered during training and tagging. Use negative
347 numbers for suffixes.
348 @param min_stem_length: Any words whose length is less than
349 C{min_stem_length+abs(affix_length)} will be assigned a
350 tag of C{None} by this tagger.
351 """
352 if (train and model) or (not train and not model):
353 raise ValueError('Must specify either training data or '
354 'trained model')
355 ContextTagger.__init__(self, model, backoff)
356
357 self._affix_length = affix_length
358 self._min_word_length = min_stem_length + abs(affix_length)
359
360 if train:
361 self._train(train, cutoff, verbose)
362
363 - def context(self, tokens, index, history):
364 token = tokens[index]
365 if len(token) < self._min_word_length:
366 return None
367 elif self._affix_length > 0:
368 return token[:self._affix_length]
369 else:
370 return token[self._affix_length:]
371
372
373 -class RegexpTagger(SequentialBackoffTagger, yaml.YAMLObject):
374 """
375 A tagger that assigns tags to words based on regular expressions
376 over word strings.
377 """
378 yaml_tag = '!nltk.RegexpTagger'
379 - def __init__(self, regexps, backoff=None):
380 """
381 Construct a new regexp tagger.
382
383 @type regexps: C{list} of C{(str, str)}
384 @param regexps: A list of C{(regexp, tag)} pairs, each of
385 which indicates that a word matching C{regexp} should
386 be tagged with C{tag}. The pairs will be evalutated in
387 order. If none of the regexps match a word, then the
388 optional backoff tagger is invoked, else it is
389 assigned the tag C{None}.
390 """
391 self._regexps = regexps
392 SequentialBackoffTagger.__init__(self, backoff)
393
399
401 return '<Regexp Tagger: size=%d>' % len(self._regexps)
402
404 """
405 A sequential tagger that uses a classifier to choose the tag for
406 each token in a sentence. The featureset input for the classifier
407 is generated by a feature detector function::
408
409 feature_detector(tokens, index, history) -> featureset
410
411 Where C{tokens} is the list of unlabeled tokens in the sentence;
412 C{index} is the index of the token for which feature detection
413 should be performed; and C{history} is list of the tags for all
414 tokens before C{index}.
415 """
416 - def __init__(self, feature_detector, train=None,
417 classifier_builder=NaiveBayesClassifier.train,
418 classifier=None, backoff=None,
419 cutoff_prob=None, verbose=False):
420 """
421 Construct a new classifier-based sequential tagger.
422
423 @param feature_detector: A function used to generate the
424 featureset input for the classifier::
425 feature_detector(tokens, index, history) -> featureset
426
427 @param train: A tagged corpus. Each item should be a C{list}
428 of C{(word, tag)} tuples.
429
430 @param backoff: A backoff tagger, to be used by the new tagger
431 if it encounters an unknown context.
432
433 @param classifier_builder: A function used to train a new
434 classifier based on the data in C{train}. It should take
435 one argument, a list of labeled featuresets (i.e.,
436 C{(featureset, label)} tuples).
437
438 @param classifier: The classifier that should be used by the
439 tagger. This is only useful if you want to manually
440 construct the classifier; normally, you would use C{train}
441 instead.
442
443 @param backoff: A backoff tagger, used if this tagger is
444 unable to determine a tag for a given token.
445
446 @param cutoff_prob: If specified, then this tagger will fall
447 back on its backoff tagger if the probability of the most
448 likely tag is less than C{cutoff_prob}.
449 """
450 SequentialBackoffTagger.__init__(self, backoff)
451
452 if (train and classifier) or (not train and not classifier):
453 raise ValueError('Must specify either training data or '
454 'trained classifier.')
455
456 self._feature_detector = feature_detector
457 """The feature detector function, used to generate a featureset
458 for each token::
459 feature_detector(tokens, index, history) -> featureset"""
460
461 self._cutoff_prob = cutoff_prob
462 """Cutoff probability for tagging -- if the probability of the
463 most likely tag is less than this, then use backoff."""
464
465 self._classifier = classifier
466 """The classifier used to choose a tag for each token."""
467
468 if train:
469 self._train(train, classifier_builder, verbose)
470
472
473 featureset = self._feature_detector(tokens, index, history)
474
475
476
477
478 if self._cutoff_prob is None:
479 return self._classifier.classify(featureset)
480 else:
481 pdist = self._classifier.prob_classify(featureset)
482 tag = pdist.max()
483 if pdist.prob(tag) >= self._cutoff_prob:
484 return tag
485 else:
486 return None
487
488 - def _train(self, tagged_corpus, classifier_builder, verbose):
489 """
490 Build a new classifier, based on the given training data
491 (C{tagged_corpus}).
492 """
493 classifier_corpus = []
494 if verbose:
495 print 'Constructing training corpus for classifier.'
496
497 for sentence in tagged_corpus:
498 history = []
499 untagged_sentence = [tok for (tok, tag) in sentence]
500 tags = [tag for (tok, tag) in sentence]
501 for index in range(len(sentence)):
502 featureset = self._feature_detector(untagged_sentence,
503 index, history)
504 classifier_corpus.append( (featureset, tags[index]) )
505 history.append(tags[index])
506
507 if verbose:
508 print 'Training classifier (%d instances)' % len(classifier_corpus)
509 self._classifier = classifier_builder(classifier_corpus)
510
512 return '<ClassifierBasedTagger: %r>' % self._classifier
513
515 """
516 Return the feature detector that this tagger uses to generate
517 featuresets for its classifier. The feature detector is a
518 function with the signature::
519
520 feature_detector(tokens, index, history) -> featureset
521
522 @see: L{classifier()}
523 """
524 return self._feature_detector
525
527 """
528 Return the classifier that this tagger uses to choose a tag
529 for each word in a sentence. The input for this classifier is
530 generated using this tagger's feature detector.
531
532 @see: L{feature_detector()}
533 """
534 return self._classifier
535