nltk.tag.sequential

1 # Natural Language Toolkit: Sequential Backoff Taggers 2 # 3 # Copyright (C) 2001-2008 NLTK Project 4 # Author: Edward Loper <[email protected]> 5 # Steven Bird <[email protected]> (minor additions) 6 # Tiago Tresoldi <[email protected]> (original affix tagger) 7 # URL: <http://nltk.org> 8 # For license information, see LICENSE.TXT 9 10 """ 11 Classes for tagging sentences sequentially, left to right. The 12 abstract base class L{SequentialBackoffTagger} serves as the base 13 class for all the taggers in this module. Tagging of individual words 14 is performed by the method L{choose_tag() 15 <SequentialBackoffTagger.choose_tag>}, which is defined by 16 subclasses of L{SequentialBackoffTagger}. If a tagger is unable to 17 determine a tag for the specified token, then its I{backoff tagger} is 18 consulted instead. Any C{SequentialBackoffTagger} may serve as a 19 backoff tagger for any other C{SequentialBackoffTagger}. 20 """ 21 22 import re, yaml 23 24 from nltk import FreqDist, ConditionalFreqDist 25 from nltk.internals import deprecated, Deprecated 26 from nltk.classify.naivebayes import NaiveBayesClassifier 27 28 from api import * 29 from util import * 30 31 ###################################################################### 32 #{ Abstract Base Classes 33 ######################################################################

34 -class SequentialBackoffTagger(TaggerI):

35 """ 36 An abstract base class for taggers that tags words sequentially, 37 left to right. Tagging of individual words is performed by the 38 method L{choose_tag()}, which should be defined by subclasses. If 39 a tagger is unable to determine a tag for the specified token, 40 then its backoff tagger is consulted. 41 42 @ivar _taggers: A list of all the taggers that should be tried to 43 tag a token (i.e., C{self} and its backoff taggers). 44 """

45 - def __init__(self, backoff=None):

46 if backoff is None: 47 self._taggers = [self] 48 else: 49 self._taggers = [self] + backoff._taggers

50

51 - def _get_backoff(self):

52 if len(self._taggers) < 2: return None 53 else: return self._taggers[1]

54 backoff = property(_get_backoff, doc=''' 55 The backoff tagger for this tagger.''') 56

57 - def tag(self, tokens):

58 # docs inherited from TaggerI 59 tags = [] 60 for i in range(len(tokens)): 61 tags.append(self.tag_one(tokens, i, tags)) 62 return zip(tokens, tags)

63

64 - def tag_one(self, tokens, index, history):

65 """ 66 Determine an appropriate tag for the specified token, and 67 return that tag. If this tagger is unable to determine a tag 68 for the specified token, then its backoff tagger is consulted. 69 70 @rtype: C{str} 71 @type tokens: C{list} 72 @param tokens: The list of words that are being tagged. 73 @type index: C{int} 74 @param index: The index of the word whose tag should be 75 returned. 76 @type history: C{list} of C{str} 77 @param history: A list of the tags for all words before 78 C{index}. 79 """ 80 tag = None 81 for tagger in self._taggers: 82 tag = tagger.choose_tag(tokens, index, history) 83 if tag is not None: break 84 return tag

85

86 - def choose_tag(self, tokens, index, history):

87 """ 88 Decide which tag should be used for the specified token, and 89 return that tag. If this tagger is unable to determine a tag 90 for the specified token, return C{None} -- do I{not} consult 91 the backoff tagger. This method should be overridden by 92 subclasses of C{SequentialBackoffTagger}. 93 94 @rtype: C{str} 95 @type tokens: C{list} 96 @param tokens: The list of words that are being tagged. 97 @type index: C{int} 98 @param index: The index of the word whose tag should be 99 returned. 100 @type history: C{list} of C{str} 101 @param history: A list of the tags for all words before 102 C{index}. 103 """ 104 raise AssertionError('SequentialBackoffTagger is an abstract class')

105 106 #//////////////////////////////////////////////////////////// 107 #{ Deprecated 108 109 @deprecated('Use batch_tag instead.')

110 - def tag_sents(self, sents, verbose=False):

111 self.tag_batch(sents, verbose)

112

113 -class ContextTagger(SequentialBackoffTagger):

114 """ 115 An abstract base class for sequential backoff taggers that choose 116 a tag for a token based on the value of its "context". Different 117 subclasses are used to define different contexts. 118 119 A C{ContextTagger} chooses the tag for a token by calculating the 120 token's context, and looking up the corresponding tag in a table. 121 This table can be constructed manually; or it can be automatically 122 constructed based on a training corpus, using the L{_train()} 123 factory method. 124 125 @ivar _context_to_tag: Dictionary mapping contexts to tags. 126 """

127 - def __init__(self, context_to_tag, backoff=None):

128 """ 129 @param context_to_tag: A dictionary mapping contexts to tags. 130 @param backoff: The backoff tagger that should be used for this tagger. 131 """ 132 SequentialBackoffTagger.__init__(self, backoff) 133 if context_to_tag: 134 self._context_to_tag = context_to_tag 135 else: 136 self._context_to_tag = {}

137

138 - def context(self, tokens, index, history):

139 """ 140 @return: the context that should be used to look up the tag 141 for the specified token; or C{None} if the specified token 142 should not be handled by this tagger. 143 @rtype: (hashable) 144 """ 145 raise AssertionError('Abstract base class')

146

147 - def choose_tag(self, tokens, index, history):

148 context = self.context(tokens, index, history) 149 return self._context_to_tag.get(context)

150

151 - def size(self):

152 """ 153 @return: The number of entries in the table used by this 154 tagger to map from contexts to tags. 155 """ 156 return len(self._context_to_tag)

157

158 - def __repr__(self):

159 return '<%s: size=%d>' % (self.__class__.__name__, self.size())

160

161 - def _train(self, tagged_corpus, cutoff=1, verbose=False):

162 """ 163 Initialize this C{ContextTagger}'s L{_context_to_tag} table 164 based on the given training data. In particular, for each 165 context C{I{c}} in the training data, set 166 C{_context_to_tag[I{c}]} to the most frequent tag for that 167 context. However, exclude any contexts that are already 168 tagged perfectly by the backoff tagger(s). 169 170 The old value of C{self._context_to_tag} (if any) is discarded. 171 172 @param tagged_corpus: A tagged corpus. Each item should be 173 a C{list} of C{(word, tag)} tuples. 174 @param cutoff: If the most likely tag for a context occurs 175 fewer than C{cutoff} times, then exclude it from the 176 context-to-tag table for the new tagger. 177 """ 178 token_count = hit_count = 0 179 180 # A context is considered 'useful' if it's not already tagged 181 # perfectly by the backoff tagger. 182 useful_contexts = set() 183 184 # Count how many times each tag occurs in each context. 185 fd = ConditionalFreqDist() 186 for sentence in tagged_corpus: 187 tokens, tags = zip(*sentence) 188 for index, (token, tag) in enumerate(sentence): 189 # Record the event. 190 token_count += 1 191 context = self.context(tokens, index, tags[:index]) 192 if context is None: continue 193 fd[context].inc(tag) 194 # If the backoff got it wrong, this context is useful: 195 if (self.backoff is None or 196 tag != self.backoff.tag_one(tokens, index, tags[:index])): 197 useful_contexts.add(context) 198 199 # Build the context_to_tag table -- for each context, figure 200 # out what the most likely tag is. Only include contexts that 201 # we've seen at least `cutoff` times. 202 for context in useful_contexts: 203 best_tag = fd[context].max() 204 hits = fd[context][best_tag] 205 if hits > cutoff: 206 self._context_to_tag[context] = best_tag 207 hit_count += hits 208 209 # Display some stats, if requested. 210 if verbose: 211 size = len(self._context_to_tag) 212 backoff = 100 - (hit_count * 100.0)/ token_count 213 pruning = 100 - (size * 100.0) / len(fd.conditions()) 214 print "[Trained Unigram tagger:", 215 print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % ( 216 size, backoff, pruning)

217 218 ###################################################################### 219 #{ Tagger Classes 220 ###################################################################### 221

222 -class DefaultTagger(SequentialBackoffTagger, yaml.YAMLObject):

223 """ 224 A tagger that assigns the same tag to every token. 225 """ 226 yaml_tag = '!nltk.DefaultTagger' 227

228 - def __init__(self, tag):

229 """ 230 Construct a new tagger that assigns C{tag} to all tokens. 231 """ 232 self._tag = tag 233 SequentialBackoffTagger.__init__(self, None)

234

235 - def choose_tag(self, tokens, index, history):

236 return self._tag # ignore token and history

237

238 - def __repr__(self):

239 return '<DefaultTagger: tag=%s>' % self._tag

240 241

242 -class NgramTagger(ContextTagger, yaml.YAMLObject):

243 """ 244 A tagger that chooses a token's tag based on its word string and 245 on the preceeding I{n} word's tags. In particular, a tuple 246 C{(tags[i-n:i-1], words[i])} is looked up in a table, and the 247 corresponding tag is returned. N-gram taggers are typically 248 trained on a tagged corpus. 249 """ 250 yaml_tag = '!nltk.NgramTagger' 251

252 - def __init__(self, n, train=None, model=None, backoff=None, 253 cutoff=1, verbose=False):

254 """ 255 Train a new C{NgramTagger} using the given training data or 256 the supplied model. In particular, construct a new tagger 257 whose table maps from each context C{(tag[i-n:i-1], word[i])} 258 to the most frequent tag for that context. But exclude any 259 contexts that are already tagged perfectly by the backoff 260 tagger. 261 262 @param train: A tagged corpus. Each item should be 263 a C{list} of C{(word, tag)} tuples. 264 @param backoff: A backoff tagger, to be used by the new 265 tagger if it encounters an unknown context. 266 @param cutoff: If the most likely tag for a context occurs 267 fewer than C{cutoff} times, then exclude it from the 268 context-to-tag table for the new tagger. 269 """ 270 self._n = n 271 272 if (train and model) or (not train and not model): 273 raise ValueError('Must specify either training data or trained model.') 274 ContextTagger.__init__(self, model, backoff) 275 if train: 276 self._train(train, cutoff, verbose)

277

278 - def context(self, tokens, index, history):

279 tag_context = tuple(history[max(0,index-self._n+1):index]) 280 return (tag_context, tokens[index])

281 282

283 -class UnigramTagger(NgramTagger):

284 """ 285 A tagger that chooses a token's tag based its word string. 286 Unigram taggers are typically trained on a tagged corpus. 287 """ 288 yaml_tag = '!nltk.UnigramTagger' 289

290 - def __init__(self, train=None, model=None, backoff=None, 291 cutoff=1, verbose=False):

292 NgramTagger.__init__(self, 1, train, model, backoff, cutoff, verbose)

293

294 - def context(self, tokens, index, history):

295 return tokens[index]

296 297

298 -class BigramTagger(NgramTagger):

299 """ 300 A tagger that chooses a token's tag based its word string and on 301 the preceeding words' tag. In particular, a tuple consisting 302 of the previous tag and the word is looked up in a table, and 303 the corresponding tag is returned. Bigram taggers are typically 304 trained on a tagged corpus. 305 """ 306 yaml_tag = '!nltk.BigramTagger' 307

308 - def __init__(self, train=None, model=None, backoff=None, 309 cutoff=1, verbose=False):

310 NgramTagger.__init__(self, 2, train, model, backoff, cutoff, verbose)

311 312

313 -class TrigramTagger(NgramTagger):

314 """ 315 A tagger that chooses a token's tag based its word string and on 316 the preceeding two words' tags. In particular, a tuple consisting 317 of the previous two tags and the word is looked up in a table, and 318 the corresponding tag is returned. Trigram taggers are typically 319 trained them on a tagged corpus. 320 """ 321 yaml_tag = '!nltk.TrigramTagger' 322

323 - def __init__(self, train=None, model=None, backoff=None, 324 cutoff=1, verbose=False):

325 NgramTagger.__init__(self, 3, train, model, backoff, cutoff, verbose)

326 327

328 -class AffixTagger(ContextTagger, yaml.YAMLObject):

329 """ 330 A tagger that chooses a token's tag based on a leading or trailing 331 substring of its word string. (It is important to note that these 332 substrings are not necessarily "true" morphological affixes). In 333 particular, a fixed-length substring of the word is looked up in a 334 table, and the corresponding tag is returned. Affix taggers are 335 typically constructed by training them on a tagged corpus; see 336 L{the constructor <__init__>}. 337 """ 338 yaml_tag = '!nltk.AffixTagger' 339

340 - def __init__(self, train=None, model=None, affix_length=-3, 341 min_stem_length=2, backoff=None, cutoff=1, verbose=False):

342 """ 343 Construct a new affix tagger. 344 345 @param affix_length: The length of the affixes that should be 346 considered during training and tagging. Use negative 347 numbers for suffixes. 348 @param min_stem_length: Any words whose length is less than 349 C{min_stem_length+abs(affix_length)} will be assigned a 350 tag of C{None} by this tagger. 351 """ 352 if (train and model) or (not train and not model): 353 raise ValueError('Must specify either training data or ' 354 'trained model') 355 ContextTagger.__init__(self, model, backoff) 356 357 self._affix_length = affix_length 358 self._min_word_length = min_stem_length + abs(affix_length) 359 360 if train: 361 self._train(train, cutoff, verbose)

362

363 - def context(self, tokens, index, history):

364 token = tokens[index] 365 if len(token) < self._min_word_length: 366 return None 367 elif self._affix_length > 0: 368 return token[:self._affix_length] 369 else: 370 return token[self._affix_length:]

371 372

373 -class RegexpTagger(SequentialBackoffTagger, yaml.YAMLObject):

374 """ 375 A tagger that assigns tags to words based on regular expressions 376 over word strings. 377 """ 378 yaml_tag = '!nltk.RegexpTagger'

379 - def __init__(self, regexps, backoff=None):

380 """ 381 Construct a new regexp tagger. 382 383 @type regexps: C{list} of C{(str, str)} 384 @param regexps: A list of C{(regexp, tag)} pairs, each of 385 which indicates that a word matching C{regexp} should 386 be tagged with C{tag}. The pairs will be evalutated in 387 order. If none of the regexps match a word, then the 388 optional backoff tagger is invoked, else it is 389 assigned the tag C{None}. 390 """ 391 self._regexps = regexps 392 SequentialBackoffTagger.__init__(self, backoff)

393

394 - def choose_tag(self, tokens, index, history):

395 for regexp, tag in self._regexps: 396 if re.match(regexp, tokens[index]): # ignore history 397 return tag 398 return None

399

400 - def __repr__(self):

401 return '<Regexp Tagger: size=%d>' % len(self._regexps)

402

403 -class ClassifierBasedTagger(SequentialBackoffTagger, FeaturesetTaggerI):

404 """ 405 A sequential tagger that uses a classifier to choose the tag for 406 each token in a sentence. The featureset input for the classifier 407 is generated by a feature detector function:: 408 409 feature_detector(tokens, index, history) -> featureset 410 411 Where C{tokens} is the list of unlabeled tokens in the sentence; 412 C{index} is the index of the token for which feature detection 413 should be performed; and C{history} is list of the tags for all 414 tokens before C{index}. 415 """

416 - def __init__(self, feature_detector, train=None, 417 classifier_builder=NaiveBayesClassifier.train, 418 classifier=None, backoff=None, 419 cutoff_prob=None, verbose=False):

420 """ 421 Construct a new classifier-based sequential tagger. 422 423 @param feature_detector: A function used to generate the 424 featureset input for the classifier:: 425 feature_detector(tokens, index, history) -> featureset 426 427 @param train: A tagged corpus. Each item should be a C{list} 428 of C{(word, tag)} tuples. 429 430 @param backoff: A backoff tagger, to be used by the new tagger 431 if it encounters an unknown context. 432 433 @param classifier_builder: A function used to train a new 434 classifier based on the data in C{train}. It should take 435 one argument, a list of labeled featuresets (i.e., 436 C{(featureset, label)} tuples). 437 438 @param classifier: The classifier that should be used by the 439 tagger. This is only useful if you want to manually 440 construct the classifier; normally, you would use C{train} 441 instead. 442 443 @param backoff: A backoff tagger, used if this tagger is 444 unable to determine a tag for a given token. 445 446 @param cutoff_prob: If specified, then this tagger will fall 447 back on its backoff tagger if the probability of the most 448 likely tag is less than C{cutoff_prob}. 449 """ 450 SequentialBackoffTagger.__init__(self, backoff) 451 452 if (train and classifier) or (not train and not classifier): 453 raise ValueError('Must specify either training data or ' 454 'trained classifier.') 455 456 self._feature_detector = feature_detector 457 """The feature detector function, used to generate a featureset 458 for each token:: 459 feature_detector(tokens, index, history) -> featureset""" 460 461 self._cutoff_prob = cutoff_prob 462 """Cutoff probability for tagging -- if the probability of the 463 most likely tag is less than this, then use backoff.""" 464 465 self._classifier = classifier 466 """The classifier used to choose a tag for each token.""" 467 468 if train: 469 self._train(train, classifier_builder, verbose)

470

471 - def choose_tag(self, tokens, index, history):

472 # Use our feature detector to get the featureset. 473 featureset = self._feature_detector(tokens, index, history) 474 475 # Use the classifier to pick a tag. If a cutoff probability 476 # was specified, then check that the tag's probability is 477 # higher than that cutoff first; otherwise, return None. 478 if self._cutoff_prob is None: 479 return self._classifier.classify(featureset) 480 else: 481 pdist = self._classifier.prob_classify(featureset) 482 tag = pdist.max() 483 if pdist.prob(tag) >= self._cutoff_prob: 484 return tag 485 else: 486 return None

487

488 - def _train(self, tagged_corpus, classifier_builder, verbose):

489 """ 490 Build a new classifier, based on the given training data 491 (C{tagged_corpus}). 492 """ 493 classifier_corpus = [] 494 if verbose: 495 print 'Constructing training corpus for classifier.' 496 497 for sentence in tagged_corpus: 498 history = [] 499 untagged_sentence = [tok for (tok, tag) in sentence] 500 tags = [tag for (tok, tag) in sentence] 501 for index in range(len(sentence)): 502 featureset = self._feature_detector(untagged_sentence, 503 index, history) 504 classifier_corpus.append( (featureset, tags[index]) ) 505 history.append(tags[index]) 506 507 if verbose: 508 print 'Training classifier (%d instances)' % len(classifier_corpus) 509 self._classifier = classifier_builder(classifier_corpus)

510

511 - def __repr__(self):

512 return '<ClassifierBasedTagger: %r>' % self._classifier

513

514 - def feature_detector(self):

515 """ 516 Return the feature detector that this tagger uses to generate 517 featuresets for its classifier. The feature detector is a 518 function with the signature:: 519 520 feature_detector(tokens, index, history) -> featureset 521 522 @see: L{classifier()} 523 """ 524 return self._feature_detector

525

526 - def classifier(self):

527 """ 528 Return the classifier that this tagger uses to choose a tag 529 for each word in a sentence. The input for this classifier is 530 generated using this tagger's feature detector. 531 532 @see: L{feature_detector()} 533 """ 534 return self._classifier

535

Source Code for Module nltk.tag.sequential