Code Coverage for nltk.tag.sequential
Untested Functions
|
Partially Tested Functions
|
"""
Classes for tagging sentences sequentially, left to right. The
abstract base class L{SequentialBackoffTagger} serves as the base
class for all the taggers in this module. Tagging of individual words
is performed by the method L{choose_tag()
<SequentialBackoffTagger.choose_tag>}, which is defined by
subclasses of L{SequentialBackoffTagger}. If a tagger is unable to
determine a tag for the specified token, then its I{backoff tagger} is
consulted instead. Any C{SequentialBackoffTagger} may serve as a
backoff tagger for any other C{SequentialBackoffTagger}.
"""
import re, yaml
from nltk import FreqDist, ConditionalFreqDist
from nltk.internals import deprecated, Deprecated
from nltk.classify.naivebayes import NaiveBayesClassifier
from api import *
from util import *
class SequentialBackoffTagger(TaggerI):
"""
An abstract base class for taggers that tags words sequentially,
left to right. Tagging of individual words is performed by the
method L{choose_tag()}, which should be defined by subclasses. If
a tagger is unable to determine a tag for the specified token,
then its backoff tagger is consulted.
@ivar _taggers: A list of all the taggers that should be tried to
tag a token (i.e., C{self} and its backoff taggers).
"""
def __init__(self, backoff=None):
if backoff is None:
self._taggers = [self]
else:
self._taggers = [self] + backoff._taggers
def _get_backoff(self):
if len(self._taggers) < 2: return None
else: return self._taggers[1]
backoff = property(_get_backoff, doc='''
The backoff tagger for this tagger.''')
def tag(self, tokens):
tags = []
for i in range(len(tokens)):
tags.append(self.tag_one(tokens, i, tags))
return zip(tokens, tags)
def tag_one(self, tokens, index, history):
"""
Determine an appropriate tag for the specified token, and
return that tag. If this tagger is unable to determine a tag
for the specified token, then its backoff tagger is consulted.
@rtype: C{str}
@type tokens: C{list}
@param tokens: The list of words that are being tagged.
@type index: C{int}
@param index: The index of the word whose tag should be
returned.
@type history: C{list} of C{str}
@param history: A list of the tags for all words before
C{index}.
"""
tag = None
for tagger in self._taggers:
tag = tagger.choose_tag(tokens, index, history)
if tag is not None: break
return tag
def choose_tag(self, tokens, index, history):
"""
Decide which tag should be used for the specified token, and
return that tag. If this tagger is unable to determine a tag
for the specified token, return C{None} -- do I{not} consult
the backoff tagger. This method should be overridden by
subclasses of C{SequentialBackoffTagger}.
@rtype: C{str}
@type tokens: C{list}
@param tokens: The list of words that are being tagged.
@type index: C{int}
@param index: The index of the word whose tag should be
returned.
@type history: C{list} of C{str}
@param history: A list of the tags for all words before
C{index}.
"""
raise AssertionError('SequentialBackoffTagger is an abstract class')
@deprecated('Use batch_tag instead.')
def tag_sents(self, sents, verbose=False):
self.tag_batch(sents, verbose)
class ContextTagger(SequentialBackoffTagger):
"""
An abstract base class for sequential backoff taggers that choose
a tag for a token based on the value of its "context". Different
subclasses are used to define different contexts.
A C{ContextTagger} chooses the tag for a token by calculating the
token's context, and looking up the corresponding tag in a table.
This table can be constructed manually; or it can be automatically
constructed based on a training corpus, using the L{_train()}
factory method.
@ivar _context_to_tag: Dictionary mapping contexts to tags.
"""
def __init__(self, context_to_tag, backoff=None):
"""
@param context_to_tag: A dictionary mapping contexts to tags.
@param backoff: The backoff tagger that should be used for this tagger.
"""
SequentialBackoffTagger.__init__(self, backoff)
if context_to_tag:
self._context_to_tag = context_to_tag
else:
self._context_to_tag = {}
def context(self, tokens, index, history):
"""
@return: the context that should be used to look up the tag
for the specified token; or C{None} if the specified token
should not be handled by this tagger.
@rtype: (hashable)
"""
raise AssertionError('Abstract base class')
def choose_tag(self, tokens, index, history):
context = self.context(tokens, index, history)
return self._context_to_tag.get(context)
def size(self):
"""
@return: The number of entries in the table used by this
tagger to map from contexts to tags.
"""
return len(self._context_to_tag)
def __repr__(self):
return '<%s: size=%d>' % (self.__class__.__name__, self.size())
def _train(self, tagged_corpus, cutoff=1, verbose=False):
"""
Initialize this C{ContextTagger}'s L{_context_to_tag} table
based on the given training data. In particular, for each
context C{I{c}} in the training data, set
C{_context_to_tag[I{c}]} to the most frequent tag for that
context. However, exclude any contexts that are already
tagged perfectly by the backoff tagger(s).
The old value of C{self._context_to_tag} (if any) is discarded.
@param tagged_corpus: A tagged corpus. Each item should be
a C{list} of C{(word, tag)} tuples.
@param cutoff: If the most likely tag for a context occurs
fewer than C{cutoff} times, then exclude it from the
context-to-tag table for the new tagger.
"""
token_count = hit_count = 0
useful_contexts = set()
fd = ConditionalFreqDist()
for sentence in tagged_corpus:
tokens, tags = zip(*sentence)
for index, (token, tag) in enumerate(sentence):
token_count += 1
context = self.context(tokens, index, tags[:index])
if context is None: continue
fd[context].inc(tag)
if (self.backoff is None or
tag != self.backoff.tag_one(tokens, index, tags[:index])):
useful_contexts.add(context)
for context in useful_contexts:
best_tag = fd[context].max()
hits = fd[context][best_tag]
if hits > cutoff:
self._context_to_tag[context] = best_tag
hit_count += hits
if verbose:
size = len(self._context_to_tag)
backoff = 100 - (hit_count * 100.0)/ token_count
pruning = 100 - (size * 100.0) / len(fd.conditions())
print "[Trained Unigram tagger:",
print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
size, backoff, pruning)
class DefaultTagger(SequentialBackoffTagger, yaml.YAMLObject):
"""
A tagger that assigns the same tag to every token.
"""
yaml_tag = '!nltk.DefaultTagger'
def __init__(self, tag):
"""
Construct a new tagger that assigns C{tag} to all tokens.
"""
self._tag = tag
SequentialBackoffTagger.__init__(self, None)
def choose_tag(self, tokens, index, history):
return self._tag
def __repr__(self):
return '<DefaultTagger: tag=%s>' % self._tag
class NgramTagger(ContextTagger, yaml.YAMLObject):
"""
A tagger that chooses a token's tag based on its word string and
on the preceeding I{n} word's tags. In particular, a tuple
C{(tags[i-n:i-1], words[i])} is looked up in a table, and the
corresponding tag is returned. N-gram taggers are typically
trained on a tagged corpus.
"""
yaml_tag = '!nltk.NgramTagger'
def __init__(self, n, train=None, model=None, backoff=None,
cutoff=1, verbose=False):
"""
Train a new C{NgramTagger} using the given training data or
the supplied model. In particular, construct a new tagger
whose table maps from each context C{(tag[i-n:i-1], word[i])}
to the most frequent tag for that context. But exclude any
contexts that are already tagged perfectly by the backoff
tagger.
@param train: A tagged corpus. Each item should be
a C{list} of C{(word, tag)} tuples.
@param backoff: A backoff tagger, to be used by the new
tagger if it encounters an unknown context.
@param cutoff: If the most likely tag for a context occurs
fewer than C{cutoff} times, then exclude it from the
context-to-tag table for the new tagger.
"""
self._n = n
if (train and model) or (not train and not model):
raise ValueError('Must specify either training data or trained model.')
ContextTagger.__init__(self, model, backoff)
if train:
self._train(train, cutoff, verbose)
def context(self, tokens, index, history):
tag_context = tuple(history[max(0,index-self._n+1):index])
return (tag_context, tokens[index])
class UnigramTagger(NgramTagger):
"""
A tagger that chooses a token's tag based its word string.
Unigram taggers are typically trained on a tagged corpus.
"""
yaml_tag = '!nltk.UnigramTagger'
def __init__(self, train=None, model=None, backoff=None,
cutoff=1, verbose=False):
NgramTagger.__init__(self, 1, train, model, backoff, cutoff, verbose)
def context(self, tokens, index, history):
return tokens[index]
class BigramTagger(NgramTagger):
"""
A tagger that chooses a token's tag based its word string and on
the preceeding words' tag. In particular, a tuple consisting
of the previous tag and the word is looked up in a table, and
the corresponding tag is returned. Bigram taggers are typically
trained on a tagged corpus.
"""
yaml_tag = '!nltk.BigramTagger'
def __init__(self, train=None, model=None, backoff=None,
cutoff=1, verbose=False):
NgramTagger.__init__(self, 2, train, model, backoff, cutoff, verbose)
class TrigramTagger(NgramTagger):
"""
A tagger that chooses a token's tag based its word string and on
the preceeding two words' tags. In particular, a tuple consisting
of the previous two tags and the word is looked up in a table, and
the corresponding tag is returned. Trigram taggers are typically
trained them on a tagged corpus.
"""
yaml_tag = '!nltk.TrigramTagger'
def __init__(self, train=None, model=None, backoff=None,
cutoff=1, verbose=False):
NgramTagger.__init__(self, 3, train, model, backoff, cutoff, verbose)
class AffixTagger(ContextTagger, yaml.YAMLObject):
"""
A tagger that chooses a token's tag based on a leading or trailing
substring of its word string. (It is important to note that these
substrings are not necessarily "true" morphological affixes). In
particular, a fixed-length substring of the word is looked up in a
table, and the corresponding tag is returned. Affix taggers are
typically constructed by training them on a tagged corpus; see
L{the constructor <__init__>}.
"""
yaml_tag = '!nltk.AffixTagger'
def __init__(self, train=None, model=None, affix_length=-3,
min_stem_length=2, backoff=None, cutoff=1, verbose=False):
"""
Construct a new affix tagger.
@param affix_length: The length of the affixes that should be
considered during training and tagging. Use negative
numbers for suffixes.
@param min_stem_length: Any words whose length is less than
C{min_stem_length+abs(affix_length)} will be assigned a
tag of C{None} by this tagger.
"""
if (train and model) or (not train and not model):
raise ValueError('Must specify either training data or '
'trained model')
ContextTagger.__init__(self, model, backoff)
self._affix_length = affix_length
self._min_word_length = min_stem_length + abs(affix_length)
if train:
self._train(train, cutoff, verbose)
def context(self, tokens, index, history):
token = tokens[index]
if len(token) < self._min_word_length:
return None
elif self._affix_length > 0:
return token[:self._affix_length]
else:
return token[self._affix_length:]
class RegexpTagger(SequentialBackoffTagger, yaml.YAMLObject):
"""
A tagger that assigns tags to words based on regular expressions
over word strings.
"""
yaml_tag = '!nltk.RegexpTagger'
def __init__(self, regexps, backoff=None):
"""
Construct a new regexp tagger.
@type regexps: C{list} of C{(str, str)}
@param regexps: A list of C{(regexp, tag)} pairs, each of
which indicates that a word matching C{regexp} should
be tagged with C{tag}. The pairs will be evalutated in
order. If none of the regexps match a word, then the
optional backoff tagger is invoked, else it is
assigned the tag C{None}.
"""
self._regexps = regexps
SequentialBackoffTagger.__init__(self, backoff)
def choose_tag(self, tokens, index, history):
for regexp, tag in self._regexps:
if re.match(regexp, tokens[index]):
return tag
return None
def __repr__(self):
return '<Regexp Tagger: size=%d>' % len(self._regexps)
class ClassifierBasedTagger(SequentialBackoffTagger, FeaturesetTaggerI):
"""
A sequential tagger that uses a classifier to choose the tag for
each token in a sentence. The featureset input for the classifier
is generated by a feature detector function::
feature_detector(tokens, index, history) -> featureset
Where C{tokens} is the list of unlabeled tokens in the sentence;
C{index} is the index of the token for which feature detection
should be performed; and C{history} is list of the tags for all
tokens before C{index}.
"""
def __init__(self, feature_detector, train=None,
classifier_builder=NaiveBayesClassifier.train,
classifier=None, backoff=None,
cutoff_prob=None, verbose=False):
"""
Construct a new classifier-based sequential tagger.
@param feature_detector: A function used to generate the
featureset input for the classifier::
feature_detector(tokens, index, history) -> featureset
@param train: A tagged corpus. Each item should be a C{list}
of C{(word, tag)} tuples.
@param backoff: A backoff tagger, to be used by the new tagger
if it encounters an unknown context.
@param classifier_builder: A function used to train a new
classifier based on the data in C{train}. It should take
one argument, a list of labeled featuresets (i.e.,
C{(featureset, label)} tuples).
@param classifier: The classifier that should be used by the
tagger. This is only useful if you want to manually
construct the classifier; normally, you would use C{train}
instead.
@param backoff: A backoff tagger, used if this tagger is
unable to determine a tag for a given token.
@param cutoff_prob: If specified, then this tagger will fall
back on its backoff tagger if the probability of the most
likely tag is less than C{cutoff_prob}.
"""
SequentialBackoffTagger.__init__(self, backoff)
if (train and classifier) or (not train and not classifier):
raise ValueError('Must specify either training data or '
'trained classifier.')
self._feature_detector = feature_detector
"""The feature detector function, used to generate a featureset
for each token::
feature_detector(tokens, index, history) -> featureset"""
self._cutoff_prob = cutoff_prob
"""Cutoff probability for tagging -- if the probability of the
most likely tag is less than this, then use backoff."""
self._classifier = classifier
"""The classifier used to choose a tag for each token."""
if train:
self._train(train, classifier_builder, verbose)
def choose_tag(self, tokens, index, history):
featureset = self._feature_detector(tokens, index, history)
if self._cutoff_prob is None:
return self._classifier.classify(featureset)
else:
pdist = self._classifier.prob_classify(featureset)
tag = pdist.max()
if pdist.prob(tag) >= self._cutoff_prob:
return tag
else:
return None
def _train(self, tagged_corpus, classifier_builder, verbose):
"""
Build a new classifier, based on the given training data
(C{tagged_corpus}).
"""
classifier_corpus = []
if verbose:
print 'Constructing training corpus for classifier.'
for sentence in tagged_corpus:
history = []
untagged_sentence = [tok for (tok, tag) in sentence]
tags = [tag for (tok, tag) in sentence]
for index in range(len(sentence)):
featureset = self._feature_detector(untagged_sentence,
index, history)
classifier_corpus.append( (featureset, tags[index]) )
history.append(tags[index])
if verbose:
print 'Training classifier (%d instances)' % len(classifier_corpus)
self._classifier = classifier_builder(classifier_corpus)
def __repr__(self):
return '<ClassifierBasedTagger: %r>' % self._classifier
def feature_detector(self):
"""
Return the feature detector that this tagger uses to generate
featuresets for its classifier. The feature detector is a
function with the signature::
feature_detector(tokens, index, history) -> featureset
@see: L{classifier()}
"""
return self._feature_detector
def classifier(self):
"""
Return the classifier that this tagger uses to choose a tag
for each word in a sentence. The input for this classifier is
generated using this tagger's feature detector.
@see: L{feature_detector()}
"""
return self._classifier