Source code for nltk.tbl.feature

# -*- coding: utf-8 -*-
# Natural Language Toolkit: Transformation-based learning
#
# Copyright (C) 2001-2015 NLTK Project
# Author: Marcus Uneson <[email protected]>
#   based on previous (nltk2) version by
#   Christopher Maloof, Edward Loper, Steven Bird
# URL: <http://nltk.org/>
# For license information, see  LICENSE.TXT

from __future__ import division, print_function, unicode_literals


[docs]class Feature(object): """ An abstract base class for Features. A Feature is a combination of a specific property-computing method and a list of relative positions to apply that method to. The property-computing method, M{extract_property(tokens, index)}, must be implemented by every subclass. It extracts or computes a specific property for the token at the current index. Typical extract_property() methods return features such as the token text or tag; but more involved methods may consider the entire sequence M{tokens} and for instance compute the length of the sentence the token belongs to. In addition, the subclass may have a PROPERTY_NAME, which is how it will be printed (in Rules and Templates, etc). If not given, defaults to the classname. """ # !!FOR_FUTURE: when targeting python3 only, consider @abc.abstractmethod # and metaclass=abc.ABCMeta rather than NotImplementedError # http://julien.danjou.info/blog/2013/guide-python-static-class-abstract-methods json_tag = 'nltk.tbl.Feature' PROPERTY_NAME = None def __init__(self, positions, end=None): """ Construct a Feature which may apply at C{positions}. #For instance, importing some concrete subclasses (Feature is abstract) >>> from nltk.tag.brill import Word, Pos #Feature Word, applying at one of [-2, -1] >>> Word([-2,-1]) Word([-2, -1]) #Positions need not be contiguous >>> Word([-2,-1, 1]) Word([-2, -1, 1]) #Contiguous ranges can alternatively be specified giving the #two endpoints (inclusive) >>> Pos(-3, -1) Pos([-3, -2, -1]) #In two-arg form, start <= end is enforced >>> Pos(2, 1) Traceback (most recent call last): File "<stdin>", line 1, in <module> File "nltk/tbl/template.py", line 306, in __init__ raise TypeError ValueError: illegal interval specification: (start=2, end=1) :type positions: list of int :param positions: the positions at which this features should apply :raises ValueError: illegal position specifications An alternative calling convention, for contiguous positions only, is Feature(start, end): :type start: int :param start: start of range where this feature should apply :type end: int :param end: end of range (NOTE: inclusive!) where this feature should apply """ self.positions = None # to avoid warnings if end is None: self.positions = tuple(sorted(set([int(i) for i in positions]))) else: # positions was actually not a list, but only the start index try: if positions > end: raise TypeError self.positions = tuple(range(positions, end+1)) except TypeError: # let any kind of erroneous spec raise ValueError raise ValueError("illegal interval specification: (start={0}, end={1})".format(positions, end)) # set property name given in subclass, or otherwise name of subclass self.PROPERTY_NAME = self.__class__.PROPERTY_NAME or self.__class__.__name__
[docs] def encode_json_obj(self): return self.positions
@classmethod
[docs] def decode_json_obj(cls, obj): positions = obj return cls(positions)
def __repr__(self): return "%s(%r)" % ( self.__class__.__name__, list(self.positions)) @classmethod
[docs] def expand(cls, starts, winlens, excludezero=False): """ Return a list of features, one for each start point in starts and for each window length in winlen. If excludezero is True, no Features containing 0 in its positions will be generated (many tbl trainers have a special representation for the target feature at [0]) For instance, importing a concrete subclass (Feature is abstract) >>> from nltk.tag.brill import Word First argument gives the possible start positions, second the possible window lengths >>> Word.expand([-3,-2,-1], [1]) [Word([-3]), Word([-2]), Word([-1])] >>> Word.expand([-2,-1], [1]) [Word([-2]), Word([-1])] >>> Word.expand([-3,-2,-1], [1,2]) [Word([-3]), Word([-2]), Word([-1]), Word([-3, -2]), Word([-2, -1])] >>> Word.expand([-2,-1], [1]) [Word([-2]), Word([-1])] a third optional argument excludes all Features whose positions contain zero >>> Word.expand([-2,-1,0], [1,2], excludezero=False) [Word([-2]), Word([-1]), Word([0]), Word([-2, -1]), Word([-1, 0])] >>> Word.expand([-2,-1,0], [1,2], excludezero=True) [Word([-2]), Word([-1]), Word([-2, -1])] All window lengths must be positive >>> Word.expand([-2,-1], [0]) Traceback (most recent call last): File "<stdin>", line 1, in <module> File "nltk/tag/tbl/template.py", line 371, in expand :param starts: where to start looking for Feature ValueError: non-positive window length in [0] :param starts: where to start looking for Feature :type starts: list of ints :param winlens: window lengths where to look for Feature :type starts: list of ints :param excludezero: do not output any Feature with 0 in any of its positions. :type excludezero: bool :returns: list of Features :raises ValueError: for non-positive window lengths """ if not all(x > 0 for x in winlens): raise ValueError("non-positive window length in {0}".format(winlens)) xs = (starts[i:i+w] for w in winlens for i in range(len(starts)-w+1)) return [cls(x) for x in xs if not (excludezero and 0 in x)]
[docs] def issuperset(self, other): """ Return True if this Feature always returns True when other does More precisely, return True if this feature refers to the same property as other; and this Feature looks at all positions that other does (and possibly other positions in addition). #For instance, importing a concrete subclass (Feature is abstract) >>> from nltk.tag.brill import Word, Pos >>> Word([-3,-2,-1]).issuperset(Word([-3,-2])) True >>> Word([-3,-2,-1]).issuperset(Word([-3,-2, 0])) False #Feature subclasses must agree >>> Word([-3,-2,-1]).issuperset(Pos([-3,-2])) False :param other: feature with which to compare :type other: (subclass of) Feature :return: True if this feature is superset, otherwise False :rtype: bool """ return self.__class__ is other.__class__ and set(self.positions) >= set(other.positions)
[docs] def intersects(self, other): """ Return True if the positions of this Feature intersects with those of other More precisely, return True if this feature refers to the same property as other; and there is some overlap in the positions they look at. #For instance, importing a concrete subclass (Feature is abstract) >>> from nltk.tag.brill import Word, Pos >>> Word([-3,-2,-1]).intersects(Word([-3,-2])) True >>> Word([-3,-2,-1]).intersects(Word([-3,-2, 0])) True >>> Word([-3,-2,-1]).intersects(Word([0])) False #Feature subclasses must agree >>> Word([-3,-2,-1]).intersects(Pos([-3,-2])) False :param other: feature with which to compare :type other: (subclass of) Feature :return: True if feature classes agree and there is some overlap in the positions they look at :rtype: bool """ return bool((self.__class__ is other.__class__ and set(self.positions) & set(other.positions))) # Rich comparisons for Features. With @functools.total_ordering (Python 2.7+), # it will be enough to define __lt__ and __eq__
def __eq__(self, other): return (self.__class__ is other.__class__ and self.positions == other.positions) def __lt__(self, other): return ( self.__class__.__name__ < other.__class__.__name__ or # self.positions is a sorted tuple of ints self.positions < other.positions ) def __ne__(self, other): return not (self == other) def __gt__(self, other): return other < self def __ge__(self, other): return not self < other def __le__(self, other): return self < other or self == other @staticmethod
[docs] def extract_property(tokens, index): """ Any subclass of Feature must define static method extract_property(tokens, index) :param tokens: the sequence of tokens :type tokens: list of tokens :param index: the current index :type index: int :return: feature value :rtype: any (but usually scalar) """ raise NotImplementedError
if __name__ == "__main__": import doctest doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)