Code Coverage for nltk.classify.megam

Untested Functions

# Natural Language Toolkit: Interface to Megam Classifier
#
# Copyright (C) 2008 NLTK Project
# Author: Edward Loper <[email protected]>
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
#
# $Id: naivebayes.py 2063 2004-07-17 21:02:24Z edloper $

"""
A set of functions used to interface with the external U{megam
<http://www.cs.utah.edu/~hal/megam/>} maxent optimization package.
Before C{megam} can be used, you should tell NLTK where it can find
the C{megam} binary, using the L{config_megam()} function.  Typical
usage:

    >>> import nltk
    >>> nltk.config_megam('.../path/to/megam')
    >>> classifier = nltk.MaxentClassifier.train(corpus, 'megam')

"""
__docformat__ = 'epytext en'

import os
import os.path
import subprocess

from nltk.internals import find_binary
try:
    import numpy
except ImportError:
    numpy = None

######################################################################
#{ Configuration
######################################################################

_megam_bin = None
def config_megam(bin=None):
    """
    Configure NLTK's interface to the C{megam} maxent optimization
    package.

    @param bin: The full path to the C{megam} binary.  If not specified,
        then nltk will search the system for a C{megam} binary; and if
        one is not found, it will raise a C{LookupError} exception.
    @type bin: C{string}
    """
    global _megam_bin
    _megam_bin = find_binary(
        'megam', bin,
        env_vars=['MEGAM',  'MEGAM_HOME'],
        binary_names=['megam.opt', 'megam', 'megam_686o', 'megam_i686.opto'],
        url='http://www.cs.utah.edu/~hal/megam/')

######################################################################
#{ Megam Interface Functions
######################################################################

def write_megam_file(train_toks, encoding, stream,
                     bernoulli=True, explicit=True):
    """
    Generate an input file for C{megam} based on the given corpus of
    classified tokens.

    @type train_toks: C{list} of C{tuples} of (C{dict}, C{str})
    @param train_toks: Training data, represented as a list of
        pairs, the first member of which is a feature dictionary,
        and the second of which is a classification label.

    @type encoding: L{MaxentFeatureEncodingI}
    @param encoding: A feature encoding, used to convert featuresets
        into feature vectors.

    @type stream: C{stream}
    @param stream: The stream to which the megam input file should be
        written.

    @param bernoulli: If true, then use the 'bernoulli' format.  I.e.,
        all joint features have binary values, and are listed iff they
        are true.  Otherwise, list feature values explicitly.  If
        C{bernoulli=False}, then you must call C{megam} with the
        C{-fvals} option.

    @param explicit: If true, then use the 'explicit' format.  I.e.,
        list the features that would fire for any of the possible
        labels, for each token.  If C{explicit=True}, then you must
        call C{megam} with the C{-explicit} option.
    """
    # Look up the set of labels.
    labels = encoding.labels()
    labelnum = dict([(label, i) for (i, label) in enumerate(labels)])

    # Write the file, which contains one line per instance.
    for featureset, label in train_toks:
        # First, the instance number.
        stream.write('%d' % labelnum[label])

        # For implicit file formats, just list the features that fire
        # for this instance's actual label.
        if not explicit:
            _write_megam_features(encoding.encode(featureset, label),
                                  stream, bernoulli)

        # For explicit formats, list the features that would fire for
        # any of the possible labels.
        else:
            for l in labels:
                stream.write(' #')
                _write_megam_features(encoding.encode(featureset, l),
                                      stream, bernoulli)

        # End of the isntance.
        stream.write('\n')

def parse_megam_weights(s, explicit=True):
    """
    Given the stdout output generated by C{megam} when training a
    model, return a C{numpy} array containing the corresponding weight
    vector.  This function does not currently handle bias features.
    """
    if numpy is None:
        raise ValueError('This function requires that numpy be installed')
    assert explicit, 'non-explicit not supported yet'
    lines = s.strip().split('\n')
    weights = numpy.zeros(len(lines), 'd')
    for line in lines:
        if line.strip():
            fid, weight = line.split()
            weights[int(fid)] = float(weight)
    return weights

def _write_megam_features(vector, stream, bernoulli):
    if not vector:
        raise ValueError('MEGAM classifier requires the use of an '
                         'always-on feature.')
    for (fid, fval) in vector:
        if bernoulli:
            if fval == 1:
                stream.write(' %s' % fid)
            elif fval != 0:
                raise ValueError('If bernoulli=True, then all'
                                 'features must be binary.')
        else:
            stream.write(' %s %s' % (fid, fval))

def call_megam(args):
    """
    Call the C{megam} binary with the given arguments.
    """
    if isinstance(args, basestring):
        raise TypeError('args should be a list of strings')
    if _megam_bin is None:
        config_megam()

    # Call megam via a subprocess
    cmd = [_megam_bin] + args
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
    (stdout, stderr) = p.communicate()

    # Check the return code.
    if p.returncode != 0:
        print
        print stderr
        raise OSError('megam command failed!')

    return stdout