Code Coverage for nltk.classify.megam
Untested Functions
"""
A set of functions used to interface with the external U{megam
<http://www.cs.utah.edu/~hal/megam/>} maxent optimization package.
Before C{megam} can be used, you should tell NLTK where it can find
the C{megam} binary, using the L{config_megam()} function. Typical
usage:
>>> import nltk
>>> nltk.config_megam('.../path/to/megam')
>>> classifier = nltk.MaxentClassifier.train(corpus, 'megam')
"""
__docformat__ = 'epytext en'
import os
import os.path
import subprocess
from nltk.internals import find_binary
try:
import numpy
except ImportError:
numpy = None
_megam_bin = None
def config_megam(bin=None):
"""
Configure NLTK's interface to the C{megam} maxent optimization
package.
@param bin: The full path to the C{megam} binary. If not specified,
then nltk will search the system for a C{megam} binary; and if
one is not found, it will raise a C{LookupError} exception.
@type bin: C{string}
"""
global _megam_bin
_megam_bin = find_binary(
'megam', bin,
env_vars=['MEGAM', 'MEGAM_HOME'],
binary_names=['megam.opt', 'megam', 'megam_686o', 'megam_i686.opto'],
url='http://www.cs.utah.edu/~hal/megam/')
def write_megam_file(train_toks, encoding, stream,
bernoulli=True, explicit=True):
"""
Generate an input file for C{megam} based on the given corpus of
classified tokens.
@type train_toks: C{list} of C{tuples} of (C{dict}, C{str})
@param train_toks: Training data, represented as a list of
pairs, the first member of which is a feature dictionary,
and the second of which is a classification label.
@type encoding: L{MaxentFeatureEncodingI}
@param encoding: A feature encoding, used to convert featuresets
into feature vectors.
@type stream: C{stream}
@param stream: The stream to which the megam input file should be
written.
@param bernoulli: If true, then use the 'bernoulli' format. I.e.,
all joint features have binary values, and are listed iff they
are true. Otherwise, list feature values explicitly. If
C{bernoulli=False}, then you must call C{megam} with the
C{-fvals} option.
@param explicit: If true, then use the 'explicit' format. I.e.,
list the features that would fire for any of the possible
labels, for each token. If C{explicit=True}, then you must
call C{megam} with the C{-explicit} option.
"""
labels = encoding.labels()
labelnum = dict([(label, i) for (i, label) in enumerate(labels)])
for featureset, label in train_toks:
stream.write('%d' % labelnum[label])
if not explicit:
_write_megam_features(encoding.encode(featureset, label),
stream, bernoulli)
else:
for l in labels:
stream.write(' #')
_write_megam_features(encoding.encode(featureset, l),
stream, bernoulli)
stream.write('\n')
def parse_megam_weights(s, explicit=True):
"""
Given the stdout output generated by C{megam} when training a
model, return a C{numpy} array containing the corresponding weight
vector. This function does not currently handle bias features.
"""
if numpy is None:
raise ValueError('This function requires that numpy be installed')
assert explicit, 'non-explicit not supported yet'
lines = s.strip().split('\n')
weights = numpy.zeros(len(lines), 'd')
for line in lines:
if line.strip():
fid, weight = line.split()
weights[int(fid)] = float(weight)
return weights
def _write_megam_features(vector, stream, bernoulli):
if not vector:
raise ValueError('MEGAM classifier requires the use of an '
'always-on feature.')
for (fid, fval) in vector:
if bernoulli:
if fval == 1:
stream.write(' %s' % fid)
elif fval != 0:
raise ValueError('If bernoulli=True, then all'
'features must be binary.')
else:
stream.write(' %s %s' % (fid, fval))
def call_megam(args):
"""
Call the C{megam} binary with the given arguments.
"""
if isinstance(args, basestring):
raise TypeError('args should be a list of strings')
if _megam_bin is None:
config_megam()
cmd = [_megam_bin] + args
p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
(stdout, stderr) = p.communicate()
if p.returncode != 0:
print
print stderr
raise OSError('megam command failed!')
return stdout