Package nltk :: Package classify :: Module megam
[hide private]
[frames] | no frames]

Source Code for Module nltk.classify.megam

  1  # Natural Language Toolkit: Interface to Megam Classifier 
  2  # 
  3  # Copyright (C) 2008 NLTK Project 
  4  # Author: Edward Loper <[email protected]> 
  5  # URL: <http://nltk.org> 
  6  # For license information, see LICENSE.TXT 
  7  # 
  8  # $Id: naivebayes.py 2063 2004-07-17 21:02:24Z edloper $ 
  9   
 10  """ 
 11  A set of functions used to interface with the external U{megam 
 12  <http://www.cs.utah.edu/~hal/megam/>} maxent optimization package. 
 13  Before C{megam} can be used, you should tell NLTK where it can find 
 14  the C{megam} binary, using the L{config_megam()} function.  Typical 
 15  usage: 
 16   
 17      >>> import nltk 
 18      >>> nltk.config_megam('.../path/to/megam') 
 19      >>> classifier = nltk.MaxentClassifier.train(corpus, 'megam') 
 20   
 21  """ 
 22  __docformat__ = 'epytext en' 
 23   
 24  import os 
 25  import os.path 
 26  import subprocess 
 27   
 28  from nltk.internals import find_binary 
 29  try: 
 30      import numpy 
 31  except ImportError: 
 32      numpy = None 
 33   
 34  ###################################################################### 
 35  #{ Configuration 
 36  ###################################################################### 
 37   
 38  _megam_bin = None 
39 -def config_megam(bin=None):
40 """ 41 Configure NLTK's interface to the C{megam} maxent optimization 42 package. 43 44 @param bin: The full path to the C{megam} binary. If not specified, 45 then nltk will search the system for a C{megam} binary; and if 46 one is not found, it will raise a C{LookupError} exception. 47 @type bin: C{string} 48 """ 49 global _megam_bin 50 _megam_bin = find_binary( 51 'megam', bin, 52 env_vars=['MEGAM', 'MEGAM_HOME'], 53 binary_names=['megam.opt', 'megam', 'megam_686o', 'megam_i686.opto'], 54 url='http://www.cs.utah.edu/~hal/megam/')
55 56 ###################################################################### 57 #{ Megam Interface Functions 58 ###################################################################### 59
60 -def write_megam_file(train_toks, encoding, stream, 61 bernoulli=True, explicit=True):
62 """ 63 Generate an input file for C{megam} based on the given corpus of 64 classified tokens. 65 66 @type train_toks: C{list} of C{tuples} of (C{dict}, C{str}) 67 @param train_toks: Training data, represented as a list of 68 pairs, the first member of which is a feature dictionary, 69 and the second of which is a classification label. 70 71 @type encoding: L{MaxentFeatureEncodingI} 72 @param encoding: A feature encoding, used to convert featuresets 73 into feature vectors. 74 75 @type stream: C{stream} 76 @param stream: The stream to which the megam input file should be 77 written. 78 79 @param bernoulli: If true, then use the 'bernoulli' format. I.e., 80 all joint features have binary values, and are listed iff they 81 are true. Otherwise, list feature values explicitly. If 82 C{bernoulli=False}, then you must call C{megam} with the 83 C{-fvals} option. 84 85 @param explicit: If true, then use the 'explicit' format. I.e., 86 list the features that would fire for any of the possible 87 labels, for each token. If C{explicit=True}, then you must 88 call C{megam} with the C{-explicit} option. 89 """ 90 # Look up the set of labels. 91 labels = encoding.labels() 92 labelnum = dict([(label, i) for (i, label) in enumerate(labels)]) 93 94 # Write the file, which contains one line per instance. 95 for featureset, label in train_toks: 96 # First, the instance number. 97 stream.write('%d' % labelnum[label]) 98 99 # For implicit file formats, just list the features that fire 100 # for this instance's actual label. 101 if not explicit: 102 _write_megam_features(encoding.encode(featureset, label), 103 stream, bernoulli) 104 105 # For explicit formats, list the features that would fire for 106 # any of the possible labels. 107 else: 108 for l in labels: 109 stream.write(' #') 110 _write_megam_features(encoding.encode(featureset, l), 111 stream, bernoulli) 112 113 # End of the isntance. 114 stream.write('\n')
115
116 -def parse_megam_weights(s, explicit=True):
117 """ 118 Given the stdout output generated by C{megam} when training a 119 model, return a C{numpy} array containing the corresponding weight 120 vector. This function does not currently handle bias features. 121 """ 122 if numpy is None: 123 raise ValueError('This function requires that numpy be installed') 124 assert explicit, 'non-explicit not supported yet' 125 lines = s.strip().split('\n') 126 weights = numpy.zeros(len(lines), 'd') 127 for line in lines: 128 if line.strip(): 129 fid, weight = line.split() 130 weights[int(fid)] = float(weight) 131 return weights
132
133 -def _write_megam_features(vector, stream, bernoulli):
134 if not vector: 135 raise ValueError('MEGAM classifier requires the use of an ' 136 'always-on feature.') 137 for (fid, fval) in vector: 138 if bernoulli: 139 if fval == 1: 140 stream.write(' %s' % fid) 141 elif fval != 0: 142 raise ValueError('If bernoulli=True, then all' 143 'features must be binary.') 144 else: 145 stream.write(' %s %s' % (fid, fval))
146
147 -def call_megam(args):
148 """ 149 Call the C{megam} binary with the given arguments. 150 """ 151 if isinstance(args, basestring): 152 raise TypeError('args should be a list of strings') 153 if _megam_bin is None: 154 config_megam() 155 156 # Call megam via a subprocess 157 cmd = [_megam_bin] + args 158 p = subprocess.Popen(cmd, stdout=subprocess.PIPE) 159 (stdout, stderr) = p.communicate() 160 161 # Check the return code. 162 if p.returncode != 0: 163 print 164 print stderr 165 raise OSError('megam command failed!') 166 167 return stdout
168