1
2
3
4
5
6
7
8
9
10 """
11 A set of functions used to interface with the external U{megam
12 <http://www.cs.utah.edu/~hal/megam/>} maxent optimization package.
13 Before C{megam} can be used, you should tell NLTK where it can find
14 the C{megam} binary, using the L{config_megam()} function. Typical
15 usage:
16
17 >>> import nltk
18 >>> nltk.config_megam('.../path/to/megam')
19 >>> classifier = nltk.MaxentClassifier.train(corpus, 'megam')
20
21 """
22 __docformat__ = 'epytext en'
23
24 import os
25 import os.path
26 import subprocess
27
28 from nltk.internals import find_binary
29 try:
30 import numpy
31 except ImportError:
32 numpy = None
33
34
35
36
37
38 _megam_bin = None
40 """
41 Configure NLTK's interface to the C{megam} maxent optimization
42 package.
43
44 @param bin: The full path to the C{megam} binary. If not specified,
45 then nltk will search the system for a C{megam} binary; and if
46 one is not found, it will raise a C{LookupError} exception.
47 @type bin: C{string}
48 """
49 global _megam_bin
50 _megam_bin = find_binary(
51 'megam', bin,
52 env_vars=['MEGAM', 'MEGAM_HOME'],
53 binary_names=['megam.opt', 'megam', 'megam_686o', 'megam_i686.opto'],
54 url='http://www.cs.utah.edu/~hal/megam/')
55
56
57
58
59
60 -def write_megam_file(train_toks, encoding, stream,
61 bernoulli=True, explicit=True):
62 """
63 Generate an input file for C{megam} based on the given corpus of
64 classified tokens.
65
66 @type train_toks: C{list} of C{tuples} of (C{dict}, C{str})
67 @param train_toks: Training data, represented as a list of
68 pairs, the first member of which is a feature dictionary,
69 and the second of which is a classification label.
70
71 @type encoding: L{MaxentFeatureEncodingI}
72 @param encoding: A feature encoding, used to convert featuresets
73 into feature vectors.
74
75 @type stream: C{stream}
76 @param stream: The stream to which the megam input file should be
77 written.
78
79 @param bernoulli: If true, then use the 'bernoulli' format. I.e.,
80 all joint features have binary values, and are listed iff they
81 are true. Otherwise, list feature values explicitly. If
82 C{bernoulli=False}, then you must call C{megam} with the
83 C{-fvals} option.
84
85 @param explicit: If true, then use the 'explicit' format. I.e.,
86 list the features that would fire for any of the possible
87 labels, for each token. If C{explicit=True}, then you must
88 call C{megam} with the C{-explicit} option.
89 """
90
91 labels = encoding.labels()
92 labelnum = dict([(label, i) for (i, label) in enumerate(labels)])
93
94
95 for featureset, label in train_toks:
96
97 stream.write('%d' % labelnum[label])
98
99
100
101 if not explicit:
102 _write_megam_features(encoding.encode(featureset, label),
103 stream, bernoulli)
104
105
106
107 else:
108 for l in labels:
109 stream.write(' #')
110 _write_megam_features(encoding.encode(featureset, l),
111 stream, bernoulli)
112
113
114 stream.write('\n')
115
117 """
118 Given the stdout output generated by C{megam} when training a
119 model, return a C{numpy} array containing the corresponding weight
120 vector. This function does not currently handle bias features.
121 """
122 if numpy is None:
123 raise ValueError('This function requires that numpy be installed')
124 assert explicit, 'non-explicit not supported yet'
125 lines = s.strip().split('\n')
126 weights = numpy.zeros(len(lines), 'd')
127 for line in lines:
128 if line.strip():
129 fid, weight = line.split()
130 weights[int(fid)] = float(weight)
131 return weights
132
134 if not vector:
135 raise ValueError('MEGAM classifier requires the use of an '
136 'always-on feature.')
137 for (fid, fval) in vector:
138 if bernoulli:
139 if fval == 1:
140 stream.write(' %s' % fid)
141 elif fval != 0:
142 raise ValueError('If bernoulli=True, then all'
143 'features must be binary.')
144 else:
145 stream.write(' %s %s' % (fid, fval))
146
148 """
149 Call the C{megam} binary with the given arguments.
150 """
151 if isinstance(args, basestring):
152 raise TypeError('args should be a list of strings')
153 if _megam_bin is None:
154 config_megam()
155
156
157 cmd = [_megam_bin] + args
158 p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
159 (stdout, stderr) = p.communicate()
160
161
162 if p.returncode != 0:
163 print
164 print stderr
165 raise OSError('megam command failed!')
166
167 return stdout
168