Code Coverage for nltk.evaluate
Untested Functions
- _edit_dist_init(), _edit_dist_step(), approxrand(), demo(), edit_dist(), f_measure(), log_likelihood(), precision(), recall(), windowdiff()
- ConfusionMatrix: __getitem__(), __init__(), __repr__(), __str__(), key(), pp()
|
Partially Tested Functions
|
"""
Utility functions for evaluating processing modules.
"""
import sys
import math
import random
try:
from scipy.stats.stats import betai
except ImportError:
betai = None
from util import LazyConcatenation, LazyMap, LazyZip
from probability import FreqDist
def accuracy(reference, test):
"""
Given a list of reference values and a corresponding list of test
values, return the percentage of corresponding values that are
equal. In particular, return the percentage of indices
C{0<i<=len(test)} such that C{test[i] == reference[i]}.
@type reference: C{list}
@param reference: An ordered list of reference values.
@type test: C{list}
@param test: A list of values to compare against the corresponding
reference values.
@raise ValueError: If C{reference} and C{length} do not have the
same length.
"""
if len(reference) != len(test):
raise ValueError("Lists must have the same length.")
num_correct = 0
for x, y in LazyZip(reference, test):
if x == y:
num_correct += 1
return float(num_correct) / len(reference)
def precision(reference, test):
"""
Given a set of reference values and a set of test values, return
the percentage of test values that appear in the reference set.
In particular, return |C{reference}S{cap}C{test}|/|C{test}|.
If C{test} is empty, then return C{None}.
@type reference: C{Set}
@param reference: A set of reference values.
@type test: C{Set}
@param test: A set of values to compare against the reference set.
@rtype: C{float} or C{None}
"""
if len(test) == 0:
return None
else:
return float(len(reference.intersection(test)))/len(test)
def recall(reference, test):
"""
Given a set of reference values and a set of test values, return
the percentage of reference values that appear in the test set.
In particular, return |C{reference}S{cap}C{test}|/|C{reference}|.
If C{reference} is empty, then return C{None}.
@type reference: C{Set}
@param reference: A set of reference values.
@type test: C{Set}
@param test: A set of values to compare against the reference set.
@rtype: C{float} or C{None}
"""
if len(reference) == 0:
return None
else:
return float(len(reference.intersection(test)))/len(reference)
def f_measure(reference, test, alpha=0.5):
"""
Given a set of reference values and a set of test values, return
the f-measure of the test values, when compared against the
reference values. The f-measure is the harmonic mean of the
L{precision} and L{recall}, weighted by C{alpha}. In particular,
given the precision M{p} and recall M{r} defined by:
- M{p} = |C{reference}S{cap}C{test}|/|C{test}|
- M{r} = |C{reference}S{cap}C{test}|/|C{reference}|
The f-measure is:
- 1/(C{alpha}/M{p} + (1-C{alpha})/M{r})
If either C{reference} or C{test} is empty, then C{f_measure}
returns C{None}.
@type reference: C{Set}
@param reference: A set of reference values.
@type test: C{Set}
@param test: A set of values to compare against the reference set.
@rtype: C{float} or C{None}
"""
p = precision(reference, test)
r = recall(reference, test)
if p is None or r is None:
return None
if p == 0 or r == 0:
return 0
return 1.0/(alpha/p + (1-alpha)/r)
def log_likelihood(reference, test):
"""
Given a list of reference values and a corresponding list of test
probability distributions, return the average log likelihood of
the reference values, given the probability distributions.
@param reference: A list of reference values
@type reference: C{list}
@param test: A list of probability distributions over values to
compare against the corresponding reference values.
@type test: C{list} of L{ProbDistI}
"""
if len(reference) != len(test):
raise ValueError("Lists must have the same length.")
total_likelihood = sum(dist.logprob(val)
for (val, dist) in zip(reference, test))
return total_likelihood/len(reference)
def approxrand(a, b, **kwargs):
"""
Returns an approximate significance level between two lists of
independently generated test values.
Approximate randomization calculates significance by randomly drawing
from a sample of the possible permutations. At the limit of the number
of possible permutations, the significance level is exact. The
approximate significance level is the sample mean number of times the
statistic of the permutated lists varies from the actual statistic of
the unpermuted argument lists.
@return: a tuple containing an approximate significance level, the count
of the number of times the pseudo-statistic varied from the
actual statistic, and the number of shuffles
@rtype: C{tuple}
@param a: a list of test values
@type a: C{list}
@param b: another list of independently generated test values
@type b: C{list}
"""
shuffles = kwargs.get('shuffles', 999)
shuffles = \
min(shuffles, reduce(lambda x, y: x * y, xrange(1, len(a) + len(b) + 1)))
stat = kwargs.get('statistic', lambda lst: float(sum(lst)) / len(lst))
verbose = kwargs.get('verbose', False)
if verbose:
print 'shuffles: %d' % shuffles
actual_stat = math.fabs(stat(a) - stat(b))
if verbose:
print 'actual statistic: %f' % actual_stat
print '-' * 60
c = 1e-100
lst = LazyConcatenation([a, b])
indices = range(len(a) + len(b))
for i in range(shuffles):
if verbose and i % 10 == 0:
print 'shuffle: %d' % i
random.shuffle(indices)
pseudo_stat_a = stat(LazyMap(lambda i: lst[i], indices[:len(a)]))
pseudo_stat_b = stat(LazyMap(lambda i: lst[i], indices[len(a):]))
pseudo_stat = math.fabs(pseudo_stat_a - pseudo_stat_b)
if pseudo_stat >= actual_stat:
c += 1
if verbose and i % 10 == 0:
print 'pseudo-statistic: %f' % pseudo_stat
print 'significance: %f' % (float(c + 1) / (i + 1))
print '-' * 60
significance = float(c + 1) / (shuffles + 1)
if verbose:
print 'significance: %f' % significance
if betai:
for phi in [0.01, 0.05, 0.10, 0.15, 0.25, 0.50]:
print "prob(phi<=%f): %f" % (phi, betai(c, shuffles, phi))
return (significance, c, shuffles)
class ConfusionMatrix(object):
"""
The confusion matrix between a list of reference values and a
corresponding list of test values. Entry [M{r},M{t}] of this
matrix is a count of the number of times that the reference value
M{r} corresponds to the test value M{t}. E.g.:
>>> ref = 'DET NN VB DET JJ NN NN IN DET NN'.split()
>>> test = 'DET VB VB DET NN NN NN IN DET NN'.split()
>>> cm = ConfusionMatrix(ref, test)
>>> print cm['NN', 'NN']
3
Note that the diagonal entries (M{Ri}=M{Tj}) of this matrix
corresponds to correct values; and the off-diagonal entries
correspond to incorrect values.
"""
def __init__(self, reference, test, sort_by_count=False):
"""
Construct a new confusion matrix from a list of reference
values and a corresponding list of test values.
@type reference: C{list}
@param reference: An ordered list of reference values.
@type test: C{list}
@param test: A list of values to compare against the
corresponding reference values.
@raise ValueError: If C{reference} and C{length} do not have
the same length.
"""
if len(reference) != len(test):
raise ValueError('Lists must have the same length.')
if sort_by_count:
ref_fdist = FreqDist(reference)
test_fdist = FreqDist(test)
def key(v): return -(ref_fdist[v]+test_fdist[v])
values = sorted(set(reference+test), key=key)
else:
values = sorted(set(reference+test))
indices = dict((val,i) for (i,val) in enumerate(values))
confusion = [[0 for val in values] for val in values]
max_conf = 0
for w,g in zip(reference, test):
confusion[indices[w]][indices[g]] += 1
max_conf = max(max_conf, confusion[indices[w]][indices[g]])
self._values = values
self._indices = indices
self._confusion = confusion
self._max_conf = max_conf
self._total = len(reference)
self._correct = sum(confusion[i][i] for i in range(len(values)))
def __getitem__(self, (li,lj)):
"""
@return: The number of times that value C{li} was expected and
value C{lj} was given.
@rtype: C{int}
"""
i = self._indices[li]
j = self._indices[lj]
return self._confusion[i][j]
def __repr__(self):
return '<ConfusionMatrix: %s/%s correct>' % (self._correct,
self._total)
def __str__(self):
return self.pp()
def pp(self, show_percents=False, values_in_chart=True):
"""
@return: A multi-line string representation of this confusion
matrix.
@todo: add marginals?
"""
confusion = self._confusion
if values_in_chart:
values = self._values
else:
values = range(len(self._values))
valuelen = max(len(str(val)) for val in values)
value_format = '%' + `valuelen` + 's | '
if show_percents:
entrylen = 6
entry_format = '%5.1f%%'
zerostr = ' .'
else:
entrylen = len(`self._max_conf`)
entry_format = '%' + `entrylen` + 'd'
zerostr = ' '*(entrylen-1) + '.'
value_strings = [str(val) for val in values]
s = ''
for i in range(valuelen):
s += (' '*valuelen)+' |'
for val in value_strings:
if i >= valuelen-len(val):
s += val[i-valuelen+len(val)].rjust(entrylen+1)
else:
s += ' '*(entrylen+1)
s += ' |\n'
s += '%s-+-%s+\n' % ('-'*valuelen, '-'*((entrylen+1)*len(values)))
for i in range(len(values)):
s += value_format % values[i]
for j in range(len(values)):
if confusion[i][j] == 0:
s += zerostr
elif show_percents:
s += entry_format % (100.0*confusion[i][j]/self._total)
else:
s += entry_format % confusion[i][j]
if i == j:
prevspace = s.rfind(' ')
s = s[:prevspace] + '<' + s[prevspace+1:] + '>'
else: s += ' '
s += '|\n'
s += '%s-+-%s+\n' % ('-'*valuelen, '-'*((entrylen+1)*len(values)))
s += '(row = reference; col = test)\n'
if not values_in_chart:
s += 'Value key:\n'
for i, value in enumerate(self._values):
s += '%6d: %s\n' % (i, value)
return s
def key(self):
values = self._values
str = 'Value key:\n'
indexlen = len(`len(values)-1`)
key_format = ' %'+`indexlen`+'d: %s\n'
for i in range(len(values)):
str += key_format % (i, values[i])
return str
def windowdiff(seg1, seg2, k, boundary="1"):
"""
Compute the windowdiff score for a pair of segmentations. A segmentation is any sequence
over a vocabulary of two items (e.g. "0", "1"), where the specified boundary value is used
to mark the edge of a segmentation.
@param seg1: a segmentation
@type seg1: C{string} or C{list}
@param seg2: a segmentation
@type seg2: C{string} or C{list}
@param k: window width
@type k: C{int}
@param boundary: boundary value
@type boundary: C{string} or C{int} or C{bool}
@rtype: C{int}
"""
if len(seg1) != len(seg2):
raise ValueError, "Segmentations have unequal length"
wd = 0
for i in range(len(seg1) - k):
wd += abs(seg1[i:i+k+1].count(boundary) - seg2[i:i+k+1].count(boundary))
return wd
def _edit_dist_init(len1, len2):
lev = []
for i in range(len1):
lev.append([0] * len2)
for i in range(len1):
lev[i][0] = i
for j in range(len2):
lev[0][j] = j
return lev
def _edit_dist_step(lev, i, j, c1, c2):
a = lev[i-1][j ] + 1
b = lev[i-1][j-1] + (c1 != c2)
c = lev[i ][j-1] + 1
lev[i][j] = min(a,b,c)
def edit_dist(s1, s2):
"""
Calculate the Levenshtein edit-distance between two strings.
The edit distance is the number of characters that need to be
substituted, inserted, or deleted, to transform s1 into s2. For
example, transforming "rain" to "shine" requires three steps,
consisting of two substitutions and one insertion:
"rain" -> "sain" -> "shin" -> "shine". These operations could have
been done in other orders, but at least three steps are needed.
@param s1, s2: The strings to be analysed
@type s1: C{string}
@type s2: C{string}
@rtype C{int}
"""
len1 = len(s1); len2 = len(s2)
lev = _edit_dist_init(len1+1, len2+1)
for i in range(len1):
for j in range (len2):
_edit_dist_step(lev, i+1, j+1, s1[i], s2[j])
return lev[len1][len2]
def demo():
print '-'*75
reference = 'DET NN VB DET JJ NN NN IN DET NN'.split()
test = 'DET VB VB DET NN NN NN IN DET NN'.split()
print 'Reference =', reference
print 'Test =', test
print 'Confusion matrix:'
print ConfusionMatrix(reference, test)
print 'Accuracy:', accuracy(reference, test)
print '-'*75
reference_set = set(reference)
test_set = set(test)
print 'Reference =', reference_set
print 'Test = ', test_set
print 'Precision:', precision(reference_set, test_set)
print ' Recall:', recall(reference_set, test_set)
print 'F-Measure:', f_measure(reference_set, test_set)
print '-'*75
if __name__ == '__main__':
demo()
__all__ = ['ConfusionMatrix', 'accuracy', 'demo',
'f_measure', 'log_likelihood', 'precision', 'recall',
'edit_dist', 'windowdiff']