nltk.evaluate

1 # Natural Language Toolkit: Evaluation 2 # 3 # Copyright (C) 2001-2008 NLTK Project 4 # Author: Edward Loper <[email protected]> 5 # Steven Bird <[email protected]> 6 # URL: <http://nltk.org> 7 # For license information, see LICENSE.TXT 8 9 10 """ 11 Utility functions for evaluating processing modules. 12 """ 13 14 import sys 15 import math 16 import random 17 18 try: 19 from scipy.stats.stats import betai 20 except ImportError: 21 betai = None 22 23 from util import LazyConcatenation, LazyMap, LazyZip 24 from probability import FreqDist 25

26 -def accuracy(reference, test):

27 """ 28 Given a list of reference values and a corresponding list of test 29 values, return the percentage of corresponding values that are 30 equal. In particular, return the percentage of indices 31 C{0<i<=len(test)} such that C{test[i] == reference[i]}. 32 33 @type reference: C{list} 34 @param reference: An ordered list of reference values. 35 @type test: C{list} 36 @param test: A list of values to compare against the corresponding 37 reference values. 38 @raise ValueError: If C{reference} and C{length} do not have the 39 same length. 40 """ 41 if len(reference) != len(test): 42 raise ValueError("Lists must have the same length.") 43 num_correct = 0 44 for x, y in LazyZip(reference, test): 45 if x == y: 46 num_correct += 1 47 return float(num_correct) / len(reference)

48

49 -def precision(reference, test):

50 """ 51 Given a set of reference values and a set of test values, return 52 the percentage of test values that appear in the reference set. 53 In particular, return |C{reference}S{cap}C{test}|/|C{test}|. 54 If C{test} is empty, then return C{None}. 55 56 @type reference: C{Set} 57 @param reference: A set of reference values. 58 @type test: C{Set} 59 @param test: A set of values to compare against the reference set. 60 @rtype: C{float} or C{None} 61 """ 62 if len(test) == 0: 63 return None 64 else: 65 return float(len(reference.intersection(test)))/len(test)

66

67 -def recall(reference, test):

68 """ 69 Given a set of reference values and a set of test values, return 70 the percentage of reference values that appear in the test set. 71 In particular, return |C{reference}S{cap}C{test}|/|C{reference}|. 72 If C{reference} is empty, then return C{None}. 73 74 @type reference: C{Set} 75 @param reference: A set of reference values. 76 @type test: C{Set} 77 @param test: A set of values to compare against the reference set. 78 @rtype: C{float} or C{None} 79 """ 80 if len(reference) == 0: 81 return None 82 else: 83 return float(len(reference.intersection(test)))/len(reference)

84

85 -def f_measure(reference, test, alpha=0.5):

86 """ 87 Given a set of reference values and a set of test values, return 88 the f-measure of the test values, when compared against the 89 reference values. The f-measure is the harmonic mean of the 90 L{precision} and L{recall}, weighted by C{alpha}. In particular, 91 given the precision M{p} and recall M{r} defined by: 92 - M{p} = |C{reference}S{cap}C{test}|/|C{test}| 93 - M{r} = |C{reference}S{cap}C{test}|/|C{reference}| 94 The f-measure is: 95 - 1/(C{alpha}/M{p} + (1-C{alpha})/M{r}) 96 97 If either C{reference} or C{test} is empty, then C{f_measure} 98 returns C{None}. 99 100 @type reference: C{Set} 101 @param reference: A set of reference values. 102 @type test: C{Set} 103 @param test: A set of values to compare against the reference set. 104 @rtype: C{float} or C{None} 105 """ 106 p = precision(reference, test) 107 r = recall(reference, test) 108 if p is None or r is None: 109 return None 110 if p == 0 or r == 0: 111 return 0 112 return 1.0/(alpha/p + (1-alpha)/r)

113

114 -def log_likelihood(reference, test):

115 """ 116 Given a list of reference values and a corresponding list of test 117 probability distributions, return the average log likelihood of 118 the reference values, given the probability distributions. 119 120 @param reference: A list of reference values 121 @type reference: C{list} 122 @param test: A list of probability distributions over values to 123 compare against the corresponding reference values. 124 @type test: C{list} of L{ProbDistI} 125 """ 126 if len(reference) != len(test): 127 raise ValueError("Lists must have the same length.") 128 129 # Return the average value of dist.logprob(val). 130 total_likelihood = sum(dist.logprob(val) 131 for (val, dist) in zip(reference, test)) 132 return total_likelihood/len(reference)

133

134 -def approxrand(a, b, **kwargs):

135 """ 136 Returns an approximate significance level between two lists of 137 independently generated test values. 138 139 Approximate randomization calculates significance by randomly drawing 140 from a sample of the possible permutations. At the limit of the number 141 of possible permutations, the significance level is exact. The 142 approximate significance level is the sample mean number of times the 143 statistic of the permutated lists varies from the actual statistic of 144 the unpermuted argument lists. 145 146 @return: a tuple containing an approximate significance level, the count 147 of the number of times the pseudo-statistic varied from the 148 actual statistic, and the number of shuffles 149 @rtype: C{tuple} 150 @param a: a list of test values 151 @type a: C{list} 152 @param b: another list of independently generated test values 153 @type b: C{list} 154 """ 155 shuffles = kwargs.get('shuffles', 999) 156 # there's no point in trying to shuffle beyond all possible permutations 157 shuffles = \ 158 min(shuffles, reduce(lambda x, y: x * y, xrange(1, len(a) + len(b) + 1))) 159 stat = kwargs.get('statistic', lambda lst: float(sum(lst)) / len(lst)) 160 verbose = kwargs.get('verbose', False) 161 162 if verbose: 163 print 'shuffles: %d' % shuffles 164 165 actual_stat = math.fabs(stat(a) - stat(b)) 166 167 if verbose: 168 print 'actual statistic: %f' % actual_stat 169 print '-' * 60 170 171 c = 1e-100 172 lst = LazyConcatenation([a, b]) 173 indices = range(len(a) + len(b)) 174 175 for i in range(shuffles): 176 if verbose and i % 10 == 0: 177 print 'shuffle: %d' % i 178 179 random.shuffle(indices) 180 181 pseudo_stat_a = stat(LazyMap(lambda i: lst[i], indices[:len(a)])) 182 pseudo_stat_b = stat(LazyMap(lambda i: lst[i], indices[len(a):])) 183 pseudo_stat = math.fabs(pseudo_stat_a - pseudo_stat_b) 184 185 if pseudo_stat >= actual_stat: 186 c += 1 187 188 if verbose and i % 10 == 0: 189 print 'pseudo-statistic: %f' % pseudo_stat 190 print 'significance: %f' % (float(c + 1) / (i + 1)) 191 print '-' * 60 192 193 significance = float(c + 1) / (shuffles + 1) 194 195 if verbose: 196 print 'significance: %f' % significance 197 if betai: 198 for phi in [0.01, 0.05, 0.10, 0.15, 0.25, 0.50]: 199 print "prob(phi<=%f): %f" % (phi, betai(c, shuffles, phi)) 200 201 return (significance, c, shuffles)

202 203

204 -class ConfusionMatrix(object):

205 """ 206 The confusion matrix between a list of reference values and a 207 corresponding list of test values. Entry [M{r},M{t}] of this 208 matrix is a count of the number of times that the reference value 209 M{r} corresponds to the test value M{t}. E.g.: 210 211 >>> ref = 'DET NN VB DET JJ NN NN IN DET NN'.split() 212 >>> test = 'DET VB VB DET NN NN NN IN DET NN'.split() 213 >>> cm = ConfusionMatrix(ref, test) 214 >>> print cm['NN', 'NN'] 215 3 216 217 Note that the diagonal entries (M{Ri}=M{Tj}) of this matrix 218 corresponds to correct values; and the off-diagonal entries 219 correspond to incorrect values. 220 """

221 - def __init__(self, reference, test, sort_by_count=False):

222 """ 223 Construct a new confusion matrix from a list of reference 224 values and a corresponding list of test values. 225 226 @type reference: C{list} 227 @param reference: An ordered list of reference values. 228 @type test: C{list} 229 @param test: A list of values to compare against the 230 corresponding reference values. 231 @raise ValueError: If C{reference} and C{length} do not have 232 the same length. 233 """ 234 if len(reference) != len(test): 235 raise ValueError('Lists must have the same length.') 236 237 # Get a list of all values. 238 if sort_by_count: 239 ref_fdist = FreqDist(reference) 240 test_fdist = FreqDist(test) 241 def key(v): return -(ref_fdist[v]+test_fdist[v]) 242 values = sorted(set(reference+test), key=key) 243 else: 244 values = sorted(set(reference+test)) 245 246 # Construct a value->index dictionary 247 indices = dict((val,i) for (i,val) in enumerate(values)) 248 249 # Make a confusion matrix table. 250 confusion = [[0 for val in values] for val in values] 251 max_conf = 0 # Maximum confusion 252 for w,g in zip(reference, test): 253 confusion[indices[w]][indices[g]] += 1 254 max_conf = max(max_conf, confusion[indices[w]][indices[g]]) 255 256 #: A list of all values in C{reference} or C{test}. 257 self._values = values 258 #: A dictionary mapping values in L{self._values} to their indices. 259 self._indices = indices 260 #: The confusion matrix itself (as a list of lists of counts). 261 self._confusion = confusion 262 #: The greatest count in L{self._confusion} (used for printing). 263 self._max_conf = max_conf 264 #: The total number of values in the confusion matrix. 265 self._total = len(reference) 266 #: The number of correct (on-diagonal) values in the matrix. 267 self._correct = sum(confusion[i][i] for i in range(len(values)))

268

269 - def __getitem__(self, (li,lj)):

270 """ 271 @return: The number of times that value C{li} was expected and 272 value C{lj} was given. 273 @rtype: C{int} 274 """ 275 i = self._indices[li] 276 j = self._indices[lj] 277 return self._confusion[i][j]

278

279 - def __repr__(self):

280 return '<ConfusionMatrix: %s/%s correct>' % (self._correct, 281 self._total)

282

283 - def __str__(self):

284 return self.pp()

285

286 - def pp(self, show_percents=False, values_in_chart=True):

287 """ 288 @return: A multi-line string representation of this confusion 289 matrix. 290 @todo: add marginals? 291 """ 292 confusion = self._confusion 293 294 if values_in_chart: 295 values = self._values 296 else: 297 values = range(len(self._values)) 298 299 # Construct a format string for row values 300 valuelen = max(len(str(val)) for val in values) 301 value_format = '%' + `valuelen` + 's | ' 302 # Construct a format string for matrix entries 303 if show_percents: 304 entrylen = 6 305 entry_format = '%5.1f%%' 306 zerostr = ' .' 307 else: 308 entrylen = len(`self._max_conf`) 309 entry_format = '%' + `entrylen` + 'd' 310 zerostr = ' '*(entrylen-1) + '.' 311 312 # Write the column values. 313 value_strings = [str(val) for val in values] 314 s = '' 315 for i in range(valuelen): 316 s += (' '*valuelen)+' |' 317 for val in value_strings: 318 if i >= valuelen-len(val): 319 s += val[i-valuelen+len(val)].rjust(entrylen+1) 320 else: 321 s += ' '*(entrylen+1) 322 s += ' |\n' 323 324 # Write a dividing line 325 s += '%s-+-%s+\n' % ('-'*valuelen, '-'*((entrylen+1)*len(values))) 326 327 # Write the entries. 328 for i in range(len(values)): 329 s += value_format % values[i] 330 for j in range(len(values)): 331 if confusion[i][j] == 0: 332 s += zerostr 333 elif show_percents: 334 s += entry_format % (100.0*confusion[i][j]/self._total) 335 else: 336 s += entry_format % confusion[i][j] 337 if i == j: 338 prevspace = s.rfind(' ') 339 s = s[:prevspace] + '<' + s[prevspace+1:] + '>' 340 else: s += ' ' 341 s += '|\n' 342 343 # Write a dividing line 344 s += '%s-+-%s+\n' % ('-'*valuelen, '-'*((entrylen+1)*len(values))) 345 346 # Write a key 347 s += '(row = reference; col = test)\n' 348 if not values_in_chart: 349 s += 'Value key:\n' 350 for i, value in enumerate(self._values): 351 s += '%6d: %s\n' % (i, value) 352 353 return s

354

355 - def key(self):

356 values = self._values 357 str = 'Value key:\n' 358 indexlen = len(`len(values)-1`) 359 key_format = ' %'+`indexlen`+'d: %s\n' 360 for i in range(len(values)): 361 str += key_format % (i, values[i]) 362 363 return str

364 365 ########################################################################## 366 # Windowdiff 367 # Pevzner, L., and Hearst, M., A Critique and Improvement of an Evaluation Metric for Text Segmentation, 368 # Computational Linguistics,, 28 (1), March 2002, pp. 19-36 369 ########################################################################## 370

371 -def windowdiff(seg1, seg2, k, boundary="1"):

372 """ 373 Compute the windowdiff score for a pair of segmentations. A segmentation is any sequence 374 over a vocabulary of two items (e.g. "0", "1"), where the specified boundary value is used 375 to mark the edge of a segmentation. 376 377 @param seg1: a segmentation 378 @type seg1: C{string} or C{list} 379 @param seg2: a segmentation 380 @type seg2: C{string} or C{list} 381 @param k: window width 382 @type k: C{int} 383 @param boundary: boundary value 384 @type boundary: C{string} or C{int} or C{bool} 385 @rtype: C{int} 386 """ 387 388 if len(seg1) != len(seg2): 389 raise ValueError, "Segmentations have unequal length" 390 wd = 0 391 for i in range(len(seg1) - k): 392 wd += abs(seg1[i:i+k+1].count(boundary) - seg2[i:i+k+1].count(boundary)) 393 return wd

394 395 # Edit Distance (Levenshtein) 396

397 -def _edit_dist_init(len1, len2):

398 lev = [] 399 for i in range(len1): 400 lev.append([0] * len2) # initialize 2-D array to zero 401 for i in range(len1): 402 lev[i][0] = i # column 0: 0,1,2,3,4,... 403 for j in range(len2): 404 lev[0][j] = j # row 0: 0,1,2,3,4,... 405 return lev

406

407 -def _edit_dist_step(lev, i, j, c1, c2):

408 a = lev[i-1][j ] + 1 # skipping s1[i] 409 b = lev[i-1][j-1] + (c1 != c2) # matching s1[i] with s2[j] 410 c = lev[i ][j-1] + 1 # skipping s2[j] 411 lev[i][j] = min(a,b,c) # pick the cheapest

412

413 -def edit_dist(s1, s2):

414 """ 415 Calculate the Levenshtein edit-distance between two strings. 416 The edit distance is the number of characters that need to be 417 substituted, inserted, or deleted, to transform s1 into s2. For 418 example, transforming "rain" to "shine" requires three steps, 419 consisting of two substitutions and one insertion: 420 "rain" -> "sain" -> "shin" -> "shine". These operations could have 421 been done in other orders, but at least three steps are needed. 422 423 @param s1, s2: The strings to be analysed 424 @type s1: C{string} 425 @type s2: C{string} 426 @rtype C{int} 427 """ 428 # set up a 2-D array 429 len1 = len(s1); len2 = len(s2) 430 lev = _edit_dist_init(len1+1, len2+1) 431 432 # iterate over the array 433 for i in range(len1): 434 for j in range (len2): 435 _edit_dist_step(lev, i+1, j+1, s1[i], s2[j]) 436 return lev[len1][len2]

437 438

439 -def demo():

440 print '-'*75 441 reference = 'DET NN VB DET JJ NN NN IN DET NN'.split() 442 test = 'DET VB VB DET NN NN NN IN DET NN'.split() 443 print 'Reference =', reference 444 print 'Test =', test 445 print 'Confusion matrix:' 446 print ConfusionMatrix(reference, test) 447 print 'Accuracy:', accuracy(reference, test) 448 449 print '-'*75 450 reference_set = set(reference) 451 test_set = set(test) 452 print 'Reference =', reference_set 453 print 'Test = ', test_set 454 print 'Precision:', precision(reference_set, test_set) 455 print ' Recall:', recall(reference_set, test_set) 456 print 'F-Measure:', f_measure(reference_set, test_set) 457 print '-'*75

458 459 if __name__ == '__main__': 460 demo() 461 462 __all__ = ['ConfusionMatrix', 'accuracy', 'demo', 463 'f_measure', 'log_likelihood', 'precision', 'recall', 464 'edit_dist', 'windowdiff'] 465

Source Code for Module nltk.evaluate