nltk.chunk.util

1 # Natural Language Toolkit: Chunk format conversions 2 # 3 # Copyright (C) 2001-2008 NLTK Project 4 # Author: Edward Loper <[email protected]> 5 # Steven Bird <[email protected]> (minor additions) 6 # URL: <http://nltk.org> 7 # For license information, see LICENSE.TXT 8 9 import re 10 import string 11 12 from nltk import Tree 13 import nltk.tag.util 14 15 from api import * 16 17 ##////////////////////////////////////////////////////// 18 ## EVALUATION 19 ##////////////////////////////////////////////////////// 20 21 from nltk import evaluate

22 -def accuracy(chunker, gold):

23 """ 24 Score the accuracy of the chunker against the gold standard. 25 Strip the chunk information from the gold standard and rechunk it using 26 the chunker, then compute the accuracy score. 27 28 @type chunker: C{ChunkParserI} 29 @param chunker: The chunker being evaluated. 30 @type gold: C{tree} 31 @param gold: The chunk structures to score the chunker on. 32 @rtype: C{float} 33 """ 34 35 gold_tags = [] 36 test_tags = [] 37 for gold_tree in gold: 38 test_tree = chunker.parse(gold_tree.flatten()) 39 gold_tags += tree2conlltags(gold_tree) 40 test_tags += tree2conlltags(test_tree) 41 42 # print 'GOLD:', gold_tags[:50] 43 # print 'TEST:', test_tags[:50] 44 return evaluate.accuracy(gold_tags, test_tags)

45 46 47 # Patched for increased performance by Yoav Goldberg <[email protected]>, 2006-01-13 48 # -- statistics are evaluated only on demand, instead of at every sentence evaluation 49 # 50 # SB: use nltk.evaluate for precision/recall scoring? 51 #

52 -class ChunkScore(object):

53 """ 54 A utility class for scoring chunk parsers. C{ChunkScore} can 55 evaluate a chunk parser's output, based on a number of statistics 56 (precision, recall, f-measure, misssed chunks, incorrect chunks). 57 It can also combine the scores from the parsing of multiple texts; 58 this makes it signifigantly easier to evaluate a chunk parser that 59 operates one sentence at a time. 60 61 Texts are evaluated with the C{score} method. The results of 62 evaluation can be accessed via a number of accessor methods, such 63 as C{precision} and C{f_measure}. A typical use of the 64 C{ChunkScore} class is:: 65 66 >>> chunkscore = ChunkScore() 67 >>> for correct in correct_sentences: 68 ... guess = chunkparser.parse(correct.leaves()) 69 ... chunkscore.score(correct, guess) 70 >>> print 'F Measure:', chunkscore.f_measure() 71 F Measure: 0.823 72 73 @ivar kwargs: Keyword arguments: 74 75 - max_tp_examples: The maximum number actual examples of true 76 positives to record. This affects the C{correct} member 77 function: C{correct} will not return more than this number 78 of true positive examples. This does *not* affect any of 79 the numerical metrics (precision, recall, or f-measure) 80 81 - max_fp_examples: The maximum number actual examples of false 82 positives to record. This affects the C{incorrect} member 83 function and the C{guessed} member function: C{incorrect} 84 will not return more than this number of examples, and 85 C{guessed} will not return more than this number of true 86 positive examples. This does *not* affect any of the 87 numerical metrics (precision, recall, or f-measure) 88 89 - max_fn_examples: The maximum number actual examples of false 90 negatives to record. This affects the C{missed} member 91 function and the C{correct} member function: C{missed} 92 will not return more than this number of examples, and 93 C{correct} will not return more than this number of true 94 negative examples. This does *not* affect any of the 95 numerical metrics (precision, recall, or f-measure) 96 97 - chunk_node: A regular expression indicating which chunks 98 should be compared. Defaults to C{'.*'} (i.e., all chunks). 99 100 @type _tp: C{list} of C{Token} 101 @ivar _tp: List of true positives 102 @type _fp: C{list} of C{Token} 103 @ivar _fp: List of false positives 104 @type _fn: C{list} of C{Token} 105 @ivar _fn: List of false negatives 106 107 @type _tp_num: C{int} 108 @ivar _tp_num: Number of true positives 109 @type _fp_num: C{int} 110 @ivar _fp_num: Number of false positives 111 @type _fn_num: C{int} 112 @ivar _fn_num: Number of false negatives. 113 """

114 - def __init__(self, **kwargs):

115 self._correct = set() 116 self._guessed = set() 117 self._tp = set() 118 self._fp = set() 119 self._fn = set() 120 self._max_tp = kwargs.get('max_tp_examples', 100) 121 self._max_fp = kwargs.get('max_fp_examples', 100) 122 self._max_fn = kwargs.get('max_fn_examples', 100) 123 self._chunk_node = kwargs.get('chunk_node', '.*') 124 self._tp_num = 0 125 self._fp_num = 0 126 self._fn_num = 0 127 self._count = 0 128 129 self._measuresNeedUpdate = False

130

131 - def _updateMeasures(self):

132 if (self._measuresNeedUpdate): 133 self._tp = self._guessed & self._correct 134 self._fn = self._correct - self._guessed 135 self._fp = self._guessed - self._correct 136 self._tp_num = len(self._tp) 137 self._fp_num = len(self._fp) 138 self._fn_num = len(self._fn) 139 self._measuresNeedUpdate = False

140

141 - def score(self, correct, guessed):

142 """ 143 Given a correctly chunked sentence, score another chunked 144 version of the same sentence. 145 146 @type correct: chunk structure 147 @param correct: The known-correct ("gold standard") chunked 148 sentence. 149 @type guessed: chunk structure 150 @param guessed: The chunked sentence to be scored. 151 """ 152 self._correct |= _chunksets(correct, self._count, self._chunk_node) 153 self._guessed |= _chunksets(guessed, self._count, self._chunk_node) 154 self._count += 1 155 self._measuresNeedUpdate = True

156

157 - def precision(self):

158 """ 159 @return: the overall precision for all texts that have been 160 scored by this C{ChunkScore}. 161 @rtype: C{float} 162 """ 163 self._updateMeasures() 164 div = self._tp_num + self._fp_num 165 if div == 0: return 0 166 else: return float(self._tp_num) / div

167

168 - def recall(self):

169 """ 170 @return: the overall recall for all texts that have been 171 scored by this C{ChunkScore}. 172 @rtype: C{float} 173 """ 174 self._updateMeasures() 175 div = self._tp_num + self._fn_num 176 if div == 0: return 0 177 else: return float(self._tp_num) / div

178

179 - def f_measure(self, alpha=0.5):

180 """ 181 @return: the overall F measure for all texts that have been 182 scored by this C{ChunkScore}. 183 @rtype: C{float} 184 185 @param alpha: the relative weighting of precision and recall. 186 Larger alpha biases the score towards the precision value, 187 while smaller alpha biases the score towards the recall 188 value. C{alpha} should have a value in the range [0,1]. 189 @type alpha: C{float} 190 """ 191 self._updateMeasures() 192 p = self.precision() 193 r = self.recall() 194 if p == 0 or r == 0: # what if alpha is 0 or 1? 195 return 0 196 return 1/(alpha/p + (1-alpha)/r)

197

198 - def missed(self):

199 """ 200 @rtype: C{list} of chunks 201 @return: the chunks which were included in the 202 correct chunk structures, but not in the guessed chunk 203 structures, listed in input order. 204 """ 205 self._updateMeasures() 206 chunks = list(self._fn) 207 return [c[1] for c in chunks] # discard position information

208

209 - def incorrect(self):

210 """ 211 @rtype: C{list} of chunks 212 @return: the chunks which were included in the 213 guessed chunk structures, but not in the correct chunk 214 structures, listed in input order. 215 """ 216 self._updateMeasures() 217 chunks = list(self._fp) 218 return [c[1] for c in chunks] # discard position information

219

220 - def correct(self):

221 """ 222 @rtype: C{list} of chunks 223 @return: the chunks which were included in the correct 224 chunk structures, listed in input order. 225 """ 226 chunks = list(self._correct) 227 return [c[1] for c in chunks] # discard position information

228

229 - def guessed(self):

230 """ 231 @rtype: C{list} of chunks 232 @return: the chunks which were included in the guessed 233 chunk structures, listed in input order. 234 """ 235 chunks = list(self._guessed) 236 return [c[1] for c in chunks] # discard position information

237

238 - def __len__(self):

239 self._updateMeasures() 240 return self._tp_num + self._fn_num

241

242 - def __repr__(self):

243 """ 244 @rtype: C{String} 245 @return: a concise representation of this C{ChunkScoring}. 246 """ 247 return '<ChunkScoring of '+`len(self)`+' chunks>'

248

249 - def __str__(self):

250 """ 251 @rtype: C{String} 252 @return: a verbose representation of this C{ChunkScoring}. 253 This representation includes the precision, recall, and 254 f-measure scores. For other information about the score, 255 use the accessor methods (e.g., C{missed()} and 256 C{incorrect()}). 257 """ 258 return ("ChunkParse score:\n" + 259 (" Precision: %5.1f%%\n" % (self.precision()*100)) + 260 (" Recall: %5.1f%%\n" % (self.recall()*100))+ 261 (" F-Measure: %5.1f%%" % (self.f_measure()*100)))

262 263 # extract chunks, and assign unique id, the absolute position of 264 # the first word of the chunk

265 -def _chunksets(t, count, chunk_node):

266 pos = 0 267 chunks = [] 268 for child in t: 269 if isinstance(child, Tree): 270 if re.match(chunk_node, child.node): 271 chunks.append(((count, pos), tuple(child.freeze()))) 272 pos += len(child.leaves()) 273 else: 274 pos += 1 275 return set(chunks)

276 277

278 -def tagstr2tree(s, chunk_node="NP", top_node="S", sep='/'):

279 """ 280 Divide a string of bracketted tagged text into 281 chunks and unchunked tokens, and produce a C{Tree}. 282 Chunks are marked by square brackets (C{[...]}). Words are 283 delimited by whitespace, and each word should have the form 284 C{I{text}/I{tag}}. Words that do not contain a slash are 285 assigned a C{tag} of C{None}. 286 287 @return: A tree corresponding to the string representation. 288 @rtype: C{tree} 289 @param s: The string to be converted 290 @type s: C{string} 291 @param chunk_node: The label to use for chunk nodes 292 @type chunk_node: C{string} 293 @param top_node: The label to use for the root of the tree 294 @type top_node: C{string} 295 """ 296 297 WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+') 298 299 stack = [Tree(top_node, [])] 300 for match in WORD_OR_BRACKET.finditer(s): 301 text = match.group() 302 if text[0] == '[': 303 if len(stack) != 1: 304 raise ValueError('Unexpected [ at char %d' % match.start()) 305 chunk = Tree(chunk_node, []) 306 stack[-1].append(chunk) 307 stack.append(chunk) 308 elif text[0] == ']': 309 if len(stack) != 2: 310 raise ValueError('Unexpected ] at char %d' % match.start()) 311 stack.pop() 312 else: 313 if sep is None: 314 stack[-1].append(text) 315 else: 316 stack[-1].append(nltk.tag.util.str2tuple(text, sep)) 317 318 if len(stack) != 1: 319 raise ValueError('Expected ] at char %d' % len(s)) 320 return stack[0]

321 322 ### CONLL 323 324 _LINE_RE = re.compile('(\S+)\s+(\S+)\s+([IOB])-?(\S+)?')

325 -def conllstr2tree(s, chunk_types=('NP', 'PP', 'VP'), top_node="S"):

326 """ 327 Convert a CoNLL IOB string into a tree. Uses the specified chunk types 328 (defaults to NP, PP and VP), and creates a tree rooted at a node 329 labeled S (by default). 330 331 @param s: The CoNLL string to be converted. 332 @type s: C{string} 333 @param chunk_types: The chunk types to be converted. 334 @type chunk_types: C{tuple} 335 @param top_node: The node label to use for the root. 336 @type top_node: C{string} 337 @return: A chunk structure for a single sentence 338 encoded in the given CONLL 2000 style string. 339 @rtype: L{Tree} 340 """ 341 342 stack = [Tree(top_node, [])] 343 344 for lineno, line in enumerate(s.split('\n')): 345 if not line.strip(): continue 346 347 # Decode the line. 348 match = _LINE_RE.match(line) 349 if match is None: 350 raise ValueError, 'Error on line %d' % lineno 351 (word, tag, state, chunk_type) = match.groups() 352 353 # If it's a chunk type we don't care about, treat it as O. 354 if (chunk_types is not None and 355 chunk_type not in chunk_types): 356 state = 'O' 357 358 # For "Begin"/"Outside", finish any completed chunks - 359 # also do so for "Inside" which don't match the previous token. 360 mismatch_I = state == 'I' and chunk_type != stack[-1].node 361 if state in 'BO' or mismatch_I: 362 if len(stack) == 2: stack.pop() 363 364 # For "Begin", start a new chunk. 365 if state == 'B' or mismatch_I: 366 chunk = Tree(chunk_type, []) 367 stack[-1].append(chunk) 368 stack.append(chunk) 369 370 # Add the new word token. 371 stack[-1].append((word, tag)) 372 373 return stack[0]

374

375 -def tree2conlltags(t):

376 """ 377 Convert a tree to the CoNLL IOB tag format 378 379 @param t: The tree to be converted. 380 @type t: C{Tree} 381 @return: A list of 3-tuples containing word, tag and IOB tag. 382 @rtype: C{list} of C{tuple} 383 """ 384 385 tags = [] 386 for child in t: 387 try: 388 category = child.node 389 prefix = "B-" 390 for contents in child: 391 if isinstance(contents, Tree): 392 raise ValueError, "Tree is too deeply nested to be printed in CoNLL format" 393 tags.append((contents[0], contents[1], prefix+category)) 394 prefix = "I-" 395 except AttributeError: 396 tags.append((child[0], child[1], "O")) 397 return tags

398

399 -def tree2conllstr(t):

400 """ 401 Convert a tree to the CoNLL IOB string format 402 403 @param t: The tree to be converted. 404 @type t: C{Tree} 405 @return: A multiline string where each line contains a word, tag and IOB tag. 406 @rtype: C{string} 407 """ 408 lines = [string.join(token) for token in tree2conlltags(t)] 409 return '\n'.join(lines)

410 411 ### IEER 412 413 _IEER_DOC_RE = re.compile(r'<DOC>\s*' 414 r'(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?' 415 r'(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?' 416 r'(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?' 417 r'<BODY>\s*' 418 r'(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?' 419 r'<TEXT>(?P<text>.*?)</TEXT>\s*' 420 r'</BODY>\s*</DOC>\s*', re.DOTALL) 421 422 _IEER_TYPE_RE = re.compile('<b_\w+\s+[^>]*?type="(?P<type>\w+)"') 423

424 -def _ieer_read_text(s, top_node):

425 stack = [Tree(top_node, [])] 426 # s will be None if there is no headline in the text 427 # return the empty list in place of a Tree 428 if s is None: 429 return [] 430 for piece_m in re.finditer('<[^>]+>|[^\s<]+', s): 431 piece = piece_m.group() 432 try: 433 if piece.startswith('<b_'): 434 m = _IEER_TYPE_RE.match(piece) 435 if m is None: print 'XXXX', piece 436 chunk = Tree(m.group('type'), []) 437 stack[-1].append(chunk) 438 stack.append(chunk) 439 elif piece.startswith('<e_'): 440 stack.pop() 441 # elif piece.startswith('<'): 442 # print "ERROR:", piece 443 # raise ValueError # Unexpected HTML 444 else: 445 stack[-1].append(piece) 446 except (IndexError, ValueError): 447 raise ValueError('Bad IEER string (error at character %d)' % 448 piece_m.start()) 449 if len(stack) != 1: 450 raise ValueError('Bad IEER string') 451 return stack[0]

452

453 -def ieerstr2tree(s, chunk_types = ['LOCATION', 'ORGANIZATION', 'PERSON', 'DURATION', 454 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE'], top_node="S"):

455 """ 456 Convert a string of chunked tagged text in the IEER named 457 entity format into a chunk structure. Chunks are of several 458 types, LOCATION, ORGANIZATION, PERSON, DURATION, DATE, CARDINAL, 459 PERCENT, MONEY, and MEASURE. 460 461 @return: A chunk structure containing the chunked tagged text that is 462 encoded in the given IEER style string. 463 @rtype: L{Tree} 464 """ 465 466 # Try looking for a single document. If that doesn't work, then just 467 # treat everything as if it was within the <TEXT>...</TEXT>. 468 m = _IEER_DOC_RE.match(s) 469 if m: 470 return { 471 'text': _ieer_read_text(m.group('text'), top_node), 472 'docno': m.group('docno'), 473 'doctype': m.group('doctype'), 474 'date_time': m.group('date_time'), 475 #'headline': m.group('headline') 476 # we want to capture NEs in the headline too! 477 'headline': _ieer_read_text(m.group('headline'), top_node), 478 } 479 else: 480 return _ieer_read_text(s, top_node)

481 482

483 -def demo():

484 485 s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./." 486 from nltk import chunk 487 t = chunk.tagstr2tree(s, chunk_node='NP') 488 print t.pprint() 489 print 490 491 s = """ 492 These DT B-NP 493 research NN I-NP 494 protocols NNS I-NP 495 offer VBP B-VP 496 to TO B-PP 497 the DT B-NP 498 patient NN I-NP 499 not RB O 500 only RB O 501 the DT B-NP 502 very RB I-NP 503 best JJS I-NP 504 therapy NN I-NP 505 which WDT B-NP 506 we PRP B-NP 507 have VBP B-VP 508 established VBN I-VP 509 today NN B-NP 510 but CC B-NP 511 also RB I-NP 512 the DT B-NP 513 hope NN I-NP 514 of IN B-PP 515 something NN B-NP 516 still RB B-ADJP 517 better JJR I-ADJP 518 . . O 519 """ 520 521 conll_tree = conllstr2tree(s, chunk_types=('NP', 'PP')) 522 print conll_tree.pprint() 523 524 # Demonstrate CoNLL output 525 print "CoNLL output:" 526 print chunk.tree2conllstr(conll_tree) 527 print

528 529 530 if __name__ == '__main__': 531 demo() 532

Source Code for Module nltk.chunk.util