nltk.chunk.regexp

1 # Natural Language Toolkit: Regular Expression Chunkers 2 # 3 # Copyright (C) 2001-2008 NLTK Project 4 # Author: Edward Loper <[email protected]> 5 # Steven Bird <[email protected]> (minor additions) 6 # URL: <http://nltk.org> 7 # For license information, see LICENSE.TXT 8 9 import re 10 import types 11 12 from nltk import Tree 13 14 from nltk.chunk.api import * 15 from nltk.chunk.util import * 16 17 ##////////////////////////////////////////////////////// 18 ## ChunkString 19 ##////////////////////////////////////////////////////// 20

21 -class ChunkString(object):

22 """ 23 A string-based encoding of a particular chunking of a text. 24 Internally, the C{ChunkString} class uses a single string to 25 encode the chunking of the input text. This string contains a 26 sequence of angle-bracket delimited tags, with chunking indicated 27 by braces. An example of this encoding is:: 28 29 {<DT><JJ><NN>}<VBN><IN>{<DT><NN>}<.>{<DT><NN>}<VBD><.> 30 31 C{ChunkString} are created from tagged texts (i.e., C{list}s of 32 C{tokens} whose type is C{TaggedType}). Initially, nothing is 33 chunked. 34 35 The chunking of a C{ChunkString} can be modified with the C{xform} 36 method, which uses a regular expression to transform the string 37 representation. These transformations should only add and remove 38 braces; they should I{not} modify the sequence of angle-bracket 39 delimited tags. 40 41 @type _str: C{string} 42 @ivar _str: The internal string representation of the text's 43 encoding. This string representation contains a sequence of 44 angle-bracket delimited tags, with chunking indicated by 45 braces. An example of this encoding is:: 46 47 {<DT><JJ><NN>}<VBN><IN>{<DT><NN>}<.>{<DT><NN>}<VBD><.> 48 49 @type _pieces: C{list} of pieces (tagged tokens and chunks) 50 @ivar _pieces: The tagged tokens and chunks encoded by this C{ChunkString}. 51 @ivar _debug: The debug level. See the constructor docs. 52 53 @cvar IN_CHUNK_PATTERN: A zero-width regexp pattern string that 54 will only match positions that are in chunks. 55 @cvar IN_CHINK_PATTERN: A zero-width regexp pattern string that 56 will only match positions that are in chinks. 57 """ 58 CHUNK_TAG_CHAR = r'[^\{\}<>]' 59 CHUNK_TAG = r'(<%s+?>)' % CHUNK_TAG_CHAR 60 61 IN_CHUNK_PATTERN = r'(?=[^\{]*\})' 62 IN_CHINK_PATTERN = r'(?=[^\}]*(\{|$))' 63 64 # These are used by _verify 65 _CHUNK = r'(\{%s+?\})+?' % CHUNK_TAG 66 _CHINK = r'(%s+?)+?' % CHUNK_TAG 67 _VALID = re.compile(r'^(\{?%s\}?)*?$' % CHUNK_TAG) 68 _BRACKETS = re.compile('[^\{\}]+') 69 _BALANCED_BRACKETS = re.compile(r'(\{\})*$') 70

71 - def __init__(self, chunk_struct, debug_level=1):

72 """ 73 Construct a new C{ChunkString} that encodes the chunking of 74 the text C{tagged_tokens}. 75 76 @type chunk_struct: C{Tree} 77 @param chunk_struct: The chunk structure to be further chunked. 78 @type debug_level: int 79 @param debug_level: The level of debugging which should be 80 applied to transformations on the C{ChunkString}. The 81 valid levels are: 82 - 0: no checks 83 - 1: full check on to_chunkstruct 84 - 2: full check on to_chunkstruct and cursory check after 85 each transformation. 86 - 3: full check on to_chunkstruct and full check after 87 each transformation. 88 We recommend you use at least level 1. You should 89 probably use level 3 if you use any non-standard 90 subclasses of C{RegexpChunkRule}. 91 """ 92 self._top_node = chunk_struct.node 93 self._pieces = chunk_struct[:] 94 tags = [self._tag(tok) for tok in self._pieces] 95 self._str = '<' + '><'.join(tags) + '>' 96 self._debug = debug_level

97

98 - def _tag(self, tok):

99 if type(tok) == types.TupleType: 100 return tok[1] 101 elif isinstance(tok, Tree): 102 return tok.node 103 else: 104 raise ValueError('chunk structures must contain tagged ' 105 'tokens or trees')

106

107 - def _verify(self, s, verify_tags):

108 """ 109 Check to make sure that C{s} still corresponds to some chunked 110 version of C{_pieces}. 111 112 @type verify_tags: C{boolean} 113 @param verify_tags: Whether the individual tags should be 114 checked. If this is false, C{_verify} will check to make 115 sure that C{_str} encodes a chunked version of I{some} 116 list of tokens. If this is true, then C{_verify} will 117 check to make sure that the tags in C{_str} match those in 118 C{_pieces}. 119 120 @raise ValueError: if this C{ChunkString}'s internal string 121 representation is invalid or not consistent with _pieces. 122 """ 123 # Check overall form 124 if not ChunkString._VALID.match(s): 125 raise ValueError('Transformation generated invalid ' 126 'chunkstring:\n %s' % s) 127 128 # Check that parens are balanced. If the string is long, we 129 # have to do this in pieces, to avoid a maximum recursion 130 # depth limit for regular expressions. 131 brackets = ChunkString._BRACKETS.sub('', s) 132 for i in range(1+len(brackets)/5000): 133 substr = brackets[i*5000:i*5000+5000] 134 if not ChunkString._BALANCED_BRACKETS.match(substr): 135 raise ValueError('Transformation generated invalid ' 136 'chunkstring:\n %s' % s) 137 138 if verify_tags<=0: return 139 140 tags1 = (re.split(r'[\{\}<>]+', s))[1:-1] 141 tags2 = [self._tag(piece) for piece in self._pieces] 142 if tags1 != tags2: 143 raise ValueError('Transformation generated invalid ' 144 'chunkstring: tag changed')

145

146 - def to_chunkstruct(self, chunk_node='CHUNK'):

147 """ 148 @return: the chunk structure encoded by this C{ChunkString}. 149 @rtype: C{Tree} 150 @raise ValueError: If a transformation has generated an 151 invalid chunkstring. 152 """ 153 if self._debug > 0: self._verify(self._str, 1) 154 155 # Use this alternating list to create the chunkstruct. 156 pieces = [] 157 index = 0 158 piece_in_chunk = 0 159 for piece in re.split('[{}]', self._str): 160 161 # Find the list of tokens contained in this piece. 162 length = piece.count('<') 163 subsequence = self._pieces[index:index+length] 164 165 # Add this list of tokens to our pieces. 166 if piece_in_chunk: 167 pieces.append(Tree(chunk_node, subsequence)) 168 else: 169 pieces += subsequence 170 171 # Update index, piece_in_chunk 172 index += length 173 piece_in_chunk = not piece_in_chunk 174 175 return Tree(self._top_node, pieces)

176

177 - def xform(self, regexp, repl):

178 """ 179 Apply the given transformation to this C{ChunkString}'s string 180 encoding. In particular, find all occurrences that match 181 C{regexp}, and replace them using C{repl} (as done by 182 C{re.sub}). 183 184 This transformation should only add and remove braces; it 185 should I{not} modify the sequence of angle-bracket delimited 186 tags. Furthermore, this transformation may not result in 187 improper bracketing. Note, in particular, that bracketing may 188 not be nested. 189 190 @type regexp: C{string} or C{regexp} 191 @param regexp: A regular expression matching the substring 192 that should be replaced. This will typically include a 193 named group, which can be used by C{repl}. 194 @type repl: C{string} 195 @param repl: An expression specifying what should replace the 196 matched substring. Typically, this will include a named 197 replacement group, specified by C{regexp}. 198 @rtype: C{None} 199 @raise ValueError: If this transformation generated an 200 invalid chunkstring. 201 """ 202 # Do the actual substitution 203 s = re.sub(regexp, repl, self._str) 204 205 # The substitution might have generated "empty chunks" 206 # (substrings of the form "{}"). Remove them, so they don't 207 # interfere with other transformations. 208 s = re.sub('\{\}', '', s) 209 210 # Make sure that the transformation was legal. 211 if self._debug > 1: self._verify(s, self._debug-2) 212 213 # Commit the transformation. 214 self._str = s

215

216 - def __repr__(self):

217 """ 218 @rtype: C{string} 219 @return: A string representation of this C{ChunkString}. This 220 string representation has the form:: 221 222 <ChunkString: '{<DT><JJ><NN>}<VBN><IN>{<DT><NN>}'> 223 224 """ 225 return '<ChunkString: %s>' % `self._str`

226

227 - def __str__(self):

228 """ 229 @rtype: C{string} 230 @return: A formatted representation of this C{ChunkString}'s 231 string encoding. This representation will include extra 232 spaces to ensure that tags will line up with the 233 representation of other C{ChunkStrings} for the same text, 234 regardless of the chunking. 235 """ 236 # Add spaces to make everything line up. 237 str = re.sub(r'>(?!\})', r'> ', self._str) 238 str = re.sub(r'([^\{])<', r'\1 <', str) 239 if str[0] == '<': str = ' ' + str 240 return str

241 242 ##////////////////////////////////////////////////////// 243 ## Chunking Rules 244 ##////////////////////////////////////////////////////// 245

246 -class RegexpChunkRule(object):

247 """ 248 A rule specifying how to modify the chunking in a C{ChunkString}, 249 using a transformational regular expression. The 250 C{RegexpChunkRule} class itself can be used to implement any 251 transformational rule based on regular expressions. There are 252 also a number of subclasses, which can be used to implement 253 simpler types of rules, based on matching regular expressions. 254 255 Each C{RegexpChunkRule} has a regular expression and a 256 replacement expression. When a C{RegexpChunkRule} is X{applied} 257 to a C{ChunkString}, it searches the C{ChunkString} for any 258 substring that matches the regular expression, and replaces it 259 using the replacement expression. This search/replace operation 260 has the same semantics as C{re.sub}. 261 262 Each C{RegexpChunkRule} also has a description string, which 263 gives a short (typically less than 75 characters) description of 264 the purpose of the rule. 265 266 This transformation defined by this C{RegexpChunkRule} should 267 only add and remove braces; it should I{not} modify the sequence 268 of angle-bracket delimited tags. Furthermore, this transformation 269 may not result in nested or mismatched bracketing. 270 """

271 - def __init__(self, regexp, repl, descr):

272 """ 273 Construct a new RegexpChunkRule. 274 275 @type regexp: C{regexp} or C{string} 276 @param regexp: This C{RegexpChunkRule}'s regular expression. 277 When this rule is applied to a C{ChunkString}, any 278 substring that matches C{regexp} will be replaced using 279 the replacement string C{repl}. Note that this must be a 280 normal regular expression, not a tag pattern. 281 @type repl: C{string} 282 @param repl: This C{RegexpChunkRule}'s replacement 283 expression. When this rule is applied to a 284 C{ChunkString}, any substring that matches C{regexp} will 285 be replaced using C{repl}. 286 @type descr: C{string} 287 @param descr: A short description of the purpose and/or effect 288 of this rule. 289 """ 290 if isinstance(regexp, basestring): 291 regexp = re.compile(regexp) 292 self._repl = repl 293 self._descr = descr 294 self._regexp = regexp

295

296 - def apply(self, chunkstr):

297 # Keep docstring generic so we can inherit it. 298 """ 299 Apply this rule to the given C{ChunkString}. See the 300 class reference documentation for a description of what it 301 means to apply a rule. 302 303 @type chunkstr: C{ChunkString} 304 @param chunkstr: The chunkstring to which this rule is 305 applied. 306 @rtype: C{None} 307 @raise ValueError: If this transformation generated an 308 invalid chunkstring. 309 """ 310 chunkstr.xform(self._regexp, self._repl)

311

312 - def descr(self):

313 """ 314 @rtype: C{string} 315 @return: a short description of the purpose and/or effect of 316 this rule. 317 """ 318 return self._descr

319

320 - def __repr__(self):

321 """ 322 @rtype: C{string} 323 @return: A string representation of this rule. This 324 string representation has the form:: 325 326 <RegexpChunkRule: '{<IN|VB.*>}'->'<IN>'> 327 328 Note that this representation does not include the 329 description string; that string can be accessed 330 separately with the C{descr} method. 331 """ 332 return ('<RegexpChunkRule: '+`self._regexp.pattern`+ 333 '->'+`self._repl`+'>')

334 335 @staticmethod

336 - def parse(s):

337 """ 338 Create a RegexpChunkRule from a string description. 339 Currently, the following formats are supported:: 340 341 {regexp} # chunk rule 342 }regexp{ # chink rule 343 regexp}{regexp # split rule 344 regexp{}regexp # merge rule 345 346 Where C{regexp} is a regular expression for the rule. Any 347 text following the comment marker (C{#}) will be used as 348 the rule's description: 349 350 >>> RegexpChunkRule.parse('{<DT>?<NN.*>+}') 351 <ChunkRule: '<DT>?<NN.*>+'> 352 """ 353 # Split off the comment (but don't split on '\#') 354 m = re.match(r'(?P<rule>(\\.|[^#])*)(?P<comment>#.*)?', s) 355 rule = m.group('rule').strip() 356 comment = (m.group('comment') or '')[1:].strip() 357 358 # Pattern bodies: chunk, chink, split, merge 359 try: 360 if not rule: 361 raise ValueError('Empty chunk pattern') 362 if rule[0] == '{' and rule[-1] == '}': 363 return ChunkRule(rule[1:-1], comment) 364 elif rule[0] == '}' and rule[-1] == '{': 365 return ChinkRule(rule[1:-1], comment) 366 elif '}{' in rule: 367 left, right = rule.split('}{') 368 return SplitRule(left, right, comment) 369 elif '{}' in rule: 370 left, right = rule.split('{}') 371 return MergeRule(left, right, comment) 372 elif re.match('[^{}]*{[^{}]*}[^{}]*', rule): 373 left, chunk, right = re.split('[{}]', rule) 374 return ChunkRuleWithContext(left, chunk, right, comment) 375 else: 376 raise ValueError('Illegal chunk pattern: %s' % rule) 377 except (ValueError, re.error): 378 raise ValueError('Illegal chunk pattern: %s' % rule)

379 380

381 -class ChunkRule(RegexpChunkRule):

382 """ 383 A rule specifying how to add chunks to a C{ChunkString}, using a 384 matching tag pattern. When applied to a C{ChunkString}, it will 385 find any substring that matches this tag pattern and that is not 386 already part of a chunk, and create a new chunk containing that 387 substring. 388 """

389 - def __init__(self, tag_pattern, descr):

390 391 """ 392 Construct a new C{ChunkRule}. 393 394 @type tag_pattern: C{string} 395 @param tag_pattern: This rule's tag pattern. When 396 applied to a C{ChunkString}, this rule will 397 chunk any substring that matches this tag pattern and that 398 is not already part of a chunk. 399 @type descr: C{string} 400 @param descr: A short description of the purpose and/or effect 401 of this rule. 402 """ 403 self._pattern = tag_pattern 404 regexp = re.compile('(?P<chunk>%s)%s' % 405 (tag_pattern2re_pattern(tag_pattern), 406 ChunkString.IN_CHINK_PATTERN)) 407 RegexpChunkRule.__init__(self, regexp, '{\g<chunk>}', descr)

408

409 - def __repr__(self):

410 """ 411 @rtype: C{string} 412 @return: A string representation of this rule. This 413 string representation has the form:: 414 415 <ChunkRule: '<IN|VB.*>'> 416 417 Note that this representation does not include the 418 description string; that string can be accessed 419 separately with the C{descr} method. 420 """ 421 return '<ChunkRule: '+`self._pattern`+'>'

422

423 -class ChinkRule(RegexpChunkRule):

424 """ 425 A rule specifying how to remove chinks to a C{ChunkString}, 426 using a matching tag pattern. When applied to a 427 C{ChunkString}, it will find any substring that matches this 428 tag pattern and that is contained in a chunk, and remove it 429 from that chunk, thus creating two new chunks. 430 """

431 - def __init__(self, tag_pattern, descr):

432 """ 433 Construct a new C{ChinkRule}. 434 435 @type tag_pattern: C{string} 436 @param tag_pattern: This rule's tag pattern. When 437 applied to a C{ChunkString}, this rule will 438 find any substring that matches this tag pattern and that 439 is contained in a chunk, and remove it from that chunk, 440 thus creating two new chunks. 441 @type descr: C{string} 442 @param descr: A short description of the purpose and/or effect 443 of this rule. 444 """ 445 self._pattern = tag_pattern 446 regexp = re.compile('(?P<chink>%s)%s' % 447 (tag_pattern2re_pattern(tag_pattern), 448 ChunkString.IN_CHUNK_PATTERN)) 449 RegexpChunkRule.__init__(self, regexp, '}\g<chink>{', descr)

450

451 - def __repr__(self):

452 """ 453 @rtype: C{string} 454 @return: A string representation of this rule. This 455 string representation has the form:: 456 457 <ChinkRule: '<IN|VB.*>'> 458 459 Note that this representation does not include the 460 description string; that string can be accessed 461 separately with the C{descr} method. 462 """ 463 return '<ChinkRule: '+`self._pattern`+'>'

464

465 -class UnChunkRule(RegexpChunkRule):

466 """ 467 A rule specifying how to remove chunks to a C{ChunkString}, 468 using a matching tag pattern. When applied to a 469 C{ChunkString}, it will find any complete chunk that matches this 470 tag pattern, and un-chunk it. 471 """

472 - def __init__(self, tag_pattern, descr):

473 """ 474 Construct a new C{UnChunkRule}. 475 476 @type tag_pattern: C{string} 477 @param tag_pattern: This rule's tag pattern. When 478 applied to a C{ChunkString}, this rule will 479 find any complete chunk that matches this tag pattern, 480 and un-chunk it. 481 @type descr: C{string} 482 @param descr: A short description of the purpose and/or effect 483 of this rule. 484 """ 485 self._pattern = tag_pattern 486 regexp = re.compile('\{(?P<chunk>%s)\}' % 487 tag_pattern2re_pattern(tag_pattern)) 488 RegexpChunkRule.__init__(self, regexp, '\g<chunk>', descr)

489

490 - def __repr__(self):

491 """ 492 @rtype: C{string} 493 @return: A string representation of this rule. This 494 string representation has the form:: 495 496 <UnChunkRule: '<IN|VB.*>'> 497 498 Note that this representation does not include the 499 description string; that string can be accessed 500 separately with the C{descr} method. 501 """ 502 return '<UnChunkRule: '+`self._pattern`+'>'

503

504 -class MergeRule(RegexpChunkRule):

505 """ 506 A rule specifying how to merge chunks in a C{ChunkString}, using 507 two matching tag patterns: a left pattern, and a right pattern. 508 When applied to a C{ChunkString}, it will find any chunk whose end 509 matches left pattern, and immediately followed by a chunk whose 510 beginning matches right pattern. It will then merge those two 511 chunks into a single chunk. 512 """

513 - def __init__(self, left_tag_pattern, right_tag_pattern, descr):

514 """ 515 Construct a new C{MergeRule}. 516 517 @type right_tag_pattern: C{string} 518 @param right_tag_pattern: This rule's right tag 519 pattern. When applied to a C{ChunkString}, this 520 rule will find any chunk whose end matches 521 C{left_tag_pattern}, and immediately followed by a chunk 522 whose beginning matches this pattern. It will 523 then merge those two chunks into a single chunk. 524 @type left_tag_pattern: C{string} 525 @param left_tag_pattern: This rule's left tag 526 pattern. When applied to a C{ChunkString}, this 527 rule will find any chunk whose end matches 528 this pattern, and immediately followed by a chunk 529 whose beginning matches C{right_tag_pattern}. It will 530 then merge those two chunks into a single chunk. 531 532 @type descr: C{string} 533 @param descr: A short description of the purpose and/or effect 534 of this rule. 535 """ 536 # Ensure that the individual patterns are coherent. E.g., if 537 # left='(' and right=')', then this will raise an exception: 538 re.compile(tag_pattern2re_pattern(left_tag_pattern)) 539 re.compile(tag_pattern2re_pattern(right_tag_pattern)) 540 541 self._left_tag_pattern = left_tag_pattern 542 self._right_tag_pattern = right_tag_pattern 543 regexp = re.compile('(?P<left>%s)}{(?=%s)' % 544 (tag_pattern2re_pattern(left_tag_pattern), 545 tag_pattern2re_pattern(right_tag_pattern))) 546 RegexpChunkRule.__init__(self, regexp, '\g<left>', descr)

547

548 - def __repr__(self):

549 """ 550 @rtype: C{string} 551 @return: A string representation of this rule. This 552 string representation has the form:: 553 554 <MergeRule: '<NN|DT|JJ>', '<NN|JJ>'> 555 556 Note that this representation does not include the 557 description string; that string can be accessed 558 separately with the C{descr} method. 559 """ 560 return ('<MergeRule: '+`self._left_tag_pattern`+', '+ 561 `self._right_tag_pattern`+'>')

562

563 -class SplitRule(RegexpChunkRule):

564 """ 565 A rule specifying how to split chunks in a C{ChunkString}, using 566 two matching tag patterns: a left pattern, and a right pattern. 567 When applied to a C{ChunkString}, it will find any chunk that 568 matches the left pattern followed by the right pattern. It will 569 then split the chunk into two new chunks, at the point between the 570 two pattern matches. 571 """

572 - def __init__(self, left_tag_pattern, right_tag_pattern, descr):

573 """ 574 Construct a new C{SplitRule}. 575 576 @type right_tag_pattern: C{string} 577 @param right_tag_pattern: This rule's right tag 578 pattern. When applied to a C{ChunkString}, this rule will 579 find any chunk containing a substring that matches 580 C{left_tag_pattern} followed by this pattern. It will 581 then split the chunk into two new chunks at the point 582 between these two matching patterns. 583 @type left_tag_pattern: C{string} 584 @param left_tag_pattern: This rule's left tag 585 pattern. When applied to a C{ChunkString}, this rule will 586 find any chunk containing a substring that matches this 587 pattern followed by C{right_tag_pattern}. It will then 588 split the chunk into two new chunks at the point between 589 these two matching patterns. 590 @type descr: C{string} 591 @param descr: A short description of the purpose and/or effect 592 of this rule. 593 """ 594 # Ensure that the individual patterns are coherent. E.g., if 595 # left='(' and right=')', then this will raise an exception: 596 re.compile(tag_pattern2re_pattern(left_tag_pattern)) 597 re.compile(tag_pattern2re_pattern(right_tag_pattern)) 598 599 self._left_tag_pattern = left_tag_pattern 600 self._right_tag_pattern = right_tag_pattern 601 regexp = re.compile('(?P<left>%s)(?=%s)' % 602 (tag_pattern2re_pattern(left_tag_pattern), 603 tag_pattern2re_pattern(right_tag_pattern))) 604 RegexpChunkRule.__init__(self, regexp, r'\g<left>}{', descr)

605

606 - def __repr__(self):

607 """ 608 @rtype: C{string} 609 @return: A string representation of this rule. This 610 string representation has the form:: 611 612 <SplitRule: '<NN>', '<DT>'> 613 614 Note that this representation does not include the 615 description string; that string can be accessed 616 separately with the C{descr} method. 617 """ 618 return ('<SplitRule: '+`self._left_tag_pattern`+', '+ 619 `self._right_tag_pattern`+'>')

620

621 -class ExpandLeftRule(RegexpChunkRule):

622 """ 623 A rule specifying how to expand chunks in a C{ChunkString} to the left, 624 using two matching tag patterns: a left pattern, and a right pattern. 625 When applied to a C{ChunkString}, it will find any chunk whose beginning 626 matches right pattern, and immediately preceded by a chink whose 627 end matches left pattern. It will then expand the chunk to incorporate 628 the new material on the left. 629 """

630 - def __init__(self, left_tag_pattern, right_tag_pattern, descr):

631 """ 632 Construct a new C{ExpandRightRule}. 633 634 @type right_tag_pattern: C{string} 635 @param right_tag_pattern: This rule's right tag 636 pattern. When applied to a C{ChunkString}, this 637 rule will find any chunk whose beginning matches 638 C{right_tag_pattern}, and immediately preceded by a chink 639 whose end matches this pattern. It will 640 then merge those two chunks into a single chunk. 641 @type left_tag_pattern: C{string} 642 @param left_tag_pattern: This rule's left tag 643 pattern. When applied to a C{ChunkString}, this 644 rule will find any chunk whose beginning matches 645 this pattern, and immediately preceded by a chink 646 whose end matches C{left_tag_pattern}. It will 647 then expand the chunk to incorporate the new material on the left. 648 649 @type descr: C{string} 650 @param descr: A short description of the purpose and/or effect 651 of this rule. 652 """ 653 # Ensure that the individual patterns are coherent. E.g., if 654 # left='(' and right=')', then this will raise an exception: 655 re.compile(tag_pattern2re_pattern(left_tag_pattern)) 656 re.compile(tag_pattern2re_pattern(right_tag_pattern)) 657 658 self._left_tag_pattern = left_tag_pattern 659 self._right_tag_pattern = right_tag_pattern 660 regexp = re.compile('(?P<left>%s)\{(?P<right>%s)' % 661 (tag_pattern2re_pattern(left_tag_pattern), 662 tag_pattern2re_pattern(right_tag_pattern))) 663 RegexpChunkRule.__init__(self, regexp, '{\g<left>\g<right>', descr)

664

665 - def __repr__(self):

666 """ 667 @rtype: C{string} 668 @return: A string representation of this rule. This 669 string representation has the form:: 670 671 <ExpandLeftRule: '<NN|DT|JJ>', '<NN|JJ>'> 672 673 Note that this representation does not include the 674 description string; that string can be accessed 675 separately with the C{descr} method. 676 """ 677 return ('<ExpandLeftRule: '+`self._left_tag_pattern`+', '+ 678 `self._right_tag_pattern`+'>')

679

680 -class ExpandRightRule(RegexpChunkRule):

681 """ 682 A rule specifying how to expand chunks in a C{ChunkString} to the 683 right, using two matching tag patterns: a left pattern, and a 684 right pattern. When applied to a C{ChunkString}, it will find any 685 chunk whose end matches left pattern, and immediately followed by 686 a chink whose beginning matches right pattern. It will then 687 expand the chunk to incorporate the new material on the right. 688 """

689 - def __init__(self, left_tag_pattern, right_tag_pattern, descr):

690 """ 691 Construct a new C{ExpandRightRule}. 692 693 @type right_tag_pattern: C{string} 694 @param right_tag_pattern: This rule's right tag 695 pattern. When applied to a C{ChunkString}, this 696 rule will find any chunk whose end matches 697 C{left_tag_pattern}, and immediately followed by a chink 698 whose beginning matches this pattern. It will 699 then merge those two chunks into a single chunk. 700 @type left_tag_pattern: C{string} 701 @param left_tag_pattern: This rule's left tag 702 pattern. When applied to a C{ChunkString}, this 703 rule will find any chunk whose end matches 704 this pattern, and immediately followed by a chink 705 whose beginning matches C{right_tag_pattern}. It will 706 then expand the chunk to incorporate the new material on the right. 707 708 @type descr: C{string} 709 @param descr: A short description of the purpose and/or effect 710 of this rule. 711 """ 712 # Ensure that the individual patterns are coherent. E.g., if 713 # left='(' and right=')', then this will raise an exception: 714 re.compile(tag_pattern2re_pattern(left_tag_pattern)) 715 re.compile(tag_pattern2re_pattern(right_tag_pattern)) 716 717 self._left_tag_pattern = left_tag_pattern 718 self._right_tag_pattern = right_tag_pattern 719 regexp = re.compile('(?P<left>%s)\}(?P<right>%s)' % 720 (tag_pattern2re_pattern(left_tag_pattern), 721 tag_pattern2re_pattern(right_tag_pattern))) 722 RegexpChunkRule.__init__(self, regexp, '\g<left>\g<right>}', descr)

723

724 - def __repr__(self):

725 """ 726 @rtype: C{string} 727 @return: A string representation of this rule. This 728 string representation has the form:: 729 730 <ExpandRightRule: '<NN|DT|JJ>', '<NN|JJ>'> 731 732 Note that this representation does not include the 733 description string; that string can be accessed 734 separately with the C{descr} method. 735 """ 736 return ('<ExpandRightRule: '+`self._left_tag_pattern`+', '+ 737 `self._right_tag_pattern`+'>')

738

739 -class ChunkRuleWithContext(RegexpChunkRule):

740 """ 741 A rule specifying how to add chunks to a C{ChunkString}, using 742 three matching tag patterns: one for the left context, one for the 743 chunk, and one for the right context. When applied to a 744 C{ChunkString}, it will find any substring that matches the chunk 745 tag pattern, is surrounded by substrings that match the two 746 context patterns, and is not already part of a chunk; and create a 747 new chunk containing the substring that matched the chunk tag 748 pattern. 749 750 Caveat: Both the left and right context are consumed when this 751 rule matches; therefore, if you need to find overlapping matches, 752 you will need to apply your rule more than once. 753 """

754 - def __init__(self, left_context_tag_pattern, chunk_tag_pattern, 755 right_context_tag_pattern, descr):

756 """ 757 Construct a new C{ChunkRuleWithContext}. 758 759 @type left_context_tag_pattern: C{string} 760 @param left_context_tag_pattern: A tag pattern that must match 761 the left context of C{chunk_tag_pattern} for this rule to 762 apply. 763 @type chunk_tag_pattern: C{string} 764 @param chunk_tag_pattern: A tag pattern that must match for this 765 rule to apply. If the rule does apply, then this pattern 766 also identifies the substring that will be made into a chunk. 767 @type right_context_tag_pattern: C{string} 768 @param right_context_tag_pattern: A tag pattern that must match 769 the right context of C{chunk_tag_pattern} for this rule to 770 apply. 771 @type descr: C{string} 772 @param descr: A short description of the purpose and/or effect 773 of this rule. 774 """ 775 # Ensure that the individual patterns are coherent. E.g., if 776 # left='(' and right=')', then this will raise an exception: 777 re.compile(tag_pattern2re_pattern(left_context_tag_pattern)) 778 re.compile(tag_pattern2re_pattern(chunk_tag_pattern)) 779 re.compile(tag_pattern2re_pattern(right_context_tag_pattern)) 780 781 self._left_context_tag_pattern = left_context_tag_pattern 782 self._chunk_tag_pattern = chunk_tag_pattern 783 self._right_context_tag_pattern = right_context_tag_pattern 784 regexp = re.compile('(?P<left>%s)(?P<chunk>%s)(?P<right>%s)%s' % 785 (tag_pattern2re_pattern(left_context_tag_pattern), 786 tag_pattern2re_pattern(chunk_tag_pattern), 787 tag_pattern2re_pattern(right_context_tag_pattern), 788 ChunkString.IN_CHINK_PATTERN)) 789 replacement = r'\g<left>{\g<chunk>}\g<right>' 790 RegexpChunkRule.__init__(self, regexp, replacement, descr)

791

792 - def __repr__(self):

793 """ 794 @rtype: C{string} 795 @return: A string representation of this rule. This 796 string representation has the form:: 797 798 <ChunkRuleWithContext: '<IN>', '<NN>', '<DT>'> 799 800 Note that this representation does not include the 801 description string; that string can be accessed 802 separately with the C{descr} method. 803 """ 804 return '<ChunkRuleWithContext: %r, %r, %r>' % ( 805 self._left_context_tag_pattern, self._chunk_tag_pattern, 806 self._right_context_tag_pattern)

807 808 ##////////////////////////////////////////////////////// 809 ## Tag Pattern Format Conversion 810 ##////////////////////////////////////////////////////// 811 812 # this should probably be made more strict than it is -- e.g., it 813 # currently accepts 'foo'. 814 CHUNK_TAG_PATTERN = re.compile(r'^((%s|<%s>)*)$' % 815 ('[^\{\}<>]+', 816 '[^\{\}<>]+')) 817

818 -def tag_pattern2re_pattern(tag_pattern):

819 """ 820 Convert a tag pattern to a regular expression pattern. A X{tag 821 pattern} is a modified version of a regular expression, designed 822 for matching sequences of tags. The differences between regular 823 expression patterns and tag patterns are: 824 825 - In tag patterns, C{'<'} and C{'>'} act as parentheses; so 826 C{'<NN>+'} matches one or more repetitions of C{'<NN>'}, not 827 C{'<NN'} followed by one or more repetitions of C{'>'}. 828 - Whitespace in tag patterns is ignored. So 829 C{'<DT> | <NN>'} is equivalant to C{'<DT>|<NN>'} 830 - In tag patterns, C{'.'} is equivalant to C{'[^{}<>]'}; so 831 C{'<NN.*>'} matches any single tag starting with C{'NN'}. 832 833 In particular, C{tag_pattern2re_pattern} performs the following 834 transformations on the given pattern: 835 836 - Replace '.' with '[^<>{}]' 837 - Remove any whitespace 838 - Add extra parens around '<' and '>', to make '<' and '>' act 839 like parentheses. E.g., so that in '<NN>+', the '+' has scope 840 over the entire '<NN>'; and so that in '<NN|IN>', the '|' has 841 scope over 'NN' and 'IN', but not '<' or '>'. 842 - Check to make sure the resulting pattern is valid. 843 844 @type tag_pattern: C{string} 845 @param tag_pattern: The tag pattern to convert to a regular 846 expression pattern. 847 @raise ValueError: If C{tag_pattern} is not a valid tag pattern. 848 In particular, C{tag_pattern} should not include braces; and it 849 should not contain nested or mismatched angle-brackets. 850 @rtype: C{string} 851 @return: A regular expression pattern corresponding to 852 C{tag_pattern}. 853 """ 854 # Clean up the regular expression 855 tag_pattern = re.sub(r'\s', '', tag_pattern) 856 tag_pattern = re.sub(r'<', '(<(', tag_pattern) 857 tag_pattern = re.sub(r'>', ')>)', tag_pattern) 858 859 # Check the regular expression 860 if not CHUNK_TAG_PATTERN.match(tag_pattern): 861 raise ValueError('Bad tag pattern: %r' % tag_pattern) 862 863 # Replace "." with CHUNK_TAG_CHAR. 864 # We have to do this after, since it adds {}[]<>s, which would 865 # confuse CHUNK_TAG_PATTERN. 866 # PRE doesn't have lookback assertions, so reverse twice, and do 867 # the pattern backwards (with lookahead assertions). This can be 868 # made much cleaner once we can switch back to SRE. 869 def reverse_str(str): 870 lst = list(str) 871 lst.reverse() 872 return ''.join(lst)

873 tc_rev = reverse_str(ChunkString.CHUNK_TAG_CHAR) 874 reversed = reverse_str(tag_pattern) 875 reversed = re.sub(r'\.(?!\$\\\$*($|[^\\]))', tc_rev, reversed) 876 tag_pattern = reverse_str(reversed) 877 878 return tag_pattern 879 880 881 ##////////////////////////////////////////////////////// 882 ## RegexpChunkParser 883 ##////////////////////////////////////////////////////// 884

885 -class RegexpChunkParser(ChunkParserI):

886 """ 887 A regular expression based chunk parser. C{RegexpChunkParser} uses a 888 sequence of X{rules} to find chunks of a single type within a 889 text. The chunking of the text is encoded using a C{ChunkString}, 890 and each rule acts by modifying the chunking in the 891 C{ChunkString}. The rules are all implemented using regular 892 expression matching and substitution. 893 894 The C{RegexpChunkRule} class and its subclasses (C{ChunkRule}, 895 C{ChinkRule}, C{UnChunkRule}, C{MergeRule}, and C{SplitRule}) 896 define the rules that are used by C{RegexpChunkParser}. Each rule 897 defines an C{apply} method, which modifies the chunking encoded 898 by a given C{ChunkString}. 899 900 @type _rules: C{list} of C{RegexpChunkRule} 901 @ivar _rules: The list of rules that should be applied to a text. 902 @type _trace: C{int} 903 @ivar _trace: The default level of tracing. 904 905 """

906 - def __init__(self, rules, chunk_node='NP', top_node='S', trace=0):

907 """ 908 Construct a new C{RegexpChunkParser}. 909 910 @type rules: C{list} of C{RegexpChunkRule} 911 @param rules: The sequence of rules that should be used to 912 generate the chunking for a tagged text. 913 @type chunk_node: C{string} 914 @param chunk_node: The node value that should be used for 915 chunk subtrees. This is typically a short string 916 describing the type of information contained by the chunk, 917 such as C{"NP"} for base noun phrases. 918 @type top_node: C{string} 919 @param top_node: The node value that should be used for the 920 top node of the chunk structure. 921 @type trace: C{int} 922 @param trace: The level of tracing that should be used when 923 parsing a text. C{0} will generate no tracing output; 924 C{1} will generate normal tracing output; and C{2} or 925 higher will generate verbose tracing output. 926 """ 927 self._rules = rules 928 self._trace = trace 929 self._chunk_node = chunk_node 930 self._top_node = top_node

931

932 - def _trace_apply(self, chunkstr, verbose):

933 """ 934 Apply each of this C{RegexpChunkParser}'s rules to C{chunkstr}, in 935 turn. Generate trace output between each rule. If C{verbose} 936 is true, then generate verbose output. 937 938 @type chunkstr: C{ChunkString} 939 @param chunkstr: The chunk string to which each rule should be 940 applied. 941 @type verbose: C{boolean} 942 @param verbose: Whether output should be verbose. 943 @rtype: C{None} 944 """ 945 print '# Input:' 946 print chunkstr 947 for rule in self._rules: 948 rule.apply(chunkstr) 949 if verbose: 950 print '#', rule.descr()+' ('+`rule`+'):' 951 else: 952 print '#', rule.descr()+':' 953 print chunkstr

954

955 - def _notrace_apply(self, chunkstr):

956 """ 957 Apply each of this C{RegexpChunkParser}'s rules to C{chunkstr}, in 958 turn. 959 960 @param chunkstr: The chunk string to which each rule should be 961 applied. 962 @type chunkstr: C{ChunkString} 963 @rtype: C{None} 964 """ 965 966 for rule in self._rules: 967 rule.apply(chunkstr)

968

969 - def parse(self, chunk_struct, trace=None):

970 from nltk import Tree 971 """ 972 @type chunk_struct: C{Tree} 973 @param chunk_struct: the chunk structure to be (further) chunked 974 @type trace: C{int} 975 @param trace: The level of tracing that should be used when 976 parsing a text. C{0} will generate no tracing output; 977 C{1} will generate normal tracing output; and C{2} or 978 highter will generate verbose tracing output. This value 979 overrides the trace level value that was given to the 980 constructor. 981 @rtype: C{Tree} 982 @return: a chunk structure that encodes the chunks in a given 983 tagged sentence. A chunk is a non-overlapping linguistic 984 group, such as a noun phrase. The set of chunks 985 identified in the chunk structure depends on the rules 986 used to define this C{RegexpChunkParser}. 987 """ 988 if len(chunk_struct) == 0: 989 print 'Warning: parsing empty text' 990 return Tree(self._top_node, []) 991 992 try: 993 chunk_struct.node 994 except AttributeError: 995 chunk_struct = Tree(self._top_node, chunk_struct) 996 997 # Use the default trace value? 998 if trace == None: trace = self._trace 999 1000 chunkstr = ChunkString(chunk_struct) 1001 1002 # Apply the sequence of rules to the chunkstring. 1003 if trace: 1004 verbose = (trace>1) 1005 self._trace_apply(chunkstr, verbose) 1006 else: 1007 self._notrace_apply(chunkstr) 1008 1009 # Use the chunkstring to create a chunk structure. 1010 return chunkstr.to_chunkstruct(self._chunk_node)

1011

1012 - def rules(self):

1013 """ 1014 @return: the sequence of rules used by C{RegexpChunkParser}. 1015 @rtype: C{list} of C{RegexpChunkRule} 1016 """ 1017 return self._rules

1018

1019 - def __repr__(self):

1020 """ 1021 @return: a concise string representation of this 1022 C{RegexpChunkParser}. 1023 @rtype: C{string} 1024 """ 1025 return "<RegexpChunkParser with %d rules>" % len(self._rules)

1026

1027 - def __str__(self):

1028 """ 1029 @return: a verbose string representation of this C{RegexpChunkParser}. 1030 @rtype: C{string} 1031 """ 1032 s = "RegexpChunkParser with %d rules:\n" % len(self._rules) 1033 margin = 0 1034 for rule in self._rules: 1035 margin = max(margin, len(rule.descr())) 1036 if margin < 35: 1037 format = " %" + `-(margin+3)` + "s%s\n" 1038 else: 1039 format = " %s\n %s\n" 1040 for rule in self._rules: 1041 s += format % (rule.descr(), `rule`) 1042 return s[:-1]

1043 1044 ##////////////////////////////////////////////////////// 1045 ## Chunk Grammar 1046 ##////////////////////////////////////////////////////// 1047

1048 -class RegexpParser(ChunkParserI):

1049 """ 1050 A grammar based chunk parser. C{chunk.RegexpParser} uses a set of 1051 regular expression patterns to specify the behavior of the parser. 1052 The chunking of the text is encoded using a C{ChunkString}, and 1053 each rule acts by modifying the chunking in the C{ChunkString}. 1054 The rules are all implemented using regular expression matching 1055 and substitution. 1056 1057 A grammar contains one or more clauses in the following form:: 1058 1059 NP: 1060 {<DT|JJ>} # chunk determiners and adjectives 1061 }<[\.VI].*>+{ # chink any tag beginning with V, I, or . 1062 <.*>}{<DT> # split a chunk at a determiner 1063 <DT|JJ>{}<NN.*> # merge chunk ending with det/adj 1064 # with one starting with a noun 1065 1066 The patterns of a clause are executed in order. An earlier 1067 pattern may introduce a chunk boundary that prevents a later 1068 pattern from executing. Sometimes an individual pattern will 1069 match on multiple, overlapping extents of the input. As with 1070 regular expression substitution more generally, the chunker will 1071 identify the first match possible, then continue looking for matches 1072 after this one has ended. 1073 1074 The clauses of a grammar are also executed in order. A cascaded 1075 chunk parser is one having more than one clause. The maximum depth 1076 of a parse tree created by this chunk parser is the same as the 1077 number of clauses in the grammar. 1078 1079 When tracing is turned on, the comment portion of a line is displayed 1080 each time the corresponding pattern is applied. 1081 1082 @type _start: C{string} 1083 @ivar _start: The start symbol of the grammar (the root node of 1084 resulting trees) 1085 @type _stages: C{int} 1086 @ivar _stages: The list of parsing stages corresponding to the grammar 1087 1088 """

1089 - def __init__(self, grammar, top_node='S', loop=1, trace=0):

1090 """ 1091 Create a new chunk parser, from the given start state 1092 and set of chunk patterns. 1093 1094 @param grammar: The list of patterns that defines the grammar 1095 @type grammar: C{list} of C{string} 1096 @param top_node: The top node of the tree being created 1097 @type top_node: L{string} or L{Nonterminal} 1098 @param loop: The number of times to run through the patterns 1099 @type loop: L{int} 1100 @type trace: C{int} 1101 @param trace: The level of tracing that should be used when 1102 parsing a text. C{0} will generate no tracing output; 1103 C{1} will generate normal tracing output; and C{2} or 1104 higher will generate verbose tracing output. 1105 """ 1106 self._trace = trace 1107 self._stages = [] 1108 self._grammar = grammar 1109 self._loop = loop 1110 1111 if isinstance(grammar, basestring): 1112 self._parse_grammar(grammar, top_node, trace) 1113 else: 1114 # Make sur the grammar looks like it has the right type: 1115 type_err = ('Expected string or list of RegexpChunkParsers ' 1116 'for the grammar.') 1117 try: grammar = list(grammar) 1118 except: raise TypeError(type_err) 1119 for elt in grammar: 1120 if not isinstance(elt, RegexpChunkParser): 1121 raise TypeError(type_err) 1122 self._stages = grammar

1123

1124 - def _parse_grammar(self, grammar, top_node, trace):

1125 """ 1126 Helper function for __init__: parse the grammar if it is a 1127 string. 1128 """ 1129 rules = [] 1130 lhs = None 1131 for line in grammar.split('\n'): 1132 line = line.strip() 1133 1134 # New stage begins if there's an unescaped ':' 1135 m = re.match('(?P<nonterminal>(\\.|[^:])*)(:(?P<rule>.*))', line) 1136 if m: 1137 # Record the stage that we just completed. 1138 self._add_stage(rules, lhs, top_node, trace) 1139 # Start a new stage. 1140 lhs = m.group('nonterminal').strip() 1141 rules = [] 1142 line = m.group('rule').strip() 1143 1144 # Skip blank & comment-only lines 1145 if line=='' or line.startswith('#'): continue 1146 1147 # Add the rule 1148 rules.append(RegexpChunkRule.parse(line)) 1149 1150 # Record the final stage 1151 self._add_stage(rules, lhs, top_node, trace)

1152

1153 - def _add_stage(self, rules, lhs, top_node, trace):

1154 """ 1155 Helper function for __init__: add a new stage to the parser. 1156 """ 1157 if rules != []: 1158 if not lhs: 1159 raise ValueError('Expected stage marker (eg NP:)') 1160 parser = RegexpChunkParser(rules, chunk_node=lhs, 1161 top_node=top_node, trace=trace) 1162 self._stages.append(parser)

1163

1164 - def parse(self, chunk_struct, trace=None):

1165 """ 1166 Apply the chunk parser to this input. 1167 1168 @type chunk_struct: C{Tree} 1169 @param chunk_struct: the chunk structure to be (further) chunked 1170 (this tree is modified, and is also returned) 1171 @type trace: C{int} 1172 @param trace: The level of tracing that should be used when 1173 parsing a text. C{0} will generate no tracing output; 1174 C{1} will generate normal tracing output; and C{2} or 1175 highter will generate verbose tracing output. This value 1176 overrides the trace level value that was given to the 1177 constructor. 1178 @return: the chunked output. 1179 @rtype: C{Tree} 1180 """ 1181 if trace == None: trace = self._trace 1182 for i in range(self._loop): 1183 for parser in self._stages: 1184 chunk_struct = parser.parse(chunk_struct, trace=trace) 1185 return chunk_struct

1186

1187 - def __repr__(self):

1188 """ 1189 @return: a concise string representation of this C{chunk.RegexpParser}. 1190 @rtype: C{string} 1191 """ 1192 return "<chunk.RegexpParser with %d stages>" % len(self._stages)

1193

1194 - def __str__(self):

1195 """ 1196 @return: a verbose string representation of this 1197 C{RegexpChunkParser}. 1198 @rtype: C{string} 1199 """ 1200 s = "chunk.RegexpParser with %d stages:\n" % len(self._stages) 1201 margin = 0 1202 for parser in self._stages: 1203 s += parser.__str__() + "\n" 1204 return s[:-1]

1205 1206 ##////////////////////////////////////////////////////// 1207 ## Demonstration code 1208 ##////////////////////////////////////////////////////// 1209

1210 -def demo_eval(chunkparser, text):

1211 """ 1212 Demonstration code for evaluating a chunk parser, using a 1213 C{ChunkScore}. This function assumes that C{text} contains one 1214 sentence per line, and that each sentence has the form expected by 1215 C{tree.chunk}. It runs the given chunk parser on each sentence in 1216 the text, and scores the result. It prints the final score 1217 (precision, recall, and f-measure); and reports the set of chunks 1218 that were missed and the set of chunks that were incorrect. (At 1219 most 10 missing chunks and 10 incorrect chunks are reported). 1220 1221 @param chunkparser: The chunkparser to be tested 1222 @type chunkparser: C{ChunkParserI} 1223 @param text: The chunked tagged text that should be used for 1224 evaluation. 1225 @type text: C{string} 1226 """ 1227 1228 from nltk import chunk, Tree 1229 1230 # Evaluate our chunk parser. 1231 chunkscore = chunk.ChunkScore() 1232 1233 for sentence in text.split('\n'): 1234 print sentence 1235 sentence = sentence.strip() 1236 if not sentence: continue 1237 gold = chunk.tagstr2tree(sentence) 1238 tokens = gold.leaves() 1239 test = chunkparser.parse(Tree('S', tokens), trace=1) 1240 chunkscore.score(gold, test) 1241 print 1242 1243 print '/'+('='*75)+'\\' 1244 print 'Scoring', chunkparser 1245 print ('-'*77) 1246 print 'Precision: %5.1f%%' % (chunkscore.precision()*100), ' '*4, 1247 print 'Recall: %5.1f%%' % (chunkscore.recall()*100), ' '*6, 1248 print 'F-Measure: %5.1f%%' % (chunkscore.f_measure()*100) 1249 1250 1251 # Missed chunks. 1252 if chunkscore.missed(): 1253 print 'Missed:' 1254 missed = chunkscore.missed() 1255 for chunk in missed[:10]: 1256 print ' ', ' '.join(c.__str__() for c in chunk) 1257 if len(chunkscore.missed()) > 10: 1258 print ' ...' 1259 1260 # Incorrect chunks. 1261 if chunkscore.incorrect(): 1262 print 'Incorrect:' 1263 incorrect = chunkscore.incorrect() 1264 for chunk in incorrect[:10]: 1265 print ' ', ' '.join(c.__str__() for c in chunk) 1266 if len(chunkscore.incorrect()) > 10: 1267 print ' ...' 1268 1269 print '\\'+('='*75)+'/' 1270 print

1271

1272 -def demo():

1273 """ 1274 A demonstration for the C{RegexpChunkParser} class. A single text is 1275 parsed with four different chunk parsers, using a variety of rules 1276 and strategies. 1277 """ 1278 1279 from nltk import chunk, Tree 1280 1281 text = """\ 1282 [ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] ./. 1283 [ John/NNP ] saw/VBD [the/DT cats/NNS] [the/DT dog/NN] chased/VBD ./. 1284 [ John/NNP ] thinks/VBZ [ Mary/NN ] saw/VBD [ the/DT cat/NN ] sit/VB on/IN [ the/DT mat/NN ]./. 1285 """ 1286 1287 print '*'*75 1288 print 'Evaluation text:' 1289 print text 1290 print '*'*75 1291 print 1292 1293 grammar = r""" 1294 NP: # NP stage 1295 {<DT>?<JJ>*<NN>} # chunk determiners, adjectives and nouns 1296 {<NNP>+} # chunk proper nouns 1297 """ 1298 cp = chunk.RegexpParser(grammar) 1299 chunk.demo_eval(cp, text) 1300 1301 grammar = r""" 1302 NP: 1303 {<.*>} # start by chunking each tag 1304 }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods 1305 <DT|JJ>{}<NN.*> # merge det/adj with nouns 1306 """ 1307 cp = chunk.RegexpParser(grammar) 1308 chunk.demo_eval(cp, text) 1309 1310 grammar = r""" 1311 NP: {<DT>?<JJ>*<NN>} # chunk determiners, adjectives and nouns 1312 VP: {<TO>?<VB.*>} # VP = verb words 1313 """ 1314 cp = chunk.RegexpParser(grammar) 1315 chunk.demo_eval(cp, text) 1316 1317 grammar = r""" 1318 NP: {<.*>*} # start by chunking everything 1319 }<[\.VI].*>+{ # chink any verbs, prepositions or periods 1320 <.*>}{<DT> # separate on determiners 1321 PP: {<IN><NP>} # PP = preposition + noun phrase 1322 VP: {<VB.*><NP|PP>*} # VP = verb words + NPs and PPs 1323 """ 1324 cp = chunk.RegexpParser(grammar) 1325 chunk.demo_eval(cp, text) 1326 1327 # Evaluation 1328 1329 from nltk.corpus import conll2000 1330 1331 print 1332 print "Demonstration of empty grammar:" 1333 1334 cp = chunk.RegexpParser("") 1335 print chunk.accuracy(cp, conll2000.chunked_sents('test.txt', 1336 chunk_types=('NP',))) 1337 1338 print 1339 print "Demonstration of accuracy evaluation using CoNLL tags:" 1340 1341 grammar = r""" 1342 NP: 1343 {<.*>} # start by chunking each tag 1344 }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods 1345 <DT|JJ>{}<NN.*> # merge det/adj with nouns 1346 """ 1347 cp = chunk.RegexpParser(grammar) 1348 print chunk.accuracy(cp, conll2000.chunked_sents('test.txt')[:5]) 1349 1350 print 1351 print "Demonstration of tagged token input" 1352 1353 grammar = r""" 1354 NP: {<.*>*} # start by chunking everything 1355 }<[\.VI].*>+{ # chink any verbs, prepositions or periods 1356 <.*>}{<DT> # separate on determiners 1357 PP: {<IN><NP>} # PP = preposition + noun phrase 1358 VP: {<VB.*><NP|PP>*} # VP = verb words + NPs and PPs 1359 """ 1360 cp = chunk.RegexpParser(grammar) 1361 print cp.parse([("the","DT"), ("little","JJ"), ("cat", "NN"), 1362 ("sat", "VBD"), ("on", "IN"), ("the", "DT"), 1363 ("mat", "NN"), (".", ".")])

1364 1365 if __name__ == '__main__': 1366 demo() 1367

Source Code for Module nltk.chunk.regexp