1
2
3
4
5
6
7
8
9 import re
10 import types
11
12 from nltk import Tree
13
14 from nltk.chunk.api import *
15 from nltk.chunk.util import *
16
17
18
19
20
22 """
23 A string-based encoding of a particular chunking of a text.
24 Internally, the C{ChunkString} class uses a single string to
25 encode the chunking of the input text. This string contains a
26 sequence of angle-bracket delimited tags, with chunking indicated
27 by braces. An example of this encoding is::
28
29 {<DT><JJ><NN>}<VBN><IN>{<DT><NN>}<.>{<DT><NN>}<VBD><.>
30
31 C{ChunkString} are created from tagged texts (i.e., C{list}s of
32 C{tokens} whose type is C{TaggedType}). Initially, nothing is
33 chunked.
34
35 The chunking of a C{ChunkString} can be modified with the C{xform}
36 method, which uses a regular expression to transform the string
37 representation. These transformations should only add and remove
38 braces; they should I{not} modify the sequence of angle-bracket
39 delimited tags.
40
41 @type _str: C{string}
42 @ivar _str: The internal string representation of the text's
43 encoding. This string representation contains a sequence of
44 angle-bracket delimited tags, with chunking indicated by
45 braces. An example of this encoding is::
46
47 {<DT><JJ><NN>}<VBN><IN>{<DT><NN>}<.>{<DT><NN>}<VBD><.>
48
49 @type _pieces: C{list} of pieces (tagged tokens and chunks)
50 @ivar _pieces: The tagged tokens and chunks encoded by this C{ChunkString}.
51 @ivar _debug: The debug level. See the constructor docs.
52
53 @cvar IN_CHUNK_PATTERN: A zero-width regexp pattern string that
54 will only match positions that are in chunks.
55 @cvar IN_CHINK_PATTERN: A zero-width regexp pattern string that
56 will only match positions that are in chinks.
57 """
58 CHUNK_TAG_CHAR = r'[^\{\}<>]'
59 CHUNK_TAG = r'(<%s+?>)' % CHUNK_TAG_CHAR
60
61 IN_CHUNK_PATTERN = r'(?=[^\{]*\})'
62 IN_CHINK_PATTERN = r'(?=[^\}]*(\{|$))'
63
64
65 _CHUNK = r'(\{%s+?\})+?' % CHUNK_TAG
66 _CHINK = r'(%s+?)+?' % CHUNK_TAG
67 _VALID = re.compile(r'^(\{?%s\}?)*?$' % CHUNK_TAG)
68 _BRACKETS = re.compile('[^\{\}]+')
69 _BALANCED_BRACKETS = re.compile(r'(\{\})*$')
70
71 - def __init__(self, chunk_struct, debug_level=1):
72 """
73 Construct a new C{ChunkString} that encodes the chunking of
74 the text C{tagged_tokens}.
75
76 @type chunk_struct: C{Tree}
77 @param chunk_struct: The chunk structure to be further chunked.
78 @type debug_level: int
79 @param debug_level: The level of debugging which should be
80 applied to transformations on the C{ChunkString}. The
81 valid levels are:
82 - 0: no checks
83 - 1: full check on to_chunkstruct
84 - 2: full check on to_chunkstruct and cursory check after
85 each transformation.
86 - 3: full check on to_chunkstruct and full check after
87 each transformation.
88 We recommend you use at least level 1. You should
89 probably use level 3 if you use any non-standard
90 subclasses of C{RegexpChunkRule}.
91 """
92 self._top_node = chunk_struct.node
93 self._pieces = chunk_struct[:]
94 tags = [self._tag(tok) for tok in self._pieces]
95 self._str = '<' + '><'.join(tags) + '>'
96 self._debug = debug_level
97
98 - def _tag(self, tok):
99 if type(tok) == types.TupleType:
100 return tok[1]
101 elif isinstance(tok, Tree):
102 return tok.node
103 else:
104 raise ValueError('chunk structures must contain tagged '
105 'tokens or trees')
106
107 - def _verify(self, s, verify_tags):
108 """
109 Check to make sure that C{s} still corresponds to some chunked
110 version of C{_pieces}.
111
112 @type verify_tags: C{boolean}
113 @param verify_tags: Whether the individual tags should be
114 checked. If this is false, C{_verify} will check to make
115 sure that C{_str} encodes a chunked version of I{some}
116 list of tokens. If this is true, then C{_verify} will
117 check to make sure that the tags in C{_str} match those in
118 C{_pieces}.
119
120 @raise ValueError: if this C{ChunkString}'s internal string
121 representation is invalid or not consistent with _pieces.
122 """
123
124 if not ChunkString._VALID.match(s):
125 raise ValueError('Transformation generated invalid '
126 'chunkstring:\n %s' % s)
127
128
129
130
131 brackets = ChunkString._BRACKETS.sub('', s)
132 for i in range(1+len(brackets)/5000):
133 substr = brackets[i*5000:i*5000+5000]
134 if not ChunkString._BALANCED_BRACKETS.match(substr):
135 raise ValueError('Transformation generated invalid '
136 'chunkstring:\n %s' % s)
137
138 if verify_tags<=0: return
139
140 tags1 = (re.split(r'[\{\}<>]+', s))[1:-1]
141 tags2 = [self._tag(piece) for piece in self._pieces]
142 if tags1 != tags2:
143 raise ValueError('Transformation generated invalid '
144 'chunkstring: tag changed')
145
147 """
148 @return: the chunk structure encoded by this C{ChunkString}.
149 @rtype: C{Tree}
150 @raise ValueError: If a transformation has generated an
151 invalid chunkstring.
152 """
153 if self._debug > 0: self._verify(self._str, 1)
154
155
156 pieces = []
157 index = 0
158 piece_in_chunk = 0
159 for piece in re.split('[{}]', self._str):
160
161
162 length = piece.count('<')
163 subsequence = self._pieces[index:index+length]
164
165
166 if piece_in_chunk:
167 pieces.append(Tree(chunk_node, subsequence))
168 else:
169 pieces += subsequence
170
171
172 index += length
173 piece_in_chunk = not piece_in_chunk
174
175 return Tree(self._top_node, pieces)
176
215
217 """
218 @rtype: C{string}
219 @return: A string representation of this C{ChunkString}. This
220 string representation has the form::
221
222 <ChunkString: '{<DT><JJ><NN>}<VBN><IN>{<DT><NN>}'>
223
224 """
225 return '<ChunkString: %s>' % `self._str`
226
228 """
229 @rtype: C{string}
230 @return: A formatted representation of this C{ChunkString}'s
231 string encoding. This representation will include extra
232 spaces to ensure that tags will line up with the
233 representation of other C{ChunkStrings} for the same text,
234 regardless of the chunking.
235 """
236
237 str = re.sub(r'>(?!\})', r'> ', self._str)
238 str = re.sub(r'([^\{])<', r'\1 <', str)
239 if str[0] == '<': str = ' ' + str
240 return str
241
242
243
244
245
247 """
248 A rule specifying how to modify the chunking in a C{ChunkString},
249 using a transformational regular expression. The
250 C{RegexpChunkRule} class itself can be used to implement any
251 transformational rule based on regular expressions. There are
252 also a number of subclasses, which can be used to implement
253 simpler types of rules, based on matching regular expressions.
254
255 Each C{RegexpChunkRule} has a regular expression and a
256 replacement expression. When a C{RegexpChunkRule} is X{applied}
257 to a C{ChunkString}, it searches the C{ChunkString} for any
258 substring that matches the regular expression, and replaces it
259 using the replacement expression. This search/replace operation
260 has the same semantics as C{re.sub}.
261
262 Each C{RegexpChunkRule} also has a description string, which
263 gives a short (typically less than 75 characters) description of
264 the purpose of the rule.
265
266 This transformation defined by this C{RegexpChunkRule} should
267 only add and remove braces; it should I{not} modify the sequence
268 of angle-bracket delimited tags. Furthermore, this transformation
269 may not result in nested or mismatched bracketing.
270 """
271 - def __init__(self, regexp, repl, descr):
272 """
273 Construct a new RegexpChunkRule.
274
275 @type regexp: C{regexp} or C{string}
276 @param regexp: This C{RegexpChunkRule}'s regular expression.
277 When this rule is applied to a C{ChunkString}, any
278 substring that matches C{regexp} will be replaced using
279 the replacement string C{repl}. Note that this must be a
280 normal regular expression, not a tag pattern.
281 @type repl: C{string}
282 @param repl: This C{RegexpChunkRule}'s replacement
283 expression. When this rule is applied to a
284 C{ChunkString}, any substring that matches C{regexp} will
285 be replaced using C{repl}.
286 @type descr: C{string}
287 @param descr: A short description of the purpose and/or effect
288 of this rule.
289 """
290 if isinstance(regexp, basestring):
291 regexp = re.compile(regexp)
292 self._repl = repl
293 self._descr = descr
294 self._regexp = regexp
295
296 - def apply(self, chunkstr):
297
298 """
299 Apply this rule to the given C{ChunkString}. See the
300 class reference documentation for a description of what it
301 means to apply a rule.
302
303 @type chunkstr: C{ChunkString}
304 @param chunkstr: The chunkstring to which this rule is
305 applied.
306 @rtype: C{None}
307 @raise ValueError: If this transformation generated an
308 invalid chunkstring.
309 """
310 chunkstr.xform(self._regexp, self._repl)
311
313 """
314 @rtype: C{string}
315 @return: a short description of the purpose and/or effect of
316 this rule.
317 """
318 return self._descr
319
321 """
322 @rtype: C{string}
323 @return: A string representation of this rule. This
324 string representation has the form::
325
326 <RegexpChunkRule: '{<IN|VB.*>}'->'<IN>'>
327
328 Note that this representation does not include the
329 description string; that string can be accessed
330 separately with the C{descr} method.
331 """
332 return ('<RegexpChunkRule: '+`self._regexp.pattern`+
333 '->'+`self._repl`+'>')
334
335 @staticmethod
337 """
338 Create a RegexpChunkRule from a string description.
339 Currently, the following formats are supported::
340
341 {regexp} # chunk rule
342 }regexp{ # chink rule
343 regexp}{regexp # split rule
344 regexp{}regexp # merge rule
345
346 Where C{regexp} is a regular expression for the rule. Any
347 text following the comment marker (C{#}) will be used as
348 the rule's description:
349
350 >>> RegexpChunkRule.parse('{<DT>?<NN.*>+}')
351 <ChunkRule: '<DT>?<NN.*>+'>
352 """
353
354 m = re.match(r'(?P<rule>(\\.|[^#])*)(?P<comment>#.*)?', s)
355 rule = m.group('rule').strip()
356 comment = (m.group('comment') or '')[1:].strip()
357
358
359 try:
360 if not rule:
361 raise ValueError('Empty chunk pattern')
362 if rule[0] == '{' and rule[-1] == '}':
363 return ChunkRule(rule[1:-1], comment)
364 elif rule[0] == '}' and rule[-1] == '{':
365 return ChinkRule(rule[1:-1], comment)
366 elif '}{' in rule:
367 left, right = rule.split('}{')
368 return SplitRule(left, right, comment)
369 elif '{}' in rule:
370 left, right = rule.split('{}')
371 return MergeRule(left, right, comment)
372 elif re.match('[^{}]*{[^{}]*}[^{}]*', rule):
373 left, chunk, right = re.split('[{}]', rule)
374 return ChunkRuleWithContext(left, chunk, right, comment)
375 else:
376 raise ValueError('Illegal chunk pattern: %s' % rule)
377 except (ValueError, re.error):
378 raise ValueError('Illegal chunk pattern: %s' % rule)
379
380
382 """
383 A rule specifying how to add chunks to a C{ChunkString}, using a
384 matching tag pattern. When applied to a C{ChunkString}, it will
385 find any substring that matches this tag pattern and that is not
386 already part of a chunk, and create a new chunk containing that
387 substring.
388 """
389 - def __init__(self, tag_pattern, descr):
390
391 """
392 Construct a new C{ChunkRule}.
393
394 @type tag_pattern: C{string}
395 @param tag_pattern: This rule's tag pattern. When
396 applied to a C{ChunkString}, this rule will
397 chunk any substring that matches this tag pattern and that
398 is not already part of a chunk.
399 @type descr: C{string}
400 @param descr: A short description of the purpose and/or effect
401 of this rule.
402 """
403 self._pattern = tag_pattern
404 regexp = re.compile('(?P<chunk>%s)%s' %
405 (tag_pattern2re_pattern(tag_pattern),
406 ChunkString.IN_CHINK_PATTERN))
407 RegexpChunkRule.__init__(self, regexp, '{\g<chunk>}', descr)
408
410 """
411 @rtype: C{string}
412 @return: A string representation of this rule. This
413 string representation has the form::
414
415 <ChunkRule: '<IN|VB.*>'>
416
417 Note that this representation does not include the
418 description string; that string can be accessed
419 separately with the C{descr} method.
420 """
421 return '<ChunkRule: '+`self._pattern`+'>'
422
424 """
425 A rule specifying how to remove chinks to a C{ChunkString},
426 using a matching tag pattern. When applied to a
427 C{ChunkString}, it will find any substring that matches this
428 tag pattern and that is contained in a chunk, and remove it
429 from that chunk, thus creating two new chunks.
430 """
431 - def __init__(self, tag_pattern, descr):
432 """
433 Construct a new C{ChinkRule}.
434
435 @type tag_pattern: C{string}
436 @param tag_pattern: This rule's tag pattern. When
437 applied to a C{ChunkString}, this rule will
438 find any substring that matches this tag pattern and that
439 is contained in a chunk, and remove it from that chunk,
440 thus creating two new chunks.
441 @type descr: C{string}
442 @param descr: A short description of the purpose and/or effect
443 of this rule.
444 """
445 self._pattern = tag_pattern
446 regexp = re.compile('(?P<chink>%s)%s' %
447 (tag_pattern2re_pattern(tag_pattern),
448 ChunkString.IN_CHUNK_PATTERN))
449 RegexpChunkRule.__init__(self, regexp, '}\g<chink>{', descr)
450
452 """
453 @rtype: C{string}
454 @return: A string representation of this rule. This
455 string representation has the form::
456
457 <ChinkRule: '<IN|VB.*>'>
458
459 Note that this representation does not include the
460 description string; that string can be accessed
461 separately with the C{descr} method.
462 """
463 return '<ChinkRule: '+`self._pattern`+'>'
464
466 """
467 A rule specifying how to remove chunks to a C{ChunkString},
468 using a matching tag pattern. When applied to a
469 C{ChunkString}, it will find any complete chunk that matches this
470 tag pattern, and un-chunk it.
471 """
472 - def __init__(self, tag_pattern, descr):
473 """
474 Construct a new C{UnChunkRule}.
475
476 @type tag_pattern: C{string}
477 @param tag_pattern: This rule's tag pattern. When
478 applied to a C{ChunkString}, this rule will
479 find any complete chunk that matches this tag pattern,
480 and un-chunk it.
481 @type descr: C{string}
482 @param descr: A short description of the purpose and/or effect
483 of this rule.
484 """
485 self._pattern = tag_pattern
486 regexp = re.compile('\{(?P<chunk>%s)\}' %
487 tag_pattern2re_pattern(tag_pattern))
488 RegexpChunkRule.__init__(self, regexp, '\g<chunk>', descr)
489
491 """
492 @rtype: C{string}
493 @return: A string representation of this rule. This
494 string representation has the form::
495
496 <UnChunkRule: '<IN|VB.*>'>
497
498 Note that this representation does not include the
499 description string; that string can be accessed
500 separately with the C{descr} method.
501 """
502 return '<UnChunkRule: '+`self._pattern`+'>'
503
505 """
506 A rule specifying how to merge chunks in a C{ChunkString}, using
507 two matching tag patterns: a left pattern, and a right pattern.
508 When applied to a C{ChunkString}, it will find any chunk whose end
509 matches left pattern, and immediately followed by a chunk whose
510 beginning matches right pattern. It will then merge those two
511 chunks into a single chunk.
512 """
513 - def __init__(self, left_tag_pattern, right_tag_pattern, descr):
514 """
515 Construct a new C{MergeRule}.
516
517 @type right_tag_pattern: C{string}
518 @param right_tag_pattern: This rule's right tag
519 pattern. When applied to a C{ChunkString}, this
520 rule will find any chunk whose end matches
521 C{left_tag_pattern}, and immediately followed by a chunk
522 whose beginning matches this pattern. It will
523 then merge those two chunks into a single chunk.
524 @type left_tag_pattern: C{string}
525 @param left_tag_pattern: This rule's left tag
526 pattern. When applied to a C{ChunkString}, this
527 rule will find any chunk whose end matches
528 this pattern, and immediately followed by a chunk
529 whose beginning matches C{right_tag_pattern}. It will
530 then merge those two chunks into a single chunk.
531
532 @type descr: C{string}
533 @param descr: A short description of the purpose and/or effect
534 of this rule.
535 """
536
537
538 re.compile(tag_pattern2re_pattern(left_tag_pattern))
539 re.compile(tag_pattern2re_pattern(right_tag_pattern))
540
541 self._left_tag_pattern = left_tag_pattern
542 self._right_tag_pattern = right_tag_pattern
543 regexp = re.compile('(?P<left>%s)}{(?=%s)' %
544 (tag_pattern2re_pattern(left_tag_pattern),
545 tag_pattern2re_pattern(right_tag_pattern)))
546 RegexpChunkRule.__init__(self, regexp, '\g<left>', descr)
547
549 """
550 @rtype: C{string}
551 @return: A string representation of this rule. This
552 string representation has the form::
553
554 <MergeRule: '<NN|DT|JJ>', '<NN|JJ>'>
555
556 Note that this representation does not include the
557 description string; that string can be accessed
558 separately with the C{descr} method.
559 """
560 return ('<MergeRule: '+`self._left_tag_pattern`+', '+
561 `self._right_tag_pattern`+'>')
562
564 """
565 A rule specifying how to split chunks in a C{ChunkString}, using
566 two matching tag patterns: a left pattern, and a right pattern.
567 When applied to a C{ChunkString}, it will find any chunk that
568 matches the left pattern followed by the right pattern. It will
569 then split the chunk into two new chunks, at the point between the
570 two pattern matches.
571 """
572 - def __init__(self, left_tag_pattern, right_tag_pattern, descr):
573 """
574 Construct a new C{SplitRule}.
575
576 @type right_tag_pattern: C{string}
577 @param right_tag_pattern: This rule's right tag
578 pattern. When applied to a C{ChunkString}, this rule will
579 find any chunk containing a substring that matches
580 C{left_tag_pattern} followed by this pattern. It will
581 then split the chunk into two new chunks at the point
582 between these two matching patterns.
583 @type left_tag_pattern: C{string}
584 @param left_tag_pattern: This rule's left tag
585 pattern. When applied to a C{ChunkString}, this rule will
586 find any chunk containing a substring that matches this
587 pattern followed by C{right_tag_pattern}. It will then
588 split the chunk into two new chunks at the point between
589 these two matching patterns.
590 @type descr: C{string}
591 @param descr: A short description of the purpose and/or effect
592 of this rule.
593 """
594
595
596 re.compile(tag_pattern2re_pattern(left_tag_pattern))
597 re.compile(tag_pattern2re_pattern(right_tag_pattern))
598
599 self._left_tag_pattern = left_tag_pattern
600 self._right_tag_pattern = right_tag_pattern
601 regexp = re.compile('(?P<left>%s)(?=%s)' %
602 (tag_pattern2re_pattern(left_tag_pattern),
603 tag_pattern2re_pattern(right_tag_pattern)))
604 RegexpChunkRule.__init__(self, regexp, r'\g<left>}{', descr)
605
607 """
608 @rtype: C{string}
609 @return: A string representation of this rule. This
610 string representation has the form::
611
612 <SplitRule: '<NN>', '<DT>'>
613
614 Note that this representation does not include the
615 description string; that string can be accessed
616 separately with the C{descr} method.
617 """
618 return ('<SplitRule: '+`self._left_tag_pattern`+', '+
619 `self._right_tag_pattern`+'>')
620
622 """
623 A rule specifying how to expand chunks in a C{ChunkString} to the left,
624 using two matching tag patterns: a left pattern, and a right pattern.
625 When applied to a C{ChunkString}, it will find any chunk whose beginning
626 matches right pattern, and immediately preceded by a chink whose
627 end matches left pattern. It will then expand the chunk to incorporate
628 the new material on the left.
629 """
630 - def __init__(self, left_tag_pattern, right_tag_pattern, descr):
631 """
632 Construct a new C{ExpandRightRule}.
633
634 @type right_tag_pattern: C{string}
635 @param right_tag_pattern: This rule's right tag
636 pattern. When applied to a C{ChunkString}, this
637 rule will find any chunk whose beginning matches
638 C{right_tag_pattern}, and immediately preceded by a chink
639 whose end matches this pattern. It will
640 then merge those two chunks into a single chunk.
641 @type left_tag_pattern: C{string}
642 @param left_tag_pattern: This rule's left tag
643 pattern. When applied to a C{ChunkString}, this
644 rule will find any chunk whose beginning matches
645 this pattern, and immediately preceded by a chink
646 whose end matches C{left_tag_pattern}. It will
647 then expand the chunk to incorporate the new material on the left.
648
649 @type descr: C{string}
650 @param descr: A short description of the purpose and/or effect
651 of this rule.
652 """
653
654
655 re.compile(tag_pattern2re_pattern(left_tag_pattern))
656 re.compile(tag_pattern2re_pattern(right_tag_pattern))
657
658 self._left_tag_pattern = left_tag_pattern
659 self._right_tag_pattern = right_tag_pattern
660 regexp = re.compile('(?P<left>%s)\{(?P<right>%s)' %
661 (tag_pattern2re_pattern(left_tag_pattern),
662 tag_pattern2re_pattern(right_tag_pattern)))
663 RegexpChunkRule.__init__(self, regexp, '{\g<left>\g<right>', descr)
664
666 """
667 @rtype: C{string}
668 @return: A string representation of this rule. This
669 string representation has the form::
670
671 <ExpandLeftRule: '<NN|DT|JJ>', '<NN|JJ>'>
672
673 Note that this representation does not include the
674 description string; that string can be accessed
675 separately with the C{descr} method.
676 """
677 return ('<ExpandLeftRule: '+`self._left_tag_pattern`+', '+
678 `self._right_tag_pattern`+'>')
679
681 """
682 A rule specifying how to expand chunks in a C{ChunkString} to the
683 right, using two matching tag patterns: a left pattern, and a
684 right pattern. When applied to a C{ChunkString}, it will find any
685 chunk whose end matches left pattern, and immediately followed by
686 a chink whose beginning matches right pattern. It will then
687 expand the chunk to incorporate the new material on the right.
688 """
689 - def __init__(self, left_tag_pattern, right_tag_pattern, descr):
690 """
691 Construct a new C{ExpandRightRule}.
692
693 @type right_tag_pattern: C{string}
694 @param right_tag_pattern: This rule's right tag
695 pattern. When applied to a C{ChunkString}, this
696 rule will find any chunk whose end matches
697 C{left_tag_pattern}, and immediately followed by a chink
698 whose beginning matches this pattern. It will
699 then merge those two chunks into a single chunk.
700 @type left_tag_pattern: C{string}
701 @param left_tag_pattern: This rule's left tag
702 pattern. When applied to a C{ChunkString}, this
703 rule will find any chunk whose end matches
704 this pattern, and immediately followed by a chink
705 whose beginning matches C{right_tag_pattern}. It will
706 then expand the chunk to incorporate the new material on the right.
707
708 @type descr: C{string}
709 @param descr: A short description of the purpose and/or effect
710 of this rule.
711 """
712
713
714 re.compile(tag_pattern2re_pattern(left_tag_pattern))
715 re.compile(tag_pattern2re_pattern(right_tag_pattern))
716
717 self._left_tag_pattern = left_tag_pattern
718 self._right_tag_pattern = right_tag_pattern
719 regexp = re.compile('(?P<left>%s)\}(?P<right>%s)' %
720 (tag_pattern2re_pattern(left_tag_pattern),
721 tag_pattern2re_pattern(right_tag_pattern)))
722 RegexpChunkRule.__init__(self, regexp, '\g<left>\g<right>}', descr)
723
725 """
726 @rtype: C{string}
727 @return: A string representation of this rule. This
728 string representation has the form::
729
730 <ExpandRightRule: '<NN|DT|JJ>', '<NN|JJ>'>
731
732 Note that this representation does not include the
733 description string; that string can be accessed
734 separately with the C{descr} method.
735 """
736 return ('<ExpandRightRule: '+`self._left_tag_pattern`+', '+
737 `self._right_tag_pattern`+'>')
738
739 -class ChunkRuleWithContext(RegexpChunkRule):
740 """
741 A rule specifying how to add chunks to a C{ChunkString}, using
742 three matching tag patterns: one for the left context, one for the
743 chunk, and one for the right context. When applied to a
744 C{ChunkString}, it will find any substring that matches the chunk
745 tag pattern, is surrounded by substrings that match the two
746 context patterns, and is not already part of a chunk; and create a
747 new chunk containing the substring that matched the chunk tag
748 pattern.
749
750 Caveat: Both the left and right context are consumed when this
751 rule matches; therefore, if you need to find overlapping matches,
752 you will need to apply your rule more than once.
753 """
754 - def __init__(self, left_context_tag_pattern, chunk_tag_pattern,
755 right_context_tag_pattern, descr):
756 """
757 Construct a new C{ChunkRuleWithContext}.
758
759 @type left_context_tag_pattern: C{string}
760 @param left_context_tag_pattern: A tag pattern that must match
761 the left context of C{chunk_tag_pattern} for this rule to
762 apply.
763 @type chunk_tag_pattern: C{string}
764 @param chunk_tag_pattern: A tag pattern that must match for this
765 rule to apply. If the rule does apply, then this pattern
766 also identifies the substring that will be made into a chunk.
767 @type right_context_tag_pattern: C{string}
768 @param right_context_tag_pattern: A tag pattern that must match
769 the right context of C{chunk_tag_pattern} for this rule to
770 apply.
771 @type descr: C{string}
772 @param descr: A short description of the purpose and/or effect
773 of this rule.
774 """
775
776
777 re.compile(tag_pattern2re_pattern(left_context_tag_pattern))
778 re.compile(tag_pattern2re_pattern(chunk_tag_pattern))
779 re.compile(tag_pattern2re_pattern(right_context_tag_pattern))
780
781 self._left_context_tag_pattern = left_context_tag_pattern
782 self._chunk_tag_pattern = chunk_tag_pattern
783 self._right_context_tag_pattern = right_context_tag_pattern
784 regexp = re.compile('(?P<left>%s)(?P<chunk>%s)(?P<right>%s)%s' %
785 (tag_pattern2re_pattern(left_context_tag_pattern),
786 tag_pattern2re_pattern(chunk_tag_pattern),
787 tag_pattern2re_pattern(right_context_tag_pattern),
788 ChunkString.IN_CHINK_PATTERN))
789 replacement = r'\g<left>{\g<chunk>}\g<right>'
790 RegexpChunkRule.__init__(self, regexp, replacement, descr)
791
792 - def __repr__(self):
793 """
794 @rtype: C{string}
795 @return: A string representation of this rule. This
796 string representation has the form::
797
798 <ChunkRuleWithContext: '<IN>', '<NN>', '<DT>'>
799
800 Note that this representation does not include the
801 description string; that string can be accessed
802 separately with the C{descr} method.
803 """
804 return '<ChunkRuleWithContext: %r, %r, %r>' % (
805 self._left_context_tag_pattern, self._chunk_tag_pattern,
806 self._right_context_tag_pattern)
807
808
809
810
811
812
813
814 CHUNK_TAG_PATTERN = re.compile(r'^((%s|<%s>)*)$' %
815 ('[^\{\}<>]+',
816 '[^\{\}<>]+'))
817
819 """
820 Convert a tag pattern to a regular expression pattern. A X{tag
821 pattern} is a modified version of a regular expression, designed
822 for matching sequences of tags. The differences between regular
823 expression patterns and tag patterns are:
824
825 - In tag patterns, C{'<'} and C{'>'} act as parentheses; so
826 C{'<NN>+'} matches one or more repetitions of C{'<NN>'}, not
827 C{'<NN'} followed by one or more repetitions of C{'>'}.
828 - Whitespace in tag patterns is ignored. So
829 C{'<DT> | <NN>'} is equivalant to C{'<DT>|<NN>'}
830 - In tag patterns, C{'.'} is equivalant to C{'[^{}<>]'}; so
831 C{'<NN.*>'} matches any single tag starting with C{'NN'}.
832
833 In particular, C{tag_pattern2re_pattern} performs the following
834 transformations on the given pattern:
835
836 - Replace '.' with '[^<>{}]'
837 - Remove any whitespace
838 - Add extra parens around '<' and '>', to make '<' and '>' act
839 like parentheses. E.g., so that in '<NN>+', the '+' has scope
840 over the entire '<NN>'; and so that in '<NN|IN>', the '|' has
841 scope over 'NN' and 'IN', but not '<' or '>'.
842 - Check to make sure the resulting pattern is valid.
843
844 @type tag_pattern: C{string}
845 @param tag_pattern: The tag pattern to convert to a regular
846 expression pattern.
847 @raise ValueError: If C{tag_pattern} is not a valid tag pattern.
848 In particular, C{tag_pattern} should not include braces; and it
849 should not contain nested or mismatched angle-brackets.
850 @rtype: C{string}
851 @return: A regular expression pattern corresponding to
852 C{tag_pattern}.
853 """
854
855 tag_pattern = re.sub(r'\s', '', tag_pattern)
856 tag_pattern = re.sub(r'<', '(<(', tag_pattern)
857 tag_pattern = re.sub(r'>', ')>)', tag_pattern)
858
859
860 if not CHUNK_TAG_PATTERN.match(tag_pattern):
861 raise ValueError('Bad tag pattern: %r' % tag_pattern)
862
863
864
865
866
867
868
869 def reverse_str(str):
870 lst = list(str)
871 lst.reverse()
872 return ''.join(lst)
873 tc_rev = reverse_str(ChunkString.CHUNK_TAG_CHAR)
874 reversed = reverse_str(tag_pattern)
875 reversed = re.sub(r'\.(?!\\(\\\\)*($|[^\\]))', tc_rev, reversed)
876 tag_pattern = reverse_str(reversed)
877
878 return tag_pattern
879
880
881
882
883
884
886 """
887 A regular expression based chunk parser. C{RegexpChunkParser} uses a
888 sequence of X{rules} to find chunks of a single type within a
889 text. The chunking of the text is encoded using a C{ChunkString},
890 and each rule acts by modifying the chunking in the
891 C{ChunkString}. The rules are all implemented using regular
892 expression matching and substitution.
893
894 The C{RegexpChunkRule} class and its subclasses (C{ChunkRule},
895 C{ChinkRule}, C{UnChunkRule}, C{MergeRule}, and C{SplitRule})
896 define the rules that are used by C{RegexpChunkParser}. Each rule
897 defines an C{apply} method, which modifies the chunking encoded
898 by a given C{ChunkString}.
899
900 @type _rules: C{list} of C{RegexpChunkRule}
901 @ivar _rules: The list of rules that should be applied to a text.
902 @type _trace: C{int}
903 @ivar _trace: The default level of tracing.
904
905 """
906 - def __init__(self, rules, chunk_node='NP', top_node='S', trace=0):
907 """
908 Construct a new C{RegexpChunkParser}.
909
910 @type rules: C{list} of C{RegexpChunkRule}
911 @param rules: The sequence of rules that should be used to
912 generate the chunking for a tagged text.
913 @type chunk_node: C{string}
914 @param chunk_node: The node value that should be used for
915 chunk subtrees. This is typically a short string
916 describing the type of information contained by the chunk,
917 such as C{"NP"} for base noun phrases.
918 @type top_node: C{string}
919 @param top_node: The node value that should be used for the
920 top node of the chunk structure.
921 @type trace: C{int}
922 @param trace: The level of tracing that should be used when
923 parsing a text. C{0} will generate no tracing output;
924 C{1} will generate normal tracing output; and C{2} or
925 higher will generate verbose tracing output.
926 """
927 self._rules = rules
928 self._trace = trace
929 self._chunk_node = chunk_node
930 self._top_node = top_node
931
933 """
934 Apply each of this C{RegexpChunkParser}'s rules to C{chunkstr}, in
935 turn. Generate trace output between each rule. If C{verbose}
936 is true, then generate verbose output.
937
938 @type chunkstr: C{ChunkString}
939 @param chunkstr: The chunk string to which each rule should be
940 applied.
941 @type verbose: C{boolean}
942 @param verbose: Whether output should be verbose.
943 @rtype: C{None}
944 """
945 print '# Input:'
946 print chunkstr
947 for rule in self._rules:
948 rule.apply(chunkstr)
949 if verbose:
950 print '#', rule.descr()+' ('+`rule`+'):'
951 else:
952 print '#', rule.descr()+':'
953 print chunkstr
954
956 """
957 Apply each of this C{RegexpChunkParser}'s rules to C{chunkstr}, in
958 turn.
959
960 @param chunkstr: The chunk string to which each rule should be
961 applied.
962 @type chunkstr: C{ChunkString}
963 @rtype: C{None}
964 """
965
966 for rule in self._rules:
967 rule.apply(chunkstr)
968
969 - def parse(self, chunk_struct, trace=None):
970 from nltk import Tree
971 """
972 @type chunk_struct: C{Tree}
973 @param chunk_struct: the chunk structure to be (further) chunked
974 @type trace: C{int}
975 @param trace: The level of tracing that should be used when
976 parsing a text. C{0} will generate no tracing output;
977 C{1} will generate normal tracing output; and C{2} or
978 highter will generate verbose tracing output. This value
979 overrides the trace level value that was given to the
980 constructor.
981 @rtype: C{Tree}
982 @return: a chunk structure that encodes the chunks in a given
983 tagged sentence. A chunk is a non-overlapping linguistic
984 group, such as a noun phrase. The set of chunks
985 identified in the chunk structure depends on the rules
986 used to define this C{RegexpChunkParser}.
987 """
988 if len(chunk_struct) == 0:
989 print 'Warning: parsing empty text'
990 return Tree(self._top_node, [])
991
992 try:
993 chunk_struct.node
994 except AttributeError:
995 chunk_struct = Tree(self._top_node, chunk_struct)
996
997
998 if trace == None: trace = self._trace
999
1000 chunkstr = ChunkString(chunk_struct)
1001
1002
1003 if trace:
1004 verbose = (trace>1)
1005 self._trace_apply(chunkstr, verbose)
1006 else:
1007 self._notrace_apply(chunkstr)
1008
1009
1010 return chunkstr.to_chunkstruct(self._chunk_node)
1011
1013 """
1014 @return: the sequence of rules used by C{RegexpChunkParser}.
1015 @rtype: C{list} of C{RegexpChunkRule}
1016 """
1017 return self._rules
1018
1020 """
1021 @return: a concise string representation of this
1022 C{RegexpChunkParser}.
1023 @rtype: C{string}
1024 """
1025 return "<RegexpChunkParser with %d rules>" % len(self._rules)
1026
1028 """
1029 @return: a verbose string representation of this C{RegexpChunkParser}.
1030 @rtype: C{string}
1031 """
1032 s = "RegexpChunkParser with %d rules:\n" % len(self._rules)
1033 margin = 0
1034 for rule in self._rules:
1035 margin = max(margin, len(rule.descr()))
1036 if margin < 35:
1037 format = " %" + `-(margin+3)` + "s%s\n"
1038 else:
1039 format = " %s\n %s\n"
1040 for rule in self._rules:
1041 s += format % (rule.descr(), `rule`)
1042 return s[:-1]
1043
1044
1045
1046
1047
1049 """
1050 A grammar based chunk parser. C{chunk.RegexpParser} uses a set of
1051 regular expression patterns to specify the behavior of the parser.
1052 The chunking of the text is encoded using a C{ChunkString}, and
1053 each rule acts by modifying the chunking in the C{ChunkString}.
1054 The rules are all implemented using regular expression matching
1055 and substitution.
1056
1057 A grammar contains one or more clauses in the following form::
1058
1059 NP:
1060 {<DT|JJ>} # chunk determiners and adjectives
1061 }<[\.VI].*>+{ # chink any tag beginning with V, I, or .
1062 <.*>}{<DT> # split a chunk at a determiner
1063 <DT|JJ>{}<NN.*> # merge chunk ending with det/adj
1064 # with one starting with a noun
1065
1066 The patterns of a clause are executed in order. An earlier
1067 pattern may introduce a chunk boundary that prevents a later
1068 pattern from executing. Sometimes an individual pattern will
1069 match on multiple, overlapping extents of the input. As with
1070 regular expression substitution more generally, the chunker will
1071 identify the first match possible, then continue looking for matches
1072 after this one has ended.
1073
1074 The clauses of a grammar are also executed in order. A cascaded
1075 chunk parser is one having more than one clause. The maximum depth
1076 of a parse tree created by this chunk parser is the same as the
1077 number of clauses in the grammar.
1078
1079 When tracing is turned on, the comment portion of a line is displayed
1080 each time the corresponding pattern is applied.
1081
1082 @type _start: C{string}
1083 @ivar _start: The start symbol of the grammar (the root node of
1084 resulting trees)
1085 @type _stages: C{int}
1086 @ivar _stages: The list of parsing stages corresponding to the grammar
1087
1088 """
1089 - def __init__(self, grammar, top_node='S', loop=1, trace=0):
1090 """
1091 Create a new chunk parser, from the given start state
1092 and set of chunk patterns.
1093
1094 @param grammar: The list of patterns that defines the grammar
1095 @type grammar: C{list} of C{string}
1096 @param top_node: The top node of the tree being created
1097 @type top_node: L{string} or L{Nonterminal}
1098 @param loop: The number of times to run through the patterns
1099 @type loop: L{int}
1100 @type trace: C{int}
1101 @param trace: The level of tracing that should be used when
1102 parsing a text. C{0} will generate no tracing output;
1103 C{1} will generate normal tracing output; and C{2} or
1104 higher will generate verbose tracing output.
1105 """
1106 self._trace = trace
1107 self._stages = []
1108 self._grammar = grammar
1109 self._loop = loop
1110
1111 if isinstance(grammar, basestring):
1112 self._parse_grammar(grammar, top_node, trace)
1113 else:
1114
1115 type_err = ('Expected string or list of RegexpChunkParsers '
1116 'for the grammar.')
1117 try: grammar = list(grammar)
1118 except: raise TypeError(type_err)
1119 for elt in grammar:
1120 if not isinstance(elt, RegexpChunkParser):
1121 raise TypeError(type_err)
1122 self._stages = grammar
1123
1152
1153 - def _add_stage(self, rules, lhs, top_node, trace):
1154 """
1155 Helper function for __init__: add a new stage to the parser.
1156 """
1157 if rules != []:
1158 if not lhs:
1159 raise ValueError('Expected stage marker (eg NP:)')
1160 parser = RegexpChunkParser(rules, chunk_node=lhs,
1161 top_node=top_node, trace=trace)
1162 self._stages.append(parser)
1163
1164 - def parse(self, chunk_struct, trace=None):
1165 """
1166 Apply the chunk parser to this input.
1167
1168 @type chunk_struct: C{Tree}
1169 @param chunk_struct: the chunk structure to be (further) chunked
1170 (this tree is modified, and is also returned)
1171 @type trace: C{int}
1172 @param trace: The level of tracing that should be used when
1173 parsing a text. C{0} will generate no tracing output;
1174 C{1} will generate normal tracing output; and C{2} or
1175 highter will generate verbose tracing output. This value
1176 overrides the trace level value that was given to the
1177 constructor.
1178 @return: the chunked output.
1179 @rtype: C{Tree}
1180 """
1181 if trace == None: trace = self._trace
1182 for i in range(self._loop):
1183 for parser in self._stages:
1184 chunk_struct = parser.parse(chunk_struct, trace=trace)
1185 return chunk_struct
1186
1188 """
1189 @return: a concise string representation of this C{chunk.RegexpParser}.
1190 @rtype: C{string}
1191 """
1192 return "<chunk.RegexpParser with %d stages>" % len(self._stages)
1193
1195 """
1196 @return: a verbose string representation of this
1197 C{RegexpChunkParser}.
1198 @rtype: C{string}
1199 """
1200 s = "chunk.RegexpParser with %d stages:\n" % len(self._stages)
1201 margin = 0
1202 for parser in self._stages:
1203 s += parser.__str__() + "\n"
1204 return s[:-1]
1205
1206
1207
1208
1209
1211 """
1212 Demonstration code for evaluating a chunk parser, using a
1213 C{ChunkScore}. This function assumes that C{text} contains one
1214 sentence per line, and that each sentence has the form expected by
1215 C{tree.chunk}. It runs the given chunk parser on each sentence in
1216 the text, and scores the result. It prints the final score
1217 (precision, recall, and f-measure); and reports the set of chunks
1218 that were missed and the set of chunks that were incorrect. (At
1219 most 10 missing chunks and 10 incorrect chunks are reported).
1220
1221 @param chunkparser: The chunkparser to be tested
1222 @type chunkparser: C{ChunkParserI}
1223 @param text: The chunked tagged text that should be used for
1224 evaluation.
1225 @type text: C{string}
1226 """
1227
1228 from nltk import chunk, Tree
1229
1230
1231 chunkscore = chunk.ChunkScore()
1232
1233 for sentence in text.split('\n'):
1234 print sentence
1235 sentence = sentence.strip()
1236 if not sentence: continue
1237 gold = chunk.tagstr2tree(sentence)
1238 tokens = gold.leaves()
1239 test = chunkparser.parse(Tree('S', tokens), trace=1)
1240 chunkscore.score(gold, test)
1241 print
1242
1243 print '/'+('='*75)+'\\'
1244 print 'Scoring', chunkparser
1245 print ('-'*77)
1246 print 'Precision: %5.1f%%' % (chunkscore.precision()*100), ' '*4,
1247 print 'Recall: %5.1f%%' % (chunkscore.recall()*100), ' '*6,
1248 print 'F-Measure: %5.1f%%' % (chunkscore.f_measure()*100)
1249
1250
1251
1252 if chunkscore.missed():
1253 print 'Missed:'
1254 missed = chunkscore.missed()
1255 for chunk in missed[:10]:
1256 print ' ', ' '.join(c.__str__() for c in chunk)
1257 if len(chunkscore.missed()) > 10:
1258 print ' ...'
1259
1260
1261 if chunkscore.incorrect():
1262 print 'Incorrect:'
1263 incorrect = chunkscore.incorrect()
1264 for chunk in incorrect[:10]:
1265 print ' ', ' '.join(c.__str__() for c in chunk)
1266 if len(chunkscore.incorrect()) > 10:
1267 print ' ...'
1268
1269 print '\\'+('='*75)+'/'
1270 print
1271
1273 """
1274 A demonstration for the C{RegexpChunkParser} class. A single text is
1275 parsed with four different chunk parsers, using a variety of rules
1276 and strategies.
1277 """
1278
1279 from nltk import chunk, Tree
1280
1281 text = """\
1282 [ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] ./.
1283 [ John/NNP ] saw/VBD [the/DT cats/NNS] [the/DT dog/NN] chased/VBD ./.
1284 [ John/NNP ] thinks/VBZ [ Mary/NN ] saw/VBD [ the/DT cat/NN ] sit/VB on/IN [ the/DT mat/NN ]./.
1285 """
1286
1287 print '*'*75
1288 print 'Evaluation text:'
1289 print text
1290 print '*'*75
1291 print
1292
1293 grammar = r"""
1294 NP: # NP stage
1295 {<DT>?<JJ>*<NN>} # chunk determiners, adjectives and nouns
1296 {<NNP>+} # chunk proper nouns
1297 """
1298 cp = chunk.RegexpParser(grammar)
1299 chunk.demo_eval(cp, text)
1300
1301 grammar = r"""
1302 NP:
1303 {<.*>} # start by chunking each tag
1304 }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods
1305 <DT|JJ>{}<NN.*> # merge det/adj with nouns
1306 """
1307 cp = chunk.RegexpParser(grammar)
1308 chunk.demo_eval(cp, text)
1309
1310 grammar = r"""
1311 NP: {<DT>?<JJ>*<NN>} # chunk determiners, adjectives and nouns
1312 VP: {<TO>?<VB.*>} # VP = verb words
1313 """
1314 cp = chunk.RegexpParser(grammar)
1315 chunk.demo_eval(cp, text)
1316
1317 grammar = r"""
1318 NP: {<.*>*} # start by chunking everything
1319 }<[\.VI].*>+{ # chink any verbs, prepositions or periods
1320 <.*>}{<DT> # separate on determiners
1321 PP: {<IN><NP>} # PP = preposition + noun phrase
1322 VP: {<VB.*><NP|PP>*} # VP = verb words + NPs and PPs
1323 """
1324 cp = chunk.RegexpParser(grammar)
1325 chunk.demo_eval(cp, text)
1326
1327
1328
1329 from nltk.corpus import conll2000
1330
1331 print
1332 print "Demonstration of empty grammar:"
1333
1334 cp = chunk.RegexpParser("")
1335 print chunk.accuracy(cp, conll2000.chunked_sents('test.txt',
1336 chunk_types=('NP',)))
1337
1338 print
1339 print "Demonstration of accuracy evaluation using CoNLL tags:"
1340
1341 grammar = r"""
1342 NP:
1343 {<.*>} # start by chunking each tag
1344 }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods
1345 <DT|JJ>{}<NN.*> # merge det/adj with nouns
1346 """
1347 cp = chunk.RegexpParser(grammar)
1348 print chunk.accuracy(cp, conll2000.chunked_sents('test.txt')[:5])
1349
1350 print
1351 print "Demonstration of tagged token input"
1352
1353 grammar = r"""
1354 NP: {<.*>*} # start by chunking everything
1355 }<[\.VI].*>+{ # chink any verbs, prepositions or periods
1356 <.*>}{<DT> # separate on determiners
1357 PP: {<IN><NP>} # PP = preposition + noun phrase
1358 VP: {<VB.*><NP|PP>*} # VP = verb words + NPs and PPs
1359 """
1360 cp = chunk.RegexpParser(grammar)
1361 print cp.parse([("the","DT"), ("little","JJ"), ("cat", "NN"),
1362 ("sat", "VBD"), ("on", "IN"), ("the", "DT"),
1363 ("mat", "NN"), (".", ".")])
1364
1365 if __name__ == '__main__':
1366 demo()
1367