Chunking

>>> from nltk.chunk import *
>>> from nltk.chunk.util import *
>>> from nltk.chunk.regexp import *
>>> from nltk import Tree
>>> tagged_text = "[ The/DT cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] [ the/DT dog/NN ] chewed/VBD ./."
>>> gold_chunked_text = tagstr2tree(tagged_text)
>>> unchunked_text = gold_chunked_text.flatten()

Chunking uses a special regexp syntax for rules that delimit the chunks. These rules must be converted to 'regular' regular expressions before a sentence can be chunked.

>>> tag_pattern = "<DT>?<JJ>*<NN.*>"
>>> regexp_pattern = tag_pattern2re_pattern(tag_pattern)
>>> regexp_pattern
'(<(DT)>)?(<(JJ)>)*(<(NN[^\\{\\}<>]*)>)'

Construct some new chunking rules.

>>> chunk_rule = ChunkRule("<.*>+", "Chunk everything")
>>> chink_rule = ChinkRule("<VBD|IN|\.>", "Chink on verbs/prepositions")
>>> split_rule = SplitRule("<DT><NN>", "<DT><NN>",
...                        "Split successive determiner/noun pairs")

Create and score a series of chunk parsers, successively more complex.

>>> chunk_parser = RegexpChunkParser([chunk_rule], chunk_label='NP')
>>> chunked_text = chunk_parser.parse(unchunked_text)
>>> print(chunked_text)
(S
  (NP
    The/DT
    cat/NN
    sat/VBD
    on/IN
    the/DT
    mat/NN
    the/DT
    dog/NN
    chewed/VBD
    ./.))
>>> chunkscore = ChunkScore()
>>> chunkscore.score(gold_chunked_text, chunked_text)
>>> print(chunkscore.precision())
0.0
>>> print(chunkscore.recall())
0.0
>>> print(chunkscore.f_measure())
0
>>> for chunk in sorted(chunkscore.missed()): print(chunk)
(NP The/DT cat/NN)
(NP the/DT dog/NN)
(NP the/DT mat/NN)
>>> for chunk in chunkscore.incorrect(): print(chunk)
(NP
  The/DT
  cat/NN
  sat/VBD
  on/IN
  the/DT
  mat/NN
  the/DT
  dog/NN
  chewed/VBD
  ./.)
>>> chunk_parser = RegexpChunkParser([chunk_rule, chink_rule],
...                                  chunk_label='NP')
>>> chunked_text = chunk_parser.parse(unchunked_text)
>>> print(chunked_text)
(S
  (NP The/DT cat/NN)
  sat/VBD
  on/IN
  (NP the/DT mat/NN the/DT dog/NN)
  chewed/VBD
  ./.)
>>> assert chunked_text == chunk_parser.parse(list(unchunked_text))
>>> chunkscore = ChunkScore()
>>> chunkscore.score(gold_chunked_text, chunked_text)
>>> chunkscore.precision()
0.5
>>> print(chunkscore.recall())
0.33333333...
>>> print(chunkscore.f_measure())
0.4
>>> for chunk in sorted(chunkscore.missed()): print(chunk)
(NP the/DT dog/NN)
(NP the/DT mat/NN)
>>> for chunk in chunkscore.incorrect(): print(chunk)
(NP the/DT mat/NN the/DT dog/NN)
>>> chunk_parser = RegexpChunkParser([chunk_rule, chink_rule, split_rule],
...                                  chunk_label='NP')
>>> chunked_text = chunk_parser.parse(unchunked_text, trace=True)
# Input:
 <DT>  <NN>  <VBD>  <IN>  <DT>  <NN>  <DT>  <NN>  <VBD>  <.>
# Chunk everything:
{<DT>  <NN>  <VBD>  <IN>  <DT>  <NN>  <DT>  <NN>  <VBD>  <.>}
# Chink on verbs/prepositions:
{<DT>  <NN>} <VBD>  <IN> {<DT>  <NN>  <DT>  <NN>} <VBD>  <.>
# Split successive determiner/noun pairs:
{<DT>  <NN>} <VBD>  <IN> {<DT>  <NN>}{<DT>  <NN>} <VBD>  <.>
>>> print(chunked_text)
(S
  (NP The/DT cat/NN)
  sat/VBD
  on/IN
  (NP the/DT mat/NN)
  (NP the/DT dog/NN)
  chewed/VBD
  ./.)
>>> chunkscore = ChunkScore()
>>> chunkscore.score(gold_chunked_text, chunked_text)
>>> chunkscore.precision()
1.0
>>> chunkscore.recall()
1.0
>>> chunkscore.f_measure()
1.0
>>> chunkscore.missed()
[]
>>> chunkscore.incorrect()
[]
>>> chunk_parser.rules() # doctest: +NORMALIZE_WHITESPACE
[<ChunkRule: '<.*>+'>, <ChinkRule: '<VBD|IN|\\.>'>,
 <SplitRule: '<DT><NN>', '<DT><NN>'>]

Printing parsers:

>>> print(repr(chunk_parser))
<RegexpChunkParser with 3 rules>
>>> print(chunk_parser)
RegexpChunkParser with 3 rules:
    Chunk everything
      <ChunkRule: '<.*>+'>
    Chink on verbs/prepositions
      <ChinkRule: '<VBD|IN|\\.>'>
    Split successive determiner/noun pairs
      <SplitRule: '<DT><NN>', '<DT><NN>'>

Regression Tests

ChunkParserI

ChunkParserI is an abstract interface -- it is not meant to be instantiated directly.

>>> ChunkParserI().parse([])
Traceback (most recent call last):
  . . .
NotImplementedError

ChunkString

ChunkString can be built from a tree of tagged tuples, a tree of trees, or a mixed list of both:

>>> t1 = Tree('S', [('w%d' % i, 't%d' % i) for i in range(10)])
>>> t2 = Tree('S', [Tree('t0', []), Tree('t1', ['c1'])])
>>> t3 = Tree('S', [('w0', 't0'), Tree('t1', ['c1'])])
>>> ChunkString(t1)
<ChunkString: '<t0><t1><t2><t3><t4><t5><t6><t7><t8><t9>'>
>>> ChunkString(t2)
<ChunkString: '<t0><t1>'>
>>> ChunkString(t3)
<ChunkString: '<t0><t1>'>

Other values generate an error:

>>> ChunkString(Tree('S', ['x']))
Traceback (most recent call last):
  . . .
ValueError: chunk structures must contain tagged tokens or trees

The str() for a chunk string adds spaces to it, which makes it line up with str() output for other chunk strings over the same underlying input.

>>> cs = ChunkString(t1)
>>> print(cs)
 <t0>  <t1>  <t2>  <t3>  <t4>  <t5>  <t6>  <t7>  <t8>  <t9>
>>> cs.xform('<t3>', '{<t3>}')
>>> print(cs)
 <t0>  <t1>  <t2> {<t3>} <t4>  <t5>  <t6>  <t7>  <t8>  <t9>

The _verify() method makes sure that our transforms don't corrupt the chunk string. By setting debug_level=2, _verify() will be called at the end of every call to xform.

>>> cs = ChunkString(t1, debug_level=3)
>>> # tag not marked with <...>:
>>> cs.xform('<t3>', 't3')
Traceback (most recent call last):
  . . .
ValueError: Transformation generated invalid chunkstring:
  <t0><t1><t2>t3<t4><t5><t6><t7><t8><t9>
>>> # brackets not balanced:
>>> cs.xform('<t3>', '{<t3>')
Traceback (most recent call last):
  . . .
ValueError: Transformation generated invalid chunkstring:
  <t0><t1><t2>{<t3><t4><t5><t6><t7><t8><t9>
>>> # nested brackets:
>>> cs.xform('<t3><t4><t5>', '{<t3>{<t4>}<t5>}')
Traceback (most recent call last):
  . . .
ValueError: Transformation generated invalid chunkstring:
  <t0><t1><t2>{<t3>{<t4>}<t5>}<t6><t7><t8><t9>
>>> # modified tags:
>>> cs.xform('<t3>', '<t9>')
Traceback (most recent call last):
  . . .
ValueError: Transformation generated invalid chunkstring: tag changed
>>> # added tags:
>>> cs.xform('<t9>', '<t9><t10>')
Traceback (most recent call last):
  . . .
ValueError: Transformation generated invalid chunkstring: tag changed

Chunking Rules

Test the different rule constructors & __repr__ methods:

>>> r1 = RegexpChunkRule('<a|b>'+ChunkString.IN_CHINK_PATTERN,
...                      '{<a|b>}', 'chunk <a> and <b>')
>>> r2 = RegexpChunkRule(re.compile('<a|b>'+ChunkString.IN_CHINK_PATTERN),
...                      '{<a|b>}', 'chunk <a> and <b>')
>>> r3 = ChunkRule('<a|b>', 'chunk <a> and <b>')
>>> r4 = ChinkRule('<a|b>', 'chink <a> and <b>')
>>> r5 = UnChunkRule('<a|b>', 'unchunk <a> and <b>')
>>> r6 = MergeRule('<a>', '<b>', 'merge <a> w/ <b>')
>>> r7 = SplitRule('<a>', '<b>', 'split <a> from <b>')
>>> r8 = ExpandLeftRule('<a>', '<b>', 'expand left <a> <b>')
>>> r9 = ExpandRightRule('<a>', '<b>', 'expand right <a> <b>')
>>> for rule in r1, r2, r3, r4, r5, r6, r7, r8, r9:
...     print(rule)
<RegexpChunkRule: '<a|b>(?=[^\\}]*(\\{|$))'->'{<a|b>}'>
<RegexpChunkRule: '<a|b>(?=[^\\}]*(\\{|$))'->'{<a|b>}'>
<ChunkRule: '<a|b>'>
<ChinkRule: '<a|b>'>
<UnChunkRule: '<a|b>'>
<MergeRule: '<a>', '<b>'>
<SplitRule: '<a>', '<b>'>
<ExpandLeftRule: '<a>', '<b>'>
<ExpandRightRule: '<a>', '<b>'>

tag_pattern2re_pattern() complains if the tag pattern looks problematic:

>>> tag_pattern2re_pattern('{}')
Traceback (most recent call last):
  . . .
ValueError: Bad tag pattern: '{}'

RegexpChunkParser

A warning is printed when parsing an empty sentence:

>>> parser = RegexpChunkParser([ChunkRule('<a>', '')])
>>> parser.parse(Tree('S', []))
Warning: parsing empty text
Tree('S', [])

RegexpParser

>>> parser = RegexpParser('''
... NP: {<DT>? <JJ>* <NN>*} # NP
... P: {<IN>}           # Preposition
... V: {<V.*>}          # Verb
... PP: {<P> <NP>}      # PP -> P NP
... VP: {<V> <NP|PP>*}  # VP -> V (NP|PP)*
... ''')
>>> print(repr(parser))
<chunk.RegexpParser with 5 stages>
>>> print(parser)
chunk.RegexpParser with 5 stages:
RegexpChunkParser with 1 rules:
    NP   <ChunkRule: '<DT>? <JJ>* <NN>*'>
RegexpChunkParser with 1 rules:
    Preposition   <ChunkRule: '<IN>'>
RegexpChunkParser with 1 rules:
    Verb   <ChunkRule: '<V.*>'>
RegexpChunkParser with 1 rules:
    PP -> P NP   <ChunkRule: '<P> <NP>'>
RegexpChunkParser with 1 rules:
    VP -> V (NP|PP)*   <ChunkRule: '<V> <NP|PP>*'>
>>> print(parser.parse(unchunked_text, trace=True))
# Input:
 <DT>  <NN>  <VBD>  <IN>  <DT>  <NN>  <DT>  <NN>  <VBD>  <.>
# NP:
{<DT>  <NN>} <VBD>  <IN> {<DT>  <NN>}{<DT>  <NN>} <VBD>  <.>
# Input:
 <NP>  <VBD>  <IN>  <NP>  <NP>  <VBD>  <.>
# Preposition:
 <NP>  <VBD> {<IN>} <NP>  <NP>  <VBD>  <.>
# Input:
 <NP>  <VBD>  <P>  <NP>  <NP>  <VBD>  <.>
# Verb:
 <NP> {<VBD>} <P>  <NP>  <NP> {<VBD>} <.>
# Input:
 <NP>  <V>  <P>  <NP>  <NP>  <V>  <.>
# PP -> P NP:
 <NP>  <V> {<P>  <NP>} <NP>  <V>  <.>
# Input:
 <NP>  <V>  <PP>  <NP>  <V>  <.>
# VP -> V (NP|PP)*:
 <NP> {<V>  <PP>  <NP>}{<V>} <.>
(S
  (NP The/DT cat/NN)
  (VP
    (V sat/VBD)
    (PP (P on/IN) (NP the/DT mat/NN))
    (NP the/DT dog/NN))
  (VP (V chewed/VBD))
  ./.)

Test parsing of other rule types:

>>> print(RegexpParser('''
... X:
...   }<a><b>{     # chink rule
...   <a>}{<b>     # split rule
...   <a>{}<b>     # merge rule
...   <a>{<b>}<c>  # chunk rule w/ context
... '''))
chunk.RegexpParser with 1 stages:
RegexpChunkParser with 4 rules:
    chink rule              <ChinkRule: '<a><b>'>
    split rule              <SplitRule: '<a>', '<b>'>
    merge rule              <MergeRule: '<a>', '<b>'>
    chunk rule w/ context   <ChunkRuleWithContext: '<a>', '<b>', '<c>'>

Illegal patterns give an error message:

>>> print(RegexpParser('X: {<foo>} {<bar>}'))
Traceback (most recent call last):
  . . .
ValueError: Illegal chunk pattern: {<foo>} {<bar>}