>>> from nltk.chunk import * >>> from nltk.chunk.util import * >>> from nltk.chunk.regexp import * >>> from nltk import Tree>>> tagged_text = "[ The/DT cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] [ the/DT dog/NN ] chewed/VBD ./." >>> gold_chunked_text = tagstr2tree(tagged_text) >>> unchunked_text = gold_chunked_text.flatten()
Chunking uses a special regexp syntax for rules that delimit the chunks. These rules must be converted to 'regular' regular expressions before a sentence can be chunked.
>>> tag_pattern = "<DT>?<JJ>*<NN.*>"
>>> regexp_pattern = tag_pattern2re_pattern(tag_pattern)
>>> regexp_pattern
'(<(DT)>)?(<(JJ)>)*(<(NN[^\\{\\}<>]*)>)'
Construct some new chunking rules.
>>> chunk_rule = ChunkRule("<.*>+", "Chunk everything")
>>> chink_rule = ChinkRule("<VBD|IN|\.>", "Chink on verbs/prepositions")
>>> split_rule = SplitRule("<DT><NN>", "<DT><NN>",
... "Split successive determiner/noun pairs")
Create and score a series of chunk parsers, successively more complex.
>>> chunk_parser = RegexpChunkParser([chunk_rule], chunk_label='NP') >>> chunked_text = chunk_parser.parse(unchunked_text) >>> print(chunked_text) (S (NP The/DT cat/NN sat/VBD on/IN the/DT mat/NN the/DT dog/NN chewed/VBD ./.))>>> chunkscore = ChunkScore() >>> chunkscore.score(gold_chunked_text, chunked_text) >>> print(chunkscore.precision()) 0.0>>> print(chunkscore.recall()) 0.0>>> print(chunkscore.f_measure()) 0>>> for chunk in sorted(chunkscore.missed()): print(chunk) (NP The/DT cat/NN) (NP the/DT dog/NN) (NP the/DT mat/NN)>>> for chunk in chunkscore.incorrect(): print(chunk) (NP The/DT cat/NN sat/VBD on/IN the/DT mat/NN the/DT dog/NN chewed/VBD ./.)>>> chunk_parser = RegexpChunkParser([chunk_rule, chink_rule], ... chunk_label='NP') >>> chunked_text = chunk_parser.parse(unchunked_text) >>> print(chunked_text) (S (NP The/DT cat/NN) sat/VBD on/IN (NP the/DT mat/NN the/DT dog/NN) chewed/VBD ./.) >>> assert chunked_text == chunk_parser.parse(list(unchunked_text))>>> chunkscore = ChunkScore() >>> chunkscore.score(gold_chunked_text, chunked_text) >>> chunkscore.precision() 0.5>>> print(chunkscore.recall()) 0.33333333...>>> print(chunkscore.f_measure()) 0.4>>> for chunk in sorted(chunkscore.missed()): print(chunk) (NP the/DT dog/NN) (NP the/DT mat/NN)>>> for chunk in chunkscore.incorrect(): print(chunk) (NP the/DT mat/NN the/DT dog/NN)>>> chunk_parser = RegexpChunkParser([chunk_rule, chink_rule, split_rule], ... chunk_label='NP') >>> chunked_text = chunk_parser.parse(unchunked_text, trace=True) # Input: <DT> <NN> <VBD> <IN> <DT> <NN> <DT> <NN> <VBD> <.> # Chunk everything: {<DT> <NN> <VBD> <IN> <DT> <NN> <DT> <NN> <VBD> <.>} # Chink on verbs/prepositions: {<DT> <NN>} <VBD> <IN> {<DT> <NN> <DT> <NN>} <VBD> <.> # Split successive determiner/noun pairs: {<DT> <NN>} <VBD> <IN> {<DT> <NN>}{<DT> <NN>} <VBD> <.> >>> print(chunked_text) (S (NP The/DT cat/NN) sat/VBD on/IN (NP the/DT mat/NN) (NP the/DT dog/NN) chewed/VBD ./.)>>> chunkscore = ChunkScore() >>> chunkscore.score(gold_chunked_text, chunked_text) >>> chunkscore.precision() 1.0>>> chunkscore.recall() 1.0>>> chunkscore.f_measure() 1.0>>> chunkscore.missed() []>>> chunkscore.incorrect() []>>> chunk_parser.rules() # doctest: +NORMALIZE_WHITESPACE [<ChunkRule: '<.*>+'>, <ChinkRule: '<VBD|IN|\\.>'>, <SplitRule: '<DT><NN>', '<DT><NN>'>]
Printing parsers:
>>> print(repr(chunk_parser))
<RegexpChunkParser with 3 rules>
>>> print(chunk_parser)
RegexpChunkParser with 3 rules:
Chunk everything
<ChunkRule: '<.*>+'>
Chink on verbs/prepositions
<ChinkRule: '<VBD|IN|\\.>'>
Split successive determiner/noun pairs
<SplitRule: '<DT><NN>', '<DT><NN>'>
ChunkParserI is an abstract interface -- it is not meant to be instantiated directly.
>>> ChunkParserI().parse([]) Traceback (most recent call last): . . . NotImplementedError
ChunkString can be built from a tree of tagged tuples, a tree of trees, or a mixed list of both:
>>> t1 = Tree('S', [('w%d' % i, 't%d' % i) for i in range(10)])
>>> t2 = Tree('S', [Tree('t0', []), Tree('t1', ['c1'])])
>>> t3 = Tree('S', [('w0', 't0'), Tree('t1', ['c1'])])
>>> ChunkString(t1)
<ChunkString: '<t0><t1><t2><t3><t4><t5><t6><t7><t8><t9>'>
>>> ChunkString(t2)
<ChunkString: '<t0><t1>'>
>>> ChunkString(t3)
<ChunkString: '<t0><t1>'>
Other values generate an error:
>>> ChunkString(Tree('S', ['x']))
Traceback (most recent call last):
. . .
ValueError: chunk structures must contain tagged tokens or trees
The str() for a chunk string adds spaces to it, which makes it line up with str() output for other chunk strings over the same underlying input.
>>> cs = ChunkString(t1)
>>> print(cs)
<t0> <t1> <t2> <t3> <t4> <t5> <t6> <t7> <t8> <t9>
>>> cs.xform('<t3>', '{<t3>}')
>>> print(cs)
<t0> <t1> <t2> {<t3>} <t4> <t5> <t6> <t7> <t8> <t9>
The _verify() method makes sure that our transforms don't corrupt the chunk string. By setting debug_level=2, _verify() will be called at the end of every call to xform.
>>> cs = ChunkString(t1, debug_level=3)>>> # tag not marked with <...>: >>> cs.xform('<t3>', 't3') Traceback (most recent call last): . . . ValueError: Transformation generated invalid chunkstring: <t0><t1><t2>t3<t4><t5><t6><t7><t8><t9>>>> # brackets not balanced: >>> cs.xform('<t3>', '{<t3>') Traceback (most recent call last): . . . ValueError: Transformation generated invalid chunkstring: <t0><t1><t2>{<t3><t4><t5><t6><t7><t8><t9>>>> # nested brackets: >>> cs.xform('<t3><t4><t5>', '{<t3>{<t4>}<t5>}') Traceback (most recent call last): . . . ValueError: Transformation generated invalid chunkstring: <t0><t1><t2>{<t3>{<t4>}<t5>}<t6><t7><t8><t9>>>> # modified tags: >>> cs.xform('<t3>', '<t9>') Traceback (most recent call last): . . . ValueError: Transformation generated invalid chunkstring: tag changed>>> # added tags: >>> cs.xform('<t9>', '<t9><t10>') Traceback (most recent call last): . . . ValueError: Transformation generated invalid chunkstring: tag changed
Test the different rule constructors & __repr__ methods:
>>> r1 = RegexpChunkRule('<a|b>'+ChunkString.IN_CHINK_PATTERN,
... '{<a|b>}', 'chunk <a> and <b>')
>>> r2 = RegexpChunkRule(re.compile('<a|b>'+ChunkString.IN_CHINK_PATTERN),
... '{<a|b>}', 'chunk <a> and <b>')
>>> r3 = ChunkRule('<a|b>', 'chunk <a> and <b>')
>>> r4 = ChinkRule('<a|b>', 'chink <a> and <b>')
>>> r5 = UnChunkRule('<a|b>', 'unchunk <a> and <b>')
>>> r6 = MergeRule('<a>', '<b>', 'merge <a> w/ <b>')
>>> r7 = SplitRule('<a>', '<b>', 'split <a> from <b>')
>>> r8 = ExpandLeftRule('<a>', '<b>', 'expand left <a> <b>')
>>> r9 = ExpandRightRule('<a>', '<b>', 'expand right <a> <b>')
>>> for rule in r1, r2, r3, r4, r5, r6, r7, r8, r9:
... print(rule)
<RegexpChunkRule: '<a|b>(?=[^\\}]*(\\{|$))'->'{<a|b>}'>
<RegexpChunkRule: '<a|b>(?=[^\\}]*(\\{|$))'->'{<a|b>}'>
<ChunkRule: '<a|b>'>
<ChinkRule: '<a|b>'>
<UnChunkRule: '<a|b>'>
<MergeRule: '<a>', '<b>'>
<SplitRule: '<a>', '<b>'>
<ExpandLeftRule: '<a>', '<b>'>
<ExpandRightRule: '<a>', '<b>'>
tag_pattern2re_pattern() complains if the tag pattern looks problematic:
>>> tag_pattern2re_pattern('{}')
Traceback (most recent call last):
. . .
ValueError: Bad tag pattern: '{}'
A warning is printed when parsing an empty sentence:
>>> parser = RegexpChunkParser([ChunkRule('<a>', '')])
>>> parser.parse(Tree('S', []))
Warning: parsing empty text
Tree('S', [])
>>> parser = RegexpParser('''
... NP: {<DT>? <JJ>* <NN>*} # NP
... P: {<IN>} # Preposition
... V: {<V.*>} # Verb
... PP: {<P> <NP>} # PP -> P NP
... VP: {<V> <NP|PP>*} # VP -> V (NP|PP)*
... ''')
>>> print(repr(parser))
<chunk.RegexpParser with 5 stages>
>>> print(parser)
chunk.RegexpParser with 5 stages:
RegexpChunkParser with 1 rules:
NP <ChunkRule: '<DT>? <JJ>* <NN>*'>
RegexpChunkParser with 1 rules:
Preposition <ChunkRule: '<IN>'>
RegexpChunkParser with 1 rules:
Verb <ChunkRule: '<V.*>'>
RegexpChunkParser with 1 rules:
PP -> P NP <ChunkRule: '<P> <NP>'>
RegexpChunkParser with 1 rules:
VP -> V (NP|PP)* <ChunkRule: '<V> <NP|PP>*'>
>>> print(parser.parse(unchunked_text, trace=True))
# Input:
<DT> <NN> <VBD> <IN> <DT> <NN> <DT> <NN> <VBD> <.>
# NP:
{<DT> <NN>} <VBD> <IN> {<DT> <NN>}{<DT> <NN>} <VBD> <.>
# Input:
<NP> <VBD> <IN> <NP> <NP> <VBD> <.>
# Preposition:
<NP> <VBD> {<IN>} <NP> <NP> <VBD> <.>
# Input:
<NP> <VBD> <P> <NP> <NP> <VBD> <.>
# Verb:
<NP> {<VBD>} <P> <NP> <NP> {<VBD>} <.>
# Input:
<NP> <V> <P> <NP> <NP> <V> <.>
# PP -> P NP:
<NP> <V> {<P> <NP>} <NP> <V> <.>
# Input:
<NP> <V> <PP> <NP> <V> <.>
# VP -> V (NP|PP)*:
<NP> {<V> <PP> <NP>}{<V>} <.>
(S
(NP The/DT cat/NN)
(VP
(V sat/VBD)
(PP (P on/IN) (NP the/DT mat/NN))
(NP the/DT dog/NN))
(VP (V chewed/VBD))
./.)
Test parsing of other rule types:
>>> print(RegexpParser('''
... X:
... }<a><b>{ # chink rule
... <a>}{<b> # split rule
... <a>{}<b> # merge rule
... <a>{<b>}<c> # chunk rule w/ context
... '''))
chunk.RegexpParser with 1 stages:
RegexpChunkParser with 4 rules:
chink rule <ChinkRule: '<a><b>'>
split rule <SplitRule: '<a>', '<b>'>
merge rule <MergeRule: '<a>', '<b>'>
chunk rule w/ context <ChunkRuleWithContext: '<a>', '<b>', '<c>'>
Illegal patterns give an error message:
>>> print(RegexpParser('X: {<foo>} {<bar>}'))
Traceback (most recent call last):
. . .
ValueError: Illegal chunk pattern: {<foo>} {<bar>}