>>> from nltk.chunk import * >>> from nltk.chunk.util import * >>> from nltk.chunk.regexp import * >>> from nltk import Tree>>> tagged_text = "[ The/DT cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] [ the/DT dog/NN ] chewed/VBD ./." >>> gold_chunked_text = tagstr2tree(tagged_text) >>> unchunked_text = gold_chunked_text.flatten()
Chunking uses a special regexp syntax for rules that delimit the chunks. These rules must be converted to 'regular' regular expressions before a sentence can be chunked.
>>> tag_pattern = "<DT>?<JJ>*<NN.*>" >>> regexp_pattern = tag_pattern2re_pattern(tag_pattern) >>> regexp_pattern '(<(DT)>)?(<(JJ)>)*(<(NN[^\\{\\}<>]*)>)'
Construct some new chunking rules.
>>> chunk_rule = ChunkRule("<.*>+", "Chunk everything") >>> chink_rule = ChinkRule("<VBD|IN|\.>", "Chink on verbs/prepositions") >>> split_rule = SplitRule("<DT><NN>", "<DT><NN>", ... "Split successive determiner/noun pairs")
Create and score a series of chunk parsers, successively more complex.
>>> chunk_parser = RegexpChunkParser([chunk_rule], chunk_label='NP') >>> chunked_text = chunk_parser.parse(unchunked_text) >>> print(chunked_text) (S (NP The/DT cat/NN sat/VBD on/IN the/DT mat/NN the/DT dog/NN chewed/VBD ./.))>>> chunkscore = ChunkScore() >>> chunkscore.score(gold_chunked_text, chunked_text) >>> print(chunkscore.precision()) 0.0>>> print(chunkscore.recall()) 0.0>>> print(chunkscore.f_measure()) 0>>> for chunk in sorted(chunkscore.missed()): print(chunk) (NP The/DT cat/NN) (NP the/DT dog/NN) (NP the/DT mat/NN)>>> for chunk in chunkscore.incorrect(): print(chunk) (NP The/DT cat/NN sat/VBD on/IN the/DT mat/NN the/DT dog/NN chewed/VBD ./.)>>> chunk_parser = RegexpChunkParser([chunk_rule, chink_rule], ... chunk_label='NP') >>> chunked_text = chunk_parser.parse(unchunked_text) >>> print(chunked_text) (S (NP The/DT cat/NN) sat/VBD on/IN (NP the/DT mat/NN the/DT dog/NN) chewed/VBD ./.) >>> assert chunked_text == chunk_parser.parse(list(unchunked_text))>>> chunkscore = ChunkScore() >>> chunkscore.score(gold_chunked_text, chunked_text) >>> chunkscore.precision() 0.5>>> print(chunkscore.recall()) 0.33333333...>>> print(chunkscore.f_measure()) 0.4>>> for chunk in sorted(chunkscore.missed()): print(chunk) (NP the/DT dog/NN) (NP the/DT mat/NN)>>> for chunk in chunkscore.incorrect(): print(chunk) (NP the/DT mat/NN the/DT dog/NN)>>> chunk_parser = RegexpChunkParser([chunk_rule, chink_rule, split_rule], ... chunk_label='NP') >>> chunked_text = chunk_parser.parse(unchunked_text, trace=True) # Input: <DT> <NN> <VBD> <IN> <DT> <NN> <DT> <NN> <VBD> <.> # Chunk everything: {<DT> <NN> <VBD> <IN> <DT> <NN> <DT> <NN> <VBD> <.>} # Chink on verbs/prepositions: {<DT> <NN>} <VBD> <IN> {<DT> <NN> <DT> <NN>} <VBD> <.> # Split successive determiner/noun pairs: {<DT> <NN>} <VBD> <IN> {<DT> <NN>}{<DT> <NN>} <VBD> <.> >>> print(chunked_text) (S (NP The/DT cat/NN) sat/VBD on/IN (NP the/DT mat/NN) (NP the/DT dog/NN) chewed/VBD ./.)>>> chunkscore = ChunkScore() >>> chunkscore.score(gold_chunked_text, chunked_text) >>> chunkscore.precision() 1.0>>> chunkscore.recall() 1.0>>> chunkscore.f_measure() 1.0>>> chunkscore.missed() []>>> chunkscore.incorrect() []>>> chunk_parser.rules() # doctest: +NORMALIZE_WHITESPACE [<ChunkRule: '<.*>+'>, <ChinkRule: '<VBD|IN|\\.>'>, <SplitRule: '<DT><NN>', '<DT><NN>'>]
Printing parsers:
>>> print(repr(chunk_parser)) <RegexpChunkParser with 3 rules> >>> print(chunk_parser) RegexpChunkParser with 3 rules: Chunk everything <ChunkRule: '<.*>+'> Chink on verbs/prepositions <ChinkRule: '<VBD|IN|\\.>'> Split successive determiner/noun pairs <SplitRule: '<DT><NN>', '<DT><NN>'>
ChunkParserI is an abstract interface -- it is not meant to be instantiated directly.
>>> ChunkParserI().parse([]) Traceback (most recent call last): . . . NotImplementedError
ChunkString can be built from a tree of tagged tuples, a tree of trees, or a mixed list of both:
>>> t1 = Tree('S', [('w%d' % i, 't%d' % i) for i in range(10)]) >>> t2 = Tree('S', [Tree('t0', []), Tree('t1', ['c1'])]) >>> t3 = Tree('S', [('w0', 't0'), Tree('t1', ['c1'])]) >>> ChunkString(t1) <ChunkString: '<t0><t1><t2><t3><t4><t5><t6><t7><t8><t9>'> >>> ChunkString(t2) <ChunkString: '<t0><t1>'> >>> ChunkString(t3) <ChunkString: '<t0><t1>'>
Other values generate an error:
>>> ChunkString(Tree('S', ['x'])) Traceback (most recent call last): . . . ValueError: chunk structures must contain tagged tokens or trees
The str() for a chunk string adds spaces to it, which makes it line up with str() output for other chunk strings over the same underlying input.
>>> cs = ChunkString(t1) >>> print(cs) <t0> <t1> <t2> <t3> <t4> <t5> <t6> <t7> <t8> <t9> >>> cs.xform('<t3>', '{<t3>}') >>> print(cs) <t0> <t1> <t2> {<t3>} <t4> <t5> <t6> <t7> <t8> <t9>
The _verify() method makes sure that our transforms don't corrupt the chunk string. By setting debug_level=2, _verify() will be called at the end of every call to xform.
>>> cs = ChunkString(t1, debug_level=3)>>> # tag not marked with <...>: >>> cs.xform('<t3>', 't3') Traceback (most recent call last): . . . ValueError: Transformation generated invalid chunkstring: <t0><t1><t2>t3<t4><t5><t6><t7><t8><t9>>>> # brackets not balanced: >>> cs.xform('<t3>', '{<t3>') Traceback (most recent call last): . . . ValueError: Transformation generated invalid chunkstring: <t0><t1><t2>{<t3><t4><t5><t6><t7><t8><t9>>>> # nested brackets: >>> cs.xform('<t3><t4><t5>', '{<t3>{<t4>}<t5>}') Traceback (most recent call last): . . . ValueError: Transformation generated invalid chunkstring: <t0><t1><t2>{<t3>{<t4>}<t5>}<t6><t7><t8><t9>>>> # modified tags: >>> cs.xform('<t3>', '<t9>') Traceback (most recent call last): . . . ValueError: Transformation generated invalid chunkstring: tag changed>>> # added tags: >>> cs.xform('<t9>', '<t9><t10>') Traceback (most recent call last): . . . ValueError: Transformation generated invalid chunkstring: tag changed
Test the different rule constructors & __repr__ methods:
>>> r1 = RegexpChunkRule('<a|b>'+ChunkString.IN_CHINK_PATTERN, ... '{<a|b>}', 'chunk <a> and <b>') >>> r2 = RegexpChunkRule(re.compile('<a|b>'+ChunkString.IN_CHINK_PATTERN), ... '{<a|b>}', 'chunk <a> and <b>') >>> r3 = ChunkRule('<a|b>', 'chunk <a> and <b>') >>> r4 = ChinkRule('<a|b>', 'chink <a> and <b>') >>> r5 = UnChunkRule('<a|b>', 'unchunk <a> and <b>') >>> r6 = MergeRule('<a>', '<b>', 'merge <a> w/ <b>') >>> r7 = SplitRule('<a>', '<b>', 'split <a> from <b>') >>> r8 = ExpandLeftRule('<a>', '<b>', 'expand left <a> <b>') >>> r9 = ExpandRightRule('<a>', '<b>', 'expand right <a> <b>') >>> for rule in r1, r2, r3, r4, r5, r6, r7, r8, r9: ... print(rule) <RegexpChunkRule: '<a|b>(?=[^\\}]*(\\{|$))'->'{<a|b>}'> <RegexpChunkRule: '<a|b>(?=[^\\}]*(\\{|$))'->'{<a|b>}'> <ChunkRule: '<a|b>'> <ChinkRule: '<a|b>'> <UnChunkRule: '<a|b>'> <MergeRule: '<a>', '<b>'> <SplitRule: '<a>', '<b>'> <ExpandLeftRule: '<a>', '<b>'> <ExpandRightRule: '<a>', '<b>'>
tag_pattern2re_pattern() complains if the tag pattern looks problematic:
>>> tag_pattern2re_pattern('{}') Traceback (most recent call last): . . . ValueError: Bad tag pattern: '{}'
A warning is printed when parsing an empty sentence:
>>> parser = RegexpChunkParser([ChunkRule('<a>', '')]) >>> parser.parse(Tree('S', [])) Warning: parsing empty text Tree('S', [])
>>> parser = RegexpParser(''' ... NP: {<DT>? <JJ>* <NN>*} # NP ... P: {<IN>} # Preposition ... V: {<V.*>} # Verb ... PP: {<P> <NP>} # PP -> P NP ... VP: {<V> <NP|PP>*} # VP -> V (NP|PP)* ... ''') >>> print(repr(parser)) <chunk.RegexpParser with 5 stages> >>> print(parser) chunk.RegexpParser with 5 stages: RegexpChunkParser with 1 rules: NP <ChunkRule: '<DT>? <JJ>* <NN>*'> RegexpChunkParser with 1 rules: Preposition <ChunkRule: '<IN>'> RegexpChunkParser with 1 rules: Verb <ChunkRule: '<V.*>'> RegexpChunkParser with 1 rules: PP -> P NP <ChunkRule: '<P> <NP>'> RegexpChunkParser with 1 rules: VP -> V (NP|PP)* <ChunkRule: '<V> <NP|PP>*'> >>> print(parser.parse(unchunked_text, trace=True)) # Input: <DT> <NN> <VBD> <IN> <DT> <NN> <DT> <NN> <VBD> <.> # NP: {<DT> <NN>} <VBD> <IN> {<DT> <NN>}{<DT> <NN>} <VBD> <.> # Input: <NP> <VBD> <IN> <NP> <NP> <VBD> <.> # Preposition: <NP> <VBD> {<IN>} <NP> <NP> <VBD> <.> # Input: <NP> <VBD> <P> <NP> <NP> <VBD> <.> # Verb: <NP> {<VBD>} <P> <NP> <NP> {<VBD>} <.> # Input: <NP> <V> <P> <NP> <NP> <V> <.> # PP -> P NP: <NP> <V> {<P> <NP>} <NP> <V> <.> # Input: <NP> <V> <PP> <NP> <V> <.> # VP -> V (NP|PP)*: <NP> {<V> <PP> <NP>}{<V>} <.> (S (NP The/DT cat/NN) (VP (V sat/VBD) (PP (P on/IN) (NP the/DT mat/NN)) (NP the/DT dog/NN)) (VP (V chewed/VBD)) ./.)
Test parsing of other rule types:
>>> print(RegexpParser(''' ... X: ... }<a><b>{ # chink rule ... <a>}{<b> # split rule ... <a>{}<b> # merge rule ... <a>{<b>}<c> # chunk rule w/ context ... ''')) chunk.RegexpParser with 1 stages: RegexpChunkParser with 4 rules: chink rule <ChinkRule: '<a><b>'> split rule <SplitRule: '<a>', '<b>'> merge rule <MergeRule: '<a>', '<b>'> chunk rule w/ context <ChunkRuleWithContext: '<a>', '<b>', '<c>'>
Illegal patterns give an error message:
>>> print(RegexpParser('X: {<foo>} {<bar>}')) Traceback (most recent call last): . . . ValueError: Illegal chunk pattern: {<foo>} {<bar>}