Parsing

1 Unit tests for the CFG (Context Free Grammar) class

>>> from nltk import cfg

>>> nt1 = cfg.Nonterminal('NP')
>>> nt2 = cfg.Nonterminal('VP')

>>> nt1.symbol()
'NP'

>>> nt1 == cfg.Nonterminal('NP')
True

>>> nt1 == nt2
False

>>> S, NP, VP, PP = cfg.nonterminals('S, NP, VP, PP')
>>> N, V, P, DT = cfg.nonterminals('N, V, P, DT')

>>> prod1 = cfg.Production(S, [NP, VP])
>>> prod2 = cfg.Production(NP, [DT, NP])

>>> prod1.lhs()
<S>

>>> prod1.rhs()
(<NP>, <VP>)

>>> prod1 == cfg.Production(S, [NP, VP])
True

>>> prod1 == prod2
False

>>> grammar = cfg.parse_cfg("""
... S -> NP VP
... PP -> P NP
... NP -> DT N | N PP | DT N PP
... VP -> V NP | V PP | V NP PP
... DT -> 'a'
... DT -> 'the'
... N -> 'cat'
... N -> 'dog'
... N -> 'rug'
... V -> 'chased'
... V -> 'sat'
... P -> 'in'
... P -> 'on'
... """)

2 Unit tests for the rd (Recursive Descent Parser) class

Create and run a recursive descent parser over both a syntactically ambiguous and unambiguous sentence.

>>> from nltk.parse import RecursiveDescentParser
>>> rd = RecursiveDescentParser(grammar)

>>> sentence1 = 'the cat chased the dog'.split()
>>> sentence2 = 'the cat chased the dog on the rug'.split()

>>> for t in rd.nbest_parse(sentence1):
...     print t
(S (NP (DT the) (N cat)) (VP (V chased) (NP (DT the) (N dog))))

>>> for t in rd.nbest_parse(sentence2):
...     print t
(S
  (NP (DT the) (N cat))
  (VP
    (V chased)
    (NP (DT the) (N dog) (PP (P on) (NP (DT the) (N rug))))))
(S
  (NP (DT the) (N cat))
  (VP
    (V chased)
    (NP (DT the) (N dog))
    (PP (P on) (NP (DT the) (N rug)))))

(dolist (expr doctest-font-lock-keywords)

(add-to-list 'font-lock-keywords expr))

font-lock-keywords

(add-to-list 'font-lock-keywords

(car doctest-font-lock-keywords))

3 Unit tests for the sr (Shift Reduce Parser) class

Create and run a shift reduce parser over both a syntactically ambiguous and unambiguous sentence. Note that unlike the recursive descent parser, one and only one parse is ever returned.

>>> from nltk.parse import ShiftReduceParser
>>> sr = ShiftReduceParser(grammar)

>>> sentence1 = 'the cat chased the dog'.split()
>>> sentence2 = 'the cat chased the dog on the rug'.split()

>>> for t in sr.nbest_parse(sentence1):
...     print t
(S (NP (DT the) (N cat)) (VP (V chased) (NP (DT the) (N dog))))

The shift reduce parser uses heuristics to decide what to do when there are multiple possible shift or reduce operations available - for the supplied grammar clearly the wrong operation is selected.

>>> for t in sr.nbest_parse(sentence2):
...     print t

4 Unit tests for the Chart parser class

>>> from nltk.parse import ChartParser, EarleyChartParser, BU_STRATEGY, TD_STRATEGY, FeatureEarleyChartParser

Define a grammar.

>>> grammar = cfg.parse_cfg("""
... S -> NP VP
... PP -> P NP
... NP -> DT N | N | NP PP
... VP -> V NP |  VP PP
... DT -> 'a' | 'the'
... N -> 'Marc' | 'man' | 'park' | 'telescope'
... V -> 'has' | 'saw'
... P -> 'in'
... """)

Some example sentences, one ambiguous and one unambiguous.

>>> sentence1 = "Marc has a telescope".split()
>>> sentence2 = "Marc saw a man in the park".split()

Create a chart parser. First give it a bottom-up strategy.

>>> parser = ChartParser(grammar, BU_STRATEGY)

>>> for t in parser.nbest_parse(sentence1):
...     print t
(S (NP (N Marc)) (VP (V has) (NP (DT a) (N telescope))))

>>> for t in parser.nbest_parse(sentence2):
...     print t
(S
  (NP (N Marc))
  (VP
    (V saw)
    (NP (NP (DT a) (N man)) (PP (P in) (NP (DT the) (N park))))))
(S
  (NP (N Marc))
  (VP
    (VP (V saw) (NP (DT a) (N man)))
    (PP (P in) (NP (DT the) (N park)))))

Redefine the chart parser to use a top-down strategy.

>>> parser = ChartParser(grammar, TD_STRATEGY)

>>> for t in parser.nbest_parse(sentence1):
...     print t
(S (NP (N Marc)) (VP (V has) (NP (DT a) (N telescope))))

>>> for t in parser.nbest_parse(sentence2):
...     print t
(S
  (NP (N Marc))
  (VP
    (V saw)
    (NP (NP (DT a) (N man)) (PP (P in) (NP (DT the) (N park))))))
(S
  (NP (N Marc))
  (VP
    (VP (V saw) (NP (DT a) (N man)))
    (PP (P in) (NP (DT the) (N park)))))

Create and test the Earley variant chart parser. This requires a lexicon.

>>> syntactic_productions = [
... cfg.Production(S, [NP, VP]),
... cfg.Production(PP, [P, NP]),
... cfg.Production(NP, [DT, N]),
... cfg.Production(NP, [N]),
... cfg.Production(NP, [NP, PP]),
... cfg.Production(VP, [V, NP]),
... cfg.Production(VP, [VP, PP])
... ]

>>> lexicon = {
...     'a': [DT], 'the': [DT],
...     'man': [N], 'Marc': [N], 'park': [N], 'telescope': [N],
...     'has': [V], 'saw': [V],
...     'in': [P]
...     }

>>> grammar = cfg.Grammar(S, syntactic_productions, lexicon)

>>> parser = EarleyChartParser(grammar)

>>> for t in parser.nbest_parse(sentence1):
...     print t
(S (NP (N Marc)) (VP (V has) (NP (DT a) (N telescope))))

>>> for t in parser.nbest_parse(sentence2):
...     print t
(S
  (NP (N Marc))
  (VP
    (V saw)
    (NP (NP (DT a) (N man)) (PP (P in) (NP (DT the) (N park))))))
(S
  (NP (N Marc))
  (VP
    (VP (V saw) (NP (DT a) (N man)))
    (PP (P in) (NP (DT the) (N park)))))

5 Unit tests for the Probabilistic CFG class

>>> from nltk.corpus import treebank
>>> from itertools import islice

Create a set of probabilistic CFG productions.

>>> grammar = cfg.parse_pcfg("""
... A -> B B [.3] | C B C [.7]
... B -> B D [.5] | C [.5]
... C -> 'a' [.1] | 'b' [0.9]
... D -> 'b' [1.0]
... """)
>>> prod = grammar.productions()[0]
>>> prod
A -> B B [0.3]

>>> prod.lhs()
<A>

>>> prod.rhs()
(<B>, <B>)

>>> prod.prob()
0.29999999999999999

>>> grammar.start()
<A>

>>> grammar.productions()
[A -> B B [0.3], A -> C B C [0.7], B -> B D [0.5], B -> C [0.5], C -> 'a' [0.1], C -> 'b' [0.9], D -> 'b' [1.0]]

Induce some productions using parsed Treebank data.

>>> productions = []
>>> for file in treebank.files()[:2]:
...     for t in treebank.parsed_sents(file):
...         productions += t.productions()

>>> grammar = cfg.induce_pcfg(S, productions)
>>> grammar
<Grammar with 71 productions>

>>> grammar.productions()[:5]
[PP -> IN NP [1.0], NNP -> 'Nov.' [0.0714285714286], NNP -> 'Agnew' [0.0714285714286], JJ -> 'industrial' [0.142857142857], NP -> CD NNS [0.133333333333]]

6 Unit tests for the Probabilistic Chart Parse classes

>>> tokens = "Jack saw Bob with my cookie".split()
>>> grammar = cfg.toy_pcfg2
>>> print grammar
Grammar with 23 productions (start state = S)
    S -> NP VP [1.0]
    VP -> V NP [0.59]
    VP -> V [0.4]
    VP -> VP PP [0.01]
    NP -> Det N [0.41]
    NP -> Name [0.28]
    NP -> NP PP [0.31]
    PP -> P NP [1.0]
    V -> 'saw' [0.21]
    V -> 'ate' [0.51]
    V -> 'ran' [0.28]
    N -> 'boy' [0.11]
    N -> 'cookie' [0.12]
    N -> 'table' [0.13]
    N -> 'telescope' [0.14]
    N -> 'hill' [0.5]
    Name -> 'Jack' [0.52]
    Name -> 'Bob' [0.48]
    P -> 'with' [0.61]
    P -> 'under' [0.39]
    Det -> 'the' [0.41]
    Det -> 'a' [0.31]
    Det -> 'my' [0.28]

Create several parsers using different queuing strategies and show the resulting parses.

>>> from nltk.parse import pchart

>>> parser = pchart.InsideChartParser(grammar)
>>> for t in parser.nbest_parse(tokens):
...     print t
(S
  (NP (Name Jack))
  (VP
    (V saw)
    (NP
      (NP (Name Bob))
      (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31606532355e-06)
(S
  (NP (Name Jack))
  (VP
    (VP (V saw) (NP (Name Bob)))
    (PP (P with) (NP (Det my) (N cookie))))) (p=2.03744042695e-07)

>>> parser = pchart.RandomChartParser(grammar)
>>> for t in parser.nbest_parse(tokens):
...     print t
(S
  (NP (Name Jack))
  (VP
    (V saw)
    (NP
      (NP (Name Bob))
      (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31606532355e-06)
(S
  (NP (Name Jack))
  (VP
    (VP (V saw) (NP (Name Bob)))
    (PP (P with) (NP (Det my) (N cookie))))) (p=2.03744042695e-07)

>>> parser = pchart.UnsortedChartParser(grammar)
>>> for t in parser.nbest_parse(tokens):
...     print t
(S
  (NP (Name Jack))
  (VP
    (V saw)
    (NP
      (NP (Name Bob))
      (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31606532355e-06)
(S
  (NP (Name Jack))
  (VP
    (VP (V saw) (NP (Name Bob)))
    (PP (P with) (NP (Det my) (N cookie))))) (p=2.03744042695e-07)

>>> parser = pchart.LongestChartParser(grammar)
>>> for t in parser.nbest_parse(tokens):
...     print t
(S
  (NP (Name Jack))
  (VP
    (V saw)
    (NP
      (NP (Name Bob))
      (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31606532355e-06)
(S
  (NP (Name Jack))
  (VP
    (VP (V saw) (NP (Name Bob)))
    (PP (P with) (NP (Det my) (N cookie))))) (p=2.03744042695e-07)

>>> parser = pchart.InsideChartParser(grammar, beam_size = len(tokens)+1)
>>> for t in parser.nbest_parse(tokens):
...     print t

7 Unit tests for the Viterbi Parse classes

>>> from nltk.parse import ViterbiParser
>>> tokens = "Jack saw Bob with my cookie".split()
>>> grammar = cfg.toy_pcfg2

Parse the tokenized sentence.

>>> parser = ViterbiParser(grammar)
>>> for t in parser.nbest_parse(tokens):
...     print t
(S
  (NP (Name Jack))
  (VP
    (V saw)
    (NP
      (NP (Name Bob))
      (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31606532355e-06)

8 Unit tests for the FeatStructNonterminal class

>>> from nltk.parse import FeatStructNonterminal
>>> FeatStructNonterminal(
...     pos='n', agr=FeatStructNonterminal(number='pl', gender='f'))
[agr=[gender='f', number='pl'], pos='n']

>>> FeatStructNonterminal('VP[+fin]/NP[+pl]')
VP[+fin]/NP[+pl]

9 Unit tests for the Feature Chart Parser classes

>>> S = FeatStructNonterminal('S')
>>> VP = FeatStructNonterminal('VP')
>>> NP = FeatStructNonterminal('NP')
>>> PP = FeatStructNonterminal('PP')
>>> V = FeatStructNonterminal('V')
>>> N = FeatStructNonterminal('N')
>>> P = FeatStructNonterminal('P')
>>> Name = FeatStructNonterminal('Name')
>>> Det = FeatStructNonterminal('Det')
>>> DetSg = FeatStructNonterminal('Det[-pl]')
>>> DetPl = FeatStructNonterminal('Det[+pl]')
>>> NSg = FeatStructNonterminal('N[-pl]')
>>> NPl = FeatStructNonterminal('N[+pl]')

Define some grammatical productions.

>>> grammatical_productions = [
...     cfg.Production(S, (NP, VP)),  cfg.Production(PP, (P, NP)),
...     cfg.Production(NP, (NP, PP)),
...     cfg.Production(VP, (VP, PP)), cfg.Production(VP, (V, NP)),
...     cfg.Production(VP, (V,)), cfg.Production(NP, (DetPl, NPl)),
...     cfg.Production(NP, (DetSg, NSg))]

Define some lexical productions.

>>> lexicon = {
...     'John': [NP], 'I': [NP],
...     'the': [Det], 'my': [Det], 'a': [Det],
...     'dog': [NSg], 'cookie': [NSg],
...     'ate': [V], 'saw': [V],
...     'with': [P], 'under': [P]
...     }

>>> earley_grammar = cfg.Grammar(S, grammatical_productions, lexicon)
>>> from nltk.compat import defaultdict

>>> tokens = 'I saw John with a dog with my cookie'.split()
>>> cp = FeatureEarleyChartParser(earley_grammar)
>>> trees = cp.nbest_parse(tokens)
>>> for t in sorted(trees, key=repr):
...     print t
(S[]
  (NP[] I)
  (VP[]
    (VP[]
      (VP[] (V[] saw) (NP[] John))
      (PP[] (P[] with) (NP[] (Det[] a) (N[-pl] dog))))
    (PP[] (P[] with) (NP[] (Det[] my) (N[-pl] cookie)))))
(S[]
  (NP[] I)
  (VP[]
    (VP[] (V[] saw) (NP[] John))
    (PP[]
      (P[] with)
      (NP[]
        (NP[] (Det[] a) (N[-pl] dog))
        (PP[] (P[] with) (NP[] (Det[] my) (N[-pl] cookie)))))))
(S[]
  (NP[] I)
  (VP[]
    (VP[]
      (V[] saw)
      (NP[]
        (NP[] John)
        (PP[] (P[] with) (NP[] (Det[] a) (N[-pl] dog)))))
    (PP[] (P[] with) (NP[] (Det[] my) (N[-pl] cookie)))))
(S[]
  (NP[] I)
  (VP[]
    (V[] saw)
    (NP[]
      (NP[] John)
      (PP[]
        (P[] with)
        (NP[]
          (NP[] (Det[] a) (N[-pl] dog))
          (PP[] (P[] with) (NP[] (Det[] my) (N[-pl] cookie))))))))
(S[]
  (NP[] I)
  (VP[]
    (V[] saw)
    (NP[]
      (NP[]
        (NP[] John)
        (PP[] (P[] with) (NP[] (Det[] a) (N[-pl] dog))))
      (PP[] (P[] with) (NP[] (Det[] my) (N[-pl] cookie))))))

1 0 Tests for loading grammar files

>>> from nltk import data
>>> simple_cfg = data.load('grammars/toy.cfg')
>>> rdp = RecursiveDescentParser(simple_cfg)
>>> prob_cfg = data.load('grammars/toy1.pcfg')
>>> vp = ViterbiParser(prob_cfg)
>>> feat_cfg = data.load('grammars/feat0.fcfg')
>>> fcp = FeatureEarleyChartParser(feat_cfg)