1
2
3
4
5
6
7
8
9
10 """
11 Extension of chart parsing implementation to handle grammars with
12 feature structures as nodes.
13 """
14
15 import yaml
16
17 from nltk.featstruct import FeatStruct, unify, FeatStructParser
18 from nltk.sem import logic
19 from nltk import cfg, defaultdict
20 from nltk.cfg import FeatStructNonterminal
21 import nltk.data
22
23 from api import *
24 from chart import *
25
26 -def load_earley(filename, trace=0, cache=False, verbose=False,
27 chart_class=Chart):
28 """
29 Load a grammar from a file, and build an Earley feature parser based on
30 that grammar.
31
32 You can optionally specify a tracing level, for how much output you
33 want to see:
34
35 0: No output.
36 1: Show edges from scanner and completer rules (not predictor).
37 2 (default): Show all edges as they are added to the chart.
38 3: Show all edges, plus the results of successful unifications.
39 4: Show all edges, plus the results of all attempted unifications.
40 5: Show all edges, plus the results of all attempted unifications,
41 including those with cached results.
42
43 If C{verbose} is set to C{True}, then more diagnostic information about
44 grammar-loading is displayed.
45 """
46 grammar = nltk.data.load(filename, cache=cache, verbose=verbose)
47 return FeatureEarleyChartParser(grammar, trace=trace,
48 chart_class=chart_class)
49
51 """
52 A specialized tree edge that allows shared variable bindings
53 between nonterminals on the left-hand side and right-hand side.
54
55 Each C{FeatureTreeEdge} contains a set of C{bindings}, i.e., a
56 dictionary mapping from variables to values. If the edge is not
57 complete, then these bindings are simply stored. However, if the
58 edge is complete, then the constructor applies these bindings to
59 every nonterminal in the edge whose symbol implements the
60 interface L{SubstituteBindingsI}.
61 """
62 - def __init__(self, span, lhs, rhs, dot=0, bindings=None):
63 """
64 Construct a new edge. If the edge is incomplete (i.e., if
65 C{dot<len(rhs)}), then store the bindings as-is. If the edge
66 is complete (i.e., if C{dot==len(rhs)}), then apply the
67 bindings to all nonterminals in C{lhs} and C{rhs}, and then
68 clear the bindings. See L{TreeEdge} for a description of
69 the other arguments.
70 """
71 if bindings is None: bindings = {}
72
73
74
75
76
77
78 if dot == len(rhs) and bindings:
79 lhs = self._bind(lhs, bindings)
80 rhs = [self._bind(elt, bindings) for elt in rhs]
81 bindings = {}
82
83
84 TreeEdge.__init__(self, span, lhs, rhs, dot)
85 self._bindings = bindings
86
87 - def _bind(self, nt, bindings):
90
92 return self._bind(self.next(), self._bindings)
93
95 """
96 Return a copy of this edge's bindings dictionary.
97 """
98 return self._bindings.copy()
99
107
108
110 if self.__class__ != other.__class__: return -1
111 return cmp((self._span, self._lhs, self._rhs,
112 self._dot, self._bindings),
113 (other._span, other._lhs, other._rhs,
114 other._dot, other._bindings))
115
117
118 return hash((self._lhs, self._rhs, self._span, self._dot,
119 tuple(sorted(self._bindings))))
120
122 """
123 A specialized version of the fundamental rule that operates on
124 nonterminals whose symbols are C{FeatStructNonterminal}s. Rather
125 tha simply comparing the nonterminals for equality, they are
126 unified. Variable bindings from these unifications are collected
127 and stored in the chart using a L{FeatureTreeEdge}. When a
128 complete edge is generated, these bindings are applied to all
129 nonterminals in the edge.
130
131 The fundamental rule states that:
132 - [AS{->}S{alpha}*B1S{beta}][i:j]
133 - [B2S{->}S{gamma}*][j:k]
134 licenses the edge:
135 - [AS{->}S{alpha}B3*S{beta}][i:j]
136 assuming that B1 and B2 can be unified to generate B3.
137 """
138 - def apply_iter(self, chart, grammar, left_edge, right_edge):
139
140 if not (left_edge.end() == right_edge.start() and
141 left_edge.is_incomplete() and
142 right_edge.is_complete() and
143 isinstance(left_edge, TreeEdge) and
144 isinstance(right_edge, TreeEdge)):
145 return
146
147
148
149 bindings = left_edge.bindings()
150 result = unify(left_edge.next(), right_edge.lhs(),
151 bindings, rename_vars=False)
152 if result is None: return
153
154
155 new_edge = FeatureTreeEdge(span=(left_edge.start(), right_edge.end()),
156 lhs=left_edge.lhs(), rhs=left_edge.rhs(),
157 dot=left_edge.dot()+1, bindings=bindings)
158
159
160 changed_chart = False
161 for cpl1 in chart.child_pointer_lists(left_edge):
162 if chart.insert(new_edge, cpl1+(right_edge,)):
163 changed_chart = True
164
165
166 if changed_chart: yield new_edge
167
169 """
170 A specialized version of the top down expand rule that operates on
171 nonterminals whose symbols are C{FeatStructNonterminal}s. Rather
172 tha simply comparing the nonterminals for equality, they are
173 unified.
174
175 The top down expand rule states that:
176 - [AS{->}S{alpha}*B1S{beta}][i:j]
177 licenses the edge:
178 - [B2S{->}*S{gamma}][j:j]
179 for each grammar production C{B2S{->}S{gamma}}, assuming that B1
180 and B2 can be unified.
181 """
193
194
195
196
197
199 """
200 A specialized version of the completer rule that operates on
201 nonterminals whose symbols are C{FeatStructNonterminal}s. Rather
202 tha simply comparing the nonterminals for equality, they are
203 unified. See L{CompleterRule} for more information.
204 """
205 _fundamental_rule = FeatureFundamentalRule()
206
212
215 if edge.is_complete() or edge.end()>=chart.num_leaves(): return
216 index = edge.end()
217 leaf = chart.leaf(index)
218 for pos in self._word_to_pos.get(leaf, []):
219 if unify(pos, edge.next_with_bindings(), rename_vars=True):
220 new_leaf_edge = LeafEdge(leaf, index)
221 if chart.insert(new_leaf_edge, ()):
222 yield new_leaf_edge
223 new_pos_edge = FeatureTreeEdge((index, index+1), pos,
224 [leaf], 1)
225 if chart.insert(new_pos_edge, (new_leaf_edge,)):
226 yield new_pos_edge
227
228
230
231
232
233
234
235
236
237
238
240 """
241 A chart parser implementing the Earley parsing algorithm:
242
243 - For each index I{end} in [0, 1, ..., N]:
244 - For each I{edge} s.t. I{edge}.end = I{end}:
245 - If I{edge} is incomplete, and I{edge}.next is not a part
246 of speech:
247 - Apply PredictorRule to I{edge}
248 - If I{edge} is incomplete, and I{edge}.next is a part of
249 speech:
250 - Apply ScannerRule to I{edge}
251 - If I{edge} is complete:
252 - Apply CompleterRule to I{edge}
253 - Return any complete parses in the chart
254
255 C{EarleyChartParser} uses a X{lexicon} to decide whether a leaf
256 has a given part of speech. This lexicon is encoded as a
257 dictionary that maps each word to a list of parts of speech that
258 word can have.
259
260 @ivar _predictor_class, _completer_class, _scanner_class: The
261 classes that are used to implement the three rules used by the
262 Earley algorithm, Replacement rules can be specified by
263 subclasses (such as L{FeatureEarleyChartParser
264 <nltk.parse.featurechar.FeatureEarleyChartParser>}).
265 """
266 _predictor_class = PredictorRule
267 _completer_class = CompleterRule
268 _scanner_class = ScannerRule
269
271 """
272 Create a new Earley chart parser, that uses C{grammar} to
273 parse texts.
274
275 @type grammar: C{cfg.Grammar}
276 @param grammar: The grammar used to parse texts.
277 @type trace: C{int}
278 @param trace: The level of tracing that should be used when
279 parsing a text. C{0} will generate no tracing output;
280 and higher numbers will produce more verbose tracing
281 output.
282 @param chart_class: The class that should be used to create
283 the charts used by this parser.
284 """
285 if isinstance(trace, dict):
286 raise ValueError("Earley parser no longer takes a lexicon "
287 "as a separate parameter; assign the "
288 "lexicon when creating the grammar instead.")
289 self._grammar = grammar
290 self._lexicon = grammar.lexicon()
291 self._trace = trace
292 self._chart_class = chart_class
293
296
298 """@return: The lexicon used by this parser."""
299 return self._lexicon
300
301
302
303 _trace_chart_width = 40
304
306 tokens = list(tokens)
307 self._check_lexicon_coverage(tokens)
308
309 chart = self._chart_class(tokens)
310 grammar = self._grammar
311
312
313 w = max(2, self._trace_chart_width/(chart.num_leaves()+1))
314 if self._trace > 0: print ' '*9, chart.pp_leaves(w)
315
316
317 chart.insert(self._starter_edge(grammar.start()), ())
318
319
320 predictor = self._predictor_class()
321 completer = self._completer_class()
322 scanner = self._scanner_class(self._lexicon)
323
324 for end in range(chart.num_leaves()+1):
325 if self._trace > 1: print 'Processing queue %d' % end
326 for edge in chart.select(end=end):
327 if edge.is_complete():
328 for e in completer.apply_iter(chart, grammar, edge):
329 if self._trace > 0:
330 print 'Completer', chart.pp_edge(e,w)
331 if edge.is_incomplete():
332 for e in predictor.apply_iter(chart, grammar, edge):
333 if self._trace > 1:
334 print 'Predictor', chart.pp_edge(e,w)
335 if edge.is_incomplete():
336 for e in scanner.apply_iter(chart, grammar, edge):
337 if self._trace > 0:
338 print 'Scanner ', chart.pp_edge(e,w)
339
340
341 return self._parses(chart, grammar.start(), tree_class)[:n]
342
343
344
346 """Return a 'starter edge' that expands to the start symbol."""
347 root = cfg.Nonterminal('[INIT]')
348 return TreeEdge((0,0), root, (start_sym,))
349
350
351
352 - def _parses(self, chart, start_sym, tree_class):
353 """Return a list of parses in the given chart."""
354 return chart.parses(start_sym, tree_class=tree_class)
355
357 try: 'x' in self._lexicon
358 except: raise ValueError('ow %r' % self._lexicon)
359 missing = [tok for tok in tokens if tok not in self._lexicon]
360 if missing:
361 missing = ', '.join('%r' % (w,) for w in missing)
362 raise ValueError("Grammar does not cover some of the "
363 "input words: " + missing)
364
366 """
367 A chart parser implementing the Earley parsing algorithm, allowing
368 nonterminals that have features (known as L{FeatStructNonterminal}s).
369 See L{EarleyChartParser} for more details.
370 """
371 _predictor_class = FeaturePredictorRule
372 _completer_class = FeatureCompleterRule
373 _scanner_class = FeatureScannerRule
374 _trace_chart_width = 10
375
379
380 - def _parses(self, chart, start, tree_class):
389
390
391
392
393
395 """
396 A specialized chart that 'instantiates' variables whose names
397 start with '@', by replacing them with unique new variables.
398 In particular, whenever a complete edge is added to the chart, any
399 variables in the edge's C{lhs} whose names start with '@' will be
400 replaced by unique new L{Variable}s.
401 """
405
406 - def insert(self, edge, child_pointer_list):
407 if edge in self._instantiated: return False
408 edge = self.instantiate_edge(edge)
409 return Chart.insert(self, edge, child_pointer_list)
410
428
433
434
435
436
437
438
440 import sys, time
441
442 S = FeatStructNonterminal('S')
443 VP = FeatStructNonterminal('VP')
444 NP = FeatStructNonterminal('NP')
445 PP = FeatStructNonterminal('PP')
446 V = FeatStructNonterminal('V')
447 N = FeatStructNonterminal('N')
448 P = FeatStructNonterminal('P')
449 Name = FeatStructNonterminal('Name')
450 Det = FeatStructNonterminal('Det')
451 DetSg = FeatStructNonterminal('Det[-pl]')
452 DetPl = FeatStructNonterminal('Det[+pl]')
453 NSg = FeatStructNonterminal('N[-pl]')
454 NPl = FeatStructNonterminal('N[+pl]')
455
456
457 grammatical_productions = [
458 cfg.Production(S, (NP, VP)), cfg.Production(PP, (P, NP)),
459 cfg.Production(NP, (NP, PP)),
460 cfg.Production(VP, (VP, PP)), cfg.Production(VP, (V, NP)),
461 cfg.Production(VP, (V,)), cfg.Production(NP, (DetPl, NPl)),
462 cfg.Production(NP, (DetSg, NSg))]
463
464
465 lexical_productions = [
466 cfg.Production(NP, ('John',)), cfg.Production(NP, ('I',)),
467 cfg.Production(Det, ('the',)), cfg.Production(Det, ('my',)),
468 cfg.Production(Det, ('a',)),
469 cfg.Production(NSg, ('dog',)), cfg.Production(NSg, ('cookie',)),
470 cfg.Production(V, ('ate',)), cfg.Production(V, ('saw',)),
471 cfg.Production(P, ('with',)), cfg.Production(P, ('under',)),
472 ]
473
474
475 earley_lexicon = defaultdict(list)
476 for prod in lexical_productions:
477 earley_lexicon[prod.rhs()[0]].append(prod.lhs())
478
479
480 earley_grammar = cfg.Grammar(S, grammatical_productions, earley_lexicon)
481 print earley_grammar
482
483 sent = 'I saw John with a dog with my cookie'
484 print "Sentence:\n", sent
485 tokens = sent.split()
486 t = time.time()
487 cp = FeatureEarleyChartParser(earley_grammar, trace=1)
488 trees = cp.nbest_parse(tokens)
489 print "Time: %s" % (time.time() - t)
490 for tree in trees: print tree
491
493 import profile
494 profile.run('for i in range(1): demo()', '/tmp/profile.out')
495 import pstats
496 p = pstats.Stats('/tmp/profile.out')
497 p.strip_dirs().sort_stats('time', 'cum').print_stats(60)
498 p.strip_dirs().sort_stats('cum', 'time').print_stats(60)
499
500 if __name__ == '__main__':
501 demo()
502 print
503 cp = load_earley('grammars/feat0.fcfg', trace=2)
504 sent = 'Kim likes children'
505 tokens = sent.split()
506 trees = cp.nbest_parse(tokens)
507 for tree in trees:
508 print tree
509