1
2
3
4
5
6
7
8
9 import string
10
11 from nltk import tokenize, cfg
12 from nltk.tree import Tree
13
14 from api import *
15
16
17
18
20 """
21 A simple bottom-up CFG parser that uses two operations, "shift"
22 and "reduce", to find a single parse for a text.
23
24 C{ShiftReduceParser} maintains a stack, which records the
25 structure of a portion of the text. This stack is a list of
26 C{String}s and C{Tree}s that collectively cover a portion of
27 the text. For example, while parsing the sentence "the dog saw
28 the man" with a typical grammar, C{ShiftReduceParser} will produce
29 the following stack, which covers "the dog saw"::
30
31 [(NP: (Det: 'the') (N: 'dog')), (V: 'saw')]
32
33 C{ShiftReduceParser} attempts to extend the stack to cover the
34 entire text, and to combine the stack elements into a single tree,
35 producing a complete parse for the sentence.
36
37 Initially, the stack is empty. It is extended to cover the text,
38 from left to right, by repeatedly applying two operations:
39
40 - X{shift} moves a token from the beginning of the text to the
41 end of the stack.
42 - X{reduce} uses a CFG production to combine the rightmost stack
43 elements into a single C{Tree}.
44
45 Often, more than one operation can be performed on a given stack.
46 In this case, C{ShiftReduceParser} uses the following heuristics
47 to decide which operation to perform:
48
49 - Only shift if no reductions are available.
50 - If multiple reductions are available, then apply the reduction
51 whose CFG production is listed earliest in the grammar.
52
53 Note that these heuristics are not guaranteed to choose an
54 operation that leads to a parse of the text. Also, if multiple
55 parses exists, C{ShiftReduceParser} will return at most one of
56 them.
57
58 @see: C{nltk.cfg}
59 """
61 """
62 Create a new C{ShiftReduceParser}, that uses C{grammar} to
63 parse texts.
64
65 @type grammar: C{Grammar}
66 @param grammar: The grammar used to parse texts.
67 @type trace: C{int}
68 @param trace: The level of tracing that should be used when
69 parsing a text. C{0} will generate no tracing output;
70 and higher numbers will produce more verbose tracing
71 output.
72 """
73 self._grammar = grammar
74 self._trace = trace
75 self._check_grammar()
76
79
108
109 - def _shift(self, stack, remaining_text):
110 """
111 Move a token from the beginning of C{remaining_text} to the
112 end of C{stack}.
113
114 @type stack: C{list} of C{String} and C{Tree}
115 @param stack: A list of C{String}s and C{Tree}s, encoding
116 the structure of the text that has been parsed so far.
117 @type remaining_text: C{list} of C{String}
118 @param remaining_text: The portion of the text that is not yet
119 covered by C{stack}.
120 @rtype: C{None}
121 """
122 stack.append(remaining_text[0])
123 remaining_text.remove(remaining_text[0])
124 if self._trace: self._trace_shift(stack, remaining_text)
125
127 """
128 @rtype: C{boolean}
129 @return: true if the right hand side of a CFG production
130 matches the rightmost elements of the stack. C{rhs}
131 matches C{rightmost_stack} if they are the same length,
132 and each element of C{rhs} matches the corresponding
133 element of C{rightmost_stack}. A nonterminal element of
134 C{rhs} matches any C{Tree} whose node value is equal
135 to the nonterminal's symbol. A terminal element of C{rhs}
136 matches any C{String} whose type is equal to the terminal.
137 @type rhs: C{list} of (terminal and C{Nonterminal})
138 @param rhs: The right hand side of a CFG production.
139 @type rightmost_stack: C{list} of (C{String} and C{Tree})
140 @param rightmost_stack: The rightmost elements of the parser's
141 stack.
142 """
143
144 if len(rightmost_stack) != len(rhs): return 0
145 for i in range(len(rightmost_stack)):
146 if isinstance(rightmost_stack[i], Tree):
147 if not isinstance(rhs[i], cfg.Nonterminal): return 0
148 if rightmost_stack[i].node != rhs[i].symbol(): return 0
149 else:
150 if isinstance(rhs[i], cfg.Nonterminal): return 0
151 if rightmost_stack[i] != rhs[i]: return 0
152 return 1
153
154 - def _reduce(self, stack, remaining_text, production=None):
155 """
156 Find a CFG production whose right hand side matches the
157 rightmost stack elements; and combine those stack elements
158 into a single C{Tree}, with the node specified by the
159 production's left-hand side. If more than one CFG production
160 matches the stack, then use the production that is listed
161 earliest in the grammar. The new C{Tree} replaces the
162 elements in the stack.
163
164 @rtype: C{Production} or C{None}
165 @return: If a reduction is performed, then return the CFG
166 production that the reduction is based on; otherwise,
167 return false.
168 @type stack: C{list} of C{String} and C{Tree}
169 @param stack: A list of C{String}s and C{Tree}s, encoding
170 the structure of the text that has been parsed so far.
171 @type remaining_text: C{list} of C{String}
172 @param remaining_text: The portion of the text that is not yet
173 covered by C{stack}.
174 """
175 if production is None: productions = self._grammar.productions()
176 else: productions = [production]
177
178
179 for production in productions:
180 rhslen = len(production.rhs())
181
182
183 if self._match_rhs(production.rhs(), stack[-rhslen:]):
184
185
186 tree = Tree(production.lhs().symbol(), stack[-rhslen:])
187 stack[-rhslen:] = [tree]
188
189
190 if self._trace:
191 self._trace_reduce(stack, production, remaining_text)
192 return production
193
194
195 return None
196
197 - def trace(self, trace=2):
198 """
199 Set the level of tracing output that should be generated when
200 parsing a text.
201
202 @type trace: C{int}
203 @param trace: The trace level. A trace level of C{0} will
204 generate no tracing output; and higher trace levels will
205 produce more verbose tracing output.
206 @rtype: C{None}
207 """
208
209
210
211 self._trace = trace
212
214 """
215 Print trace output displaying the given stack and text.
216
217 @rtype: C{None}
218 @param marker: A character that is printed to the left of the
219 stack. This is used with trace level 2 to print 'S'
220 before shifted stacks and 'R' before reduced stacks.
221 """
222 str = ' '+marker+' [ '
223 for elt in stack:
224 if isinstance(elt, Tree):
225 str += `cfg.Nonterminal(elt.node)` + ' '
226 else:
227 str += `elt` + ' '
228 str += '* ' + string.join(remaining_text) + ']'
229 print str
230
240
253
255 """
256 Check to make sure that all of the CFG productions are
257 potentially useful. If any productions can never be used,
258 then print a warning.
259
260 @rtype: C{None}
261 """
262 productions = self._grammar.productions()
263
264
265
266 for i in range(len(productions)):
267 for j in range(i+1, len(productions)):
268 rhs1 = productions[i].rhs()
269 rhs2 = productions[j].rhs()
270 if rhs1[:len(rhs2)] == rhs2:
271 print 'Warning: %r will never be used' % productions[i]
272
273
274
275
277 """
278 A C{ShiftReduceParser} that allows you to setp through the parsing
279 process, performing a single operation at a time. It also allows
280 you to change the parser's grammar midway through parsing a text.
281
282 The C{initialize} method is used to start parsing a text.
283 C{shift} performs a single shift operation, and C{reduce} performs
284 a single reduce operation. C{step} will perform a single reduce
285 operation if possible; otherwise, it will perform a single shift
286 operation. C{parses} returns the set of parses that have been
287 found by the parser.
288
289 @ivar _history: A list of C{(stack, remaining_text)} pairs,
290 containing all of the previous states of the parser. This
291 history is used to implement the C{undo} operation.
292 @see: C{nltk.cfg}
293 """
295 self._grammar = grammar
296 self._trace = trace
297 self._stack = None
298 self._remaining_text = None
299 self._history = []
300
307
309 """
310 @return: The parser's stack.
311 @rtype: C{list} of C{String} and C{Tree}
312 """
313 return self._stack
314
315 - def remaining_text(self):
316 """
317 @return: The portion of the text that is not yet covered by the
318 stack.
319 @rtype: C{list} of C{String}
320 """
321 return self._remaining_text
322
324 """
325 Start parsing a given text. This sets the parser's stack to
326 C{[]} and sets its remaining text to C{tokens}.
327 """
328 self._stack = []
329 self._remaining_text = tokens
330 self._history = []
331
333 """
334 Perform a single parsing operation. If a reduction is
335 possible, then perform that reduction, and return the
336 production that it is based on. Otherwise, if a shift is
337 possible, then perform it, and return 1. Otherwise,
338 return 0.
339
340 @return: 0 if no operation was performed; 1 if a shift was
341 performed; and the CFG production used to reduce if a
342 reduction was performed.
343 @rtype: C{Production} or C{boolean}
344 """
345 return self.reduce() or self.shift()
346
348 """
349 Move a token from the beginning of the remaining text to the
350 end of the stack. If there are no more tokens in the
351 remaining text, then do nothing.
352
353 @return: True if the shift operation was successful.
354 @rtype: C{boolean}
355 """
356 if len(self._remaining_text) == 0: return 0
357 self._history.append( (self._stack[:], self._remaining_text[:]) )
358 self._shift(self._stack, self._remaining_text)
359 return 1
360
361 - def reduce(self, production=None):
362 """
363 Use C{production} to combine the rightmost stack elements into
364 a single C{Tree}. If C{production} does not match the
365 rightmost stack elements, then do nothing.
366
367 @return: The production used to reduce the stack, if a
368 reduction was performed. If no reduction was performed,
369 return C{None}.
370
371 @rtype: C{Production} or C{None}
372 """
373 self._history.append( (self._stack[:], self._remaining_text[:]) )
374 return_val = self._reduce(self._stack, self._remaining_text,
375 production)
376
377 if not return_val: self._history.pop()
378 return return_val
379
381 """
382 Return the parser to its state before the most recent
383 shift or reduce operation. Calling C{undo} repeatedly return
384 the parser to successively earlier states. If no shift or
385 reduce operations have been performed, C{undo} will make no
386 changes.
387
388 @return: true if an operation was successfully undone.
389 @rtype: C{boolean}
390 """
391 if len(self._history) == 0: return 0
392 (self._stack, self._remaining_text) = self._history.pop()
393 return 1
394
396 """
397 @return: A list of the productions for which reductions are
398 available for the current parser state.
399 @rtype: C{list} of C{Production}
400 """
401 productions = []
402 for production in self._grammar.productions():
403 rhslen = len(production.rhs())
404 if self._match_rhs(production.rhs(), self._stack[-rhslen:]):
405 productions.append(production)
406 return productions
407
409 """
410 @return: A list of the parses that have been found by this
411 parser so far.
412 @rtype: C{list} of C{Tree}
413 """
414 if len(self._remaining_text) != 0: return []
415 if len(self._stack) != 1: return []
416 if self._stack[0].node != self._grammar.start().symbol():
417 return []
418 return self._stack
419
420
421
423 """
424 Change the grammar used to parse texts.
425
426 @param grammar: The new grammar.
427 @type grammar: C{CFG}
428 """
429 self._grammar = grammar
430
431
432
433
434
436 """
437 A demonstration of the shift-reduce parser.
438 """
439
440 from nltk import parse, cfg
441
442 grammar = cfg.parse_cfg("""
443 S -> NP VP
444 NP -> Det N | Det N PP
445 VP -> V NP | V NP PP
446 PP -> P NP
447 NP -> 'I'
448 N -> 'man' | 'park' | 'telescope' | 'dog'
449 Det -> 'the' | 'a'
450 P -> 'in' | 'with'
451 V -> 'saw'
452 """)
453
454 sent = 'I saw a man in the park'.split()
455
456 parser = parse.ShiftReduceParser(grammar, trace=2)
457 for p in parser.nbest_parse(sent):
458 print p
459
460 if __name__ == '__main__': demo()
461