nltk.parse.sr

1 # Natural Language Toolkit: Shift-Reduce Parser 2 # 3 # Copyright (C) 2001-2008 NLTK Project 4 # Author: Edward Loper <[email protected]> 5 # Steven Bird <[email protected]> 6 # URL: <http://nltk.org> 7 # For license information, see LICENSE.TXT 8 9 import string 10 11 from nltk import tokenize, cfg 12 from nltk.tree import Tree 13 14 from api import * 15 16 ##////////////////////////////////////////////////////// 17 ## Shift/Reduce Parser 18 ##//////////////////////////////////////////////////////

19 -class ShiftReduceParser(ParserI):

20 """ 21 A simple bottom-up CFG parser that uses two operations, "shift" 22 and "reduce", to find a single parse for a text. 23 24 C{ShiftReduceParser} maintains a stack, which records the 25 structure of a portion of the text. This stack is a list of 26 C{String}s and C{Tree}s that collectively cover a portion of 27 the text. For example, while parsing the sentence "the dog saw 28 the man" with a typical grammar, C{ShiftReduceParser} will produce 29 the following stack, which covers "the dog saw":: 30 31 [(NP: (Det: 'the') (N: 'dog')), (V: 'saw')] 32 33 C{ShiftReduceParser} attempts to extend the stack to cover the 34 entire text, and to combine the stack elements into a single tree, 35 producing a complete parse for the sentence. 36 37 Initially, the stack is empty. It is extended to cover the text, 38 from left to right, by repeatedly applying two operations: 39 40 - X{shift} moves a token from the beginning of the text to the 41 end of the stack. 42 - X{reduce} uses a CFG production to combine the rightmost stack 43 elements into a single C{Tree}. 44 45 Often, more than one operation can be performed on a given stack. 46 In this case, C{ShiftReduceParser} uses the following heuristics 47 to decide which operation to perform: 48 49 - Only shift if no reductions are available. 50 - If multiple reductions are available, then apply the reduction 51 whose CFG production is listed earliest in the grammar. 52 53 Note that these heuristics are not guaranteed to choose an 54 operation that leads to a parse of the text. Also, if multiple 55 parses exists, C{ShiftReduceParser} will return at most one of 56 them. 57 58 @see: C{nltk.cfg} 59 """

60 - def __init__(self, grammar, trace=0):

61 """ 62 Create a new C{ShiftReduceParser}, that uses C{grammar} to 63 parse texts. 64 65 @type grammar: C{Grammar} 66 @param grammar: The grammar used to parse texts. 67 @type trace: C{int} 68 @param trace: The level of tracing that should be used when 69 parsing a text. C{0} will generate no tracing output; 70 and higher numbers will produce more verbose tracing 71 output. 72 """ 73 self._grammar = grammar 74 self._trace = trace 75 self._check_grammar()

76

77 - def grammar(self):

78 return self._grammar

79

80 - def parse(self, tokens):

81 tokens = list(tokens) 82 self._grammar.check_coverage(tokens) 83 84 # initialize the stack. 85 stack = [] 86 remaining_text = tokens 87 88 # Trace output. 89 if self._trace: 90 print 'Parsing %r' % string.join(tokens) 91 self._trace_stack(stack, remaining_text) 92 93 # iterate through the text, pushing the token onto 94 # the stack, then reducing the stack. 95 while len(remaining_text) > 0: 96 self._shift(stack, remaining_text) 97 while self._reduce(stack, remaining_text): pass 98 99 # Did we reduce everything? 100 if len(stack) != 1: return None 101 102 # Did we end up with the right category? 103 if stack[0].node != self._grammar.start().symbol(): 104 return None 105 106 # We parsed successfully! 107 return stack[0]

108

109 - def _shift(self, stack, remaining_text):

110 """ 111 Move a token from the beginning of C{remaining_text} to the 112 end of C{stack}. 113 114 @type stack: C{list} of C{String} and C{Tree} 115 @param stack: A list of C{String}s and C{Tree}s, encoding 116 the structure of the text that has been parsed so far. 117 @type remaining_text: C{list} of C{String} 118 @param remaining_text: The portion of the text that is not yet 119 covered by C{stack}. 120 @rtype: C{None} 121 """ 122 stack.append(remaining_text[0]) 123 remaining_text.remove(remaining_text[0]) 124 if self._trace: self._trace_shift(stack, remaining_text)

125

126 - def _match_rhs(self, rhs, rightmost_stack):

127 """ 128 @rtype: C{boolean} 129 @return: true if the right hand side of a CFG production 130 matches the rightmost elements of the stack. C{rhs} 131 matches C{rightmost_stack} if they are the same length, 132 and each element of C{rhs} matches the corresponding 133 element of C{rightmost_stack}. A nonterminal element of 134 C{rhs} matches any C{Tree} whose node value is equal 135 to the nonterminal's symbol. A terminal element of C{rhs} 136 matches any C{String} whose type is equal to the terminal. 137 @type rhs: C{list} of (terminal and C{Nonterminal}) 138 @param rhs: The right hand side of a CFG production. 139 @type rightmost_stack: C{list} of (C{String} and C{Tree}) 140 @param rightmost_stack: The rightmost elements of the parser's 141 stack. 142 """ 143 144 if len(rightmost_stack) != len(rhs): return 0 145 for i in range(len(rightmost_stack)): 146 if isinstance(rightmost_stack[i], Tree): 147 if not isinstance(rhs[i], cfg.Nonterminal): return 0 148 if rightmost_stack[i].node != rhs[i].symbol(): return 0 149 else: 150 if isinstance(rhs[i], cfg.Nonterminal): return 0 151 if rightmost_stack[i] != rhs[i]: return 0 152 return 1

153

154 - def _reduce(self, stack, remaining_text, production=None):

155 """ 156 Find a CFG production whose right hand side matches the 157 rightmost stack elements; and combine those stack elements 158 into a single C{Tree}, with the node specified by the 159 production's left-hand side. If more than one CFG production 160 matches the stack, then use the production that is listed 161 earliest in the grammar. The new C{Tree} replaces the 162 elements in the stack. 163 164 @rtype: C{Production} or C{None} 165 @return: If a reduction is performed, then return the CFG 166 production that the reduction is based on; otherwise, 167 return false. 168 @type stack: C{list} of C{String} and C{Tree} 169 @param stack: A list of C{String}s and C{Tree}s, encoding 170 the structure of the text that has been parsed so far. 171 @type remaining_text: C{list} of C{String} 172 @param remaining_text: The portion of the text that is not yet 173 covered by C{stack}. 174 """ 175 if production is None: productions = self._grammar.productions() 176 else: productions = [production] 177 178 # Try each production, in order. 179 for production in productions: 180 rhslen = len(production.rhs()) 181 182 # check if the RHS of a production matches the top of the stack 183 if self._match_rhs(production.rhs(), stack[-rhslen:]): 184 185 # combine the tree to reflect the reduction 186 tree = Tree(production.lhs().symbol(), stack[-rhslen:]) 187 stack[-rhslen:] = [tree] 188 189 # We reduced something 190 if self._trace: 191 self._trace_reduce(stack, production, remaining_text) 192 return production 193 194 # We didn't reduce anything 195 return None

196

197 - def trace(self, trace=2):

198 """ 199 Set the level of tracing output that should be generated when 200 parsing a text. 201 202 @type trace: C{int} 203 @param trace: The trace level. A trace level of C{0} will 204 generate no tracing output; and higher trace levels will 205 produce more verbose tracing output. 206 @rtype: C{None} 207 """ 208 # 1: just show shifts. 209 # 2: show shifts & reduces 210 # 3: display which tokens & productions are shifed/reduced 211 self._trace = trace

212

213 - def _trace_stack(self, stack, remaining_text, marker=' '):

214 """ 215 Print trace output displaying the given stack and text. 216 217 @rtype: C{None} 218 @param marker: A character that is printed to the left of the 219 stack. This is used with trace level 2 to print 'S' 220 before shifted stacks and 'R' before reduced stacks. 221 """ 222 str = ' '+marker+' [ ' 223 for elt in stack: 224 if isinstance(elt, Tree): 225 str += `cfg.Nonterminal(elt.node)` + ' ' 226 else: 227 str += `elt` + ' ' 228 str += '* ' + string.join(remaining_text) + ']' 229 print str

230

231 - def _trace_shift(self, stack, remaining_text):

232 """ 233 Print trace output displaying that a token has been shifted. 234 235 @rtype: C{None} 236 """ 237 if self._trace > 2: print 'Shift %r:' % stack[-1] 238 if self._trace == 2: self._trace_stack(stack, remaining_text, 'S') 239 elif self._trace > 0: self._trace_stack(stack, remaining_text)

240

241 - def _trace_reduce(self, stack, production, remaining_text):

242 """ 243 Print trace output displaying that C{production} was used to 244 reduce C{stack}. 245 246 @rtype: C{None} 247 """ 248 if self._trace > 2: 249 rhs = string.join(production.rhs()) 250 print 'Reduce %r <- %s' % (production.lhs(), rhs) 251 if self._trace == 2: self._trace_stack(stack, remaining_text, 'R') 252 elif self._trace > 1: self._trace_stack(stack, remaining_text)

253

254 - def _check_grammar(self):

255 """ 256 Check to make sure that all of the CFG productions are 257 potentially useful. If any productions can never be used, 258 then print a warning. 259 260 @rtype: C{None} 261 """ 262 productions = self._grammar.productions() 263 264 # Any production whose RHS is an extension of another production's RHS 265 # will never be used. 266 for i in range(len(productions)): 267 for j in range(i+1, len(productions)): 268 rhs1 = productions[i].rhs() 269 rhs2 = productions[j].rhs() 270 if rhs1[:len(rhs2)] == rhs2: 271 print 'Warning: %r will never be used' % productions[i]

272 273 ##////////////////////////////////////////////////////// 274 ## Stepping Shift/Reduce Parser 275 ##//////////////////////////////////////////////////////

276 -class SteppingShiftReduceParser(ShiftReduceParser):

277 """ 278 A C{ShiftReduceParser} that allows you to setp through the parsing 279 process, performing a single operation at a time. It also allows 280 you to change the parser's grammar midway through parsing a text. 281 282 The C{initialize} method is used to start parsing a text. 283 C{shift} performs a single shift operation, and C{reduce} performs 284 a single reduce operation. C{step} will perform a single reduce 285 operation if possible; otherwise, it will perform a single shift 286 operation. C{parses} returns the set of parses that have been 287 found by the parser. 288 289 @ivar _history: A list of C{(stack, remaining_text)} pairs, 290 containing all of the previous states of the parser. This 291 history is used to implement the C{undo} operation. 292 @see: C{nltk.cfg} 293 """

294 - def __init__(self, grammar, trace=0):

295 self._grammar = grammar 296 self._trace = trace 297 self._stack = None 298 self._remaining_text = None 299 self._history = []

300

301 - def nbest_parse(self, tokens, n=None):

302 tokens = list(tokens) 303 self.initialize(tokens) 304 while self.step(): pass 305 306 return self.parses()[:n]

307

308 - def stack(self):

309 """ 310 @return: The parser's stack. 311 @rtype: C{list} of C{String} and C{Tree} 312 """ 313 return self._stack

314

315 - def remaining_text(self):

316 """ 317 @return: The portion of the text that is not yet covered by the 318 stack. 319 @rtype: C{list} of C{String} 320 """ 321 return self._remaining_text

322

323 - def initialize(self, tokens):

324 """ 325 Start parsing a given text. This sets the parser's stack to 326 C{[]} and sets its remaining text to C{tokens}. 327 """ 328 self._stack = [] 329 self._remaining_text = tokens 330 self._history = []

331

332 - def step(self):

333 """ 334 Perform a single parsing operation. If a reduction is 335 possible, then perform that reduction, and return the 336 production that it is based on. Otherwise, if a shift is 337 possible, then perform it, and return 1. Otherwise, 338 return 0. 339 340 @return: 0 if no operation was performed; 1 if a shift was 341 performed; and the CFG production used to reduce if a 342 reduction was performed. 343 @rtype: C{Production} or C{boolean} 344 """ 345 return self.reduce() or self.shift()

346

347 - def shift(self):

348 """ 349 Move a token from the beginning of the remaining text to the 350 end of the stack. If there are no more tokens in the 351 remaining text, then do nothing. 352 353 @return: True if the shift operation was successful. 354 @rtype: C{boolean} 355 """ 356 if len(self._remaining_text) == 0: return 0 357 self._history.append( (self._stack[:], self._remaining_text[:]) ) 358 self._shift(self._stack, self._remaining_text) 359 return 1

360

361 - def reduce(self, production=None):

362 """ 363 Use C{production} to combine the rightmost stack elements into 364 a single C{Tree}. If C{production} does not match the 365 rightmost stack elements, then do nothing. 366 367 @return: The production used to reduce the stack, if a 368 reduction was performed. If no reduction was performed, 369 return C{None}. 370 371 @rtype: C{Production} or C{None} 372 """ 373 self._history.append( (self._stack[:], self._remaining_text[:]) ) 374 return_val = self._reduce(self._stack, self._remaining_text, 375 production) 376 377 if not return_val: self._history.pop() 378 return return_val

379

380 - def undo(self):

381 """ 382 Return the parser to its state before the most recent 383 shift or reduce operation. Calling C{undo} repeatedly return 384 the parser to successively earlier states. If no shift or 385 reduce operations have been performed, C{undo} will make no 386 changes. 387 388 @return: true if an operation was successfully undone. 389 @rtype: C{boolean} 390 """ 391 if len(self._history) == 0: return 0 392 (self._stack, self._remaining_text) = self._history.pop() 393 return 1

394

395 - def reducible_productions(self):

396 """ 397 @return: A list of the productions for which reductions are 398 available for the current parser state. 399 @rtype: C{list} of C{Production} 400 """ 401 productions = [] 402 for production in self._grammar.productions(): 403 rhslen = len(production.rhs()) 404 if self._match_rhs(production.rhs(), self._stack[-rhslen:]): 405 productions.append(production) 406 return productions

407

408 - def parses(self):

409 """ 410 @return: A list of the parses that have been found by this 411 parser so far. 412 @rtype: C{list} of C{Tree} 413 """ 414 if len(self._remaining_text) != 0: return [] 415 if len(self._stack) != 1: return [] 416 if self._stack[0].node != self._grammar.start().symbol(): 417 return [] 418 return self._stack

419 420 # copied from nltk.parser 421

422 - def set_grammar(self, grammar):

423 """ 424 Change the grammar used to parse texts. 425 426 @param grammar: The new grammar. 427 @type grammar: C{CFG} 428 """ 429 self._grammar = grammar

430 431 ##////////////////////////////////////////////////////// 432 ## Demonstration Code 433 ##////////////////////////////////////////////////////// 434

435 -def demo():

436 """ 437 A demonstration of the shift-reduce parser. 438 """ 439 440 from nltk import parse, cfg 441 442 grammar = cfg.parse_cfg(""" 443 S -> NP VP 444 NP -> Det N | Det N PP 445 VP -> V NP | V NP PP 446 PP -> P NP 447 NP -> 'I' 448 N -> 'man' | 'park' | 'telescope' | 'dog' 449 Det -> 'the' | 'a' 450 P -> 'in' | 'with' 451 V -> 'saw' 452 """) 453 454 sent = 'I saw a man in the park'.split() 455 456 parser = parse.ShiftReduceParser(grammar, trace=2) 457 for p in parser.nbest_parse(sent): 458 print p

459 460 if __name__ == '__main__': demo() 461

Source Code for Module nltk.parse.sr