nltk.parse.pchart

1 # Natural Language Toolkit: Probabilistic Chart Parsers 2 # 3 # Copyright (C) 2001-2008 NLTK Project 4 # Author: Edward Loper <[email protected]> 5 # Steven Bird <[email protected]> 6 # URL: <http://nltk.org> 7 # For license information, see LICENSE.TXT 8 9 """ 10 Classes and interfaces for associating probabilities with tree 11 structures that represent the internal organization of a text. The 12 probabilistic parser module defines C{BottomUpChartParser}. 13 14 C{BottomUpChartParser} is an abstract class that implements a 15 bottom-up chart parser for C{PCFG}s. It maintains a queue of edges, 16 and adds them to the chart one at a time. The ordering of this queue 17 is based on the probabilities associated with the edges, allowing the 18 parser to expand more likely edges before less likely ones. Each 19 subclass implements a different queue ordering, producing different 20 search strategies. Currently the following subclasses are defined: 21 22 - C{InsideChartParser} searches edges in decreasing order of 23 their trees' inside probabilities. 24 - C{RandomChartParser} searches edges in random order. 25 - C{LongestChartParser} searches edges in decreasing order of their 26 location's length. 27 28 The C{BottomUpChartParser} constructor has an optional argument beam_size. 29 If non-zero, this controls the size of the beam (aka the edge queue). 30 This option is most useful with InsideChartParser. 31 """ 32 33 ##////////////////////////////////////////////////////// 34 ## Bottom-Up PCFG Chart Parser 35 ##////////////////////////////////////////////////////// 36 37 # [XX] This might not be implemented quite right -- it would be better 38 # to associate probabilities with child pointer lists. 39 40 from nltk.tree import Tree, ProbabilisticTree 41 from nltk import Nonterminal 42 43 from api import * 44 from chart import Chart, LeafEdge, TreeEdge, AbstractChartRule 45 46 # Probabilistic edges

47 -class ProbabilisticLeafEdge(LeafEdge):

48 - def prob(self): return 1.0

49

50 -class ProbabilisticTreeEdge(TreeEdge):

51 - def __init__(self, prob, *args, **kwargs):

52 self._prob = prob 53 TreeEdge.__init__(self, *args, **kwargs)

54 - def prob(self): return self._prob

55 - def __cmp__(self, other):

56 if self._prob != other.prob(): return -1 57 return TreeEdge.__cmp__(self, other)

58 - def from_production(production, index, p):

59 return ProbabilisticTreeEdge(p, (index, index), production.lhs(), 60 production.rhs(), 0)

61 from_production = staticmethod(from_production)

62 63 # Rules using probabilistic edges

64 -class BottomUpInitRule(AbstractChartRule):

65 NUM_EDGES=0

66 - def apply_iter(self, chart, grammar):

67 for index in range(chart.num_leaves()): 68 new_edge = ProbabilisticLeafEdge(chart.leaf(index), index) 69 if chart.insert(new_edge, ()): 70 yield new_edge

71

72 -class BottomUpPredictRule(AbstractChartRule):

73 NUM_EDGES=1

74 - def apply_iter(self, chart, grammar, edge):

75 if edge.is_incomplete(): return 76 for prod in grammar.productions(): 77 if edge.lhs() == prod.rhs()[0]: 78 new_edge = ProbabilisticTreeEdge.from_production(prod, edge.start(), prod.prob()) 79 if chart.insert(new_edge, ()): 80 yield new_edge

81

82 -class ProbabilisticFundamentalRule(AbstractChartRule):

83 NUM_EDGES=2

84 - def apply_iter(self, chart, grammar, left_edge, right_edge):

85 # Make sure the rule is applicable. 86 if not (left_edge.end() == right_edge.start() and 87 left_edge.next() == right_edge.lhs() and 88 left_edge.is_incomplete() and right_edge.is_complete()): 89 return 90 91 # Construct the new edge. 92 p = left_edge.prob() * right_edge.prob() 93 new_edge = ProbabilisticTreeEdge(p, 94 span=(left_edge.start(), right_edge.end()), 95 lhs=left_edge.lhs(), rhs=left_edge.rhs(), 96 dot=left_edge.dot()+1) 97 98 # Add it to the chart, with appropriate child pointers. 99 changed_chart = False 100 for cpl1 in chart.child_pointer_lists(left_edge): 101 if chart.insert(new_edge, cpl1+(right_edge,)): 102 changed_chart = True 103 104 # If we changed the chart, then generate the edge. 105 if changed_chart: yield new_edge

106

107 -class SingleEdgeProbabilisticFundamentalRule(AbstractChartRule):

108 NUM_EDGES=1 109 110 _fundamental_rule = ProbabilisticFundamentalRule() 111

112 - def apply_iter(self, chart, grammar, edge1):

113 fr = self._fundamental_rule 114 if edge1.is_incomplete(): 115 # edge1 = left_edge; edge2 = right_edge 116 for edge2 in chart.select(start=edge1.end(), is_complete=True, 117 lhs=edge1.next()): 118 for new_edge in fr.apply_iter(chart, grammar, edge1, edge2): 119 yield new_edge 120 else: 121 # edge2 = left_edge; edge1 = right_edge 122 for edge2 in chart.select(end=edge1.start(), is_complete=False, 123 next=edge1.lhs()): 124 for new_edge in fr.apply_iter(chart, grammar, edge2, edge1): 125 yield new_edge

126

127 - def __str__(self): return 'Fundamental Rule'

128

129 -class BottomUpChartParser(ParserI):

130 """ 131 An abstract bottom-up parser for C{PCFG}s that uses a C{Chart} to 132 record partial results. C{BottomUpChartParser} maintains a 133 queue of edges that can be added to the chart. This queue is 134 initialized with edges for each token in the text that is being 135 parsed. C{BottomUpChartParser} inserts these edges into the 136 chart one at a time, starting with the most likely edges, and 137 proceeding to less likely edges. For each edge that is added to 138 the chart, it may become possible to insert additional edges into 139 the chart; these are added to the queue. This process continues 140 until enough complete parses have been generated, or until the 141 queue is empty. 142 143 The sorting order for the queue is not specified by 144 C{BottomUpChartParser}. Different sorting orders will result 145 in different search strategies. The sorting order for the queue 146 is defined by the method C{sort_queue}; subclasses are required 147 to provide a definition for this method. 148 149 @type _grammar: C{PCFG} 150 @ivar _grammar: The grammar used to parse sentences. 151 @type _trace: C{int} 152 @ivar _trace: The level of tracing output that should be generated 153 when parsing a text. 154 """

155 - def __init__(self, grammar, beam_size=0, trace=0):

156 """ 157 Create a new C{BottomUpChartParser}, that uses C{grammar} 158 to parse texts. 159 160 @type grammar: C{PCFG} 161 @param grammar: The grammar used to parse texts. 162 @type beam_size: C{int} 163 @param beam_size: The maximum length for the parser's edge queue. 164 @type trace: C{int} 165 @param trace: The level of tracing that should be used when 166 parsing a text. C{0} will generate no tracing output; 167 and higher numbers will produce more verbose tracing 168 output. 169 """ 170 self._grammar = grammar 171 self.beam_size = beam_size 172 self._trace = trace

173

174 - def grammar(self):

175 return self._grammar

176

177 - def trace(self, trace=2):

178 """ 179 Set the level of tracing output that should be generated when 180 parsing a text. 181 182 @type trace: C{int} 183 @param trace: The trace level. A trace level of C{0} will 184 generate no tracing output; and higher trace levels will 185 produce more verbose tracing output. 186 @rtype: C{None} 187 """ 188 self._trace = trace

189

190 - def nbest_parse(self, tokens, n=None):

191 self._grammar.check_coverage(tokens) 192 chart = Chart(list(tokens)) 193 grammar = self._grammar 194 195 # Chart parser rules. 196 bu_init = BottomUpInitRule() 197 bu = BottomUpPredictRule() 198 fr = SingleEdgeProbabilisticFundamentalRule() 199 200 # Our queue! 201 queue = [] 202 203 # Initialize the chart. 204 for e in bu_init.apply_iter(chart, grammar): 205 if self._trace>1: chart.pp_edge(e,width=2) 206 queue.append(e) 207 208 while len(queue) > 0: 209 # Re-sort the queue. 210 self.sort_queue(queue, chart) 211 212 # Prune the queue to the correct size if a beam was defined 213 if self.beam_size: 214 self._prune(queue, chart) 215 216 # Get the best edge. 217 edge = queue.pop() 218 if self._trace>0: 219 print ' %-50s [%s]' % (chart.pp_edge(edge,width=2), 220 edge.prob()) 221 222 # Apply BU & FR to it. 223 queue.extend(bu.apply(chart, grammar, edge)) 224 queue.extend(fr.apply(chart, grammar, edge)) 225 226 # Get a list of complete parses. 227 parses = chart.parses(grammar.start(), ProbabilisticTree) 228 229 # Assign probabilities to the trees. 230 prod_probs = {} 231 for prod in grammar.productions(): 232 prod_probs[prod.lhs(), prod.rhs()] = prod.prob() 233 for parse in parses: 234 self._setprob(parse, prod_probs) 235 236 # Sort by probability 237 parses.sort(lambda a,b: cmp(b.prob(), a.prob())) 238 239 return parses[:n]

240

241 - def _setprob(self, tree, prod_probs):

242 if tree.prob() is not None: return 243 244 # Get the prob of the CFG production. 245 lhs = Nonterminal(tree.node) 246 rhs = [] 247 for child in tree: 248 if isinstance(child, Tree): 249 rhs.append(Nonterminal(child.node)) 250 else: 251 rhs.append(child) 252 prob = prod_probs[lhs, tuple(rhs)] 253 254 # Get the probs of children. 255 for child in tree: 256 if isinstance(child, Tree): 257 self._setprob(child, prod_probs) 258 prob *= child.prob() 259 260 tree.set_prob(prob)

261

262 - def sort_queue(self, queue, chart):

263 """ 264 Sort the given queue of C{Edge}s, placing the edge that should 265 be tried first at the beginning of the queue. This method 266 will be called after each C{Edge} is added to the queue. 267 268 @param queue: The queue of C{Edge}s to sort. Each edge in 269 this queue is an edge that could be added to the chart by 270 the fundamental rule; but that has not yet been added. 271 @type queue: C{list} of C{Edge} 272 @param chart: The chart being used to parse the text. This 273 chart can be used to provide extra information for sorting 274 the queue. 275 @type chart: C{Chart} 276 @rtype: C{None} 277 """ 278 raise AssertionError, "BottomUpChartParser is an abstract class"

279

280 - def _prune(self, queue, chart):

281 """ Discard items in the queue if the queue is longer than the beam.""" 282 if len(queue) > self.beam_size: 283 split = len(queue)-self.beam_size 284 if self._trace > 2: 285 for edge in queue[:split]: 286 print ' %-50s [DISCARDED]' % chart.pp_edge(edge,2) 287 del queue[:split]

288

289 -class InsideChartParser(BottomUpChartParser):

290 """ 291 A bottom-up parser for C{PCFG}s that tries edges in descending 292 order of the inside probabilities of their trees. The X{inside 293 probability} of a tree is simply the 294 probability of the entire tree, ignoring its context. In 295 particular, the inside probability of a tree generated by 296 production M{p} with children M{c[1]}, M{c[2]}, ..., M{c[n]} is 297 P(M{p})*P(M{c[1]})*P(M{c[2]})*M{...}*P(M{c[n]}); and the inside 298 probability of a token is 1 if it is present in the text, and 0 if 299 it is absent. 300 301 This sorting order results in a type of lowest-cost-first search 302 strategy. 303 """ 304 # Inherit constructor.

305 - def sort_queue(self, queue, chart):

306 """ 307 Sort the given queue of edges, in descending order of the 308 inside probabilities of the edges' trees. 309 310 @param queue: The queue of C{Edge}s to sort. Each edge in 311 this queue is an edge that could be added to the chart by 312 the fundamental rule; but that has not yet been added. 313 @type queue: C{list} of C{Edge} 314 @param chart: The chart being used to parse the text. This 315 chart can be used to provide extra information for sorting 316 the queue. 317 @type chart: C{Chart} 318 @rtype: C{None} 319 """ 320 queue.sort(lambda e1,e2:cmp(e1.prob(), e2.prob()))

321 322 # Eventually, this will become some sort of inside-outside parser: 323 # class InsideOutsideParser(BottomUpChartParser): 324 # def __init__(self, grammar, trace=0): 325 # # Inherit docs. 326 # BottomUpChartParser.__init__(self, grammar, trace) 327 # 328 # # Find the best path from S to each nonterminal 329 # bestp = {} 330 # for production in grammar.productions(): bestp[production.lhs()]=0 331 # bestp[grammar.start()] = 1.0 332 # 333 # for i in range(len(grammar.productions())): 334 # for production in grammar.productions(): 335 # lhs = production.lhs() 336 # for elt in production.rhs(): 337 # bestp[elt] = max(bestp[lhs]*production.prob(), 338 # bestp.get(elt,0)) 339 # 340 # self._bestp = bestp 341 # for (k,v) in self._bestp.items(): print k,v 342 # 343 # def _cmp(self, e1, e2): 344 # return cmp(e1.structure()[PROB]*self._bestp[e1.lhs()], 345 # e2.structure()[PROB]*self._bestp[e2.lhs()]) 346 # 347 # def sort_queue(self, queue, chart): 348 # queue.sort(self._cmp) 349 350 import random

351 -class RandomChartParser(BottomUpChartParser):

352 """ 353 A bottom-up parser for C{PCFG}s that tries edges in random order. 354 This sorting order results in a random search strategy. 355 """ 356 # Inherit constructor

357 - def sort_queue(self, queue, chart):

358 i = random.randint(0, len(queue)-1) 359 (queue[-1], queue[i]) = (queue[i], queue[-1])

360

361 -class UnsortedChartParser(BottomUpChartParser):

362 """ 363 A bottom-up parser for C{PCFG}s that tries edges in whatever order. 364 """ 365 # Inherit constructor

366 - def sort_queue(self, queue, chart): return

367

368 -class LongestChartParser(BottomUpChartParser):

369 """ 370 A bottom-up parser for C{PCFG}s that tries longer edges before 371 shorter ones. This sorting order results in a type of best-first 372 search strategy. 373 """ 374 # Inherit constructor

375 - def sort_queue(self, queue, chart):

376 queue.sort(lambda e1,e2: cmp(e1.length(), e2.length()))

377 378 ##////////////////////////////////////////////////////// 379 ## Test Code 380 ##////////////////////////////////////////////////////// 381

382 -def demo():

383 """ 384 A demonstration of the probabilistic parsers. The user is 385 prompted to select which demo to run, and how many parses should 386 be found; and then each parser is run on the same demo, and a 387 summary of the results are displayed. 388 """ 389 import sys, time 390 from nltk import tokenize, cfg 391 from nltk.parse import pchart 392 393 # Define two demos. Each demo has a sentence and a grammar. 394 demos = [('I saw John with my telescope', cfg.toy_pcfg1), 395 ('the boy saw Jack with Bob under the table with a telescope', 396 cfg.toy_pcfg2)] 397 398 # Ask the user which demo they want to use. 399 print 400 for i in range(len(demos)): 401 print '%3s: %s' % (i+1, demos[i][0]) 402 print ' %r' % demos[i][1] 403 print 404 print 'Which demo (%d-%d)? ' % (1, len(demos)), 405 try: 406 snum = int(sys.stdin.readline().strip())-1 407 sent, grammar = demos[snum] 408 except: 409 print 'Bad sentence number' 410 return 411 412 # Tokenize the sentence. 413 tokens = sent.split() 414 415 # Define a list of parsers. We'll use all parsers. 416 parsers = [ 417 pchart.InsideChartParser(grammar), 418 pchart.RandomChartParser(grammar), 419 pchart.UnsortedChartParser(grammar), 420 pchart.LongestChartParser(grammar), 421 pchart.InsideChartParser(grammar, beam_size = len(tokens)+1) # was BeamParser 422 ] 423 424 # Run the parsers on the tokenized sentence. 425 times = [] 426 average_p = [] 427 num_parses = [] 428 all_parses = {} 429 for parser in parsers: 430 print '\ns: %s\nparser: %s\ngrammar: %s' % (sent,parser,grammar) 431 parser.trace(3) 432 t = time.time() 433 parses = parser.nbest_parse(tokens) 434 times.append(time.time()-t) 435 if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) 436 else: p = 0 437 average_p.append(p) 438 num_parses.append(len(parses)) 439 for p in parses: all_parses[p.freeze()] = 1 440 441 # Print some summary statistics 442 print 443 print ' Parser Beam | Time (secs) # Parses Average P(parse)' 444 print '------------------------+------------------------------------------' 445 for i in range(len(parsers)): 446 print '%18s %4d |%11.4f%11d%19.14f' % (parsers[i].__class__.__name__, 447 parsers[i].beam_size, 448 times[i],num_parses[i],average_p[i]) 449 parses = all_parses.keys() 450 if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) 451 else: p = 0 452 print '------------------------+------------------------------------------' 453 print '%18s |%11s%11d%19.14f' % ('(All Parses)', 'n/a', len(parses), p) 454 455 # Ask the user if we should draw the parses. 456 print 457 print 'Draw parses (y/n)? ', 458 if sys.stdin.readline().strip().lower().startswith('y'): 459 from nltk.draw.tree import draw_trees 460 print ' please wait...' 461 draw_trees(*parses) 462 463 # Ask the user if we should print the parses. 464 print 465 print 'Print parses (y/n)? ', 466 if sys.stdin.readline().strip().lower().startswith('y'): 467 for parse in parses: 468 print parse

469 470 if __name__ == '__main__': 471 demo() 472

Source Code for Module nltk.parse.pchart