1
2
3
4
5
6
7
8
9 """
10 Classes and interfaces for associating probabilities with tree
11 structures that represent the internal organization of a text. The
12 probabilistic parser module defines C{BottomUpChartParser}.
13
14 C{BottomUpChartParser} is an abstract class that implements a
15 bottom-up chart parser for C{PCFG}s. It maintains a queue of edges,
16 and adds them to the chart one at a time. The ordering of this queue
17 is based on the probabilities associated with the edges, allowing the
18 parser to expand more likely edges before less likely ones. Each
19 subclass implements a different queue ordering, producing different
20 search strategies. Currently the following subclasses are defined:
21
22 - C{InsideChartParser} searches edges in decreasing order of
23 their trees' inside probabilities.
24 - C{RandomChartParser} searches edges in random order.
25 - C{LongestChartParser} searches edges in decreasing order of their
26 location's length.
27
28 The C{BottomUpChartParser} constructor has an optional argument beam_size.
29 If non-zero, this controls the size of the beam (aka the edge queue).
30 This option is most useful with InsideChartParser.
31 """
32
33
34
35
36
37
38
39
40 from nltk.tree import Tree, ProbabilisticTree
41 from nltk import Nonterminal
42
43 from api import *
44 from chart import Chart, LeafEdge, TreeEdge, AbstractChartRule
45
46
48 - def prob(self): return 1.0
49
62
63
71
81
106
108 NUM_EDGES=1
109
110 _fundamental_rule = ProbabilisticFundamentalRule()
111
113 fr = self._fundamental_rule
114 if edge1.is_incomplete():
115
116 for edge2 in chart.select(start=edge1.end(), is_complete=True,
117 lhs=edge1.next()):
118 for new_edge in fr.apply_iter(chart, grammar, edge1, edge2):
119 yield new_edge
120 else:
121
122 for edge2 in chart.select(end=edge1.start(), is_complete=False,
123 next=edge1.lhs()):
124 for new_edge in fr.apply_iter(chart, grammar, edge2, edge1):
125 yield new_edge
126
127 - def __str__(self): return 'Fundamental Rule'
128
130 """
131 An abstract bottom-up parser for C{PCFG}s that uses a C{Chart} to
132 record partial results. C{BottomUpChartParser} maintains a
133 queue of edges that can be added to the chart. This queue is
134 initialized with edges for each token in the text that is being
135 parsed. C{BottomUpChartParser} inserts these edges into the
136 chart one at a time, starting with the most likely edges, and
137 proceeding to less likely edges. For each edge that is added to
138 the chart, it may become possible to insert additional edges into
139 the chart; these are added to the queue. This process continues
140 until enough complete parses have been generated, or until the
141 queue is empty.
142
143 The sorting order for the queue is not specified by
144 C{BottomUpChartParser}. Different sorting orders will result
145 in different search strategies. The sorting order for the queue
146 is defined by the method C{sort_queue}; subclasses are required
147 to provide a definition for this method.
148
149 @type _grammar: C{PCFG}
150 @ivar _grammar: The grammar used to parse sentences.
151 @type _trace: C{int}
152 @ivar _trace: The level of tracing output that should be generated
153 when parsing a text.
154 """
155 - def __init__(self, grammar, beam_size=0, trace=0):
156 """
157 Create a new C{BottomUpChartParser}, that uses C{grammar}
158 to parse texts.
159
160 @type grammar: C{PCFG}
161 @param grammar: The grammar used to parse texts.
162 @type beam_size: C{int}
163 @param beam_size: The maximum length for the parser's edge queue.
164 @type trace: C{int}
165 @param trace: The level of tracing that should be used when
166 parsing a text. C{0} will generate no tracing output;
167 and higher numbers will produce more verbose tracing
168 output.
169 """
170 self._grammar = grammar
171 self.beam_size = beam_size
172 self._trace = trace
173
176
177 - def trace(self, trace=2):
178 """
179 Set the level of tracing output that should be generated when
180 parsing a text.
181
182 @type trace: C{int}
183 @param trace: The trace level. A trace level of C{0} will
184 generate no tracing output; and higher trace levels will
185 produce more verbose tracing output.
186 @rtype: C{None}
187 """
188 self._trace = trace
189
191 self._grammar.check_coverage(tokens)
192 chart = Chart(list(tokens))
193 grammar = self._grammar
194
195
196 bu_init = BottomUpInitRule()
197 bu = BottomUpPredictRule()
198 fr = SingleEdgeProbabilisticFundamentalRule()
199
200
201 queue = []
202
203
204 for e in bu_init.apply_iter(chart, grammar):
205 if self._trace>1: chart.pp_edge(e,width=2)
206 queue.append(e)
207
208 while len(queue) > 0:
209
210 self.sort_queue(queue, chart)
211
212
213 if self.beam_size:
214 self._prune(queue, chart)
215
216
217 edge = queue.pop()
218 if self._trace>0:
219 print ' %-50s [%s]' % (chart.pp_edge(edge,width=2),
220 edge.prob())
221
222
223 queue.extend(bu.apply(chart, grammar, edge))
224 queue.extend(fr.apply(chart, grammar, edge))
225
226
227 parses = chart.parses(grammar.start(), ProbabilisticTree)
228
229
230 prod_probs = {}
231 for prod in grammar.productions():
232 prod_probs[prod.lhs(), prod.rhs()] = prod.prob()
233 for parse in parses:
234 self._setprob(parse, prod_probs)
235
236
237 parses.sort(lambda a,b: cmp(b.prob(), a.prob()))
238
239 return parses[:n]
240
261
263 """
264 Sort the given queue of C{Edge}s, placing the edge that should
265 be tried first at the beginning of the queue. This method
266 will be called after each C{Edge} is added to the queue.
267
268 @param queue: The queue of C{Edge}s to sort. Each edge in
269 this queue is an edge that could be added to the chart by
270 the fundamental rule; but that has not yet been added.
271 @type queue: C{list} of C{Edge}
272 @param chart: The chart being used to parse the text. This
273 chart can be used to provide extra information for sorting
274 the queue.
275 @type chart: C{Chart}
276 @rtype: C{None}
277 """
278 raise AssertionError, "BottomUpChartParser is an abstract class"
279
280 - def _prune(self, queue, chart):
281 """ Discard items in the queue if the queue is longer than the beam."""
282 if len(queue) > self.beam_size:
283 split = len(queue)-self.beam_size
284 if self._trace > 2:
285 for edge in queue[:split]:
286 print ' %-50s [DISCARDED]' % chart.pp_edge(edge,2)
287 del queue[:split]
288
290 """
291 A bottom-up parser for C{PCFG}s that tries edges in descending
292 order of the inside probabilities of their trees. The X{inside
293 probability} of a tree is simply the
294 probability of the entire tree, ignoring its context. In
295 particular, the inside probability of a tree generated by
296 production M{p} with children M{c[1]}, M{c[2]}, ..., M{c[n]} is
297 P(M{p})*P(M{c[1]})*P(M{c[2]})*M{...}*P(M{c[n]}); and the inside
298 probability of a token is 1 if it is present in the text, and 0 if
299 it is absent.
300
301 This sorting order results in a type of lowest-cost-first search
302 strategy.
303 """
304
306 """
307 Sort the given queue of edges, in descending order of the
308 inside probabilities of the edges' trees.
309
310 @param queue: The queue of C{Edge}s to sort. Each edge in
311 this queue is an edge that could be added to the chart by
312 the fundamental rule; but that has not yet been added.
313 @type queue: C{list} of C{Edge}
314 @param chart: The chart being used to parse the text. This
315 chart can be used to provide extra information for sorting
316 the queue.
317 @type chart: C{Chart}
318 @rtype: C{None}
319 """
320 queue.sort(lambda e1,e2:cmp(e1.prob(), e2.prob()))
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350 import random
352 """
353 A bottom-up parser for C{PCFG}s that tries edges in random order.
354 This sorting order results in a random search strategy.
355 """
356
358 i = random.randint(0, len(queue)-1)
359 (queue[-1], queue[i]) = (queue[i], queue[-1])
360
362 """
363 A bottom-up parser for C{PCFG}s that tries edges in whatever order.
364 """
365
367
369 """
370 A bottom-up parser for C{PCFG}s that tries longer edges before
371 shorter ones. This sorting order results in a type of best-first
372 search strategy.
373 """
374
377
378
379
380
381
383 """
384 A demonstration of the probabilistic parsers. The user is
385 prompted to select which demo to run, and how many parses should
386 be found; and then each parser is run on the same demo, and a
387 summary of the results are displayed.
388 """
389 import sys, time
390 from nltk import tokenize, cfg
391 from nltk.parse import pchart
392
393
394 demos = [('I saw John with my telescope', cfg.toy_pcfg1),
395 ('the boy saw Jack with Bob under the table with a telescope',
396 cfg.toy_pcfg2)]
397
398
399 print
400 for i in range(len(demos)):
401 print '%3s: %s' % (i+1, demos[i][0])
402 print ' %r' % demos[i][1]
403 print
404 print 'Which demo (%d-%d)? ' % (1, len(demos)),
405 try:
406 snum = int(sys.stdin.readline().strip())-1
407 sent, grammar = demos[snum]
408 except:
409 print 'Bad sentence number'
410 return
411
412
413 tokens = sent.split()
414
415
416 parsers = [
417 pchart.InsideChartParser(grammar),
418 pchart.RandomChartParser(grammar),
419 pchart.UnsortedChartParser(grammar),
420 pchart.LongestChartParser(grammar),
421 pchart.InsideChartParser(grammar, beam_size = len(tokens)+1)
422 ]
423
424
425 times = []
426 average_p = []
427 num_parses = []
428 all_parses = {}
429 for parser in parsers:
430 print '\ns: %s\nparser: %s\ngrammar: %s' % (sent,parser,grammar)
431 parser.trace(3)
432 t = time.time()
433 parses = parser.nbest_parse(tokens)
434 times.append(time.time()-t)
435 if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
436 else: p = 0
437 average_p.append(p)
438 num_parses.append(len(parses))
439 for p in parses: all_parses[p.freeze()] = 1
440
441
442 print
443 print ' Parser Beam | Time (secs) # Parses Average P(parse)'
444 print '------------------------+------------------------------------------'
445 for i in range(len(parsers)):
446 print '%18s %4d |%11.4f%11d%19.14f' % (parsers[i].__class__.__name__,
447 parsers[i].beam_size,
448 times[i],num_parses[i],average_p[i])
449 parses = all_parses.keys()
450 if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
451 else: p = 0
452 print '------------------------+------------------------------------------'
453 print '%18s |%11s%11d%19.14f' % ('(All Parses)', 'n/a', len(parses), p)
454
455
456 print
457 print 'Draw parses (y/n)? ',
458 if sys.stdin.readline().strip().lower().startswith('y'):
459 from nltk.draw.tree import draw_trees
460 print ' please wait...'
461 draw_trees(*parses)
462
463
464 print
465 print 'Print parses (y/n)? ',
466 if sys.stdin.readline().strip().lower().startswith('y'):
467 for parse in parses:
468 print parse
469
470 if __name__ == '__main__':
471 demo()
472