1
2
3
4
5
6
7
8
9 from nltk.tree import Tree, ProbabilisticTree
10
11 from api import *
12
13
14
15
16
18 """
19 A bottom-up C{PCFG} parser that uses dynamic programming to find
20 the single most likely parse for a text. The C{ViterbiParser} parser
21 parses texts by filling in a X{most likely constituent table}.
22 This table records the most probable tree representation for any
23 given span and node value. In particular, it has an entry for
24 every start index, end index, and node value, recording the most
25 likely subtree that spans from the start index to the end index,
26 and has the given node value.
27
28 The C{ViterbiParser} parser fills in this table incrementally. It starts
29 by filling in all entries for constituents that span one element
30 of text (i.e., entries where the end index is one greater than the
31 start index). After it has filled in all table entries for
32 constituents that span one element of text, it fills in the
33 entries for constitutants that span two elements of text. It
34 continues filling in the entries for constituents spanning larger
35 and larger portions of the text, until the entire table has been
36 filled. Finally, it returns the table entry for a constituent
37 spanning the entire text, whose node value is the grammar's start
38 symbol.
39
40 In order to find the most likely constituent with a given span and
41 node value, the C{ViterbiParser} parser considers all productions that
42 could produce that node value. For each production, it finds all
43 children that collectively cover the span and have the node values
44 specified by the production's right hand side. If the probability
45 of the tree formed by applying the production to the children is
46 greater than the probability of the current entry in the table,
47 then the table is updated with this new tree.
48
49 A pseudo-code description of the algorithm used by
50 C{ViterbiParser} is:
51
52 - Create an empty most likely constituent table, M{MLC}.
53 - For M{width} in 1...len(M{text}):
54 - For M{start} in 1...len(M{text})-M{width}:
55 - For M{prod} in grammar.productions:
56 - For each sequence of subtrees [M{t[1]}, M{t[2]}, ...,
57 M{t[n]}] in M{MLC}, where M{t[i]}.node==M{prod}.rhs[i],
58 and the sequence covers [M{start}:M{start}+M{width}]:
59 - M{old_p} = M{MLC}[M{start}, M{start+width}, M{prod}.lhs]
60 - M{new_p} = P(M{t[1]})*P(M{t[1]})*...*P(M{t[n]})*P(M{prod})
61 - if M{new_p} > M{old_p}:
62 - M{new_tree} = Tree(M{prod}.lhs, M{t[1]}, M{t[2]},
63 ..., M{t[n]})
64 - M{MLC}[M{start}, M{start+width}, M{prod}.lhs]
65 = M{new_tree}
66 - Return M{MLC}[0, len(M{text}), M{start_symbol}]
67
68 @type _grammar: C{pcfg.Grammar}
69 @ivar _grammar: The grammar used to parse sentences.
70 @type _trace: C{int}
71 @ivar _trace: The level of tracing output that should be generated
72 when parsing a text.
73 """
75 """
76 Create a new C{ViterbiParser} parser, that uses {grammar} to
77 parse texts.
78
79 @type grammar: C{pcfg.Grammar}
80 @param grammar: The grammar used to parse texts.
81 @type trace: C{int}
82 @param trace: The level of tracing that should be used when
83 parsing a text. C{0} will generate no tracing output;
84 and higher numbers will produce more verbose tracing
85 output.
86 """
87 self._grammar = grammar
88 self._trace = trace
89
92
93 - def trace(self, trace=2):
94 """
95 Set the level of tracing output that should be generated when
96 parsing a text.
97
98 @type trace: C{int}
99 @param trace: The trace level. A trace level of C{0} will
100 generate no tracing output; and higher trace levels will
101 produce more verbose tracing output.
102 @rtype: C{None}
103 """
104 self._trace = trace
105
107
108
109 tokens = list(tokens)
110 self._grammar.check_coverage(tokens)
111
112
113
114
115
116
117
118
119 constituents = {}
120
121
122
123 if self._trace: print ('Inserting tokens into the most likely'+
124 ' constituents table...')
125 for index in range(len(tokens)):
126 token = tokens[index]
127 constituents[index,index+1,token] = token
128 if self._trace > 1:
129 self._trace_lexical_insertion(token, index, len(tokens))
130
131
132
133 for length in range(1, len(tokens)+1):
134 if self._trace:
135 print ('Finding the most likely constituents'+
136 ' spanning %d text elements...' % length)
137
138 for start in range(len(tokens)-length+1):
139 span = (start, start+length)
140 self._add_constituents_spanning(span, constituents,
141 tokens)
142
143
144 trees = [constituents.get((0, len(tokens),
145 self._grammar.start()), [])]
146
147
148 trees.sort(lambda t1,t2: cmp(t2.prob(), t1.prob()))
149 return trees[:n]
150
152 """
153 Find any constituents that might cover C{span}, and add them
154 to the most likely constituents table.
155
156 @rtype: C{None}
157 @type span: C{(int, int)}
158 @param span: The section of the text for which we are
159 trying to find possible constituents. The span is
160 specified as a pair of integers, where the first integer
161 is the index of the first token that should be included in
162 the constituent; and the second integer is the index of
163 the first token that should not be included in the
164 constituent. I.e., the constituent should cover
165 C{M{text}[span[0]:span[1]]}, where C{M{text}} is the text
166 that we are parsing.
167
168 @type constituents: C{dictionary} from
169 C{(int,int,Nonterminal)} to (C{ProbabilisticToken} or
170 C{ProbabilisticTree}).
171 @param constituents: The most likely constituents table. This
172 table records the most probable tree representation for
173 any given span and node value. In particular,
174 C{constituents(M{s},M{e},M{nv})} is the most likely
175 C{ProbabilisticTree} that covers C{M{text}[M{s}:M{e}]}
176 and has a node value C{M{nv}.symbol()}, where C{M{text}}
177 is the text that we are parsing. When
178 C{_add_constituents_spanning} is called, C{constituents}
179 should contain all possible constituents that are shorter
180 than C{span}.
181
182 @type tokens: C{list} of tokens
183 @param tokens: The text we are parsing. This is only used for
184 trace output.
185 """
186
187
188
189 changed = 1
190 while changed:
191 changed = 0
192
193
194
195 instantiations = self._find_instantiations(span, constituents)
196
197
198
199
200
201 for (production, children) in instantiations:
202 subtrees = [c for c in children if isinstance(c, Tree)]
203 p = reduce(lambda pr,t:pr*t.prob(),
204 subtrees, production.prob())
205 node = production.lhs().symbol()
206 tree = ProbabilisticTree(node, children, prob=p)
207
208
209
210 c = constituents.get((span[0], span[1], production.lhs()),
211 None)
212 if self._trace > 1:
213 if c is None or c != tree:
214 if c is None or c.prob() < tree.prob():
215 print ' Insert:',
216 else:
217 print ' Discard:',
218 self._trace_production(production, p, span, len(tokens))
219 if c is None or c.prob() < tree.prob():
220 constituents[span[0], span[1], production.lhs()] = tree
221 changed = 1
222
224 """
225 @return: a list of the production instantiations that cover a
226 given span of the text. A X{production instantiation} is
227 a tuple containing a production and a list of children,
228 where the production's right hand side matches the list of
229 children; and the children cover C{span}. @rtype: C{list}
230 of C{pair} of C{Production}, (C{list} of
231 (C{ProbabilisticTree} or token.
232
233 @type span: C{(int, int)}
234 @param span: The section of the text for which we are
235 trying to find production instantiations. The span is
236 specified as a pair of integers, where the first integer
237 is the index of the first token that should be covered by
238 the production instantiation; and the second integer is
239 the index of the first token that should not be covered by
240 the production instantiation.
241 @type constituents: C{dictionary} from
242 C{(int,int,Nonterminal)} to (C{ProbabilisticToken} or
243 C{ProbabilisticTree}).
244 @param constituents: The most likely constituents table. This
245 table records the most probable tree representation for
246 any given span and node value. See the module
247 documentation for more information.
248 """
249 rv = []
250 for production in self._grammar.productions():
251 childlists = self._match_rhs(production.rhs(), span, constituents)
252
253 for childlist in childlists:
254 rv.append( (production, childlist) )
255 return rv
256
258 """
259 @return: a set of all the lists of children that cover C{span}
260 and that match C{rhs}.
261 @rtype: C{list} of (C{list} of C{ProbabilisticTree} or
262 C{Token})
263
264 @type rhs: C{list} of C{Nonterminal} or (any)
265 @param rhs: The list specifying what kinds of children need to
266 cover C{span}. Each nonterminal in C{rhs} specifies
267 that the corresponding child should be a tree whose node
268 value is that nonterminal's symbol. Each terminal in C{rhs}
269 specifies that the corresponding child should be a token
270 whose type is that terminal.
271 @type span: C{(int, int)}
272 @param span: The section of the text for which we are
273 trying to find child lists. The span is specified as a
274 pair of integers, where the first integer is the index of
275 the first token that should be covered by the child list;
276 and the second integer is the index of the first token
277 that should not be covered by the child list.
278 @type constituents: C{dictionary} from
279 C{(int,int,Nonterminal)} to (C{ProbabilisticToken} or
280 C{ProbabilisticTree}).
281 @param constituents: The most likely constituents table. This
282 table records the most probable tree representation for
283 any given span and node value. See the module
284 documentation for more information.
285 """
286 (start, end) = span
287
288
289 if start >= end and rhs == (): return [[]]
290 if start >= end or rhs == (): return []
291
292
293 childlists = []
294 for split in range(start, end+1):
295 l=constituents.get((start,split,rhs[0]))
296 if l is not None:
297 rights = self._match_rhs(rhs[1:], (split,end), constituents)
298 childlists += [[l]+r for r in rights]
299
300 return childlists
301
303 """
304 Print trace output indicating that a given production has been
305 applied at a given location.
306
307 @param production: The production that has been applied
308 @type production: C{Production}
309 @param p: The probability of the tree produced by the production.
310 @type p: C{float}
311 @param span: The span of the production
312 @type span: C{tuple}
313 @rtype: C{None}
314 """
315
316 str = '|' + '.' * span[0]
317 str += '=' * (span[1] - span[0])
318 str += '.' * (width - span[1]) + '| '
319 str += '%s' % production
320 if self._trace > 2: str = '%-40s %12.10f ' % (str, p)
321
322 print str
323
328
330 return '<ViterbiParser for %r>' % self._grammar
331
332
333
334
335
336
338 """
339 A demonstration of the probabilistic parsers. The user is
340 prompted to select which demo to run, and how many parses should
341 be found; and then each parser is run on the same demo, and a
342 summary of the results are displayed.
343 """
344 import sys, time
345 import nltk
346 from nltk import tokenize
347 from nltk.parse import ViterbiParser
348
349
350 demos = [('I saw the man with my telescope', nltk.toy_pcfg1),
351 ('the boy saw Jack with Bob under the table with a telescope', nltk.toy_pcfg2)]
352
353
354 print
355 for i in range(len(demos)):
356 print '%3s: %s' % (i+1, demos[i][0])
357 print ' %r' % demos[i][1]
358 print
359 print 'Which demo (%d-%d)? ' % (1, len(demos)),
360 try:
361 snum = int(sys.stdin.readline().strip())-1
362 sent, grammar = demos[snum]
363 except:
364 print 'Bad sentence number'
365 return
366
367
368 tokens = sent.split()
369
370 parser = ViterbiParser(grammar)
371 all_parses = {}
372
373 print '\nsent: %s\nparser: %s\ngrammar: %s' % (sent,parser,grammar)
374 parser.trace(3)
375 t = time.time()
376 parses = parser.nbest_parse(tokens)
377 time = time.time()-t
378 if parses:
379 average = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
380 else:
381 average = 0
382 num_parses = len(parses)
383 for p in parses:
384 all_parses[p.freeze()] = 1
385
386
387 print
388 print 'Time (secs) # Parses Average P(parse)'
389 print '-----------------------------------------'
390 print '%11.4f%11d%19.14f' % (time, num_parses, average)
391 parses = all_parses.keys()
392 if parses:
393 p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
394 else: p = 0
395 print '------------------------------------------'
396 print '%11s%11d%19.14f' % ('n/a', len(parses), p)
397
398
399 print
400 print 'Draw parses (y/n)? ',
401 if sys.stdin.readline().strip().lower().startswith('y'):
402 from nltk.draw.tree import draw_trees
403 print ' please wait...'
404 draw_trees(*parses)
405
406
407 print
408 print 'Print parses (y/n)? ',
409 if sys.stdin.readline().strip().lower().startswith('y'):
410 for parse in parses:
411 print parse
412
413 if __name__ == '__main__':
414 demo()
415