nltk.parse.viterbi

18 """ 19 A bottom-up C{PCFG} parser that uses dynamic programming to find 20 the single most likely parse for a text. The C{ViterbiParser} parser 21 parses texts by filling in a X{most likely constituent table}. 22 This table records the most probable tree representation for any 23 given span and node value. In particular, it has an entry for 24 every start index, end index, and node value, recording the most 25 likely subtree that spans from the start index to the end index, 26 and has the given node value. 27 28 The C{ViterbiParser} parser fills in this table incrementally. It starts 29 by filling in all entries for constituents that span one element 30 of text (i.e., entries where the end index is one greater than the 31 start index). After it has filled in all table entries for 32 constituents that span one element of text, it fills in the 33 entries for constitutants that span two elements of text. It 34 continues filling in the entries for constituents spanning larger 35 and larger portions of the text, until the entire table has been 36 filled. Finally, it returns the table entry for a constituent 37 spanning the entire text, whose node value is the grammar's start 38 symbol. 39 40 In order to find the most likely constituent with a given span and 41 node value, the C{ViterbiParser} parser considers all productions that 42 could produce that node value. For each production, it finds all 43 children that collectively cover the span and have the node values 44 specified by the production's right hand side. If the probability 45 of the tree formed by applying the production to the children is 46 greater than the probability of the current entry in the table, 47 then the table is updated with this new tree. 48 49 A pseudo-code description of the algorithm used by 50 C{ViterbiParser} is: 51 52 - Create an empty most likely constituent table, M{MLC}. 53 - For M{width} in 1...len(M{text}): 54 - For M{start} in 1...len(M{text})-M{width}: 55 - For M{prod} in grammar.productions: 56 - For each sequence of subtrees [M{t[1]}, M{t[2]}, ..., 57 M{t[n]}] in M{MLC}, where M{t[i]}.node==M{prod}.rhs[i], 58 and the sequence covers [M{start}:M{start}+M{width}]: 59 - M{old_p} = M{MLC}[M{start}, M{start+width}, M{prod}.lhs] 60 - M{new_p} = P(M{t[1]})*P(M{t[1]})*...*P(M{t[n]})*P(M{prod}) 61 - if M{new_p} > M{old_p}: 62 - M{new_tree} = Tree(M{prod}.lhs, M{t[1]}, M{t[2]}, 63 ..., M{t[n]}) 64 - M{MLC}[M{start}, M{start+width}, M{prod}.lhs] 65 = M{new_tree} 66 - Return M{MLC}[0, len(M{text}), M{start_symbol}] 67 68 @type _grammar: C{pcfg.Grammar} 69 @ivar _grammar: The grammar used to parse sentences. 70 @type _trace: C{int} 71 @ivar _trace: The level of tracing output that should be generated 72 when parsing a text. 73 """

74 - def __init__(self, grammar, trace=0):

75 """ 76 Create a new C{ViterbiParser} parser, that uses {grammar} to 77 parse texts. 78 79 @type grammar: C{pcfg.Grammar} 80 @param grammar: The grammar used to parse texts. 81 @type trace: C{int} 82 @param trace: The level of tracing that should be used when 83 parsing a text. C{0} will generate no tracing output; 84 and higher numbers will produce more verbose tracing 85 output. 86 """ 87 self._grammar = grammar 88 self._trace = trace

89

90 - def grammar(self):

91 return self._grammar

92

93 - def trace(self, trace=2):

94 """ 95 Set the level of tracing output that should be generated when 96 parsing a text. 97 98 @type trace: C{int} 99 @param trace: The trace level. A trace level of C{0} will 100 generate no tracing output; and higher trace levels will 101 produce more verbose tracing output. 102 @rtype: C{None} 103 """ 104 self._trace = trace

105

106 - def nbest_parse(self, tokens, n=None):

107 # Inherit docs from ParserI 108 109 tokens = list(tokens) 110 self._grammar.check_coverage(tokens) 111 112 # The most likely constituent table. This table specifies the 113 # most likely constituent for a given span and type. 114 # Constituents can be either Trees or tokens. For 115 # Trees, the "type" is the Nonterminal for the tree's 116 # root node value. For Tokens, the "type" is the token's 117 # type. The table is stored as a dictionary, since it is 118 # sparse. 119 constituents = {} 120 121 # Initialize the constituents dictionary with the words from 122 # the text. 123 if self._trace: print ('Inserting tokens into the most likely'+ 124 ' constituents table...') 125 for index in range(len(tokens)): 126 token = tokens[index] 127 constituents[index,index+1,token] = token 128 if self._trace > 1: 129 self._trace_lexical_insertion(token, index, len(tokens)) 130 131 # Consider each span of length 1, 2, ..., n; and add any trees 132 # that might cover that span to the constituents dictionary. 133 for length in range(1, len(tokens)+1): 134 if self._trace: 135 print ('Finding the most likely constituents'+ 136 ' spanning %d text elements...' % length) 137 #print constituents 138 for start in range(len(tokens)-length+1): 139 span = (start, start+length) 140 self._add_constituents_spanning(span, constituents, 141 tokens) 142 143 # Find all trees that span the entire text & have the right cat 144 trees = [constituents.get((0, len(tokens), 145 self._grammar.start()), [])] 146 147 # Sort the trees, and return the requested number of them. 148 trees.sort(lambda t1,t2: cmp(t2.prob(), t1.prob())) 149 return trees[:n]

150

151 - def _add_constituents_spanning(self, span, constituents, tokens):

152 """ 153 Find any constituents that might cover C{span}, and add them 154 to the most likely constituents table. 155 156 @rtype: C{None} 157 @type span: C{(int, int)} 158 @param span: The section of the text for which we are 159 trying to find possible constituents. The span is 160 specified as a pair of integers, where the first integer 161 is the index of the first token that should be included in 162 the constituent; and the second integer is the index of 163 the first token that should not be included in the 164 constituent. I.e., the constituent should cover 165 C{M{text}[span[0]:span[1]]}, where C{M{text}} is the text 166 that we are parsing. 167 168 @type constituents: C{dictionary} from 169 C{(int,int,Nonterminal)} to (C{ProbabilisticToken} or 170 C{ProbabilisticTree}). 171 @param constituents: The most likely constituents table. This 172 table records the most probable tree representation for 173 any given span and node value. In particular, 174 C{constituents(M{s},M{e},M{nv})} is the most likely 175 C{ProbabilisticTree} that covers C{M{text}[M{s}:M{e}]} 176 and has a node value C{M{nv}.symbol()}, where C{M{text}} 177 is the text that we are parsing. When 178 C{_add_constituents_spanning} is called, C{constituents} 179 should contain all possible constituents that are shorter 180 than C{span}. 181 182 @type tokens: C{list} of tokens 183 @param tokens: The text we are parsing. This is only used for 184 trace output. 185 """ 186 # Since some of the grammar productions may be unary, we need to 187 # repeatedly try all of the productions until none of them add any 188 # new constituents. 189 changed = 1 190 while changed: 191 changed = 0 192 193 # Find all ways instantiations of the grammar productions that 194 # cover the span. 195 instantiations = self._find_instantiations(span, constituents) 196 197 # For each production instantiation, add a new 198 # ProbabilisticTree whose probability is the product 199 # of the childrens' probabilities and the production's 200 # probability. 201 for (production, children) in instantiations: 202 subtrees = [c for c in children if isinstance(c, Tree)] 203 p = reduce(lambda pr,t:pr*t.prob(), 204 subtrees, production.prob()) 205 node = production.lhs().symbol() 206 tree = ProbabilisticTree(node, children, prob=p) 207 208 # If it's new a constituent, then add it to the 209 # constituents dictionary. 210 c = constituents.get((span[0], span[1], production.lhs()), 211 None) 212 if self._trace > 1: 213 if c is None or c != tree: 214 if c is None or c.prob() < tree.prob(): 215 print ' Insert:', 216 else: 217 print ' Discard:', 218 self._trace_production(production, p, span, len(tokens)) 219 if c is None or c.prob() < tree.prob(): 220 constituents[span[0], span[1], production.lhs()] = tree 221 changed = 1

222

223 - def _find_instantiations(self, span, constituents):

224 """ 225 @return: a list of the production instantiations that cover a 226 given span of the text. A X{production instantiation} is 227 a tuple containing a production and a list of children, 228 where the production's right hand side matches the list of 229 children; and the children cover C{span}. @rtype: C{list} 230 of C{pair} of C{Production}, (C{list} of 231 (C{ProbabilisticTree} or token. 232 233 @type span: C{(int, int)} 234 @param span: The section of the text for which we are 235 trying to find production instantiations. The span is 236 specified as a pair of integers, where the first integer 237 is the index of the first token that should be covered by 238 the production instantiation; and the second integer is 239 the index of the first token that should not be covered by 240 the production instantiation. 241 @type constituents: C{dictionary} from 242 C{(int,int,Nonterminal)} to (C{ProbabilisticToken} or 243 C{ProbabilisticTree}). 244 @param constituents: The most likely constituents table. This 245 table records the most probable tree representation for 246 any given span and node value. See the module 247 documentation for more information. 248 """ 249 rv = [] 250 for production in self._grammar.productions(): 251 childlists = self._match_rhs(production.rhs(), span, constituents) 252 253 for childlist in childlists: 254 rv.append( (production, childlist) ) 255 return rv

256

257 - def _match_rhs(self, rhs, span, constituents):

258 """ 259 @return: a set of all the lists of children that cover C{span} 260 and that match C{rhs}. 261 @rtype: C{list} of (C{list} of C{ProbabilisticTree} or 262 C{Token}) 263 264 @type rhs: C{list} of C{Nonterminal} or (any) 265 @param rhs: The list specifying what kinds of children need to 266 cover C{span}. Each nonterminal in C{rhs} specifies 267 that the corresponding child should be a tree whose node 268 value is that nonterminal's symbol. Each terminal in C{rhs} 269 specifies that the corresponding child should be a token 270 whose type is that terminal. 271 @type span: C{(int, int)} 272 @param span: The section of the text for which we are 273 trying to find child lists. The span is specified as a 274 pair of integers, where the first integer is the index of 275 the first token that should be covered by the child list; 276 and the second integer is the index of the first token 277 that should not be covered by the child list. 278 @type constituents: C{dictionary} from 279 C{(int,int,Nonterminal)} to (C{ProbabilisticToken} or 280 C{ProbabilisticTree}). 281 @param constituents: The most likely constituents table. This 282 table records the most probable tree representation for 283 any given span and node value. See the module 284 documentation for more information. 285 """ 286 (start, end) = span 287 288 # Base case 289 if start >= end and rhs == (): return [[]] 290 if start >= end or rhs == (): return [] 291 292 # Find everything that matches the 1st symbol of the RHS 293 childlists = [] 294 for split in range(start, end+1): 295 l=constituents.get((start,split,rhs[0])) 296 if l is not None: 297 rights = self._match_rhs(rhs[1:], (split,end), constituents) 298 childlists += [[l]+r for r in rights] 299 300 return childlists

301

302 - def _trace_production(self, production, p, span, width):

303 """ 304 Print trace output indicating that a given production has been 305 applied at a given location. 306 307 @param production: The production that has been applied 308 @type production: C{Production} 309 @param p: The probability of the tree produced by the production. 310 @type p: C{float} 311 @param span: The span of the production 312 @type span: C{tuple} 313 @rtype: C{None} 314 """ 315 316 str = '|' + '.' * span[0] 317 str += '=' * (span[1] - span[0]) 318 str += '.' * (width - span[1]) + '| ' 319 str += '%s' % production 320 if self._trace > 2: str = '%-40s %12.10f ' % (str, p) 321 322 print str

323

324 - def _trace_lexical_insertion(self, token, index, width):

325 str = ' Insert: |' + '.' * index + '=' + '.' * (width-index-1) + '| ' 326 str += '%s' % (token,) 327 print str

328

329 - def __repr__(self):

330 return '<ViterbiParser for %r>' % self._grammar

Source Code for Module nltk.parse.viterbi