Package nltk :: Package parse :: Module featurechart
[hide private]
[frames] | no frames]

Source Code for Module nltk.parse.featurechart

  1  # Natural Language Toolkit: Chart Parser for Feature-Based Grammars 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Rob Speer <[email protected]> 
  5  # URL: <http://nltk.org> 
  6  # For license information, see LICENSE.TXT 
  7  # 
  8  # $Id: featurechart.py 6301 2008-07-28 03:33:31Z DHGarrette $ 
  9   
 10  """ 
 11  Extension of chart parsing implementation to handle grammars with 
 12  feature structures as nodes. 
 13  """ 
 14   
 15  import yaml 
 16   
 17  from nltk.featstruct import FeatStruct, unify, FeatStructParser 
 18  from nltk.sem import logic 
 19  from nltk import cfg, defaultdict 
 20  from nltk.cfg import FeatStructNonterminal 
 21  import nltk.data 
 22   
 23  from api import * 
 24  from chart import * 
 25   
26 -def load_earley(filename, trace=0, cache=False, verbose=False, 27 chart_class=Chart):
28 """ 29 Load a grammar from a file, and build an Earley feature parser based on 30 that grammar. 31 32 You can optionally specify a tracing level, for how much output you 33 want to see: 34 35 0: No output. 36 1: Show edges from scanner and completer rules (not predictor). 37 2 (default): Show all edges as they are added to the chart. 38 3: Show all edges, plus the results of successful unifications. 39 4: Show all edges, plus the results of all attempted unifications. 40 5: Show all edges, plus the results of all attempted unifications, 41 including those with cached results. 42 43 If C{verbose} is set to C{True}, then more diagnostic information about 44 grammar-loading is displayed. 45 """ 46 grammar = nltk.data.load(filename, cache=cache, verbose=verbose) 47 return FeatureEarleyChartParser(grammar, trace=trace, 48 chart_class=chart_class)
49
50 -class FeatureTreeEdge(TreeEdge):
51 """ 52 A specialized tree edge that allows shared variable bindings 53 between nonterminals on the left-hand side and right-hand side. 54 55 Each C{FeatureTreeEdge} contains a set of C{bindings}, i.e., a 56 dictionary mapping from variables to values. If the edge is not 57 complete, then these bindings are simply stored. However, if the 58 edge is complete, then the constructor applies these bindings to 59 every nonterminal in the edge whose symbol implements the 60 interface L{SubstituteBindingsI}. 61 """
62 - def __init__(self, span, lhs, rhs, dot=0, bindings=None):
63 """ 64 Construct a new edge. If the edge is incomplete (i.e., if 65 C{dot<len(rhs)}), then store the bindings as-is. If the edge 66 is complete (i.e., if C{dot==len(rhs)}), then apply the 67 bindings to all nonterminals in C{lhs} and C{rhs}, and then 68 clear the bindings. See L{TreeEdge} for a description of 69 the other arguments. 70 """ 71 if bindings is None: bindings = {} 72 73 # If the edge is complete, then substitute in the bindings, 74 # and then throw them away. (If we didn't throw them away, we 75 # might think that 2 complete edges are different just because 76 # they have different bindings, even though all bindings have 77 # already been applied.) 78 if dot == len(rhs) and bindings: 79 lhs = self._bind(lhs, bindings) 80 rhs = [self._bind(elt, bindings) for elt in rhs] 81 bindings = {} 82 83 # Initialize the edge. 84 TreeEdge.__init__(self, span, lhs, rhs, dot) 85 self._bindings = bindings
86
87 - def _bind(self, nt, bindings):
88 if not isinstance(nt, FeatStructNonterminal): return nt 89 return nt.substitute_bindings(bindings)
90
91 - def next_with_bindings(self):
92 return self._bind(self.next(), self._bindings)
93
94 - def bindings(self):
95 """ 96 Return a copy of this edge's bindings dictionary. 97 """ 98 return self._bindings.copy()
99
100 - def __str__(self):
101 if self.is_complete(): 102 return TreeEdge.__str__(self) 103 else: 104 bindings = '{%s}' % ', '.join('%s: %r' % item for item in 105 sorted(self._bindings.items())) 106 return '%s %s' % (TreeEdge.__str__(self), bindings)
107 108 # two edges w/ different bindings are not equal.
109 - def __cmp__(self, other):
110 if self.__class__ != other.__class__: return -1 111 return cmp((self._span, self._lhs, self._rhs, 112 self._dot, self._bindings), 113 (other._span, other._lhs, other._rhs, 114 other._dot, other._bindings))
115
116 - def __hash__(self):
117 # cache this:? 118 return hash((self._lhs, self._rhs, self._span, self._dot, 119 tuple(sorted(self._bindings))))
120
121 -class FeatureFundamentalRule(FundamentalRule):
122 """ 123 A specialized version of the fundamental rule that operates on 124 nonterminals whose symbols are C{FeatStructNonterminal}s. Rather 125 tha simply comparing the nonterminals for equality, they are 126 unified. Variable bindings from these unifications are collected 127 and stored in the chart using a L{FeatureTreeEdge}. When a 128 complete edge is generated, these bindings are applied to all 129 nonterminals in the edge. 130 131 The fundamental rule states that: 132 - [AS{->}S{alpha}*B1S{beta}][i:j] 133 - [B2S{->}S{gamma}*][j:k] 134 licenses the edge: 135 - [AS{->}S{alpha}B3*S{beta}][i:j] 136 assuming that B1 and B2 can be unified to generate B3. 137 """
138 - def apply_iter(self, chart, grammar, left_edge, right_edge):
139 # Make sure the rule is applicable. 140 if not (left_edge.end() == right_edge.start() and 141 left_edge.is_incomplete() and 142 right_edge.is_complete() and 143 isinstance(left_edge, TreeEdge) and 144 isinstance(right_edge, TreeEdge)): 145 return 146 147 # Unify B1 (left_edge.next) with B2 (right_edge.lhs) to 148 # generate B3 (result). 149 bindings = left_edge.bindings() # creates a copy. 150 result = unify(left_edge.next(), right_edge.lhs(), 151 bindings, rename_vars=False) 152 if result is None: return 153 154 # Construct the new edge. 155 new_edge = FeatureTreeEdge(span=(left_edge.start(), right_edge.end()), 156 lhs=left_edge.lhs(), rhs=left_edge.rhs(), 157 dot=left_edge.dot()+1, bindings=bindings) 158 159 # Add it to the chart, with appropriate child pointers. 160 changed_chart = False 161 for cpl1 in chart.child_pointer_lists(left_edge): 162 if chart.insert(new_edge, cpl1+(right_edge,)): 163 changed_chart = True 164 165 # If we changed the chart, then generate the edge. 166 if changed_chart: yield new_edge
167
168 -class FeatureTopDownExpandRule(TopDownExpandRule):
169 """ 170 A specialized version of the top down expand rule that operates on 171 nonterminals whose symbols are C{FeatStructNonterminal}s. Rather 172 tha simply comparing the nonterminals for equality, they are 173 unified. 174 175 The top down expand rule states that: 176 - [AS{->}S{alpha}*B1S{beta}][i:j] 177 licenses the edge: 178 - [B2S{->}*S{gamma}][j:j] 179 for each grammar production C{B2S{->}S{gamma}}, assuming that B1 180 and B2 can be unified. 181 """
182 - def apply_iter(self, chart, grammar, edge):
183 if edge.is_complete(): return 184 for prod in grammar.productions(): 185 # Note: we rename vars here, because we don't want variables 186 # from the two different productions to match. 187 if unify(prod.lhs(), edge.next_with_bindings(), rename_vars=True): 188 new_edge = FeatureTreeEdge(span=(edge.end(), edge.end()), 189 lhs=prod.lhs(), 190 rhs=prod.rhs(), dot=0) 191 if chart.insert(new_edge, ()): 192 yield new_edge
193 194 #//////////////////////////////////////////////////////////// 195 # Earley Parsing Rules 196 #//////////////////////////////////////////////////////////// 197
198 -class FeatureCompleterRule(CompleterRule):
199 """ 200 A specialized version of the completer rule that operates on 201 nonterminals whose symbols are C{FeatStructNonterminal}s. Rather 202 tha simply comparing the nonterminals for equality, they are 203 unified. See L{CompleterRule} for more information. 204 """ 205 _fundamental_rule = FeatureFundamentalRule() 206
207 - def apply_iter(self, chart, grammar, edge1):
208 fr = self._fundamental_rule 209 for edge2 in chart.select(end=edge1.start(), is_complete=False): 210 for new_edge in fr.apply_iter(chart, grammar, edge2, edge1): 211 yield new_edge
212
213 -class FeatureScannerRule(ScannerRule):
214 - def apply_iter(self, chart, gramar, edge):
215 if edge.is_complete() or edge.end()>=chart.num_leaves(): return 216 index = edge.end() 217 leaf = chart.leaf(index) 218 for pos in self._word_to_pos.get(leaf, []): 219 if unify(pos, edge.next_with_bindings(), rename_vars=True): 220 new_leaf_edge = LeafEdge(leaf, index) 221 if chart.insert(new_leaf_edge, ()): 222 yield new_leaf_edge 223 new_pos_edge = FeatureTreeEdge((index, index+1), pos, 224 [leaf], 1) 225 if chart.insert(new_pos_edge, (new_leaf_edge,)): 226 yield new_pos_edge
227 228 # This is just another name for TopDownExpandRule:
229 -class FeaturePredictorRule(FeatureTopDownExpandRule): pass
230 231 #//////////////////////////////////////////////////////////// 232 # Earley Parser 233 #//////////////////////////////////////////////////////////// 234 235 ## Simple Earley Chart Parser, without features 236 ## (defined here because the feature version needs to build on it, but 237 ## chart.py has a simpler way to use the Earley algorithm) 238
239 -class EarleyChartParser(ParserI):
240 """ 241 A chart parser implementing the Earley parsing algorithm: 242 243 - For each index I{end} in [0, 1, ..., N]: 244 - For each I{edge} s.t. I{edge}.end = I{end}: 245 - If I{edge} is incomplete, and I{edge}.next is not a part 246 of speech: 247 - Apply PredictorRule to I{edge} 248 - If I{edge} is incomplete, and I{edge}.next is a part of 249 speech: 250 - Apply ScannerRule to I{edge} 251 - If I{edge} is complete: 252 - Apply CompleterRule to I{edge} 253 - Return any complete parses in the chart 254 255 C{EarleyChartParser} uses a X{lexicon} to decide whether a leaf 256 has a given part of speech. This lexicon is encoded as a 257 dictionary that maps each word to a list of parts of speech that 258 word can have. 259 260 @ivar _predictor_class, _completer_class, _scanner_class: The 261 classes that are used to implement the three rules used by the 262 Earley algorithm, Replacement rules can be specified by 263 subclasses (such as L{FeatureEarleyChartParser 264 <nltk.parse.featurechar.FeatureEarleyChartParser>}). 265 """ 266 _predictor_class = PredictorRule 267 _completer_class = CompleterRule 268 _scanner_class = ScannerRule 269
270 - def __init__(self, grammar, trace=0, chart_class=Chart):
271 """ 272 Create a new Earley chart parser, that uses C{grammar} to 273 parse texts. 274 275 @type grammar: C{cfg.Grammar} 276 @param grammar: The grammar used to parse texts. 277 @type trace: C{int} 278 @param trace: The level of tracing that should be used when 279 parsing a text. C{0} will generate no tracing output; 280 and higher numbers will produce more verbose tracing 281 output. 282 @param chart_class: The class that should be used to create 283 the charts used by this parser. 284 """ 285 if isinstance(trace, dict): 286 raise ValueError("Earley parser no longer takes a lexicon " 287 "as a separate parameter; assign the " 288 "lexicon when creating the grammar instead.") 289 self._grammar = grammar 290 self._lexicon = grammar.lexicon() 291 self._trace = trace 292 self._chart_class = chart_class
293
294 - def grammar(self):
295 return self._grammar
296
297 - def lexicon(self):
298 """@return: The lexicon used by this parser.""" 299 return self._lexicon
300 301 #: The default total width reserved for the chart in trace output. 302 #: The remainder of each line will be used to display edges. 303 _trace_chart_width = 40 304
305 - def nbest_parse(self, tokens, n=None, tree_class=Tree):
306 tokens = list(tokens) 307 self._check_lexicon_coverage(tokens) 308 309 chart = self._chart_class(tokens) 310 grammar = self._grammar 311 312 # Width, for printing trace edges. 313 w = max(2, self._trace_chart_width/(chart.num_leaves()+1)) 314 if self._trace > 0: print ' '*9, chart.pp_leaves(w) 315 316 # Initialize the chart with a special "starter" edge. 317 chart.insert(self._starter_edge(grammar.start()), ()) 318 319 # Create the 3 rules: 320 predictor = self._predictor_class() 321 completer = self._completer_class() 322 scanner = self._scanner_class(self._lexicon) 323 324 for end in range(chart.num_leaves()+1): 325 if self._trace > 1: print 'Processing queue %d' % end 326 for edge in chart.select(end=end): 327 if edge.is_complete(): 328 for e in completer.apply_iter(chart, grammar, edge): 329 if self._trace > 0: 330 print 'Completer', chart.pp_edge(e,w) 331 if edge.is_incomplete(): 332 for e in predictor.apply_iter(chart, grammar, edge): 333 if self._trace > 1: 334 print 'Predictor', chart.pp_edge(e,w) 335 if edge.is_incomplete(): 336 for e in scanner.apply_iter(chart, grammar, edge): 337 if self._trace > 0: 338 print 'Scanner ', chart.pp_edge(e,w) 339 340 # Output a list of complete parses. 341 return self._parses(chart, grammar.start(), tree_class)[:n]
342 343 # This is a separate method because FeatureEarleyChartParser needs 344 # to replace it:
345 - def _starter_edge(self, start_sym):
346 """Return a 'starter edge' that expands to the start symbol.""" 347 root = cfg.Nonterminal('[INIT]') 348 return TreeEdge((0,0), root, (start_sym,))
349 350 # This is a separate method because FeatureEarleyChartParser needs 351 # to replace it:
352 - def _parses(self, chart, start_sym, tree_class):
353 """Return a list of parses in the given chart.""" 354 return chart.parses(start_sym, tree_class=tree_class)
355
356 - def _check_lexicon_coverage(self, tokens):
357 try: 'x' in self._lexicon 358 except: raise ValueError('ow %r' % self._lexicon) 359 missing = [tok for tok in tokens if tok not in self._lexicon] 360 if missing: 361 missing = ', '.join('%r' % (w,) for w in missing) 362 raise ValueError("Grammar does not cover some of the " 363 "input words: " + missing)
364
365 -class FeatureEarleyChartParser(EarleyChartParser):
366 """ 367 A chart parser implementing the Earley parsing algorithm, allowing 368 nonterminals that have features (known as L{FeatStructNonterminal}s). 369 See L{EarleyChartParser} for more details. 370 """ 371 _predictor_class = FeaturePredictorRule 372 _completer_class = FeatureCompleterRule 373 _scanner_class = FeatureScannerRule 374 _trace_chart_width = 10 # Edges are big, so compress the chart. 375
376 - def _starter_edge(self, start):
377 root = FeatStructNonterminal('[*type*="[INIT]"]') 378 return FeatureTreeEdge((0,0), root, (start,), 0)
379
380 - def _parses(self, chart, start, tree_class):
381 # Output a list of complete parses. 382 trees = [] 383 for edge in chart.select(span=(0, chart.num_leaves())): 384 if ( (not isinstance(edge, LeafEdge)) and 385 (unify(edge.lhs(), start, rename_vars=True)) ): 386 trees += chart.trees(edge, complete=True, 387 tree_class=tree_class) 388 return trees
389 390 #//////////////////////////////////////////////////////////// 391 # Instantiate Variable Chart 392 #//////////////////////////////////////////////////////////// 393
394 -class InstantiateVarsChart(Chart):
395 """ 396 A specialized chart that 'instantiates' variables whose names 397 start with '@', by replacing them with unique new variables. 398 In particular, whenever a complete edge is added to the chart, any 399 variables in the edge's C{lhs} whose names start with '@' will be 400 replaced by unique new L{Variable}s. 401 """
402 - def __init__(self, tokens):
403 Chart.__init__(self, tokens) 404 self._instantiated = set()
405
406 - def insert(self, edge, child_pointer_list):
407 if edge in self._instantiated: return False 408 edge = self.instantiate_edge(edge) 409 return Chart.insert(self, edge, child_pointer_list)
410
411 - def instantiate_edge(self, edge):
412 # If the edge is a leaf, or is not complete, or is 413 # already in the chart, then just return it as-is. 414 if not isinstance(edge, FeatureTreeEdge): return edge 415 if not edge.is_complete(): return edge 416 if edge in self._edge_to_cpls: return edge 417 418 # Get a list of variables that need to be instantiated. 419 # If there are none, then return the edge as-is. 420 inst_vars = self.inst_vars(edge) 421 if not inst_vars: return edge 422 423 # Instantiate the edge! 424 self._instantiated.add(edge) 425 lhs = edge.lhs().substitute_bindings(inst_vars) 426 return FeatureTreeEdge(edge.span(), lhs, edge.rhs(), 427 edge.dot(), edge.bindings())
428
429 - def inst_vars(self, edge):
430 return dict((var, logic.unique_variable()) 431 for var in edge.lhs().variables() 432 if var.name.startswith('@'))
433 434 #//////////////////////////////////////////////////////////// 435 # Demo 436 #//////////////////////////////////////////////////////////// 437 438 # TODO: update to use grammar parser
439 -def demo():
440 import sys, time 441 442 S = FeatStructNonterminal('S') 443 VP = FeatStructNonterminal('VP') 444 NP = FeatStructNonterminal('NP') 445 PP = FeatStructNonterminal('PP') 446 V = FeatStructNonterminal('V') 447 N = FeatStructNonterminal('N') 448 P = FeatStructNonterminal('P') 449 Name = FeatStructNonterminal('Name') 450 Det = FeatStructNonterminal('Det') 451 DetSg = FeatStructNonterminal('Det[-pl]') 452 DetPl = FeatStructNonterminal('Det[+pl]') 453 NSg = FeatStructNonterminal('N[-pl]') 454 NPl = FeatStructNonterminal('N[+pl]') 455 456 # Define some grammatical productions. 457 grammatical_productions = [ 458 cfg.Production(S, (NP, VP)), cfg.Production(PP, (P, NP)), 459 cfg.Production(NP, (NP, PP)), 460 cfg.Production(VP, (VP, PP)), cfg.Production(VP, (V, NP)), 461 cfg.Production(VP, (V,)), cfg.Production(NP, (DetPl, NPl)), 462 cfg.Production(NP, (DetSg, NSg))] 463 464 # Define some lexical productions. 465 lexical_productions = [ 466 cfg.Production(NP, ('John',)), cfg.Production(NP, ('I',)), 467 cfg.Production(Det, ('the',)), cfg.Production(Det, ('my',)), 468 cfg.Production(Det, ('a',)), 469 cfg.Production(NSg, ('dog',)), cfg.Production(NSg, ('cookie',)), 470 cfg.Production(V, ('ate',)), cfg.Production(V, ('saw',)), 471 cfg.Production(P, ('with',)), cfg.Production(P, ('under',)), 472 ] 473 474 475 earley_lexicon = defaultdict(list) 476 for prod in lexical_productions: 477 earley_lexicon[prod.rhs()[0]].append(prod.lhs()) 478 #print "Lexicon:" 479 #print earley_lexicon 480 earley_grammar = cfg.Grammar(S, grammatical_productions, earley_lexicon) 481 print earley_grammar 482 483 sent = 'I saw John with a dog with my cookie' 484 print "Sentence:\n", sent 485 tokens = sent.split() 486 t = time.time() 487 cp = FeatureEarleyChartParser(earley_grammar, trace=1) 488 trees = cp.nbest_parse(tokens) 489 print "Time: %s" % (time.time() - t) 490 for tree in trees: print tree
491
492 -def run_profile():
493 import profile 494 profile.run('for i in range(1): demo()', '/tmp/profile.out') 495 import pstats 496 p = pstats.Stats('/tmp/profile.out') 497 p.strip_dirs().sort_stats('time', 'cum').print_stats(60) 498 p.strip_dirs().sort_stats('cum', 'time').print_stats(60)
499 500 if __name__ == '__main__': 501 demo() 502 print 503 cp = load_earley('grammars/feat0.fcfg', trace=2) 504 sent = 'Kim likes children' 505 tokens = sent.split() 506 trees = cp.nbest_parse(tokens) 507 for tree in trees: 508 print tree 509