nltk.cfg

1 # Natural Language Toolkit: Context Free Grammars 2 # 3 # Copyright (C) 2001-2008 NLTK Project 4 # Author: Steven Bird <[email protected]> 5 # Edward Loper <[email protected]> 6 # URL: <http://nltk.org> 7 # For license information, see LICENSE.TXT 8 # 9 10 """ 11 Basic data classes for representing context free grammars. A 12 X{grammar} specifies which trees can represent the structure of a 13 given text. Each of these trees is called a X{parse tree} for the 14 text (or simply a X{parse}). In a X{context free} grammar, the set of 15 parse trees for any piece of a text can depend only on that piece, and 16 not on the rest of the text (i.e., the piece's context). Context free 17 grammars are often used to find possible syntactic structures for 18 sentences. In this context, the leaves of a parse tree are word 19 tokens; and the node values are phrasal categories, such as C{NP} 20 and C{VP}. 21 22 The L{Grammar} class is used to encode context free grammars. Each C{Grammar} 23 consists of a start symbol and a set of productions. The X{start 24 symbol} specifies the root node value for parse trees. For example, 25 the start symbol for syntactic parsing is usually C{S}. Start 26 symbols are encoded using the C{Nonterminal} class, which is discussed 27 below. 28 29 A Grammar's X{productions} specify what parent-child relationships a parse 30 tree can contain. Each production specifies that a particular 31 node can be the parent of a particular set of children. For example, 32 the production C{<S> -> <NP> <VP>} specifies that an C{S} node can 33 be the parent of an C{NP} node and a C{VP} node. 34 35 Grammar productions are implemented by the C{Production} class. 36 Each C{Production} consists of a left hand side and a right hand 37 side. The X{left hand side} is a C{Nonterminal} that specifies the 38 node type for a potential parent; and the X{right hand side} is a list 39 that specifies allowable children for that parent. This lists 40 consists of C{Nonterminals} and text types: each C{Nonterminal} 41 indicates that the corresponding child may be a C{TreeToken} with the 42 specified node type; and each text type indicates that the 43 corresponding child may be a C{Token} with the with that type. 44 45 The C{Nonterminal} class is used to distinguish node values from leaf 46 values. This prevents the grammar from accidentally using a leaf 47 value (such as the English word "A") as the node of a subtree. Within 48 a C{Grammar}, all node values are wrapped in the C{Nonterminal} class. 49 Note, however, that the trees that are specified by the grammar do 50 B{not} include these C{Nonterminal} wrappers. 51 52 Grammars can also be given a more procedural interpretation. According to 53 this interpretation, a Grammar specifies any tree structure M{tree} that 54 can be produced by the following procedure: 55 56 - Set M{tree} to the start symbol 57 - Repeat until M{tree} contains no more nonterminal leaves: 58 - Choose a production M{prod} with whose left hand side 59 M{lhs} is a nonterminal leaf of M{tree}. 60 - Replace the nonterminal leaf with a subtree, whose node 61 value is the value wrapped by the nonterminal M{lhs}, and 62 whose children are the right hand side of M{prod}. 63 64 The operation of replacing the left hand side (M{lhs}) of a production 65 with the right hand side (M{rhs}) in a tree (M{tree}) is known as 66 X{expanding} M{lhs} to M{rhs} in M{tree}. 67 """ 68 69 import re 70 71 from nltk.featstruct import FeatStruct, FeatDict, FeatStructParser, SLASH, TYPE 72 73 ################################################################# 74 # Nonterminal 75 ################################################################# 76

77 -class Nonterminal(object):

78 """ 79 A non-terminal symbol for a context free grammar. C{Nonterminal} 80 is a wrapper class for node values; it is used by 81 C{Production}s to distinguish node values from leaf values. 82 The node value that is wrapped by a C{Nonterminal} is known as its 83 X{symbol}. Symbols are typically strings representing phrasal 84 categories (such as C{"NP"} or C{"VP"}). However, more complex 85 symbol types are sometimes used (e.g., for lexicalized grammars). 86 Since symbols are node values, they must be immutable and 87 hashable. Two C{Nonterminal}s are considered equal if their 88 symbols are equal. 89 90 @see: L{Grammar} 91 @see: L{Production} 92 @type _symbol: (any) 93 @ivar _symbol: The node value corresponding to this 94 C{Nonterminal}. This value must be immutable and hashable. 95 """

96 - def __init__(self, symbol):

97 """ 98 Construct a new non-terminal from the given symbol. 99 100 @type symbol: (any) 101 @param symbol: The node value corresponding to this 102 C{Nonterminal}. This value must be immutable and 103 hashable. 104 """ 105 self._symbol = symbol 106 self._hash = hash(symbol)

107

108 - def symbol(self):

109 """ 110 @return: The node value corresponding to this C{Nonterminal}. 111 @rtype: (any) 112 """ 113 return self._symbol

114

115 - def __eq__(self, other):

116 """ 117 @return: True if this non-terminal is equal to C{other}. In 118 particular, return true iff C{other} is a C{Nonterminal} 119 and this non-terminal's symbol is equal to C{other}'s 120 symbol. 121 @rtype: C{boolean} 122 """ 123 try: 124 return ((self._symbol == other._symbol) \ 125 and isinstance(other, self.__class__)) 126 except AttributeError: 127 return False

128

129 - def __ne__(self, other):

130 """ 131 @return: True if this non-terminal is not equal to C{other}. In 132 particular, return true iff C{other} is not a C{Nonterminal} 133 or this non-terminal's symbol is not equal to C{other}'s 134 symbol. 135 @rtype: C{boolean} 136 """ 137 return not (self==other)

138

139 - def __cmp__(self, other):

140 if self == other: return 0 141 else: return -1

142

143 - def __hash__(self):

144 return self._hash

145

146 - def __repr__(self):

147 """ 148 @return: A string representation for this C{Nonterminal}. 149 The string representation for a C{Nonterminal} whose 150 symbol is C{M{s}} is C{<M{s}>}. 151 @rtype: C{string} 152 """ 153 if isinstance(self._symbol, basestring): 154 return '<%s>' % (self._symbol,) 155 else: 156 return '<%r>' % (self._symbol,)

157

158 - def __str__(self):

159 """ 160 @return: A string representation for this C{Nonterminal}. 161 The string representation for a C{Nonterminal} whose 162 symbol is C{M{s}} is C{M{s}}. 163 @rtype: C{string} 164 """ 165 if isinstance(self._symbol, basestring): 166 return '%s' % (self._symbol,) 167 else: 168 return '%r' % (self._symbol,)

169

170 - def __div__(self, rhs):

171 """ 172 @return: A new nonterminal whose symbol is C{M{A}/M{B}}, where 173 C{M{A}} is the symbol for this nonterminal, and C{M{B}} 174 is the symbol for rhs. 175 @rtype: L{Nonterminal} 176 @param rhs: The nonterminal used to form the right hand side 177 of the new nonterminal. 178 @type rhs: L{Nonterminal} 179 """ 180 return Nonterminal('%s/%s' % (self._symbol, rhs._symbol))

181

182 -def nonterminals(symbols):

183 """ 184 Given a string containing a list of symbol names, return a list of 185 C{Nonterminals} constructed from those symbols. 186 187 @param symbols: The symbol name string. This string can be 188 delimited by either spaces or commas. 189 @type symbols: C{string} 190 @return: A list of C{Nonterminals} constructed from the symbol 191 names given in C{symbols}. The C{Nonterminals} are sorted 192 in the same order as the symbols names. 193 @rtype: C{list} of L{Nonterminal} 194 """ 195 if ',' in symbols: symbol_list = symbols.split(',') 196 else: symbol_list = symbols.split() 197 return [Nonterminal(s.strip()) for s in symbol_list]

198 199 ################################################################# 200 # Production and Grammar 201 ################################################################# 202

203 -class Production(object):

204 """ 205 A context-free grammar production. Each production 206 expands a single C{Nonterminal} (the X{left-hand side}) to a 207 sequence of terminals and C{Nonterminals} (the X{right-hand 208 side}). X{terminals} can be any immutable hashable object that is 209 not a C{Nonterminal}. Typically, terminals are strings 210 representing word types, such as C{"dog"} or C{"under"}. 211 212 Abstractly, a Grammar production indicates that the right-hand side is 213 a possible X{instantiation} of the left-hand side. Grammar 214 productions are X{context-free}, in the sense that this 215 instantiation should not depend on the context of the left-hand 216 side or of the right-hand side. 217 218 @see: L{Grammar} 219 @see: L{Nonterminal} 220 @type _lhs: L{Nonterminal} 221 @ivar _lhs: The left-hand side of the production. 222 @type _rhs: C{tuple} of (C{Nonterminal} and (terminal)) 223 @ivar _rhs: The right-hand side of the production. 224 """ 225

226 - def __init__(self, lhs, rhs):

227 """ 228 Construct a new C{Production}. 229 230 @param lhs: The left-hand side of the new C{Production}. 231 @type lhs: L{Nonterminal} 232 @param rhs: The right-hand side of the new C{Production}. 233 @type rhs: sequence of (C{Nonterminal} and (terminal)) 234 """ 235 if isinstance(rhs, (str, unicode)): 236 raise TypeError('production right hand side should be a list, ' 237 'not a string') 238 self._lhs = lhs 239 self._rhs = tuple(rhs) 240 self._hash = hash((self._lhs, self._rhs))

241

242 - def lhs(self):

243 """ 244 @return: the left-hand side of this C{Production}. 245 @rtype: L{Nonterminal} 246 """ 247 return self._lhs

248

249 - def rhs(self):

250 """ 251 @return: the right-hand side of this C{Production}. 252 @rtype: sequence of (C{Nonterminal} and (terminal)) 253 """ 254 return self._rhs

255

256 - def __str__(self):

257 """ 258 @return: A verbose string representation of the 259 C{Production}. 260 @rtype: C{string} 261 """ 262 str = '%s ->' % (self._lhs,) 263 for elt in self._rhs: 264 if isinstance(elt, Nonterminal): 265 str += ' %s' % (elt,) 266 else: 267 str += ' %r' % (elt,) 268 return str

269

270 - def __repr__(self):

271 """ 272 @return: A concise string representation of the 273 C{Production}. 274 @rtype: C{string} 275 """ 276 return '%s' % self

277

278 - def __eq__(self, other):

279 """ 280 @return: true if this C{Production} is equal to C{other}. 281 @rtype: C{boolean} 282 """ 283 return (isinstance(other, self.__class__) and 284 self._lhs == other._lhs and 285 self._rhs == other._rhs)

286

287 - def __ne__(self, other):

288 return not (self == other)

289

290 - def __cmp__(self, other):

291 if not isinstance(other, self.__class__): return -1 292 return cmp((self._lhs, self._rhs), (other._lhs, other._rhs))

293

294 - def __hash__(self):

295 """ 296 @return: A hash value for the C{Production}. 297 @rtype: C{int} 298 """ 299 return self._hash

300 301

302 -class Grammar(object):

303 """ 304 A context-free grammar. A Grammar consists of a start state and a set 305 of productions. The set of terminals and nonterminals is 306 implicitly specified by the productions. 307 308 If you need efficient key-based access to productions, you 309 can use a subclass to implement it. 310 """

311 - def __init__(self, start, productions, lexicon=None):

312 """ 313 Create a new context-free grammar, from the given start state 314 and set of C{Production}s. 315 316 @param start: The start symbol 317 @type start: L{Nonterminal} 318 @param productions: The list of productions that defines the grammar 319 @type productions: C{list} of L{Production} 320 """ 321 self._start = start 322 self._productions = productions 323 self._lexicon = lexicon 324 self._lhs_index = {} 325 self._rhs_index = {} 326 for prod in self._productions: 327 if prod._lhs not in self._lhs_index: 328 self._lhs_index[prod._lhs] = [] 329 if prod._rhs and prod._rhs[0] not in self._rhs_index: 330 self._rhs_index[prod._rhs[0]] = [] 331 self._lhs_index[prod._lhs].append(prod) 332 if prod._rhs: 333 self._rhs_index[prod._rhs[0]].append(prod)

334

335 - def start(self):

336 return self._start

337 338 # tricky to balance readability and efficiency here! 339 # can't use set operations as they don't preserve ordering

340 - def productions(self, lhs=None, rhs=None):

341 # no constraints so return everything 342 if not lhs and not rhs: 343 return self._productions 344 345 # only lhs specified so look up its index 346 elif lhs and not rhs: 347 if lhs in self._lhs_index: 348 return self._lhs_index[lhs] 349 else: 350 return [] 351 352 # only rhs specified so look up its index 353 elif rhs and not lhs: 354 if rhs in self._rhs_index: 355 return self._rhs_index[rhs] 356 else: 357 return [] 358 359 # intersect 360 else: 361 if lhs in self._lhs_index: 362 return [prod for prod in self._lhs_index[lhs] 363 if prod in self._rhs_index[rhs]] 364 else: 365 return []

366

367 - def lexicon(self):

368 return self._lexicon

369 370

371 - def check_coverage(self, tokens):

372 """ 373 Check whether the grammar rules cover the given list of tokens. 374 If not, then raise an exception. 375 """ 376 missing = [tok for tok in tokens 377 if len(self.productions(rhs=tok))==0] 378 if missing: 379 missing = ', '.join('%r' % (w,) for w in missing) 380 raise ValueError("Grammar does not cover some of the " 381 "input words: %r." +missing)

382 383 # [xx] does this still get used anywhere, or does check_coverage 384 # replace it?

385 - def covers(self, tokens):

386 """ 387 Check whether the grammar rules cover the given list of tokens. 388 389 @param tokens: the given list of tokens. 390 @type tokens: a C{list} of C{string} objects. 391 @return: True/False 392 """ 393 for token in tokens: 394 if len(self.productions(rhs=token)) == 0: 395 return False 396 return True

397

398 - def __repr__(self):

399 return '<Grammar with %d productions>' % len(self._productions)

400

401 - def __str__(self):

402 str = 'Grammar with %d productions' % len(self._productions) 403 str += ' (start state = %s)' % self._start 404 for production in self._productions: 405 str += '\n %s' % production 406 if self._lexicon: 407 str += '\n\n Lexical Entries\n ===============' 408 for word in sorted(self._lexicon): 409 str += '\n %-15s: %s' % (word, self._lexicon[word]) 410 return str

411 412 ################################################################# 413 # Parsing CFGs 414 ################################################################# 415 416 _PARSE_CFG_RE = re.compile(r'''^\s* # leading whitespace 417 (\w+(?:/\w+)?)\s* # lhs 418 (?:[-=]+>)\s* # arrow 419 (?:( # rhs: 420 "[^"]+" # doubled-quoted terminal 421 | '[^']+' # single-quoted terminal 422 | \w+(?:/\w+)? # non-terminal 423 | \| # disjunction 424 ) 425 \s*) # trailing space 426 *$''', # zero or more copies 427 re.VERBOSE) 428 _SPLIT_CFG_RE = re.compile(r'''(\w+(?:/\w+)?|[-=]+>|"[^"]+"|'[^']+'|\|)''') 429 430

431 -def parse_cfg_production(s):

432 """ 433 Returns a list of productions 434 """ 435 # Use _PARSE_CFG_RE to check that it's valid. 436 if not _PARSE_CFG_RE.match(s): 437 raise ValueError, 'Bad production string' 438 # Use _SPLIT_CFG_RE to process it. 439 pieces = _SPLIT_CFG_RE.split(s) 440 pieces = [p for i,p in enumerate(pieces) if i%2==1] 441 lhside = Nonterminal(pieces[0]) 442 rhsides = [[]] 443 found_terminal = found_non_terminal = False 444 for piece in pieces[2:]: 445 if piece == '|': 446 rhsides.append([]) # Vertical bar 447 found_terminal = found_non_terminal = False 448 elif piece[0] in ('"', "'"): 449 rhsides[-1].append(piece[1:-1]) # Terminal 450 found_terminal = True 451 else: 452 rhsides[-1].append(Nonterminal(piece)) # Nonterminal 453 found_non_terminal = True 454 if found_terminal and found_non_terminal: 455 raise ValueError('Bad right-hand-side: do not mix ' 456 'terminals and non-terminals') 457 return [Production(lhside, rhside) for rhside in rhsides]

458

459 -def parse_cfg(s):

460 productions = [] 461 for linenum, line in enumerate(s.split('\n')): 462 line = line.strip() 463 if line.startswith('#') or line=='': continue 464 try: productions += parse_cfg_production(line) 465 except ValueError: 466 raise ValueError, 'Unable to parse line %s: %s' % (linenum, line) 467 if len(productions) == 0: 468 raise ValueError, 'No productions found!' 469 start = productions[0].lhs() 470 return Grammar(start, productions)

471 472 473 ################################################################# 474 # Weighted Production and Grammar 475 ################################################################# 476 477 from nltk.probability import ImmutableProbabilisticMixIn 478

479 -class WeightedProduction(Production, ImmutableProbabilisticMixIn):

480 """ 481 A probabilistic context free grammar production. 482 PCFG C{WeightedProduction}s are essentially just C{Production}s that 483 have probabilities associated with them. These probabilities are 484 used to record how likely it is that a given production will 485 be used. In particular, the probability of a C{WeightedProduction} 486 records the likelihood that its right-hand side is the correct 487 instantiation for any given occurance of its left-hand side. 488 489 @see: L{Production} 490 """

491 - def __init__(self, lhs, rhs, **prob):

492 """ 493 Construct a new C{WeightedProduction}. 494 495 @param lhs: The left-hand side of the new C{WeightedProduction}. 496 @type lhs: L{Nonterminal} 497 @param rhs: The right-hand side of the new C{WeightedProduction}. 498 @type rhs: sequence of (C{Nonterminal} and (terminal)) 499 @param prob: Probability parameters of the new C{WeightedProduction}. 500 """ 501 ImmutableProbabilisticMixIn.__init__(self, **prob) 502 Production.__init__(self, lhs, rhs)

503

504 - def __str__(self):

505 return Production.__str__(self) + ' [%s]' % self.prob()

506

507 - def __eq__(self, other):

508 return (isinstance(other, self.__class__) and 509 self._lhs == other._lhs and 510 self._rhs == other._rhs and 511 self.prob() == other.prob())

512

513 - def __ne__(self, other):

514 return not (self == other)

515

516 - def __hash__(self):

517 return hash((self._lhs, self._rhs, self.prob()))

518

519 -class WeightedGrammar(Grammar):

520 """ 521 A probabilistic context-free grammar. A Weighted Grammar consists 522 of a start state and a set of weighted productions. The set of 523 terminals and nonterminals is implicitly specified by the 524 productions. 525 526 PCFG productions should be C{WeightedProduction}s. 527 C{WeightedGrammar}s impose the constraint that the set of 528 productions with any given left-hand-side must have probabilities 529 that sum to 1. 530 531 If you need efficient key-based access to productions, you can use 532 a subclass to implement it. 533 534 @type EPSILON: C{float} 535 @cvar EPSILON: The acceptable margin of error for checking that 536 productions with a given left-hand side have probabilities 537 that sum to 1. 538 """ 539 EPSILON = 0.01 540

541 - def __init__(self, start, productions):

542 """ 543 Create a new context-free grammar, from the given start state 544 and set of C{WeightedProduction}s. 545 546 @param start: The start symbol 547 @type start: L{Nonterminal} 548 @param productions: The list of productions that defines the grammar 549 @type productions: C{list} of C{Production} 550 @raise ValueError: if the set of productions with any left-hand-side 551 do not have probabilities that sum to a value within 552 EPSILON of 1. 553 """ 554 Grammar.__init__(self, start, productions) 555 556 # Make sure that the probabilities sum to one. 557 probs = {} 558 for production in productions: 559 probs[production.lhs()] = (probs.get(production.lhs(), 0) + 560 production.prob()) 561 for (lhs, p) in probs.items(): 562 if not ((1-WeightedGrammar.EPSILON) < p < 563 (1+WeightedGrammar.EPSILON)): 564 raise ValueError("Productions for %r do not sum to 1" % lhs)

565 566 # Contributed by Nathan Bodenstab <[email protected]> 567

568 -def induce_pcfg(start, productions):

569 """ 570 Induce a PCFG grammar from a list of productions. 571 572 The probability of a production A -> B C in a PCFG is: 573 574 | count(A -> B C) 575 | P(B, C | A) = --------------- where * is any right hand side 576 | count(A -> *) 577 578 @param start: The start symbol 579 @type start: L{Nonterminal} 580 @param productions: The list of productions that defines the grammar 581 @type productions: C{list} of L{Production} 582 """ 583 584 # Production count: the number of times a given production occurs 585 pcount = {} 586 587 # LHS-count: counts the number of times a given lhs occurs 588 lcount = {} 589 590 for prod in productions: 591 lcount[prod.lhs()] = lcount.get(prod.lhs(), 0) + 1 592 pcount[prod] = pcount.get(prod, 0) + 1 593 594 prods = [WeightedProduction(p.lhs(), p.rhs(), 595 prob=float(pcount[p]) / lcount[p.lhs()]) 596 for p in pcount] 597 return WeightedGrammar(start, prods)

598 599 ################################################################# 600 # Parsing PCFGs 601 ################################################################# 602 603 _PARSE_PCFG_RE = re.compile(r'''^\s* # leading whitespace 604 (\w+(?:/\w+)?)\s* # lhs 605 (?:[-=]+>)\s* # arrow 606 (?:( # rhs: 607 "[^"]+" # doubled-quoted terminal 608 | '[^']+' # single-quoted terminal 609 | \w+(?:/\w+)? # non-terminal 610 | \[[01]?\.\d+\] # probability 611 | \| # disjunction 612 ) 613 \s*) # trailing space 614 *$''', # zero or more copies 615 re.VERBOSE) 616 _SPLIT_PCFG_RE = re.compile(r'(\w+(?:/\w+)?|\[[01]?\.\d+\]|[-=]+>|"[^"]+"' 617 r"|'[^']+'|\|)") 618

619 -def parse_pcfg_production(s):

620 """ 621 Returns a list of PCFG productions 622 """ 623 # Use _PARSE_PCFG_RE to check that it's valid. 624 if not _PARSE_PCFG_RE.match(s): 625 raise ValueError, 'Bad production string' 626 # Use _SPLIT_PCFG_RE to process it. 627 pieces = _SPLIT_PCFG_RE.split(s) 628 pieces = [p for i,p in enumerate(pieces) if i%2==1] 629 lhside = Nonterminal(pieces[0]) 630 rhsides = [[]] 631 probabilities = [0.0] 632 found_terminal = found_non_terminal = False 633 for piece in pieces[2:]: 634 if piece == '|': 635 rhsides.append([]) # Vertical bar 636 probabilities.append(0.0) 637 found_terminal = found_non_terminal = False 638 elif piece[0] in ('"', "'"): 639 if found_terminal: 640 raise ValueError('Bad right-hand-side: do not use ' 641 'a sequence of terminals') 642 rhsides[-1].append(piece[1:-1]) # Terminal 643 found_terminal = True 644 elif piece[0] in "[": 645 probabilities[-1] = float(piece[1:-1]) # Probability 646 else: 647 rhsides[-1].append(Nonterminal(piece)) # Nonterminal 648 found_non_terminal = True 649 if found_terminal and found_non_terminal: 650 raise ValueError('Bad right-hand-side: do not mix ' 651 'terminals and non-terminals') 652 return [WeightedProduction(lhside, rhside, prob=probability) 653 for (rhside, probability) in zip(rhsides, probabilities)]

654

655 -def parse_pcfg(s):

656 productions = [] 657 for linenum, line in enumerate(s.split('\n')): 658 line = line.strip() 659 if line.startswith('#') or line=='': continue 660 try: productions += parse_pcfg_production(line) 661 except ValueError: 662 raise ValueError, 'Unable to parse line %s: %s' % (linenum, line) 663 if len(productions) == 0: 664 raise ValueError, 'No productions found!' 665 start = productions[0].lhs() 666 return WeightedGrammar(start, productions)

667 668 ################################################################# 669 # Parsing Feature-based CFGs 670 ################################################################# 671

672 -def earley_lexicon(productions):

673 """ 674 Convert CFG lexical productions into a dictionary indexed 675 by the lexical string. 676 """ 677 from nltk import defaultdict 678 lexicon = defaultdict(list) 679 for prod in productions: 680 lexicon[prod.rhs()[0]].append(prod.lhs()) 681 return lexicon

682

683 -class FeatStructNonterminal(FeatDict, Nonterminal):

684 """A feature structure that's also a nonterminal. It acts as its 685 own symbol, and automatically freezes itself when hashed."""

686 - def __hash__(self):

687 self.freeze() 688 return FeatStruct.__hash__(self)

689 - def symbol(self):

690 return self

691

692 -def parse_fcfg_production(line, fstruct_parser):

693 pos = 0 694 695 # Parse the left-hand side. 696 lhs, pos = fstruct_parser.partial_parse(line, pos) 697 698 # Skip over the arrow. 699 m = re.compile('\s*->\s*').match(line, pos) 700 if not m: raise ValueError('Expected an arrow') 701 pos = m.end() 702 703 # Parse the right hand side. 704 rhsides = [[]] 705 while pos < len(line): 706 # String -- add nonterminal. 707 if line[pos] in "\'\"": 708 m = re.compile('("[^"]*"|'+"'[^']+')\s*").match(line, pos) 709 if not m: raise ValueError('Unterminated string') 710 if rhsides[-1] != []: raise ValueError('Bad right-hand-side') 711 rhsides[-1].append(m.group(1)[1:-1]) 712 pos = m.end() 713 714 # Vertical bar -- start new rhside. 715 elif line[pos] == '|': 716 if len(rhsides[-1])==1 and isinstance(rhsides[-1], basestring): 717 raise ValueError('Bad right-hand-side') 718 rhsides.append([]) 719 pos = re.compile('\\|\s*').match(line,pos).end() 720 721 # Anything else -- feature structure nonterminal. 722 else: 723 fstruct, pos = fstruct_parser.partial_parse(line, pos) 724 rhsides[-1].append(fstruct) 725 726 return [Production(lhs, rhs) for rhs in rhsides]

727

728 -def parse_fcfg(input, features=None):

729 """ 730 Return a tuple (list of grammatical productions, 731 lexicon dict). 732 733 @param input: a grammar, either in the form of a string or else 734 as a list of strings. 735 """ 736 if features is None: 737 features = (SLASH, TYPE) 738 fstruct_parser = FeatStructParser(features, FeatStructNonterminal) 739 if isinstance(input, str): 740 lines = input.split('\n') 741 else: 742 lines = input 743 744 start = None 745 productions = [] 746 for linenum, line in enumerate(lines): 747 line = line.strip() 748 if line.startswith('#') or line=='': continue 749 if line[0] == '%': 750 parts = line[1:].split() 751 directive, args = line[1:].split(None, 1) 752 if directive == 'start': 753 start = fstruct_parser.parse(args) 754 # elif directive == 'include': 755 # filename = args.strip('"') 756 # # [XX] This is almost certainly a bug: [XX] 757 # self.apply_file(filename) 758 else: 759 try: 760 # expand out the disjunctions on the RHS 761 productions += parse_fcfg_production(line, fstruct_parser) 762 except ValueError, e: 763 raise ValueError('Unable to parse line %s: %s\n%s' % 764 (linenum+1, line, e)) 765 766 if not productions: 767 raise ValueError, 'No productions found!' 768 grammatical_productions = [prod for prod in productions if not 769 (len(prod.rhs()) == 1 and isinstance(prod.rhs()[0], str))] 770 lexical_productions = [prod for prod in productions if 771 (len(prod.rhs()) == 1 and isinstance(prod.rhs()[0], str))] 772 if not start: 773 start = productions[0].lhs() 774 lexicon = earley_lexicon(lexical_productions) 775 grammar = Grammar(start, grammatical_productions, lexicon) 776 return grammar

777 778 from nltk.internals import deprecated 779 @deprecated("Use nltk.cfg.parse_fcfg() instead.")

780 -def parse_featcfg(input):

781 return parse_fcfg(input) 782 783 ################################################################# 784 # Demonstration 785 ################################################################# 786

787 -def cfg_demo():

788 """ 789 A demonstration showing how C{Grammar}s can be created and used. 790 """ 791 792 from nltk import cfg 793 794 # Create some nonterminals 795 S, NP, VP, PP = cfg.nonterminals('S, NP, VP, PP') 796 N, V, P, Det = cfg.nonterminals('N, V, P, Det') 797 VP_slash_NP = VP/NP 798 799 print 'Some nonterminals:', [S, NP, VP, PP, N, V, P, Det, VP/NP] 800 print ' S.symbol() =>', `S.symbol()` 801 print 802 803 print cfg.Production(S, [NP]) 804 805 # Create some Grammar Productions 806 grammar = cfg.parse_cfg(""" 807 S -> NP VP 808 PP -> P NP 809 NP -> Det N | NP PP 810 VP -> V NP | VP PP 811 Det -> 'a' | 'the' 812 N -> 'dog' | 'cat' 813 V -> 'chased' | 'sat' 814 P -> 'on' | 'in' 815 """) 816 817 print 'A Grammar:', `grammar` 818 print ' grammar.start() =>', `grammar.start()` 819 print ' grammar.productions() =>', 820 # Use string.replace(...) is to line-wrap the output. 821 print `grammar.productions()`.replace(',', ',\n'+' '*25) 822 print 823 824 print 'Coverage of input words by a grammar:' 825 print grammar.covers(['a','dog']) 826 print grammar.covers(['a','toy'])

827 828 toy_pcfg1 = parse_pcfg(""" 829 S -> NP VP [1.0] 830 NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] 831 Det -> 'the' [0.8] | 'my' [0.2] 832 N -> 'man' [0.5] | 'telescope' [0.5] 833 VP -> VP PP [0.1] | V NP [0.7] | V [0.2] 834 V -> 'ate' [0.35] | 'saw' [0.65] 835 PP -> P NP [1.0] 836 P -> 'with' [0.61] | 'under' [0.39] 837 """) 838 839 toy_pcfg2 = parse_pcfg(""" 840 S -> NP VP [1.0] 841 VP -> V NP [.59] 842 VP -> V [.40] 843 VP -> VP PP [.01] 844 NP -> Det N [.41] 845 NP -> Name [.28] 846 NP -> NP PP [.31] 847 PP -> P NP [1.0] 848 V -> 'saw' [.21] 849 V -> 'ate' [.51] 850 V -> 'ran' [.28] 851 N -> 'boy' [.11] 852 N -> 'cookie' [.12] 853 N -> 'table' [.13] 854 N -> 'telescope' [.14] 855 N -> 'hill' [.5] 856 Name -> 'Jack' [.52] 857 Name -> 'Bob' [.48] 858 P -> 'with' [.61] 859 P -> 'under' [.39] 860 Det -> 'the' [.41] 861 Det -> 'a' [.31] 862 Det -> 'my' [.28] 863 """) 864

865 -def pcfg_demo():

866 """ 867 A demonstration showing how PCFG C{Grammar}s can be created and used. 868 """ 869 870 from nltk.corpus import treebank 871 from nltk import cfg, treetransforms 872 from nltk.parse import pchart 873 874 pcfg_prods = cfg.toy_pcfg1.productions() 875 876 pcfg_prod = pcfg_prods[2] 877 print 'A PCFG production:', `pcfg_prod` 878 print ' pcfg_prod.lhs() =>', `pcfg_prod.lhs()` 879 print ' pcfg_prod.rhs() =>', `pcfg_prod.rhs()` 880 print ' pcfg_prod.prob() =>', `pcfg_prod.prob()` 881 print 882 883 grammar = cfg.toy_pcfg2 884 print 'A PCFG grammar:', `grammar` 885 print ' grammar.start() =>', `grammar.start()` 886 print ' grammar.productions() =>', 887 # Use string.replace(...) is to line-wrap the output. 888 print `grammar.productions()`.replace(',', ',\n'+' '*26) 889 print 890 891 print 'Coverage of input words by a grammar:' 892 print grammar.covers(['a','boy']) 893 print grammar.covers(['a','girl']) 894 895 # extract productions from three trees and induce the PCFG 896 print "Induce PCFG grammar from treebank data:" 897 898 productions = [] 899 for item in treebank.items[:2]: 900 for tree in treebank.parsed_sents(item): 901 # perform optional tree transformations, e.g.: 902 tree.collapse_unary(collapsePOS = False) 903 tree.chomsky_normal_form(horzMarkov = 2) 904 905 productions += tree.productions() 906 907 S = Nonterminal('S') 908 grammar = cfg.induce_pcfg(S, productions) 909 print grammar 910 print 911 912 print "Parse sentence using induced grammar:" 913 914 parser = pchart.InsideChartParser(grammar) 915 parser.trace(3) 916 917 # doesn't work as tokens are different: 918 #sent = treebank.tokenized('wsj_0001.mrg')[0] 919 920 sent = treebank.parsed_sents('wsj_0001.mrg')[0].leaves() 921 print sent 922 for parse in parser.nbest_parse(sent): 923 print parse

924

925 -def fcfg_demo():

926 import nltk.data 927 g = nltk.data.load('grammars/feat0.fcfg') 928 print g 929 print

930

931 -def demo():

932 cfg_demo() 933 pcfg_demo() 934 fcfg_demo()

935 936 if __name__ == '__main__': 937 demo() 938 939 __all__ = ['Grammar', 'ImmutableProbabilisticMixIn', 'Nonterminal', 940 'Production', 'WeightedGrammar', 'WeightedProduction', 941 'cfg_demo', 'demo', 'induce_pcfg', 'nonterminals', 'parse_cfg', 942 'parse_cfg_production', 'parse_pcfg', 'parse_fcfg', 943 'parse_pcfg_production', 'pcfg_demo', 'toy_pcfg1', 'toy_pcfg2'] 944

Source Code for Module nltk.cfg