Package nltk :: Package tokenize :: Module sexpr
[hide private]
[frames] | no frames]

Source Code for Module nltk.tokenize.sexpr

  1  # Natural Language Toolkit: Tokenizers 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Yoav Goldberg <[email protected]> 
  5  #         Steven Bird <[email protected]> (minor edits) 
  6  # URL: <http://nltk.sourceforge.net> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  """ 
 10  A tokenizer that divides strings into s-expressions.  E.g.: 
 11   
 12      >>> sexpr_tokenize('(a b (c d)) e f (g)') 
 13      ['(a b (c d))', 'e', 'f', '(g)'] 
 14  """ 
 15   
 16  import re 
 17   
 18  from api import * 
 19   
20 -class SExprTokenizer(TokenizerI):
21 """ 22 A tokenizer that divides strings into X{s-expressions}. An 23 s-expresion can be either: 24 25 - A parenthasized expression, including any nested parenthasized 26 expressions. 27 - A sequence of non-whitespace non-parenthasis characters. 28 29 For example, the string C{'(a (b c)) d e (f)'} consists of four 30 s-expressions: C{'(a (b c))'}, C{'d'}, C{'e'}, and C{'(f)'}. 31 """
32 - def __init__(self, parens='()', strict=True):
33 """ 34 Construct a new SExpr tokenizer. By default, the characters 35 C{'('} and C{')'} are treated as open and close parenthases; 36 but alternative strings may be specified. 37 38 @param parens: A two-element sequence specifying the open and 39 close parenthases that should be used to find sexprs. This 40 will typically be either a two-character string, or a list 41 of two strings. 42 @type parens: C{str} or C{list} 43 @param strict: If true, then raise an exception when tokenizing 44 an ill-formed sexpr. 45 """ 46 if len(parens) != 2: 47 raise ValueError('parens must contain exactly two strings') 48 self._strict = strict 49 self._open_paren = parens[0] 50 self._close_paren = parens[1] 51 self._paren_regexp = re.compile('%s|%s' % (re.escape(parens[0]), 52 re.escape(parens[1])))
53
54 - def tokenize(self, text):
55 """ 56 Tokenize the text into s-expressions. For example: 57 58 >>> SExprTokenizer().tokenize('(a b (c d)) e f (g)') 59 ['(a b (c d))', 'e', 'f', '(g)'] 60 61 All parenthases are assumed to mark sexprs. In particular, no 62 special processing is done to exclude parenthases that occur 63 inside strings, or following backslash characters. 64 65 If the given expression contains non-matching parenthases, 66 then the behavior of the tokenizer depends on the C{strict} 67 parameter to the constructor. If C{strict} is C{True}, then 68 raise a C{ValueError}. If C{strict} is C{False}, then any 69 unmatched close parenthases will be listed as their own 70 s-expression; and the last partial sexpr with unmatched open 71 parenthases will be listed as its own sexpr: 72 73 >>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g') 74 ['c', ')', 'd', ')', 'e', '(f (g'] 75 76 @param text: the string to be tokenized 77 @type text: C{string} or C{iter(string)} 78 @return: An iterator over tokens (each of which is an s-expression) 79 """ 80 result = [] 81 pos = 0 82 depth = 0 83 for m in self._paren_regexp.finditer(text): 84 paren = m.group() 85 if depth == 0: 86 result += text[pos:m.start()].split() 87 pos = m.start() 88 if paren == self._open_paren: 89 depth += 1 90 if paren == self._close_paren: 91 if self._strict and depth == 0: 92 raise ValueError('Un-matched close paren at char %d' 93 % m.start()) 94 depth = max(0, depth-1) 95 if depth == 0: 96 result.append(text[pos:m.end()]) 97 pos = m.end() 98 if self._strict and depth > 0: 99 raise ValueError('Un-matched open paren at char %d' % pos) 100 if pos < len(text): 101 result.append(text[pos:]) 102 return result
103 104 sexpr_tokenize = SExprTokenizer().tokenize 105
106 -def demo():
107 from nltk import tokenize 108 109 example = "d (d ((e) ((f) ss) a a c) d) r (t i) (iu a" 110 example = "d [d [[e] [[f] ss] a a c] d] r [t i]" 111 print 'Input text:' 112 print example 113 print 114 print 'Tokenize s-expressions:' 115 for x in SExprTokenizer('[]').tokenize(example): 116 print x
117 118 if __name__ == '__main__': 119 demo() 120