Package nltk :: Package tokenize :: Module regexp
[hide private]
[frames] | no frames]

Source Code for Module nltk.tokenize.regexp

  1  # Natural Language Toolkit: Tokenizers 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Edward Loper <[email protected]> 
  5  #         Steven Bird <[email protected]> 
  6  #         Trevor Cohn <[email protected]> 
  7  # URL: <http://nltk.sourceforge.net> 
  8  # For license information, see LICENSE.TXT 
  9   
 10  """ 
 11  Tokenizers that divide strings into substrings using regular 
 12  expressions that can match either tokens or separators between tokens. 
 13  """ 
 14   
 15  import re 
 16  import sre_constants 
 17   
 18  from nltk.internals import convert_regexp_to_nongrouping 
 19   
 20  from api import * 
 21   
22 -class RegexpTokenizer(TokenizerI):
23 """ 24 A tokenizer that splits a string into substrings using a regular 25 expression. The regular expression can be specified to match 26 either tokens or separators between tokens. 27 28 Unlike C{re.findall()} and C{re.split()}, C{RegexpTokenizer} does 29 not treat regular expressions that contain grouping parenthases 30 specially. 31 """
32 - def __init__(self, pattern, gaps=False, discard_empty=True, 33 flags=re.UNICODE | re.MULTILINE | re.DOTALL):
34 """ 35 Construct a new tokenizer that splits strings using the given 36 regular expression C{pattern}. By default, C{pattern} will be 37 used to find tokens; but if C{gaps} is set to C{False}, then 38 C{patterns} will be used to find separators between tokens 39 instead. 40 41 @type pattern: C{str} 42 @param pattern: The pattern used to build this tokenizer. 43 This pattern may safely contain grouping parenthases. 44 @type gaps: C{bool} 45 @param gaps: True if this tokenizer's pattern should be used 46 to find separators between tokens; False if this 47 tokenizer's pattern should be used to find the tokens 48 themselves. 49 @type discard_empty: C{bool} 50 @param discard_empty: True if any empty tokens (C{''}) 51 generated by the tokenizer should be discarded. Empty 52 tokens can only be generated if L{_gaps} is true. 53 @type flags: C{int} 54 @param flags: The regexp flags used to compile this 55 tokenizer's pattern. By default, the following flags are 56 used: C{re.UNICODE | re.MULTILINE | re.DOTALL}. 57 """ 58 # If they gave us a regexp object, extract the pattern. 59 pattern = getattr(pattern, 'pattern', pattern) 60 61 self._pattern = pattern 62 """The pattern used to build this tokenizer.""" 63 64 self._gaps = gaps 65 """True if this tokenizer's pattern should be used to find 66 separators between tokens; False if this tokenizer's pattern 67 should be used to find the tokens themselves.""" 68 69 self._discard_empty = discard_empty 70 """True if any empty tokens (C{''}) generated by the tokenizer 71 should be discarded. Empty tokens can only be generated if 72 L{_gaps} is true.""" 73 74 self._flags = flags 75 """The flags used to compile this tokenizer's pattern.""" 76 77 self._regexp = None 78 """The compiled regular expression used to tokenize texts.""" 79 80 # Remove grouping parenthases -- if the regexp contains any 81 # grouping parenthases, then the behavior of re.findall and 82 # re.split will change. 83 nongrouping_pattern = convert_regexp_to_nongrouping(pattern) 84 85 try: 86 self._regexp = re.compile(nongrouping_pattern, flags) 87 except re.error, e: 88 raise ValueError('Error in regular expression %r: %s' % 89 (pattern, e))
90
91 - def tokenize(self, text):
92 # If our regexp matches gaps, use re.split: 93 if self._gaps: 94 if self._discard_empty: 95 return [tok for tok in self._regexp.split(text) if tok] 96 else: 97 return self._regexp.split(text) 98 99 # If our regexp matches tokens, use re.findall: 100 else: 101 return self._regexp.findall(text)
102
103 - def __repr__(self):
104 return ('%s(pattern=%r, gaps=%r, discard_empty=%r, flags=%r)' % 105 (self.__class__.__name__, self._pattern, self._gaps, 106 self._discard_empty, self._flags))
107
108 -class BlanklineTokenizer(RegexpTokenizer):
109 """ 110 A tokenizer that divides a string into substrings by treating any 111 sequence of blank lines as a separator. Blank lines are defined 112 as lines containing no characters, or containing only space 113 (C{' '}) or tab (C{'\t'}) characters. 114 """
115 - def __init__(self):
116 RegexpTokenizer.__init__(self, r'\s*\n\s*\n\s*', gaps=True)
117
118 -class WordPunctTokenizer(RegexpTokenizer):
119 r""" 120 A tokenizer that divides a text into sequences of alphabetic and 121 non-alphabetic characters. E.g.: 122 123 >>> WordPunctTokenizer().tokenize("She said 'hello'.") 124 ['She', 'said', "'", 'hello', "'."] 125 """
126 - def __init__(self):
127 RegexpTokenizer.__init__(self, r'\w+|[^\w\s]+')
128
129 -class WordTokenizer(RegexpTokenizer):
130 """ 131 A tokenizer that divides a text into sequences of alphabetic 132 characters. Any non-alphabetic characters are discarded. E.g.: 133 134 >>> WordTokenizer().tokenize("She said 'hello'.") 135 ['She', 'said', 'hello'] 136 """
137 - def __init__(self):
138 RegexpTokenizer.__init__(self, r'\w+')
139 140 ###################################################################### 141 #{ Tokenization Functions 142 ###################################################################### 143
144 -def regexp_tokenize(text, pattern, gaps=False, discard_empty=True, 145 flags=re.UNICODE | re.MULTILINE | re.DOTALL):
146 """ 147 Split the given text string, based on the given regular expression 148 pattern. See the documentation for L{RegexpTokenizer.tokenize()} 149 for descriptions of the arguments. 150 """ 151 tokenizer = RegexpTokenizer(pattern, gaps, discard_empty, flags) 152 return tokenizer.tokenize(text)
153 154 blankline_tokenize = BlanklineTokenizer().tokenize 155 wordpunct_tokenize = WordPunctTokenizer().tokenize 156 word_tokenize = WordTokenizer().tokenize 157