Package nltk :: Package tokenize :: Module regexp
[hide private]
[frames] | no frames]

Source Code for Module nltk.tokenize.regexp

  1  # Natural Language Toolkit: Tokenizers 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Edward Loper <[email protected]> 
  5  #         Steven Bird <[email protected]> 
  6  #         Trevor Cohn <[email protected]> 
  7  # URL: <> 
  8  # For license information, see LICENSE.TXT 
 10  """ 
 11  Tokenizers that divide strings into substrings using regular 
 12  expressions that can match either tokens or separators between tokens. 
 13  """ 
 15  import re 
 16  import sre_constants 
 18  from nltk.internals import convert_regexp_to_nongrouping 
 20  from api import * 
22 -class RegexpTokenizer(TokenizerI):
23 """ 24 A tokenizer that splits a string into substrings using a regular 25 expression. The regular expression can be specified to match 26 either tokens or separators between tokens. 27 28 Unlike C{re.findall()} and C{re.split()}, C{RegexpTokenizer} does 29 not treat regular expressions that contain grouping parenthases 30 specially. 31 """
32 - def __init__(self, pattern, gaps=False, discard_empty=True, 33 flags=re.UNICODE | re.MULTILINE | re.DOTALL):
34 """ 35 Construct a new tokenizer that splits strings using the given 36 regular expression C{pattern}. By default, C{pattern} will be 37 used to find tokens; but if C{gaps} is set to C{False}, then 38 C{patterns} will be used to find separators between tokens 39 instead. 40 41 @type pattern: C{str} 42 @param pattern: The pattern used to build this tokenizer. 43 This pattern may safely contain grouping parenthases. 44 @type gaps: C{bool} 45 @param gaps: True if this tokenizer's pattern should be used 46 to find separators between tokens; False if this 47 tokenizer's pattern should be used to find the tokens 48 themselves. 49 @type discard_empty: C{bool} 50 @param discard_empty: True if any empty tokens (C{''}) 51 generated by the tokenizer should be discarded. Empty 52 tokens can only be generated if L{_gaps} is true. 53 @type flags: C{int} 54 @param flags: The regexp flags used to compile this 55 tokenizer's pattern. By default, the following flags are 56 used: C{re.UNICODE | re.MULTILINE | re.DOTALL}. 57 """ 58 # If they gave us a regexp object, extract the pattern. 59 pattern = getattr(pattern, 'pattern', pattern) 60 61 self._pattern = pattern 62 """The pattern used to build this tokenizer.""" 63 64 self._gaps = gaps 65 """True if this tokenizer's pattern should be used to find 66 separators between tokens; False if this tokenizer's pattern 67 should be used to find the tokens themselves.""" 68 69 self._discard_empty = discard_empty 70 """True if any empty tokens (C{''}) generated by the tokenizer 71 should be discarded. Empty tokens can only be generated if 72 L{_gaps} is true.""" 73 74 self._flags = flags 75 """The flags used to compile this tokenizer's pattern.""" 76 77 self._regexp = None 78 """The compiled regular expression used to tokenize texts.""" 79 80 # Remove grouping parenthases -- if the regexp contains any 81 # grouping parenthases, then the behavior of re.findall and 82 # re.split will change. 83 nongrouping_pattern = convert_regexp_to_nongrouping(pattern) 84 85 try: 86 self._regexp = re.compile(nongrouping_pattern, flags) 87 except re.error, e: 88 raise ValueError('Error in regular expression %r: %s' % 89 (pattern, e))
91 - def tokenize(self, text):
92 # If our regexp matches gaps, use re.split: 93 if self._gaps: 94 if self._discard_empty: 95 return [tok for tok in self._regexp.split(text) if tok] 96 else: 97 return self._regexp.split(text) 98 99 # If our regexp matches tokens, use re.findall: 100 else: 101 return self._regexp.findall(text)
103 - def __repr__(self):
104 return ('%s(pattern=%r, gaps=%r, discard_empty=%r, flags=%r)' % 105 (self.__class__.__name__, self._pattern, self._gaps, 106 self._discard_empty, self._flags))
108 -class BlanklineTokenizer(RegexpTokenizer):
109 """ 110 A tokenizer that divides a string into substrings by treating any 111 sequence of blank lines as a separator. Blank lines are defined 112 as lines containing no characters, or containing only space 113 (C{' '}) or tab (C{'\t'}) characters. 114 """
115 - def __init__(self):
116 RegexpTokenizer.__init__(self, r'\s*\n\s*\n\s*', gaps=True)
118 -class WordPunctTokenizer(RegexpTokenizer):
119 r""" 120 A tokenizer that divides a text into sequences of alphabetic and 121 non-alphabetic characters. E.g.: 122 123 >>> WordPunctTokenizer().tokenize("She said 'hello'.") 124 ['She', 'said', "'", 'hello', "'."] 125 """
126 - def __init__(self):
127 RegexpTokenizer.__init__(self, r'\w+|[^\w\s]+')
129 -class WordTokenizer(RegexpTokenizer):
130 """ 131 A tokenizer that divides a text into sequences of alphabetic 132 characters. Any non-alphabetic characters are discarded. E.g.: 133 134 >>> WordTokenizer().tokenize("She said 'hello'.") 135 ['She', 'said', 'hello'] 136 """
137 - def __init__(self):
138 RegexpTokenizer.__init__(self, r'\w+')
139 140 ###################################################################### 141 #{ Tokenization Functions 142 ###################################################################### 143
144 -def regexp_tokenize(text, pattern, gaps=False, discard_empty=True, 145 flags=re.UNICODE | re.MULTILINE | re.DOTALL):
146 """ 147 Split the given text string, based on the given regular expression 148 pattern. See the documentation for L{RegexpTokenizer.tokenize()} 149 for descriptions of the arguments. 150 """ 151 tokenizer = RegexpTokenizer(pattern, gaps, discard_empty, flags) 152 return tokenizer.tokenize(text)
153 154 blankline_tokenize = BlanklineTokenizer().tokenize 155 wordpunct_tokenize = WordPunctTokenizer().tokenize 156 word_tokenize = WordTokenizer().tokenize 157