nltk.tokenize.regexp

23 """ 24 A tokenizer that splits a string into substrings using a regular 25 expression. The regular expression can be specified to match 26 either tokens or separators between tokens. 27 28 Unlike C{re.findall()} and C{re.split()}, C{RegexpTokenizer} does 29 not treat regular expressions that contain grouping parenthases 30 specially. 31 """

32 - def __init__(self, pattern, gaps=False, discard_empty=True, 33 flags=re.UNICODE | re.MULTILINE | re.DOTALL):

34 """ 35 Construct a new tokenizer that splits strings using the given 36 regular expression C{pattern}. By default, C{pattern} will be 37 used to find tokens; but if C{gaps} is set to C{False}, then 38 C{patterns} will be used to find separators between tokens 39 instead. 40 41 @type pattern: C{str} 42 @param pattern: The pattern used to build this tokenizer. 43 This pattern may safely contain grouping parenthases. 44 @type gaps: C{bool} 45 @param gaps: True if this tokenizer's pattern should be used 46 to find separators between tokens; False if this 47 tokenizer's pattern should be used to find the tokens 48 themselves. 49 @type discard_empty: C{bool} 50 @param discard_empty: True if any empty tokens (C{''}) 51 generated by the tokenizer should be discarded. Empty 52 tokens can only be generated if L{_gaps} is true. 53 @type flags: C{int} 54 @param flags: The regexp flags used to compile this 55 tokenizer's pattern. By default, the following flags are 56 used: C{re.UNICODE | re.MULTILINE | re.DOTALL}. 57 """ 58 # If they gave us a regexp object, extract the pattern. 59 pattern = getattr(pattern, 'pattern', pattern) 60 61 self._pattern = pattern 62 """The pattern used to build this tokenizer.""" 63 64 self._gaps = gaps 65 """True if this tokenizer's pattern should be used to find 66 separators between tokens; False if this tokenizer's pattern 67 should be used to find the tokens themselves.""" 68 69 self._discard_empty = discard_empty 70 """True if any empty tokens (C{''}) generated by the tokenizer 71 should be discarded. Empty tokens can only be generated if 72 L{_gaps} is true.""" 73 74 self._flags = flags 75 """The flags used to compile this tokenizer's pattern.""" 76 77 self._regexp = None 78 """The compiled regular expression used to tokenize texts.""" 79 80 # Remove grouping parenthases -- if the regexp contains any 81 # grouping parenthases, then the behavior of re.findall and 82 # re.split will change. 83 nongrouping_pattern = convert_regexp_to_nongrouping(pattern) 84 85 try: 86 self._regexp = re.compile(nongrouping_pattern, flags) 87 except re.error, e: 88 raise ValueError('Error in regular expression %r: %s' % 89 (pattern, e))

90

91 - def tokenize(self, text):

92 # If our regexp matches gaps, use re.split: 93 if self._gaps: 94 if self._discard_empty: 95 return [tok for tok in self._regexp.split(text) if tok] 96 else: 97 return self._regexp.split(text) 98 99 # If our regexp matches tokens, use re.findall: 100 else: 101 return self._regexp.findall(text)

102

103 - def __repr__(self):

104 return ('%s(pattern=%r, gaps=%r, discard_empty=%r, flags=%r)' % 105 (self.__class__.__name__, self._pattern, self._gaps, 106 self._discard_empty, self._flags))

Source Code for Module nltk.tokenize.regexp