Code Coverage for nltk.tokenize.regexp
Untested Functions
|
Partially Tested Functions
|
"""
Tokenizers that divide strings into substrings using regular
expressions that can match either tokens or separators between tokens.
"""
import re
import sre_constants
from nltk.internals import convert_regexp_to_nongrouping
from api import *
class RegexpTokenizer(TokenizerI):
"""
A tokenizer that splits a string into substrings using a regular
expression. The regular expression can be specified to match
either tokens or separators between tokens.
Unlike C{re.findall()} and C{re.split()}, C{RegexpTokenizer} does
not treat regular expressions that contain grouping parenthases
specially.
"""
def __init__(self, pattern, gaps=False, discard_empty=True,
flags=re.UNICODE | re.MULTILINE | re.DOTALL):
"""
Construct a new tokenizer that splits strings using the given
regular expression C{pattern}. By default, C{pattern} will be
used to find tokens; but if C{gaps} is set to C{False}, then
C{patterns} will be used to find separators between tokens
instead.
@type pattern: C{str}
@param pattern: The pattern used to build this tokenizer.
This pattern may safely contain grouping parenthases.
@type gaps: C{bool}
@param gaps: True if this tokenizer's pattern should be used
to find separators between tokens; False if this
tokenizer's pattern should be used to find the tokens
themselves.
@type discard_empty: C{bool}
@param discard_empty: True if any empty tokens (C{''})
generated by the tokenizer should be discarded. Empty
tokens can only be generated if L{_gaps} is true.
@type flags: C{int}
@param flags: The regexp flags used to compile this
tokenizer's pattern. By default, the following flags are
used: C{re.UNICODE | re.MULTILINE | re.DOTALL}.
"""
pattern = getattr(pattern, 'pattern', pattern)
self._pattern = pattern
"""The pattern used to build this tokenizer."""
self._gaps = gaps
"""True if this tokenizer's pattern should be used to find
separators between tokens; False if this tokenizer's pattern
should be used to find the tokens themselves."""
self._discard_empty = discard_empty
"""True if any empty tokens (C{''}) generated by the tokenizer
should be discarded. Empty tokens can only be generated if
L{_gaps} is true."""
self._flags = flags
"""The flags used to compile this tokenizer's pattern."""
self._regexp = None
"""The compiled regular expression used to tokenize texts."""
nongrouping_pattern = convert_regexp_to_nongrouping(pattern)
try:
self._regexp = re.compile(nongrouping_pattern, flags)
except re.error, e:
raise ValueError('Error in regular expression %r: %s' %
(pattern, e))
def tokenize(self, text):
if self._gaps:
if self._discard_empty:
return [tok for tok in self._regexp.split(text) if tok]
else:
return self._regexp.split(text)
else:
return self._regexp.findall(text)
def __repr__(self):
return ('%s(pattern=%r, gaps=%r, discard_empty=%r, flags=%r)' %
(self.__class__.__name__, self._pattern, self._gaps,
self._discard_empty, self._flags))
class BlanklineTokenizer(RegexpTokenizer):
"""
A tokenizer that divides a string into substrings by treating any
sequence of blank lines as a separator. Blank lines are defined
as lines containing no characters, or containing only space
(C{' '}) or tab (C{'\t'}) characters.
"""
def __init__(self):
RegexpTokenizer.__init__(self, r'\s*\n\s*\n\s*', gaps=True)
class WordPunctTokenizer(RegexpTokenizer):
r"""
A tokenizer that divides a text into sequences of alphabetic and
non-alphabetic characters. E.g.:
>>> WordPunctTokenizer().tokenize("She said 'hello'.")
['She', 'said', "'", 'hello', "'."]
"""
def __init__(self):
RegexpTokenizer.__init__(self, r'\w+|[^\w\s]+')
class WordTokenizer(RegexpTokenizer):
"""
A tokenizer that divides a text into sequences of alphabetic
characters. Any non-alphabetic characters are discarded. E.g.:
>>> WordTokenizer().tokenize("She said 'hello'.")
['She', 'said', 'hello']
"""
def __init__(self):
RegexpTokenizer.__init__(self, r'\w+')
def regexp_tokenize(text, pattern, gaps=False, discard_empty=True,
flags=re.UNICODE | re.MULTILINE | re.DOTALL):
"""
Split the given text string, based on the given regular expression
pattern. See the documentation for L{RegexpTokenizer.tokenize()}
for descriptions of the arguments.
"""
tokenizer = RegexpTokenizer(pattern, gaps, discard_empty, flags)
return tokenizer.tokenize(text)
blankline_tokenize = BlanklineTokenizer().tokenize
wordpunct_tokenize = WordPunctTokenizer().tokenize
word_tokenize = WordTokenizer().tokenize