1
2
3
4
5
6
7
8
9
10 """
11 Tokenizers that divide strings into substrings using regular
12 expressions that can match either tokens or separators between tokens.
13 """
14
15 import re
16 import sre_constants
17
18 from nltk.internals import convert_regexp_to_nongrouping
19
20 from api import *
21
23 """
24 A tokenizer that splits a string into substrings using a regular
25 expression. The regular expression can be specified to match
26 either tokens or separators between tokens.
27
28 Unlike C{re.findall()} and C{re.split()}, C{RegexpTokenizer} does
29 not treat regular expressions that contain grouping parenthases
30 specially.
31 """
32 - def __init__(self, pattern, gaps=False, discard_empty=True,
33 flags=re.UNICODE | re.MULTILINE | re.DOTALL):
34 """
35 Construct a new tokenizer that splits strings using the given
36 regular expression C{pattern}. By default, C{pattern} will be
37 used to find tokens; but if C{gaps} is set to C{False}, then
38 C{patterns} will be used to find separators between tokens
39 instead.
40
41 @type pattern: C{str}
42 @param pattern: The pattern used to build this tokenizer.
43 This pattern may safely contain grouping parenthases.
44 @type gaps: C{bool}
45 @param gaps: True if this tokenizer's pattern should be used
46 to find separators between tokens; False if this
47 tokenizer's pattern should be used to find the tokens
48 themselves.
49 @type discard_empty: C{bool}
50 @param discard_empty: True if any empty tokens (C{''})
51 generated by the tokenizer should be discarded. Empty
52 tokens can only be generated if L{_gaps} is true.
53 @type flags: C{int}
54 @param flags: The regexp flags used to compile this
55 tokenizer's pattern. By default, the following flags are
56 used: C{re.UNICODE | re.MULTILINE | re.DOTALL}.
57 """
58
59 pattern = getattr(pattern, 'pattern', pattern)
60
61 self._pattern = pattern
62 """The pattern used to build this tokenizer."""
63
64 self._gaps = gaps
65 """True if this tokenizer's pattern should be used to find
66 separators between tokens; False if this tokenizer's pattern
67 should be used to find the tokens themselves."""
68
69 self._discard_empty = discard_empty
70 """True if any empty tokens (C{''}) generated by the tokenizer
71 should be discarded. Empty tokens can only be generated if
72 L{_gaps} is true."""
73
74 self._flags = flags
75 """The flags used to compile this tokenizer's pattern."""
76
77 self._regexp = None
78 """The compiled regular expression used to tokenize texts."""
79
80
81
82
83 nongrouping_pattern = convert_regexp_to_nongrouping(pattern)
84
85 try:
86 self._regexp = re.compile(nongrouping_pattern, flags)
87 except re.error, e:
88 raise ValueError('Error in regular expression %r: %s' %
89 (pattern, e))
90
92
93 if self._gaps:
94 if self._discard_empty:
95 return [tok for tok in self._regexp.split(text) if tok]
96 else:
97 return self._regexp.split(text)
98
99
100 else:
101 return self._regexp.findall(text)
102
104 return ('%s(pattern=%r, gaps=%r, discard_empty=%r, flags=%r)' %
105 (self.__class__.__name__, self._pattern, self._gaps,
106 self._discard_empty, self._flags))
107
109 """
110 A tokenizer that divides a string into substrings by treating any
111 sequence of blank lines as a separator. Blank lines are defined
112 as lines containing no characters, or containing only space
113 (C{' '}) or tab (C{'\t'}) characters.
114 """
117
119 r"""
120 A tokenizer that divides a text into sequences of alphabetic and
121 non-alphabetic characters. E.g.:
122
123 >>> WordPunctTokenizer().tokenize("She said 'hello'.")
124 ['She', 'said', "'", 'hello', "'."]
125 """
128
130 """
131 A tokenizer that divides a text into sequences of alphabetic
132 characters. Any non-alphabetic characters are discarded. E.g.:
133
134 >>> WordTokenizer().tokenize("She said 'hello'.")
135 ['She', 'said', 'hello']
136 """
139
140
141
142
143
144 -def regexp_tokenize(text, pattern, gaps=False, discard_empty=True,
145 flags=re.UNICODE | re.MULTILINE | re.DOTALL):
146 """
147 Split the given text string, based on the given regular expression
148 pattern. See the documentation for L{RegexpTokenizer.tokenize()}
149 for descriptions of the arguments.
150 """
151 tokenizer = RegexpTokenizer(pattern, gaps, discard_empty, flags)
152 return tokenizer.tokenize(text)
153
154 blankline_tokenize = BlanklineTokenizer().tokenize
155 wordpunct_tokenize = WordPunctTokenizer().tokenize
156 word_tokenize = WordTokenizer().tokenize
157