1
2
3
4
5
6
7
8
9
10 """
11 Tokenizers that divide strings into substrings using the string
12 C{split()} method.
13
14 These tokenizers follow the standard L{TokenizerI} interface, and so
15 can be used with any code that expects a tokenizer. For example,
16 these tokenizers can be used to specify the tokenization conventions
17 when building a L{CorpusReader<nltk.corpus.reader.api.CorpusReader>}.
18 But if you are tokenizing a string yourself, consider using string
19 C{split()} method directly instead.
20 """
21
22 from api import *
23
25 r"""
26 A tokenizer that divides a string into substrings by treating any
27 sequence of whitespace characters as a separator. Whitespace
28 characters are space (C{' '}), tab (C{'\t'}), and newline
29 (C{'\n'}). If you are performing the tokenization yourself
30 (rather than building a tokenizer to pass to some other piece of
31 code), consider using the string C{split()} method instead:
32
33 >>> words = s.split()
34 """
37
39 r"""
40 A tokenizer that divides a string into substrings by treating any
41 single space character as a separator. If you are performing the
42 tokenization yourself (rather than building a tokenizer to pass to
43 some other piece of code), consider using the string C{split()}
44 method instead:
45
46 >>> words = s.split(' ')
47 """
50
52 r"""
53 A tokenizer that divides a string into substrings by treating any
54 single tab character as a separator. If you are performing the
55 tokenization yourself (rather than building a tokenizer to pass to
56 some other piece of code), consider using the string C{split()}
57 method instead:
58
59 >>> words = s.split('\t')
60 """
63
65 r"""
66 A tokenizer that divides a string into substrings by treating any
67 single newline character as a separator. Handling of blank lines
68 may be controlled using a constructor parameter.
69 """
70 - def __init__(self, blanklines='discard'):
71 """
72 @param blanklines: Indicates how blank lines should be
73 handled. Valid values are:
74
75 - C{'discard'}: strip blank lines out of the token list
76 before returning it. A line is considered blank if
77 it contains only whitespace characters.
78 - C{'keep'}: leave all blank lines in the token list.
79 - C{'discard-eof'}: if the string ends with a newline,
80 then do not generate a corresponding token C{''} after
81 that newline.
82 """
83 valid_blanklines = ('discard', 'keep', 'discard-eof')
84 if blanklines not in valid_blanklines:
85 raise ValueError('Blank lines must be one of: %s' %
86 ' '.join(valid_blanklines))
87
88 self._blanklines = blanklines
89
91 lines = s.split('\n')
92
93 if self._blanklines == 'discard':
94 lines = [l for l in lines if l.rstrip()]
95 elif self._blanklines == 'discard-eof':
96 if lines and not lines[-1].strip(): lines.pop()
97 return lines
98
99
100
101
102
105