nltk.tokenize.simple

1 # Natural Language Toolkit: Simple Tokenizers 2 # 3 # Copyright (C) 2001-2008 NLTK Project 4 # Author: Edward Loper <[email protected]> 5 # Steven Bird <[email protected]> 6 # Trevor Cohn <[email protected]> 7 # URL: <http://nltk.sourceforge.net> 8 # For license information, see LICENSE.TXT 9 10 """ 11 Tokenizers that divide strings into substrings using the string 12 C{split()} method. 13 14 These tokenizers follow the standard L{TokenizerI} interface, and so 15 can be used with any code that expects a tokenizer. For example, 16 these tokenizers can be used to specify the tokenization conventions 17 when building a L{CorpusReader<nltk.corpus.reader.api.CorpusReader>}. 18 But if you are tokenizing a string yourself, consider using string 19 C{split()} method directly instead. 20 """ 21 22 from api import * 23

24 -class WhitespaceTokenizer(TokenizerI):

25 r""" 26 A tokenizer that divides a string into substrings by treating any 27 sequence of whitespace characters as a separator. Whitespace 28 characters are space (C{' '}), tab (C{'\t'}), and newline 29 (C{'\n'}). If you are performing the tokenization yourself 30 (rather than building a tokenizer to pass to some other piece of 31 code), consider using the string C{split()} method instead: 32 33 >>> words = s.split() 34 """

35 - def tokenize(self, s):

36 return s.split()

37

38 -class SpaceTokenizer(TokenizerI):

39 r""" 40 A tokenizer that divides a string into substrings by treating any 41 single space character as a separator. If you are performing the 42 tokenization yourself (rather than building a tokenizer to pass to 43 some other piece of code), consider using the string C{split()} 44 method instead: 45 46 >>> words = s.split(' ') 47 """

48 - def tokenize(self, s):

49 return s.split(' ')

50

51 -class TabTokenizer(TokenizerI):

52 r""" 53 A tokenizer that divides a string into substrings by treating any 54 single tab character as a separator. If you are performing the 55 tokenization yourself (rather than building a tokenizer to pass to 56 some other piece of code), consider using the string C{split()} 57 method instead: 58 59 >>> words = s.split('\t') 60 """

61 - def tokenize(self, s):

62 return s.split('\t')

63

64 -class LineTokenizer(TokenizerI):

65 r""" 66 A tokenizer that divides a string into substrings by treating any 67 single newline character as a separator. Handling of blank lines 68 may be controlled using a constructor parameter. 69 """

70 - def __init__(self, blanklines='discard'):

71 """ 72 @param blanklines: Indicates how blank lines should be 73 handled. Valid values are: 74 75 - C{'discard'}: strip blank lines out of the token list 76 before returning it. A line is considered blank if 77 it contains only whitespace characters. 78 - C{'keep'}: leave all blank lines in the token list. 79 - C{'discard-eof'}: if the string ends with a newline, 80 then do not generate a corresponding token C{''} after 81 that newline. 82 """ 83 valid_blanklines = ('discard', 'keep', 'discard-eof') 84 if blanklines not in valid_blanklines: 85 raise ValueError('Blank lines must be one of: %s' % 86 ' '.join(valid_blanklines)) 87 88 self._blanklines = blanklines

89

90 - def tokenize(self, s):

91 lines = s.split('\n') 92 # If requested, strip off blank lines. 93 if self._blanklines == 'discard': 94 lines = [l for l in lines if l.rstrip()] 95 elif self._blanklines == 'discard-eof': 96 if lines and not lines[-1].strip(): lines.pop() 97 return lines

98 99 ###################################################################### 100 #{ Tokenization Functions 101 ###################################################################### 102

103 -def line_tokenize(text, blanklines='discard'):

104 return LineTokenizer(blanklines).tokenize(text)

105

Source Code for Module nltk.tokenize.simple