Package nltk :: Package tokenize :: Module simple
[hide private]
[frames] | no frames]

Source Code for Module nltk.tokenize.simple

  1  # Natural Language Toolkit: Simple Tokenizers 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Edward Loper <[email protected]> 
  5  #         Steven Bird <[email protected]> 
  6  #         Trevor Cohn <[email protected]> 
  7  # URL: <http://nltk.sourceforge.net> 
  8  # For license information, see LICENSE.TXT 
  9   
 10  """ 
 11  Tokenizers that divide strings into substrings using the string 
 12  C{split()} method. 
 13   
 14  These tokenizers follow the standard L{TokenizerI} interface, and so 
 15  can be used with any code that expects a tokenizer.  For example, 
 16  these tokenizers can be used to specify the tokenization conventions 
 17  when building a L{CorpusReader<nltk.corpus.reader.api.CorpusReader>}. 
 18  But if you are tokenizing a string yourself, consider using string 
 19  C{split()} method directly instead. 
 20  """ 
 21   
 22  from api import * 
 23   
24 -class WhitespaceTokenizer(TokenizerI):
25 r""" 26 A tokenizer that divides a string into substrings by treating any 27 sequence of whitespace characters as a separator. Whitespace 28 characters are space (C{' '}), tab (C{'\t'}), and newline 29 (C{'\n'}). If you are performing the tokenization yourself 30 (rather than building a tokenizer to pass to some other piece of 31 code), consider using the string C{split()} method instead: 32 33 >>> words = s.split() 34 """
35 - def tokenize(self, s):
36 return s.split()
37
38 -class SpaceTokenizer(TokenizerI):
39 r""" 40 A tokenizer that divides a string into substrings by treating any 41 single space character as a separator. If you are performing the 42 tokenization yourself (rather than building a tokenizer to pass to 43 some other piece of code), consider using the string C{split()} 44 method instead: 45 46 >>> words = s.split(' ') 47 """
48 - def tokenize(self, s):
49 return s.split(' ')
50
51 -class TabTokenizer(TokenizerI):
52 r""" 53 A tokenizer that divides a string into substrings by treating any 54 single tab character as a separator. If you are performing the 55 tokenization yourself (rather than building a tokenizer to pass to 56 some other piece of code), consider using the string C{split()} 57 method instead: 58 59 >>> words = s.split('\t') 60 """
61 - def tokenize(self, s):
62 return s.split('\t')
63
64 -class LineTokenizer(TokenizerI):
65 r""" 66 A tokenizer that divides a string into substrings by treating any 67 single newline character as a separator. Handling of blank lines 68 may be controlled using a constructor parameter. 69 """
70 - def __init__(self, blanklines='discard'):
71 """ 72 @param blanklines: Indicates how blank lines should be 73 handled. Valid values are: 74 75 - C{'discard'}: strip blank lines out of the token list 76 before returning it. A line is considered blank if 77 it contains only whitespace characters. 78 - C{'keep'}: leave all blank lines in the token list. 79 - C{'discard-eof'}: if the string ends with a newline, 80 then do not generate a corresponding token C{''} after 81 that newline. 82 """ 83 valid_blanklines = ('discard', 'keep', 'discard-eof') 84 if blanklines not in valid_blanklines: 85 raise ValueError('Blank lines must be one of: %s' % 86 ' '.join(valid_blanklines)) 87 88 self._blanklines = blanklines
89
90 - def tokenize(self, s):
91 lines = s.split('\n') 92 # If requested, strip off blank lines. 93 if self._blanklines == 'discard': 94 lines = [l for l in lines if l.rstrip()] 95 elif self._blanklines == 'discard-eof': 96 if lines and not lines[-1].strip(): lines.pop() 97 return lines
98 99 ###################################################################### 100 #{ Tokenization Functions 101 ###################################################################### 102
103 -def line_tokenize(text, blanklines='discard'):
104 return LineTokenizer(blanklines).tokenize(text)
105