Package nltk :: Package tokenize
[hide private]
[frames] | no frames]

Source Code for Package nltk.tokenize

 1  # Natural Language Toolkit: Tokenizers 
 2  # 
 3  # Copyright (C) 2001-2008 NLTK Project 
 4  # Author: Edward Loper <[email protected]> 
 5  #         Steven Bird <[email protected]> (minor additions) 
 6  # URL: <http://nltk.org> 
 7  # For license information, see LICENSE.TXT 
 8   
 9  """ 
10  Functions for X{tokenizing}, i.e., dividing text strings into 
11  substrings. 
12  """ 
13   
14  from simple import * 
15  from regexp import * 
16  from punkt import * 
17  from sexpr import * 
18  from nltk.internals import deprecated 
19   
20  __all__ = ['WhitespaceTokenizer', 'SpaceTokenizer', 'TabTokenizer', 
21             'LineTokenizer', 'RegexpTokenizer', 'BlanklineTokenizer', 
22             'WordPunctTokenizer', 'WordTokenizer', 'blankline_tokenize', 
23             'wordpunct_tokenize', 'regexp_tokenize', 'word_tokenize', 
24             'SExprTokenizer', 'sexpr_tokenize', 'line_tokenize', 
25             'PunktWordTokenizer', 'punkt_word_tokenize', 
26             'PunktSentenceTokenizer', 
27             ] 
28   
29  ###################################################################### 
30  #{ Deprecated since 0.8 
31  ###################################################################### 
32   
33  @deprecated("Use nltk.blankline_tokenize() or " 
34              "nltk.BlanklineTokenizer instead.") 
35 -def blankline(text):
36 return BlanklineTokenizer().tokenize(text)
37 38 @deprecated("Use nltk.wordpunct_tokenize() or " 39 "nltk.WordPunctTokenizer instead.")
40 -def wordpunct(text):
41 return WordPunctTokenizer().tokenize(text)
42 43 @deprecated("Use str.split() or nltk.WhitespaceTokenizer instead.")
44 -def whitespace(text):
45 return WhitespaceTokenizer().tokenize(text)
46 47 @deprecated("Use nltk.word_tokenize() or " 48 "nltk.WordTokenizer instead.")
49 -def word(text):
50 return WordTokenizer().tokenize(text)
51 52 @deprecated("Use nltk.line_tokenize() or " 53 "nltk.LineTokenizer instead.")
54 -def line(text):
55 return LineTokenizer().tokenize(text)
56 57 #} 58