Package nltk :: Package tokenize :: Module api
[hide private]
[frames] | no frames]

Source Code for Module nltk.tokenize.api

 1  # Natural Language Toolkit: Tokenizer Interface 
 2  # 
 3  # Copyright (C) 2001-2008 NLTK Project 
 4  # Author: Edward Loper <[email protected]> 
 5  # URL: <http://nltk.org> 
 6  # For license information, see LICENSE.TXT 
 7   
 8  """ 
 9  Tokenizer Interface 
10  """ 
11  from nltk.internals import overridden 
12   
13 -class TokenizerI(object):
14 """ 15 A procesing interface for I{tokenizing} a string, or dividing it 16 into a list of substrings. 17 18 Subclasses must define: 19 - either L{tokenize()} or L{batch_tokenize()} (or both) 20 """
21 - def tokenize(self, s):
22 """ 23 Divide the given string into a list of substrings. 24 25 @return: C{list} of C{str} 26 """ 27 if overridden(self.batch_tokenize): 28 return self.batch_tokenize([s])[0] 29 else: 30 raise NotImplementedError()
31
32 - def batch_tokenize(self, strings):
33 """ 34 Apply L{self.tokenize()} to each element of C{strings}. I.e.: 35 36 >>> return [self.tokenize(s) for s in strings] 37 38 @rtype: C{list} of C{list} of C{str} 39 """ 40 return [self.tokenize(s) for s in strings]
41