Package nltk :: Package corpus :: Package reader :: Module string_category
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.string_category

 1  # Natural Language Toolkit: String Category Corpus Reader 
 2  # 
 3  # Copyright (C) 2001-2008 NLTK Project 
 4  # Author: Steven Bird <[email protected]> 
 5  #         Edward Loper <[email protected]> 
 6  # URL: <http://nltk.org> 
 7  # For license information, see LICENSE.TXT 
 8   
 9  """ 
10  Read tuples from a corpus consisting of categorized strings. 
11  For example, from the question classification corpus: 
12   
13  NUM:dist How far is it from Denver to Aspen ? 
14  LOC:city What county is Modesto , California in ? 
15  HUM:desc Who was Galileo ? 
16  DESC:def What is an atom ? 
17  NUM:date When did Hawaii become a state ? 
18  """        
19   
20  # based on PPAttachmentCorpusReader 
21   
22  from util import * 
23  from api import * 
24  import os 
25   
26  # [xx] Should the order of the tuple be reversed -- in most other places 
27  # in nltk, we use the form (data, tag) -- e.g., tagged words and 
28  # labeled texts for classifiers. 
29 -class StringCategoryCorpusReader(CorpusReader):
30 - def __init__(self, root, files, delimiter=' ', encoding=None):
31 """ 32 @param root: The root directory for this corpus. 33 @param files: A list or regexp specifying the files in this corpus. 34 @param delimiter: Field delimiter 35 """ 36 CorpusReader.__init__(self, root, files, encoding) 37 self._delimiter = delimiter
38
39 - def tuples(self, files):
40 return concat([StreamBackedCorpusView(filename, self._read_tuple_block, 41 encoding=enc) 42 for (filename, enc) in self.abspaths(files)])
43
44 - def raw(self, files=None):
45 """ 46 @return: the text contents of the given files, as a single string. 47 """ 48 if files is None: files = self._files 49 elif isinstance(files, basestring): files = [files] 50 return concat([self.open(f).read() for f in files])
51
52 - def _read_tuple_block(self, stream):
53 line = stream.readline().strip() 54 if line: 55 return [tuple(line.split(self._delimiter, 1))] 56 else: 57 return []
58