1
2
3
4
5
6
7
8
9 """
10 Read tuples from a corpus consisting of categorized strings.
11 For example, from the question classification corpus:
12
13 NUM:dist How far is it from Denver to Aspen ?
14 LOC:city What county is Modesto , California in ?
15 HUM:desc Who was Galileo ?
16 DESC:def What is an atom ?
17 NUM:date When did Hawaii become a state ?
18 """
19
20
21
22 from util import *
23 from api import *
24 import os
25
26
27
28
30 - def __init__(self, root, files, delimiter=' ', encoding=None):
31 """
32 @param root: The root directory for this corpus.
33 @param files: A list or regexp specifying the files in this corpus.
34 @param delimiter: Field delimiter
35 """
36 CorpusReader.__init__(self, root, files, encoding)
37 self._delimiter = delimiter
38
43
44 - def raw(self, files=None):
45 """
46 @return: the text contents of the given files, as a single string.
47 """
48 if files is None: files = self._files
49 elif isinstance(files, basestring): files = [files]
50 return concat([self.open(f).read() for f in files])
51
53 line = stream.readline().strip()
54 if line:
55 return [tuple(line.split(self._delimiter, 1))]
56 else:
57 return []
58