1
2
3
4
5
6
7
8
9 """
10 Indian Language POS-Tagged Corpus
11 Collected by A Kumaran, Microsoft Research, India
12 Distributed with permission
13
14 Contents:
15 - Bangla: IIT Kharagpur
16 - Hindi: Microsoft Research India
17 - Marathi: IIT Bombay
18 - Telugu: IIIT Hyderabad
19 """
20
21 from nltk.corpus.reader.util import *
22 from nltk.corpus.reader.api import *
23 from nltk import tokenize
24 import codecs
25 from nltk.internals import deprecated
26 from nltk.tag.util import str2tuple
27
29 """
30 List of words, one per line. Blank lines are ignored.
31 """
32 - def words(self, files=None):
36
38 if simplify_tags:
39 tag_mapping_function = self._tag_mapping_function
40 else:
41 tag_mapping_function = None
42 return concat([IndianCorpusView(filename, enc,
43 True, False, tag_mapping_function)
44 for (filename, enc) in self.abspaths(files, True)])
45
46 - def sents(self, files=None):
50
52 if simplify_tags:
53 tag_mapping_function = self._tag_mapping_function
54 else:
55 tag_mapping_function = None
56 return concat([IndianCorpusView(filename, enc,
57 True, True, tag_mapping_function)
58 for (filename, enc) in self.abspaths(files, True)])
59
60 - def raw(self, files=None):
64
65
66 @deprecated("Use .raw() or .words() or .tagged_words() instead.")
67 - def read(self, items, format='tagged'):
72 @deprecated("Use .words() instead.")
75 @deprecated("Use .tagged_words() instead.")
78
79
81 - def __init__(self, corpus_file, encoding, tagged,
82 group_by_sent, tag_mapping_function=None):
87
89 line = stream.readline()
90 if line.startswith('<'):
91 return []
92 sent = [str2tuple(word, sep='_') for word in line.split()]
93 if self._tag_mapping_function:
94 sent = [(w, self._tag_mapping_function(t)) for (w,t) in sent]
95 if not self._tagged: sent = [w for (w,t) in sent]
96 if self._group_by_sent:
97 return [sent]
98 else:
99 return sent
100