#
# Copyright (C) 2001-2017 NLTK Project
# Author: Masato Hagiwara <[email protected]>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html
from __future__ import print_function
import sys
from six import string_types
from nltk.corpus.reader import util
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
[docs]class ChasenCorpusReader(CorpusReader):
def __init__(self, root, fileids, encoding='utf8', sent_splitter=None):
self._sent_splitter = sent_splitter
CorpusReader.__init__(self, root, fileids, encoding)
[docs] def raw(self, fileids=None):
if fileids is None: fileids = self._fileids
elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
[docs] def words(self, fileids=None):
return concat([ChasenCorpusView(fileid, enc,
False, False, False, self._sent_splitter)
for (fileid, enc) in self.abspaths(fileids, True)])
[docs] def tagged_words(self, fileids=None):
return concat([ChasenCorpusView(fileid, enc,
True, False, False, self._sent_splitter)
for (fileid, enc) in self.abspaths(fileids, True)])
[docs] def sents(self, fileids=None):
return concat([ChasenCorpusView(fileid, enc,
False, True, False, self._sent_splitter)
for (fileid, enc) in self.abspaths(fileids, True)])
[docs] def tagged_sents(self, fileids=None):
return concat([ChasenCorpusView(fileid, enc,
True, True, False, self._sent_splitter)
for (fileid, enc) in self.abspaths(fileids, True)])
[docs] def paras(self, fileids=None):
return concat([ChasenCorpusView(fileid, enc,
False, True, True, self._sent_splitter)
for (fileid, enc) in self.abspaths(fileids, True)])
[docs] def tagged_paras(self, fileids=None):
return concat([ChasenCorpusView(fileid, enc,
True, True, True, self._sent_splitter)
for (fileid, enc) in self.abspaths(fileids, True)])
[docs]class ChasenCorpusView(StreamBackedCorpusView):
"""
A specialized corpus view for ChasenReader. Similar to ``TaggedCorpusView``,
but this'll use fixed sets of word and sentence tokenizer.
"""
def __init__(self, corpus_file, encoding,
tagged, group_by_sent, group_by_para, sent_splitter=None):
self._tagged = tagged
self._group_by_sent = group_by_sent
self._group_by_para = group_by_para
self._sent_splitter = sent_splitter
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
[docs] def read_block(self, stream):
"""Reads one paragraph at a time."""
block = []
for para_str in read_regexp_block(stream, r".", r"^EOS\n"):
para = []
sent = []
for line in para_str.splitlines():
_eos = line.strip() == 'EOS'
_cells = line.split('\t')
w = (_cells[0], '\t'.join(_cells[1:]))
if not _eos: sent.append(w)
if _eos or (self._sent_splitter and self._sent_splitter(w)):
if not self._tagged:
sent = [w for (w,t) in sent]
if self._group_by_sent:
para.append(sent)
else:
para.extend(sent)
sent = []
if len(sent)>0:
if not self._tagged:
sent = [w for (w,t) in sent]
if self._group_by_sent:
para.append(sent)
else:
para.extend(sent)
if self._group_by_para:
block.append(para)
else:
block.extend(para)
return block
[docs]def demo():
import nltk
from nltk.corpus.util import LazyCorpusLoader
jeita = LazyCorpusLoader(
'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8')
print('/'.join( jeita.words()[22100:22140] ))
print('\nEOS\n'.join('\n'.join("%s/%s" % (w[0],w[1].split('\t')[2]) for w in sent)
for sent in jeita.tagged_sents()[2170:2173]))
[docs]def test():
from nltk.corpus.util import LazyCorpusLoader
jeita = LazyCorpusLoader(
'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8')
assert isinstance(jeita.tagged_words()[0][1], string_types)
if __name__ == '__main__':
demo()
test()