Source code for nltk.corpus.reader.chasen

#
# Copyright (C) 2001-2017 NLTK Project
# Author: Masato Hagiwara <[email protected]>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html
from __future__ import print_function

import sys

from six import string_types

from nltk.corpus.reader import util

from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *

[docs]class ChasenCorpusReader(CorpusReader): def __init__(self, root, fileids, encoding='utf8', sent_splitter=None): self._sent_splitter = sent_splitter CorpusReader.__init__(self, root, fileids, encoding)
[docs] def raw(self, fileids=None): if fileids is None: fileids = self._fileids elif isinstance(fileids, string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids])
[docs] def words(self, fileids=None): return concat([ChasenCorpusView(fileid, enc, False, False, False, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True)])
[docs] def tagged_words(self, fileids=None): return concat([ChasenCorpusView(fileid, enc, True, False, False, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True)])
[docs] def sents(self, fileids=None): return concat([ChasenCorpusView(fileid, enc, False, True, False, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True)])
[docs] def tagged_sents(self, fileids=None): return concat([ChasenCorpusView(fileid, enc, True, True, False, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True)])
[docs] def paras(self, fileids=None): return concat([ChasenCorpusView(fileid, enc, False, True, True, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True)])
[docs] def tagged_paras(self, fileids=None): return concat([ChasenCorpusView(fileid, enc, True, True, True, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True)])
[docs]class ChasenCorpusView(StreamBackedCorpusView): """ A specialized corpus view for ChasenReader. Similar to ``TaggedCorpusView``, but this'll use fixed sets of word and sentence tokenizer. """ def __init__(self, corpus_file, encoding, tagged, group_by_sent, group_by_para, sent_splitter=None): self._tagged = tagged self._group_by_sent = group_by_sent self._group_by_para = group_by_para self._sent_splitter = sent_splitter StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
[docs] def read_block(self, stream): """Reads one paragraph at a time.""" block = [] for para_str in read_regexp_block(stream, r".", r"^EOS\n"): para = [] sent = [] for line in para_str.splitlines(): _eos = line.strip() == 'EOS' _cells = line.split('\t') w = (_cells[0], '\t'.join(_cells[1:])) if not _eos: sent.append(w) if _eos or (self._sent_splitter and self._sent_splitter(w)): if not self._tagged: sent = [w for (w,t) in sent] if self._group_by_sent: para.append(sent) else: para.extend(sent) sent = [] if len(sent)>0: if not self._tagged: sent = [w for (w,t) in sent] if self._group_by_sent: para.append(sent) else: para.extend(sent) if self._group_by_para: block.append(para) else: block.extend(para) return block
[docs]def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader jeita = LazyCorpusLoader( 'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8') print('/'.join( jeita.words()[22100:22140] )) print('\nEOS\n'.join('\n'.join("%s/%s" % (w[0],w[1].split('\t')[2]) for w in sent) for sent in jeita.tagged_sents()[2170:2173]))
[docs]def test(): from nltk.corpus.util import LazyCorpusLoader jeita = LazyCorpusLoader( 'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8') assert isinstance(jeita.tagged_words()[0][1], string_types)
if __name__ == '__main__': demo() test()