Source code for nltk.corpus.reader.toolbox

# Natural Language Toolkit: Toolbox Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Greg Aumann <[email protected]>
#         Stuart Robinson <[email protected]>
#         Steven Bird <[email protected]>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

"""
Module for reading, writing and manipulating
Toolbox databases and settings fileids.
"""

from nltk.toolbox import ToolboxData
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *


[docs]class ToolboxCorpusReader(CorpusReader):
[docs] def xml(self, fileids, key=None): return concat( [ ToolboxData(path, enc).parse(key=key) for (path, enc) in self.abspaths(fileids, True) ] )
[docs] def fields( self, fileids, strip=True, unwrap=True, encoding='utf8', errors='strict', unicode_fields=None, ): return concat( [ list( ToolboxData(fileid, enc).fields( strip, unwrap, encoding, errors, unicode_fields ) ) for (fileid, enc) in self.abspaths(fileids, include_encoding=True) ] )
# should probably be done lazily:
[docs] def entries(self, fileids, **kwargs): if 'key' in kwargs: key = kwargs['key'] del kwargs['key'] else: key = 'lx' # the default key in MDF entries = [] for marker, contents in self.fields(fileids, **kwargs): if marker == key: entries.append((contents, [])) else: try: entries[-1][-1].append((marker, contents)) except IndexError: pass return entries
[docs] def words(self, fileids, key='lx'): return [contents for marker, contents in self.fields(fileids) if marker == key]
[docs] def raw(self, fileids): if fileids is None: fileids = self._fileids elif isinstance(fileids, string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids])
[docs]def demo(): pass
if __name__ == '__main__': demo()