Code Coverage for nltk.corpus.reader.senseval
Untested Functions
|
Partially Tested Functions
|
"""
Read from the Senseval 2 Corpus.
SENSEVAL [http://www.senseval.org/]
Evaluation exercises for Word Sense Disambiguation.
Organized by ACL-SIGLEX [http://www.siglex.org/]
Prepared by Ted Pedersen <[email protected]>, University of Minnesota,
http://www.d.umn.edu/~tpederse/data.html
Distributed with permission.
The NLTK version of the Senseval 2 files uses well-formed XML.
Each instance of the ambiguous words "hard", "interest", "line", and "serve"
is tagged with a sense identifier, and supplied with context.
"""
from nltk.corpus.reader.util import *
from api import *
from nltk.tokenize import *
import os, re, xml.sax
from xmldocs import XMLCorpusReader
from nltk.etree import ElementTree
from nltk.internals import deprecated
class SensevalInstance(object):
def __init__(self, word, position, context, senses):
self.word = word
self.senses = tuple(senses)
self.position = position
self.context = context
def __repr__(self):
return ('SensevalInstance(word=%r, position=%r, '
'context=%r, senses=%r)' %
(self.word, self.position, self.context, self.senses))
class SensevalCorpusReader(CorpusReader):
def instances(self, files=None):
return concat([SensevalCorpusView(filename, enc)
for (filename, enc) in self.abspaths(files, True)])
def raw(self, files=None):
"""
@return: the text contents of the given files, as a single string.
"""
if files is None: files = self._files
elif isinstance(files, basestring): files = [files]
return concat([self.open(f).read() for f in files])
def _entry(self, tree):
elts = []
for lexelt in tree.findall('lexelt'):
for inst in lexelt.findall('instance'):
sense = inst[0].attrib['senseid']
context = [(w.text, w.attrib['pos'])
for w in inst[1]]
elts.append( (sense, context) )
return elts
@deprecated("Use .instances() or .raw() instead.")
def read(self, items, format='listed'):
if format == 'listed': return self.instances(items)
if format == 'raw': return self.raw(items)
raise ValueError('bad format %r' % format)
@deprecated("Use .instances() instead.")
def listed(self, items):
return self.instances(items)
class SensevalCorpusView(StreamBackedCorpusView):
def __init__(self, filename, encoding):
StreamBackedCorpusView.__init__(self, filename, encoding=encoding)
self._word_tokenizer = WhitespaceTokenizer()
self._lexelt_starts = [0]
self._lexelts = [None]
def read_block(self, stream):
lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell())-1
lexelt = self._lexelts[lexelt_num]
instance_lines = []
in_instance = False
while True:
line = stream.readline()
if line == '':
assert instance_lines == []
return []
if line.lstrip().startswith('<lexelt'):
lexelt_num += 1
m = re.search('item=("[^"]+"|\'[^\']+\')', line)
assert m is not None
lexelt = m.group(1)[1:-1]
if lexelt_num < len(self._lexelts):
assert lexelt == self._lexelts[lexelt_num]
else:
self._lexelts.append(lexelt)
self._lexelt_starts.append(stream.tell())
if line.lstrip().startswith('<instance'):
assert instance_lines == []
in_instance = True
if in_instance:
instance_lines.append(line)
if line.lstrip().startswith('</instance'):
xml_block = '\n'.join(instance_lines)
xml_block = _fixXML(xml_block)
inst = ElementTree.fromstring(xml_block)
return [self._parse_instance(inst, lexelt)]
def _parse_instance(self, instance, lexelt):
senses = []
context = []
position = None
for child in instance:
if child.tag == 'answer':
senses.append(child.attrib['senseid'])
elif child.tag == 'context':
context += self._word_tokenizer.tokenize(child.text)
for cword in child:
if cword.tag == 'compound':
cword = cword[0]
if cword.tag == 'head':
assert position is None, 'head specified twice'
assert cword.text.strip() or len(cword)==1
assert not (cword.text.strip() and len(cword)==1)
position = len(context)
if cword.text.strip():
context.append(cword.text.strip())
elif cword[0].tag == 'wf':
context.append((cword[0].text,
cword[0].attrib['pos']))
if cword[0].tail:
context += self._word_tokenizer.tokenize(
cword[0].tail)
else:
assert False, 'expected CDATA or wf in <head>'
elif cword.tag == 'wf':
context.append((cword.text, cword.attrib['pos']))
elif cword.tag == 's':
pass
else:
print 'ACK', cword.tag
assert False, 'expected CDATA or <wf> or <head>'
if cword.tail:
context += self._word_tokenizer.tokenize(cword.tail)
else:
assert False, 'unexpected tag %s' % child.tag
return SensevalInstance(lexelt, position, context, senses)
def _fixXML(text):
"""
Fix the various issues with Senseval pseudo-XML.
"""
text = re.sub(r'<([~\^])>', r'\1', text)
text = re.sub(r'(\s+)\&(\s+)', r'\1&\2', text)
text = re.sub(r'"""', '\'"\'', text)
text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text)
text = re.sub(r'<\&frasl>\s*<p[^>]*>', 'FRASL', text)
text = re.sub(r'<\&I[^>]*>', '', text)
text = re.sub(r'<{([^}]+)}>', r'\1', text)
text = re.sub(r'<(@|/?p)>', r'', text)
text = re.sub(r'<&\w+ \.>', r'', text)
text = re.sub(r'<!DOCTYPE[^>]*>', r'', text)
text = re.sub(r'<\[\/?[^>]+\]*>', r'', text)
text = re.sub(r'<(\&\w+;)>', r'\1', text)
text = re.sub(r'&(?!amp|gt|lt|apos|quot)', r'', text)
text = re.sub(r'[ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>',
r' <wf pos="\2">\1</wf>', text)
text = re.sub(r'\s*"\s*<p=\'"\'/>', " <wf pos='\"'>\"</wf>", text)
return text