nltk.corpus.reader.senseval

1 # Natural Language Toolkit: Senseval 2 Corpus Reader 2 # 3 # Copyright (C) 2001-2008 NLTK Project 4 # Author: Trevor Cohn <[email protected]> 5 # Steven Bird <[email protected]> (modifications) 6 # URL: <http://nltk.org> 7 # For license information, see LICENSE.TXT 8 9 """ 10 Read from the Senseval 2 Corpus. 11 12 SENSEVAL [http://www.senseval.org/] 13 Evaluation exercises for Word Sense Disambiguation. 14 Organized by ACL-SIGLEX [http://www.siglex.org/] 15 16 Prepared by Ted Pedersen <[email protected]>, University of Minnesota, 17 http://www.d.umn.edu/~tpederse/data.html 18 Distributed with permission. 19 20 The NLTK version of the Senseval 2 files uses well-formed XML. 21 Each instance of the ambiguous words "hard", "interest", "line", and "serve" 22 is tagged with a sense identifier, and supplied with context. 23 """ 24 25 from nltk.corpus.reader.util import * 26 from api import * 27 from nltk.tokenize import * 28 import os, re, xml.sax 29 from xmldocs import XMLCorpusReader 30 from nltk.etree import ElementTree 31 from nltk.internals import deprecated 32

33 -class SensevalInstance(object):

34 - def __init__(self, word, position, context, senses):

35 self.word = word 36 self.senses = tuple(senses) 37 self.position = position 38 self.context = context

39 - def __repr__(self):

40 return ('SensevalInstance(word=%r, position=%r, ' 41 'context=%r, senses=%r)' % 42 (self.word, self.position, self.context, self.senses))

43

44 -class SensevalCorpusReader(CorpusReader):

45 - def instances(self, files=None):

46 return concat([SensevalCorpusView(filename, enc) 47 for (filename, enc) in self.abspaths(files, True)])

48

49 - def raw(self, files=None):

50 """ 51 @return: the text contents of the given files, as a single string. 52 """ 53 if files is None: files = self._files 54 elif isinstance(files, basestring): files = [files] 55 return concat([self.open(f).read() for f in files])

56

57 - def _entry(self, tree):

58 elts = [] 59 for lexelt in tree.findall('lexelt'): 60 for inst in lexelt.findall('instance'): 61 sense = inst[0].attrib['senseid'] 62 context = [(w.text, w.attrib['pos']) 63 for w in inst[1]] 64 elts.append( (sense, context) ) 65 return elts

66 67 #{ Deprecated since 0.8 68 @deprecated("Use .instances() or .raw() instead.")

69 - def read(self, items, format='listed'):

70 if format == 'listed': return self.instances(items) 71 if format == 'raw': return self.raw(items) 72 raise ValueError('bad format %r' % format)

73 @deprecated("Use .instances() instead.")

74 - def listed(self, items):

75 return self.instances(items)

76 #} 77

78 -class SensevalCorpusView(StreamBackedCorpusView):

79 - def __init__(self, filename, encoding):

80 StreamBackedCorpusView.__init__(self, filename, encoding=encoding) 81 82 self._word_tokenizer = WhitespaceTokenizer() 83 self._lexelt_starts = [0] # list of streampos 84 self._lexelts = [None] # list of lexelt names

85

86 - def read_block(self, stream):

87 # Decide which lexical element we're in. 88 lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell())-1 89 lexelt = self._lexelts[lexelt_num] 90 91 instance_lines = [] 92 in_instance = False 93 while True: 94 line = stream.readline() 95 if line == '': 96 assert instance_lines == [] 97 return [] 98 99 # Start of a lexical element? 100 if line.lstrip().startswith('<lexelt'): 101 lexelt_num += 1 102 m = re.search('item=("[^"]+"|\'[^\']+\')', line) 103 assert m is not None # <lexelt> has no 'item=...' 104 lexelt = m.group(1)[1:-1] 105 if lexelt_num < len(self._lexelts): 106 assert lexelt == self._lexelts[lexelt_num] 107 else: 108 self._lexelts.append(lexelt) 109 self._lexelt_starts.append(stream.tell()) 110 111 # Start of an instance? 112 if line.lstrip().startswith('<instance'): 113 assert instance_lines == [] 114 in_instance = True 115 116 # Body of an instance? 117 if in_instance: 118 instance_lines.append(line) 119 120 # End of an instance? 121 if line.lstrip().startswith('</instance'): 122 xml_block = '\n'.join(instance_lines) 123 xml_block = _fixXML(xml_block) 124 inst = ElementTree.fromstring(xml_block) 125 return [self._parse_instance(inst, lexelt)]

126

127 - def _parse_instance(self, instance, lexelt):

128 senses = [] 129 context = [] 130 position = None 131 for child in instance: 132 if child.tag == 'answer': 133 senses.append(child.attrib['senseid']) 134 elif child.tag == 'context': 135 context += self._word_tokenizer.tokenize(child.text) 136 for cword in child: 137 if cword.tag == 'compound': 138 cword = cword[0] # is this ok to do? 139 140 if cword.tag == 'head': 141 # Some santiy checks: 142 assert position is None, 'head specified twice' 143 assert cword.text.strip() or len(cword)==1 144 assert not (cword.text.strip() and len(cword)==1) 145 # Record the position of the head: 146 position = len(context) 147 # Addd on the head word itself: 148 if cword.text.strip(): 149 context.append(cword.text.strip()) 150 elif cword[0].tag == 'wf': 151 context.append((cword[0].text, 152 cword[0].attrib['pos'])) 153 if cword[0].tail: 154 context += self._word_tokenizer.tokenize( 155 cword[0].tail) 156 else: 157 assert False, 'expected CDATA or wf in <head>' 158 elif cword.tag == 'wf': 159 context.append((cword.text, cword.attrib['pos'])) 160 elif cword.tag == 's': 161 pass # Sentence boundary marker. 162 163 else: 164 print 'ACK', cword.tag 165 assert False, 'expected CDATA or <wf> or <head>' 166 if cword.tail: 167 context += self._word_tokenizer.tokenize(cword.tail) 168 else: 169 assert False, 'unexpected tag %s' % child.tag 170 return SensevalInstance(lexelt, position, context, senses)

171

172 -def _fixXML(text):

173 """ 174 Fix the various issues with Senseval pseudo-XML. 175 """ 176 # <~> or <^> => ~ or ^ 177 text = re.sub(r'<([~\^])>', r'\1', text) 178 # fix lone & 179 text = re.sub(r'(\s+)\&(\s+)', r'\1&\2', text) 180 # fix """ 181 text = re.sub(r'"""', '\'"\'', text) 182 # fix <s snum=dd> => <s snum="dd"/> 183 text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text) 184 # fix foreign word tag 185 text = re.sub(r'<\&frasl>\s*<p[^>]*>', 'FRASL', text) 186 # remove <&I .> 187 text = re.sub(r'<\&I[^>]*>', '', text) 188 # fix <{word}> 189 text = re.sub(r'<{([^}]+)}>', r'\1', text) 190 # remove <@>, <p>, </p> 191 text = re.sub(r'<(@|/?p)>', r'', text) 192 # remove <&M .> and <&T .> and <&Ms .> 193 text = re.sub(r'<&\w+ \.>', r'', text) 194 # remove <!DOCTYPE... > lines 195 text = re.sub(r'<!DOCTYPE[^>]*>', r'', text) 196 # remove <[hi]> and <[/p]> etc 197 text = re.sub(r'<\[\/?[^>]+\]*>', r'', text) 198 # take the thing out of the brackets: <…> 199 text = re.sub(r'<(\&\w+;)>', r'\1', text) 200 # and remove the & for those patterns that aren't regular XML 201 text = re.sub(r'&(?!amp|gt|lt|apos|quot)', r'', text) 202 # fix 'abc <p="foo"/>' style tags - now <wf pos="foo">abc</wf> 203 text = re.sub(r'[ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>', 204 r' <wf pos="\2">\1</wf>', text) 205 text = re.sub(r'\s*"\s*<p=\'"\'/>', " <wf pos='\"'>\"</wf>", text) 206 return text

207

Source Code for Module nltk.corpus.reader.senseval