Package nltk :: Package corpus :: Package reader :: Module senseval
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.senseval

  1  # Natural Language Toolkit: Senseval 2 Corpus Reader 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Trevor Cohn <[email protected]> 
  5  #         Steven Bird <[email protected]> (modifications) 
  6  # URL: <http://nltk.org> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  """ 
 10  Read from the Senseval 2 Corpus. 
 11   
 12  SENSEVAL [http://www.senseval.org/] 
 13  Evaluation exercises for Word Sense Disambiguation. 
 14  Organized by ACL-SIGLEX [http://www.siglex.org/] 
 15   
 16  Prepared by Ted Pedersen <[email protected]>, University of Minnesota, 
 17  http://www.d.umn.edu/~tpederse/data.html 
 18  Distributed with permission. 
 19   
 20  The NLTK version of the Senseval 2 files uses well-formed XML. 
 21  Each instance of the ambiguous words "hard", "interest", "line", and "serve" 
 22  is tagged with a sense identifier, and supplied with context. 
 23  """        
 24   
 25  from nltk.corpus.reader.util import * 
 26  from api import * 
 27  from nltk.tokenize import * 
 28  import os, re, xml.sax 
 29  from xmldocs import XMLCorpusReader 
 30  from nltk.etree import ElementTree 
 31  from nltk.internals import deprecated 
 32   
33 -class SensevalInstance(object):
34 - def __init__(self, word, position, context, senses):
35 self.word = word 36 self.senses = tuple(senses) 37 self.position = position 38 self.context = context
39 - def __repr__(self):
40 return ('SensevalInstance(word=%r, position=%r, ' 41 'context=%r, senses=%r)' % 42 (self.word, self.position, self.context, self.senses))
43
44 -class SensevalCorpusReader(CorpusReader):
45 - def instances(self, files=None):
46 return concat([SensevalCorpusView(filename, enc) 47 for (filename, enc) in self.abspaths(files, True)])
48
49 - def raw(self, files=None):
50 """ 51 @return: the text contents of the given files, as a single string. 52 """ 53 if files is None: files = self._files 54 elif isinstance(files, basestring): files = [files] 55 return concat([self.open(f).read() for f in files])
56
57 - def _entry(self, tree):
58 elts = [] 59 for lexelt in tree.findall('lexelt'): 60 for inst in lexelt.findall('instance'): 61 sense = inst[0].attrib['senseid'] 62 context = [(w.text, w.attrib['pos']) 63 for w in inst[1]] 64 elts.append( (sense, context) ) 65 return elts
66 67 #{ Deprecated since 0.8 68 @deprecated("Use .instances() or .raw() instead.")
69 - def read(self, items, format='listed'):
70 if format == 'listed': return self.instances(items) 71 if format == 'raw': return self.raw(items) 72 raise ValueError('bad format %r' % format)
73 @deprecated("Use .instances() instead.")
74 - def listed(self, items):
75 return self.instances(items)
76 #} 77
78 -class SensevalCorpusView(StreamBackedCorpusView):
79 - def __init__(self, filename, encoding):
80 StreamBackedCorpusView.__init__(self, filename, encoding=encoding) 81 82 self._word_tokenizer = WhitespaceTokenizer() 83 self._lexelt_starts = [0] # list of streampos 84 self._lexelts = [None] # list of lexelt names
85
86 - def read_block(self, stream):
87 # Decide which lexical element we're in. 88 lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell())-1 89 lexelt = self._lexelts[lexelt_num] 90 91 instance_lines = [] 92 in_instance = False 93 while True: 94 line = stream.readline() 95 if line == '': 96 assert instance_lines == [] 97 return [] 98 99 # Start of a lexical element? 100 if line.lstrip().startswith('<lexelt'): 101 lexelt_num += 1 102 m = re.search('item=("[^"]+"|\'[^\']+\')', line) 103 assert m is not None # <lexelt> has no 'item=...' 104 lexelt = m.group(1)[1:-1] 105 if lexelt_num < len(self._lexelts): 106 assert lexelt == self._lexelts[lexelt_num] 107 else: 108 self._lexelts.append(lexelt) 109 self._lexelt_starts.append(stream.tell()) 110 111 # Start of an instance? 112 if line.lstrip().startswith('<instance'): 113 assert instance_lines == [] 114 in_instance = True 115 116 # Body of an instance? 117 if in_instance: 118 instance_lines.append(line) 119 120 # End of an instance? 121 if line.lstrip().startswith('</instance'): 122 xml_block = '\n'.join(instance_lines) 123 xml_block = _fixXML(xml_block) 124 inst = ElementTree.fromstring(xml_block) 125 return [self._parse_instance(inst, lexelt)]
126
127 - def _parse_instance(self, instance, lexelt):
128 senses = [] 129 context = [] 130 position = None 131 for child in instance: 132 if child.tag == 'answer': 133 senses.append(child.attrib['senseid']) 134 elif child.tag == 'context': 135 context += self._word_tokenizer.tokenize(child.text) 136 for cword in child: 137 if cword.tag == 'compound': 138 cword = cword[0] # is this ok to do? 139 140 if cword.tag == 'head': 141 # Some santiy checks: 142 assert position is None, 'head specified twice' 143 assert cword.text.strip() or len(cword)==1 144 assert not (cword.text.strip() and len(cword)==1) 145 # Record the position of the head: 146 position = len(context) 147 # Addd on the head word itself: 148 if cword.text.strip(): 149 context.append(cword.text.strip()) 150 elif cword[0].tag == 'wf': 151 context.append((cword[0].text, 152 cword[0].attrib['pos'])) 153 if cword[0].tail: 154 context += self._word_tokenizer.tokenize( 155 cword[0].tail) 156 else: 157 assert False, 'expected CDATA or wf in <head>' 158 elif cword.tag == 'wf': 159 context.append((cword.text, cword.attrib['pos'])) 160 elif cword.tag == 's': 161 pass # Sentence boundary marker. 162 163 else: 164 print 'ACK', cword.tag 165 assert False, 'expected CDATA or <wf> or <head>' 166 if cword.tail: 167 context += self._word_tokenizer.tokenize(cword.tail) 168 else: 169 assert False, 'unexpected tag %s' % child.tag 170 return SensevalInstance(lexelt, position, context, senses)
171
172 -def _fixXML(text):
173 """ 174 Fix the various issues with Senseval pseudo-XML. 175 """ 176 # <~> or <^> => ~ or ^ 177 text = re.sub(r'<([~\^])>', r'\1', text) 178 # fix lone & 179 text = re.sub(r'(\s+)\&(\s+)', r'\1&amp;\2', text) 180 # fix """ 181 text = re.sub(r'"""', '\'"\'', text) 182 # fix <s snum=dd> => <s snum="dd"/> 183 text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text) 184 # fix foreign word tag 185 text = re.sub(r'<\&frasl>\s*<p[^>]*>', 'FRASL', text) 186 # remove <&I .> 187 text = re.sub(r'<\&I[^>]*>', '', text) 188 # fix <{word}> 189 text = re.sub(r'<{([^}]+)}>', r'\1', text) 190 # remove <@>, <p>, </p> 191 text = re.sub(r'<(@|/?p)>', r'', text) 192 # remove <&M .> and <&T .> and <&Ms .> 193 text = re.sub(r'<&\w+ \.>', r'', text) 194 # remove <!DOCTYPE... > lines 195 text = re.sub(r'<!DOCTYPE[^>]*>', r'', text) 196 # remove <[hi]> and <[/p]> etc 197 text = re.sub(r'<\[\/?[^>]+\]*>', r'', text) 198 # take the thing out of the brackets: <&hellip;> 199 text = re.sub(r'<(\&\w+;)>', r'\1', text) 200 # and remove the & for those patterns that aren't regular XML 201 text = re.sub(r'&(?!amp|gt|lt|apos|quot)', r'', text) 202 # fix 'abc <p="foo"/>' style tags - now <wf pos="foo">abc</wf> 203 text = re.sub(r'[ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>', 204 r' <wf pos="\2">\1</wf>', text) 205 text = re.sub(r'\s*"\s*<p=\'"\'/>', " <wf pos='\"'>\"</wf>", text) 206 return text
207