1
2
3
4
5
6
7
8
9 """
10 Read from the Senseval 2 Corpus.
11
12 SENSEVAL [http://www.senseval.org/]
13 Evaluation exercises for Word Sense Disambiguation.
14 Organized by ACL-SIGLEX [http://www.siglex.org/]
15
16 Prepared by Ted Pedersen <[email protected]>, University of Minnesota,
17 http://www.d.umn.edu/~tpederse/data.html
18 Distributed with permission.
19
20 The NLTK version of the Senseval 2 files uses well-formed XML.
21 Each instance of the ambiguous words "hard", "interest", "line", and "serve"
22 is tagged with a sense identifier, and supplied with context.
23 """
24
25 from nltk.corpus.reader.util import *
26 from api import *
27 from nltk.tokenize import *
28 import os, re, xml.sax
29 from xmldocs import XMLCorpusReader
30 from nltk.etree import ElementTree
31 from nltk.internals import deprecated
32
34 - def __init__(self, word, position, context, senses):
40 return ('SensevalInstance(word=%r, position=%r, '
41 'context=%r, senses=%r)' %
42 (self.word, self.position, self.context, self.senses))
43
48
49 - def raw(self, files=None):
50 """
51 @return: the text contents of the given files, as a single string.
52 """
53 if files is None: files = self._files
54 elif isinstance(files, basestring): files = [files]
55 return concat([self.open(f).read() for f in files])
56
57 - def _entry(self, tree):
58 elts = []
59 for lexelt in tree.findall('lexelt'):
60 for inst in lexelt.findall('instance'):
61 sense = inst[0].attrib['senseid']
62 context = [(w.text, w.attrib['pos'])
63 for w in inst[1]]
64 elts.append( (sense, context) )
65 return elts
66
67
68 @deprecated("Use .instances() or .raw() instead.")
69 - def read(self, items, format='listed'):
73 @deprecated("Use .instances() instead.")
76
77
85
87
88 lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell())-1
89 lexelt = self._lexelts[lexelt_num]
90
91 instance_lines = []
92 in_instance = False
93 while True:
94 line = stream.readline()
95 if line == '':
96 assert instance_lines == []
97 return []
98
99
100 if line.lstrip().startswith('<lexelt'):
101 lexelt_num += 1
102 m = re.search('item=("[^"]+"|\'[^\']+\')', line)
103 assert m is not None
104 lexelt = m.group(1)[1:-1]
105 if lexelt_num < len(self._lexelts):
106 assert lexelt == self._lexelts[lexelt_num]
107 else:
108 self._lexelts.append(lexelt)
109 self._lexelt_starts.append(stream.tell())
110
111
112 if line.lstrip().startswith('<instance'):
113 assert instance_lines == []
114 in_instance = True
115
116
117 if in_instance:
118 instance_lines.append(line)
119
120
121 if line.lstrip().startswith('</instance'):
122 xml_block = '\n'.join(instance_lines)
123 xml_block = _fixXML(xml_block)
124 inst = ElementTree.fromstring(xml_block)
125 return [self._parse_instance(inst, lexelt)]
126
128 senses = []
129 context = []
130 position = None
131 for child in instance:
132 if child.tag == 'answer':
133 senses.append(child.attrib['senseid'])
134 elif child.tag == 'context':
135 context += self._word_tokenizer.tokenize(child.text)
136 for cword in child:
137 if cword.tag == 'compound':
138 cword = cword[0]
139
140 if cword.tag == 'head':
141
142 assert position is None, 'head specified twice'
143 assert cword.text.strip() or len(cword)==1
144 assert not (cword.text.strip() and len(cword)==1)
145
146 position = len(context)
147
148 if cword.text.strip():
149 context.append(cword.text.strip())
150 elif cword[0].tag == 'wf':
151 context.append((cword[0].text,
152 cword[0].attrib['pos']))
153 if cword[0].tail:
154 context += self._word_tokenizer.tokenize(
155 cword[0].tail)
156 else:
157 assert False, 'expected CDATA or wf in <head>'
158 elif cword.tag == 'wf':
159 context.append((cword.text, cword.attrib['pos']))
160 elif cword.tag == 's':
161 pass
162
163 else:
164 print 'ACK', cword.tag
165 assert False, 'expected CDATA or <wf> or <head>'
166 if cword.tail:
167 context += self._word_tokenizer.tokenize(cword.tail)
168 else:
169 assert False, 'unexpected tag %s' % child.tag
170 return SensevalInstance(lexelt, position, context, senses)
171
173 """
174 Fix the various issues with Senseval pseudo-XML.
175 """
176
177 text = re.sub(r'<([~\^])>', r'\1', text)
178
179 text = re.sub(r'(\s+)\&(\s+)', r'\1&\2', text)
180
181 text = re.sub(r'"""', '\'"\'', text)
182
183 text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text)
184
185 text = re.sub(r'<\&frasl>\s*<p[^>]*>', 'FRASL', text)
186
187 text = re.sub(r'<\&I[^>]*>', '', text)
188
189 text = re.sub(r'<{([^}]+)}>', r'\1', text)
190
191 text = re.sub(r'<(@|/?p)>', r'', text)
192
193 text = re.sub(r'<&\w+ \.>', r'', text)
194
195 text = re.sub(r'<!DOCTYPE[^>]*>', r'', text)
196
197 text = re.sub(r'<\[\/?[^>]+\]*>', r'', text)
198
199 text = re.sub(r'<(\&\w+;)>', r'\1', text)
200
201 text = re.sub(r'&(?!amp|gt|lt|apos|quot)', r'', text)
202
203 text = re.sub(r'[ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>',
204 r' <wf pos="\2">\1</wf>', text)
205 text = re.sub(r'\s*"\s*<p=\'"\'/>', " <wf pos='\"'>\"</wf>", text)
206 return text
207