1
2
3
4
5
6
7
8 from nltk.compat import *
9 from nltk.corpus.reader.util import *
10 from nltk.corpus.reader.api import *
11 from nltk.corpus.reader.xmldocs import *
12 import re, textwrap
13
15
16
17 - def __init__(self, root, files, wrap_etree=False):
18 XMLCorpusReader.__init__(self, root, files, wrap_etree)
19
20 self._lemma_to_class = defaultdict(list)
21 """A dictionary mapping from verb lemma strings to lists of
22 verbnet class identifiers."""
23
24 self._wordnet_to_class = defaultdict(list)
25 """A dictionary mapping from wordnet identifier strings to
26 lists of verbnet class identifiers."""
27
28 self._class_to_fileid = {}
29 """A dictionary mapping from class identifiers to
30 corresponding file identifiers. The keys of this dictionary
31 provide a complete list of all classes and subclasses."""
32
33 self._shortid_to_longid = {}
34
35
36
37
38 self._quick_index()
39
40 _LONGID_RE = re.compile(r'([^\-\.]*)-([\d+.\-]+)$')
41 """Regular expression that matches (and decomposes) longids"""
42
43 _SHORTID_RE = re.compile(r'[\d+.\-]+$')
44 """Regular expression that matches shortids"""
45
46 _INDEX_RE = re.compile(r'<MEMBER name="\??([^"]+)" wn="([^"]*)"/?>|'
47 r'<VNSUBCLASS ID="([^"]+)"/?>')
48 """Regular expression used by L{_index()} to quickly scan the corpus
49 for basic information."""
50
51 - def lemmas(self, classid=None):
52 """
53 Return a list of all verb lemmas that appear in any class, or
54 in the C{classid} if specified.
55 """
56 if classid is None:
57 return sorted(self._lemma_to_class.keys())
58 else:
59
60 vnclass = self.vnclass(classid)
61 return [member.get('name') for member in
62 vnclass.findall('MEMBERS/MEMBER')]
63
65 """
66 Return a list of all wordnet identifiers that appear in any
67 class, or in C{classid} if specified.
68 """
69 if classid is None:
70 return sorted(self._wordnet_to_class.keys())
71 else:
72
73 vnclass = self.vnclass(classid)
74 return sum([member.get('wn','').split() for member in
75 vnclass.findall('MEMBERS/MEMBER')], [])
76
77 - def classids(self, lemma=None, wordnetid=None, fileid=None, classid=None):
78 """
79 Return a list of the verbnet class identifiers. If a file
80 identifier is specified, then return only the verbnet class
81 identifiers for classes (and subclasses) defined by that file.
82 If a lemma is specified, then return only verbnet class
83 identifiers for classes that contain that lemma as a member.
84 If a wordnetid is specified, then return only identifiers for
85 classes that contain that wordnetid as a member. If a classid
86 is specified, then return only identifiers for subclasses of
87 the specified verbnet class.
88 """
89 if len([x for x in [lemma, wordnetid, fileid, classid]
90 if x is not None]) > 1:
91 raise ValueError('Specify at most one of: fileid, wordnetid, '
92 'fileid, classid')
93 if fileid is not None:
94 return [c for (c,f) in self._class_to_fileid.items()
95 if f == fileid]
96 elif lemma is not None:
97 return self._lemma_to_class[lemma]
98 elif wordnetid is not None:
99 return self._wordnet_to_class[wordnetid]
100 elif classid is not None:
101 xmltree = self.vnclass(classid)
102 return [subclass.get('ID') for subclass in
103 xmltree.findall('SUBCLASSES/VNSUBCLASS')]
104 else:
105 return sorted(self._class_to_fileid.keys())
106
107 - def vnclass(self, fileid_or_classid):
108 """
109 Return an ElementTree containing the xml for the specified
110 verbnet class.
111
112 @param fileid_or_classid: An identifier specifying which class
113 should be returned. Can be a file identifier (such as
114 C{'put-9.1.xml'}), or a verbnet class identifier (such as
115 C{'put-9.1'}) or a short verbnet class identifier (such as
116 C{'9.1'}).
117 """
118
119 if fileid_or_classid in self._files:
120 return self.xml(fileid_or_classid)
121
122
123 classid = self.longid(fileid_or_classid)
124 if classid in self._class_to_fileid:
125 fileid = self._class_to_fileid[self.longid(classid)]
126 tree = self.xml(fileid)
127 if classid == tree.get('ID'):
128 return tree
129 else:
130 for subclass in tree.findall('.//VNSUBCLASS'):
131 if classid == subclass.get('ID'):
132 return subclass
133 else:
134 assert False
135
136 else:
137 raise ValueError('Unknown identifier %s' % fileid_or_classid)
138
139 - def files(self, vnclass_ids=None):
140 """
141 Return a list of files that make up this corpus. If
142 C{vnclass_ids} is specified, then return the files that make
143 up the specified verbnet class(es).
144 """
145 if vnclass_ids is None:
146 return self._files
147 elif isinstance(vnclass_ids, basestring):
148 return [self._class_to_fileid[self.longid(vnclass_ids)]]
149 else:
150 return [self._class_to_fileid[self.longid(vnclass_id)]
151 for vnclass_id in vnclass_ids]
152
153
154
155
156
158 """
159 Initialize the indexes L{_lemma_to_class},
160 L{_wordnet_to_class}, and L{_class_to_fileid} by scanning
161 through the corpus files. This is fast with cElementTree
162 (<0.1 secs), but quite slow (>10 secs) with the python
163 implementation of ElementTree.
164 """
165 for fileid in self._files:
166 self._index_helper(self.xml(fileid), fileid)
167
179
181 """
182 Initialize the indexes L{_lemma_to_class},
183 L{_wordnet_to_class}, and L{_class_to_fileid} by scanning
184 through the corpus files. This doesn't do proper xml parsing,
185 but is good enough to find everything in the standard verbnet
186 corpus -- and it runs about 30 times faster than xml parsing
187 (with the python ElementTree; only 2-3 times faster with
188 cElementTree).
189 """
190
191
192 for fileid in self._files:
193 vnclass = fileid[:-4]
194 self._class_to_fileid[vnclass] = fileid
195 self._shortid_to_longid[self.shortid(vnclass)] = vnclass
196 for m in self._INDEX_RE.finditer(self.open(fileid).read()):
197 groups = m.groups()
198 if groups[0] is not None:
199 self._lemma_to_class[groups[0]].append(vnclass)
200 for wn in groups[1].split():
201 self._wordnet_to_class[wn].append(vnclass)
202 elif groups[2] is not None:
203 self._class_to_fileid[groups[2]] = fileid
204 vnclass = groups[2]
205 self._shortid_to_longid[self.shortid(vnclass)] = vnclass
206 else:
207 assert False, 'unexpected match condition'
208
209
210
211
212
214 """Given a short verbnet class identifier (eg '37.10'), map it
215 to a long id (eg 'confess-37.10'). If C{shortid} is already a
216 long id, then return it as-is"""
217 if self._LONGID_RE.match(shortid):
218 return shortid
219 elif not self._SHORTID_RE.match(shortid):
220 raise ValueError('vnclass identifier %r not found' % shortid)
221 try:
222 return self._shortid_to_longid[shortid]
223 except KeyError:
224 raise ValueError('vnclass identifier %r not found' % shortid)
225
227 """Given a long verbnet class identifier (eg 'confess-37.10'),
228 map it to a short id (eg '37.10'). If C{longid} is already a
229 short id, then return it as-is."""
230 if self._SHORTID_RE.match(longid):
231 return longid
232 m = self._LONGID_RE.match(longid)
233 if m:
234 return m.group(2)
235 else:
236 raise ValueError('vnclass identifier %r not found' % longid)
237
238
239
240
241
243 """
244 Return a string containing a pretty-printed representation of
245 the given verbnet class.
246
247 @param vnclass: A verbnet class identifier; or an ElementTree
248 containing the xml contents of a verbnet class.
249 """
250 if isinstance(vnclass, basestring):
251 vnclass = self.vnclass(vnclass)
252
253 s = vnclass.get('ID') + '\n'
254 s += self.pprint_subclasses(vnclass, indent=' ') + '\n'
255 s += self.pprint_members(vnclass, indent=' ') + '\n'
256 s += ' Thematic roles:\n'
257 s += self.pprint_themroles(vnclass, indent=' ') + '\n'
258 s += ' Frames:\n'
259 s += '\n'.join(self.pprint_frame(vnframe, indent=' ')
260 for vnframe in vnclass.findall('FRAMES/FRAME'))
261 return s
262
264 """
265 Return a string containing a pretty-printed representation of
266 the given verbnet class's subclasses.
267
268 @param vnclass: A verbnet class identifier; or an ElementTree
269 containing the xml contents of a verbnet class.
270 """
271 if isinstance(vnclass, basestring):
272 vnclass = self.vnclass(vnclass)
273
274 subclasses = [subclass.get('ID') for subclass in
275 vnclass.findall('SUBCLASSES/VNSUBCLASS')]
276 if not subclasses: subclasses = ['(none)']
277 s = 'Subclasses: ' + ' '.join(subclasses)
278 return textwrap.fill(s, 70, initial_indent=indent,
279 subsequent_indent=indent+' ')
280
282 """
283 Return a string containing a pretty-printed representation of
284 the given verbnet class's member verbs.
285
286 @param vnclass: A verbnet class identifier; or an ElementTree
287 containing the xml contents of a verbnet class.
288 """
289 if isinstance(vnclass, basestring):
290 vnclass = self.vnclass(vnclass)
291
292 members = [member.get('name') for member in
293 vnclass.findall('MEMBERS/MEMBER')]
294 if not members: members = ['(none)']
295 s = 'Members: ' + ' '.join(members)
296 return textwrap.fill(s, 70, initial_indent=indent,
297 subsequent_indent=indent+' ')
298
300 """
301 Return a string containing a pretty-printed representation of
302 the given verbnet class's thematic roles.
303
304 @param vnclass: A verbnet class identifier; or an ElementTree
305 containing the xml contents of a verbnet class.
306 """
307 if isinstance(vnclass, basestring):
308 vnclass = self.vnclass(vnclass)
309
310 pieces = []
311 for themrole in vnclass.findall('THEMROLES/THEMROLE'):
312 piece = indent + '* ' + themrole.get('type')
313 modifiers = ['%(Value)s%(type)s' % restr.attrib
314 for restr in themrole.findall('SELRESTRS/SELRESTR')]
315 if modifiers:
316 piece += '[%s]' % ' '.join(modifiers)
317 pieces.append(piece)
318
319 return '\n'.join(pieces)
320
322 """
323 Return a string containing a pretty-printed representation of
324 the given verbnet frame.
325
326 @param vnframe: An ElementTree containing the xml contents of
327 a verbnet frame.
328 """
329 s = self.pprint_description(vnframe, indent) + '\n'
330 s += self.pprint_syntax(vnframe, indent+' Syntax: ') + '\n'
331 s += indent + ' Semantics:\n'
332 s += self.pprint_semantics(vnframe, indent+' ')
333 return s
334
336 """
337 Return a string containing a pretty-printed representation of
338 the given verbnet frame description.
339
340 @param vnframe: An ElementTree containing the xml contents of
341 a verbnet frame.
342 """
343 descr = vnframe.find('DESCRIPTION')
344 s = indent + descr.attrib['primary']
345 if descr.get('secondary', ''):
346 s += ' (%s)' % descr.get('secondary')
347 return s
348
350 """
351 Return a string containing a pretty-printed representation of
352 the given verbnet frame syntax.
353
354 @param vnframe: An ElementTree containing the xml contents of
355 a verbnet frame.
356 """
357 pieces = []
358 for elt in vnframe.find('SYNTAX'):
359 piece = elt.tag
360 modifiers = []
361 if 'value' in elt.attrib:
362 modifiers.append(elt.get('value'))
363 modifiers += ['%(Value)s%(type)s' % restr.attrib
364 for restr in (elt.findall('SELRESTRS/SELRESTR') +
365 elt.findall('SYNRESTRS/SYNRESTR'))]
366 if modifiers:
367 piece += '[%s]' % ' '.join(modifiers)
368 pieces.append(piece)
369
370 return indent + ' '.join(pieces)
371
373 """
374 Return a string containing a pretty-printed representation of
375 the given verbnet frame semantics.
376
377 @param vnframe: An ElementTree containing the xml contents of
378 a verbnet frame.
379 """
380 pieces = []
381 for pred in vnframe.findall('SEMANTICS/PRED'):
382 args = [arg.get('value') for arg in pred.findall('ARGS/ARG')]
383 pieces.append('%s(%s)' % (pred.get('value'), ', '.join(args)))
384 return '\n'.join(['%s* %s' % (indent, piece) for piece in pieces])
385