Package nltk :: Package corpus :: Package reader :: Module verbnet
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.verbnet

  1  # Natural Language Toolkit: Verbnet Corpus Reader 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Edward Loper <[email protected]> 
  5  # URL: <http://nltk.org> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  from nltk.compat import * 
  9  from nltk.corpus.reader.util import * 
 10  from nltk.corpus.reader.api import * 
 11  from nltk.corpus.reader.xmldocs import * 
 12  import re, textwrap 
 13   
14 -class VerbnetCorpusReader(XMLCorpusReader):
15 16 # No unicode encoding param, since the data files are all XML.
17 - def __init__(self, root, files, wrap_etree=False):
18 XMLCorpusReader.__init__(self, root, files, wrap_etree) 19 20 self._lemma_to_class = defaultdict(list) 21 """A dictionary mapping from verb lemma strings to lists of 22 verbnet class identifiers.""" 23 24 self._wordnet_to_class = defaultdict(list) 25 """A dictionary mapping from wordnet identifier strings to 26 lists of verbnet class identifiers.""" 27 28 self._class_to_fileid = {} 29 """A dictionary mapping from class identifiers to 30 corresponding file identifiers. The keys of this dictionary 31 provide a complete list of all classes and subclasses.""" 32 33 self._shortid_to_longid = {} 34 35 # Initialize the dictionaries. Use the quick (regexp-based) 36 # method instead of the slow (xml-based) method, because it 37 # runs 2-30 times faster. 38 self._quick_index()
39 40 _LONGID_RE = re.compile(r'([^\-\.]*)-([\d+.\-]+)$') 41 """Regular expression that matches (and decomposes) longids""" 42 43 _SHORTID_RE = re.compile(r'[\d+.\-]+$') 44 """Regular expression that matches shortids""" 45 46 _INDEX_RE = re.compile(r'<MEMBER name="\??([^"]+)" wn="([^"]*)"/?>|' 47 r'<VNSUBCLASS ID="([^"]+)"/?>') 48 """Regular expression used by L{_index()} to quickly scan the corpus 49 for basic information.""" 50
51 - def lemmas(self, classid=None):
52 """ 53 Return a list of all verb lemmas that appear in any class, or 54 in the C{classid} if specified. 55 """ 56 if classid is None: 57 return sorted(self._lemma_to_class.keys()) 58 else: 59 # [xx] should this include subclass members? 60 vnclass = self.vnclass(classid) 61 return [member.get('name') for member in 62 vnclass.findall('MEMBERS/MEMBER')]
63
64 - def wordnetids(self, classid=None):
65 """ 66 Return a list of all wordnet identifiers that appear in any 67 class, or in C{classid} if specified. 68 """ 69 if classid is None: 70 return sorted(self._wordnet_to_class.keys()) 71 else: 72 # [xx] should this include subclass members? 73 vnclass = self.vnclass(classid) 74 return sum([member.get('wn','').split() for member in 75 vnclass.findall('MEMBERS/MEMBER')], [])
76
77 - def classids(self, lemma=None, wordnetid=None, fileid=None, classid=None):
78 """ 79 Return a list of the verbnet class identifiers. If a file 80 identifier is specified, then return only the verbnet class 81 identifiers for classes (and subclasses) defined by that file. 82 If a lemma is specified, then return only verbnet class 83 identifiers for classes that contain that lemma as a member. 84 If a wordnetid is specified, then return only identifiers for 85 classes that contain that wordnetid as a member. If a classid 86 is specified, then return only identifiers for subclasses of 87 the specified verbnet class. 88 """ 89 if len([x for x in [lemma, wordnetid, fileid, classid] 90 if x is not None]) > 1: 91 raise ValueError('Specify at most one of: fileid, wordnetid, ' 92 'fileid, classid') 93 if fileid is not None: 94 return [c for (c,f) in self._class_to_fileid.items() 95 if f == fileid] 96 elif lemma is not None: 97 return self._lemma_to_class[lemma] 98 elif wordnetid is not None: 99 return self._wordnet_to_class[wordnetid] 100 elif classid is not None: 101 xmltree = self.vnclass(classid) 102 return [subclass.get('ID') for subclass in 103 xmltree.findall('SUBCLASSES/VNSUBCLASS')] 104 else: 105 return sorted(self._class_to_fileid.keys())
106
107 - def vnclass(self, fileid_or_classid):
108 """ 109 Return an ElementTree containing the xml for the specified 110 verbnet class. 111 112 @param fileid_or_classid: An identifier specifying which class 113 should be returned. Can be a file identifier (such as 114 C{'put-9.1.xml'}), or a verbnet class identifier (such as 115 C{'put-9.1'}) or a short verbnet class identifier (such as 116 C{'9.1'}). 117 """ 118 # File identifier: just return the xml. 119 if fileid_or_classid in self._files: 120 return self.xml(fileid_or_classid) 121 122 # Class identifier: get the xml, and find the right elt. 123 classid = self.longid(fileid_or_classid) 124 if classid in self._class_to_fileid: 125 fileid = self._class_to_fileid[self.longid(classid)] 126 tree = self.xml(fileid) 127 if classid == tree.get('ID'): 128 return tree 129 else: 130 for subclass in tree.findall('.//VNSUBCLASS'): 131 if classid == subclass.get('ID'): 132 return subclass 133 else: 134 assert False # we saw it during _index()! 135 136 else: 137 raise ValueError('Unknown identifier %s' % fileid_or_classid)
138
139 - def files(self, vnclass_ids=None):
140 """ 141 Return a list of files that make up this corpus. If 142 C{vnclass_ids} is specified, then return the files that make 143 up the specified verbnet class(es). 144 """ 145 if vnclass_ids is None: 146 return self._files 147 elif isinstance(vnclass_ids, basestring): 148 return [self._class_to_fileid[self.longid(vnclass_ids)]] 149 else: 150 return [self._class_to_fileid[self.longid(vnclass_id)] 151 for vnclass_id in vnclass_ids]
152 153 ###################################################################### 154 #{ Index Initialization 155 ###################################################################### 156
157 - def _index(self):
158 """ 159 Initialize the indexes L{_lemma_to_class}, 160 L{_wordnet_to_class}, and L{_class_to_fileid} by scanning 161 through the corpus files. This is fast with cElementTree 162 (<0.1 secs), but quite slow (>10 secs) with the python 163 implementation of ElementTree. 164 """ 165 for fileid in self._files: 166 self._index_helper(self.xml(fileid), fileid)
167
168 - def _index_helper(self, xmltree, fileid):
169 """Helper for L{_index()}""" 170 vnclass = xmltree.get('ID') 171 self._class_to_fileid[vnclass] = fileid 172 self._shortid_to_longid[self.shortid(vnclass)] = vnclass 173 for member in xmltree.findall('MEMBERS/MEMBER'): 174 self._lemma_to_class[member.get('name')].append(vnclass) 175 for wn in member.get('wn', '').split(): 176 self._wordnet_to_class[wn].append(vnclass) 177 for subclass in xmltree.findall('SUBCLASSES/VNSUBCLASS'): 178 self._index_helper(subclass, fileid)
179
180 - def _quick_index(self):
181 """ 182 Initialize the indexes L{_lemma_to_class}, 183 L{_wordnet_to_class}, and L{_class_to_fileid} by scanning 184 through the corpus files. This doesn't do proper xml parsing, 185 but is good enough to find everything in the standard verbnet 186 corpus -- and it runs about 30 times faster than xml parsing 187 (with the python ElementTree; only 2-3 times faster with 188 cElementTree). 189 """ 190 # nb: if we got rid of wordnet_to_class, this would run 2-3 191 # times faster. 192 for fileid in self._files: 193 vnclass = fileid[:-4] # strip the '.xml' 194 self._class_to_fileid[vnclass] = fileid 195 self._shortid_to_longid[self.shortid(vnclass)] = vnclass 196 for m in self._INDEX_RE.finditer(self.open(fileid).read()): 197 groups = m.groups() 198 if groups[0] is not None: 199 self._lemma_to_class[groups[0]].append(vnclass) 200 for wn in groups[1].split(): 201 self._wordnet_to_class[wn].append(vnclass) 202 elif groups[2] is not None: 203 self._class_to_fileid[groups[2]] = fileid 204 vnclass = groups[2] # for <MEMBER> elts. 205 self._shortid_to_longid[self.shortid(vnclass)] = vnclass 206 else: 207 assert False, 'unexpected match condition'
208 209 ###################################################################### 210 #{ Identifier conversion 211 ###################################################################### 212
213 - def longid(self, shortid):
214 """Given a short verbnet class identifier (eg '37.10'), map it 215 to a long id (eg 'confess-37.10'). If C{shortid} is already a 216 long id, then return it as-is""" 217 if self._LONGID_RE.match(shortid): 218 return shortid # it's already a longid. 219 elif not self._SHORTID_RE.match(shortid): 220 raise ValueError('vnclass identifier %r not found' % shortid) 221 try: 222 return self._shortid_to_longid[shortid] 223 except KeyError: 224 raise ValueError('vnclass identifier %r not found' % shortid)
225
226 - def shortid(self, longid):
227 """Given a long verbnet class identifier (eg 'confess-37.10'), 228 map it to a short id (eg '37.10'). If C{longid} is already a 229 short id, then return it as-is.""" 230 if self._SHORTID_RE.match(longid): 231 return longid # it's already a shortid. 232 m = self._LONGID_RE.match(longid) 233 if m: 234 return m.group(2) 235 else: 236 raise ValueError('vnclass identifier %r not found' % longid)
237 238 ###################################################################### 239 #{ Pretty Printing 240 ###################################################################### 241
242 - def pprint(self, vnclass):
243 """ 244 Return a string containing a pretty-printed representation of 245 the given verbnet class. 246 247 @param vnclass: A verbnet class identifier; or an ElementTree 248 containing the xml contents of a verbnet class. 249 """ 250 if isinstance(vnclass, basestring): 251 vnclass = self.vnclass(vnclass) 252 253 s = vnclass.get('ID') + '\n' 254 s += self.pprint_subclasses(vnclass, indent=' ') + '\n' 255 s += self.pprint_members(vnclass, indent=' ') + '\n' 256 s += ' Thematic roles:\n' 257 s += self.pprint_themroles(vnclass, indent=' ') + '\n' 258 s += ' Frames:\n' 259 s += '\n'.join(self.pprint_frame(vnframe, indent=' ') 260 for vnframe in vnclass.findall('FRAMES/FRAME')) 261 return s
262
263 - def pprint_subclasses(self, vnclass, indent=''):
264 """ 265 Return a string containing a pretty-printed representation of 266 the given verbnet class's subclasses. 267 268 @param vnclass: A verbnet class identifier; or an ElementTree 269 containing the xml contents of a verbnet class. 270 """ 271 if isinstance(vnclass, basestring): 272 vnclass = self.vnclass(vnclass) 273 274 subclasses = [subclass.get('ID') for subclass in 275 vnclass.findall('SUBCLASSES/VNSUBCLASS')] 276 if not subclasses: subclasses = ['(none)'] 277 s = 'Subclasses: ' + ' '.join(subclasses) 278 return textwrap.fill(s, 70, initial_indent=indent, 279 subsequent_indent=indent+' ')
280
281 - def pprint_members(self, vnclass, indent=''):
282 """ 283 Return a string containing a pretty-printed representation of 284 the given verbnet class's member verbs. 285 286 @param vnclass: A verbnet class identifier; or an ElementTree 287 containing the xml contents of a verbnet class. 288 """ 289 if isinstance(vnclass, basestring): 290 vnclass = self.vnclass(vnclass) 291 292 members = [member.get('name') for member in 293 vnclass.findall('MEMBERS/MEMBER')] 294 if not members: members = ['(none)'] 295 s = 'Members: ' + ' '.join(members) 296 return textwrap.fill(s, 70, initial_indent=indent, 297 subsequent_indent=indent+' ')
298
299 - def pprint_themroles(self, vnclass, indent=''):
300 """ 301 Return a string containing a pretty-printed representation of 302 the given verbnet class's thematic roles. 303 304 @param vnclass: A verbnet class identifier; or an ElementTree 305 containing the xml contents of a verbnet class. 306 """ 307 if isinstance(vnclass, basestring): 308 vnclass = self.vnclass(vnclass) 309 310 pieces = [] 311 for themrole in vnclass.findall('THEMROLES/THEMROLE'): 312 piece = indent + '* ' + themrole.get('type') 313 modifiers = ['%(Value)s%(type)s' % restr.attrib 314 for restr in themrole.findall('SELRESTRS/SELRESTR')] 315 if modifiers: 316 piece += '[%s]' % ' '.join(modifiers) 317 pieces.append(piece) 318 319 return '\n'.join(pieces)
320
321 - def pprint_frame(self, vnframe, indent=''):
322 """ 323 Return a string containing a pretty-printed representation of 324 the given verbnet frame. 325 326 @param vnframe: An ElementTree containing the xml contents of 327 a verbnet frame. 328 """ 329 s = self.pprint_description(vnframe, indent) + '\n' 330 s += self.pprint_syntax(vnframe, indent+' Syntax: ') + '\n' 331 s += indent + ' Semantics:\n' 332 s += self.pprint_semantics(vnframe, indent+' ') 333 return s
334
335 - def pprint_description(self, vnframe, indent=''):
336 """ 337 Return a string containing a pretty-printed representation of 338 the given verbnet frame description. 339 340 @param vnframe: An ElementTree containing the xml contents of 341 a verbnet frame. 342 """ 343 descr = vnframe.find('DESCRIPTION') 344 s = indent + descr.attrib['primary'] 345 if descr.get('secondary', ''): 346 s += ' (%s)' % descr.get('secondary') 347 return s
348
349 - def pprint_syntax(self, vnframe, indent=''):
350 """ 351 Return a string containing a pretty-printed representation of 352 the given verbnet frame syntax. 353 354 @param vnframe: An ElementTree containing the xml contents of 355 a verbnet frame. 356 """ 357 pieces = [] 358 for elt in vnframe.find('SYNTAX'): 359 piece = elt.tag 360 modifiers = [] 361 if 'value' in elt.attrib: 362 modifiers.append(elt.get('value')) 363 modifiers += ['%(Value)s%(type)s' % restr.attrib 364 for restr in (elt.findall('SELRESTRS/SELRESTR') + 365 elt.findall('SYNRESTRS/SYNRESTR'))] 366 if modifiers: 367 piece += '[%s]' % ' '.join(modifiers) 368 pieces.append(piece) 369 370 return indent + ' '.join(pieces)
371
372 - def pprint_semantics(self, vnframe, indent=''):
373 """ 374 Return a string containing a pretty-printed representation of 375 the given verbnet frame semantics. 376 377 @param vnframe: An ElementTree containing the xml contents of 378 a verbnet frame. 379 """ 380 pieces = [] 381 for pred in vnframe.findall('SEMANTICS/PRED'): 382 args = [arg.get('value') for arg in pred.findall('ARGS/ARG')] 383 pieces.append('%s(%s)' % (pred.get('value'), ', '.join(args))) 384 return '\n'.join(['%s* %s' % (indent, piece) for piece in pieces])
385