Package nltk :: Package sem :: Module relextract
[hide private]
[frames] | no frames]

Source Code for Module nltk.sem.relextract

  1  # Natural Language Toolkit: Relation Extraction 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Ewan Klein <[email protected]> 
  5  # URL: <http://nltk.org> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  """ 
  9  Code for extracting relational triples from the ieer and conll2002 corpora. 
 10   
 11  Relations are stored internally as dictionaries ('reldicts').  
 12   
 13  The two serialization outputs are I{rtuple} and I{clause}.  
 14     - An I{rtuple} is a tuple of the form C{(subj, filler, obj)},  
 15       where C{subj} and C{obj} are pairs of Named Entity mentions, and C{filler} is the string of words    
 16       occurring between C{sub} and C{obj} (with no intervening NEs). Strings are printed via C{repr()} to 
 17       circumvent locale variations in rendering utf-8 encoded strings. 
 18     - A I{clause} is an atom of the form C{relsym(subjsym, objsym)},  
 19       where the relation, subject and object have been canonicalized to single strings. 
 20   
 21  """ 
 22   
 23  # todo: get a more general solution to canonicalized symbols for clauses -- maybe use xmlcharrefs? 
 24   
 25  from nltk import defaultdict 
 26   
 27  from string import join 
 28  import re 
 29  import htmlentitydefs 
 30  from itertools import ifilter 
 31   
 32  # Dictionary that associates corpora with NE classes 
 33  NE_CLASSES = { 
 34      'ieer': ['LOCATION', 'ORGANIZATION', 'PERSON', 'DURATION',  
 35              'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE'], 
 36      'conll2002': ['LOC', 'PER', 'ORG'] 
 37      } 
 38   
 39  # Allow abbreviated class labels                    
 40  short2long = dict(LOC = 'LOCATION', ORG = 'ORGANIZATION', PER = 'PERSON') 
 41  long2short = dict(LOCATION ='LOC', ORGANIZATION = 'ORG', PERSON = 'PER') 
 42   
 43   
44 -def _expand(type):
45 """ 46 Expand an NE class name. 47 @type type: C{str} 48 @rtype: C{str} 49 """ 50 try: 51 return short2long[type] 52 except KeyError: 53 return type
54
55 -def class_abbrev(type):
56 """ 57 Abbreviate an NE class name. 58 @type type: C{str} 59 @rtype: C{str} 60 """ 61 try: 62 return long2short[type] 63 except KeyError: 64 return type
65 66
67 -def _join(lst, sep=' ', untag=False):
68 """ 69 Join a list into a string, turning tags tuples into tag strings or just words. 70 @param untag: if C{True}, omit the tag from tagged input strings. 71 @type lst: C{list} 72 @rtype: C{str} 73 """ 74 try: 75 return join(lst, sep=sep) 76 except TypeError: 77 if untag: 78 return join([tup[0] for tup in lst], sep=sep) 79 from nltk.tag import tuple2str 80 return join([tuple2str(tup) for tup in lst], sep=sep)
81
82 -def descape_entity(m, defs=htmlentitydefs.entitydefs):
83 """ 84 Translate one entity to its ISO Latin value. 85 Inspired by example from effbot.org 86 87 88 """ 89 #s = 'mcglashan_&amp;_sarrail' 90 #l = ['mcglashan', '&amp;', 'sarrail'] 91 #pattern = re.compile("&(\w+?);") 92 #new = list2sym(l) 93 #s = pattern.sub(descape_entity, s) 94 #print s, new 95 try: 96 return defs[m.group(1)] 97 98 except KeyError: 99 return m.group(0) # use as is
100
101 -def list2sym(lst):
102 """ 103 Convert a list of strings into a canonical symbol. 104 @type lst: C{list} 105 @return: a Unicode string without whitespace 106 @rtype: C{unicode} 107 """ 108 sym = _join(lst, '_', untag=True) 109 sym = sym.lower() 110 ENT = re.compile("&(\w+?);") 111 sym = ENT.sub(descape_entity, sym) 112 sym = sym.replace('.', '') 113 return sym
114
115 -def mk_pairs(tree):
116 """ 117 Group a chunk structure into a list of pairs of the form (list(str), L{Tree}) 118 119 In order to facilitate the construction of (L{Tree}, string, L{Tree}) triples, this 120 identifies pairs whose first member is a list (possibly empty) of terminal 121 strings, and whose second member is a L{Tree} of the form (NE_label, terminals). 122 123 @param tree: a chunk tree 124 @return: a list of pairs (list(C{str}), L{Tree}) 125 @rtype: C{list} of C{tuple} 126 """ 127 128 from nltk import Tree 129 130 pairs = [] 131 pair = [[], None] 132 133 for dtr in tree: 134 if not isinstance(dtr, Tree): 135 pair[0].append(dtr) 136 else: 137 # dtr is a Tree 138 pair[1] = dtr 139 pairs.append(pair) 140 pair = [[], None] 141 return pairs
142 143
144 -def mk_reldicts(pairs, window=5, trace=0):
145 """ 146 Converts the pairs generated by L{mk_pairs} into a 'reldict': a dictionary which 147 stores information about the subject and object NEs plus the filler between them. 148 Additionally, a left and right context of length =< window are captured (within 149 a given input sentence). 150 151 @param pairs: a pair of list(str) and L{Tree}, as generated by 152 @param window: a threshold for the number of items to include in the left and right context 153 @type window: C{int} 154 @return: 'relation' dictionaries whose keys are 'lcon', 'subjclass', 'subjtext', 'subjsym', 'filler', objclass', objtext', 'objsym' and 'rcon' 155 @rtype: C{list} of C{defaultdict} 156 """ 157 result = [] 158 while len(pairs) > 2: 159 reldict = defaultdict(str) 160 reldict['lcon'] = _join(pairs[0][0][-window:]) 161 reldict['subjclass'] = pairs[0][1].node 162 reldict['subjtext'] = _join(pairs[0][1].leaves()) 163 reldict['subjsym'] = list2sym(pairs[0][1].leaves()) 164 reldict['filler'] = _join(pairs[1][0]) 165 reldict['objclass'] = pairs[1][1].node 166 reldict['objtext'] = _join(pairs[1][1].leaves()) 167 reldict['objsym'] = list2sym(pairs[1][1].leaves()) 168 reldict['rcon'] = _join(pairs[2][0][:window]) 169 if trace: 170 print "(rel(%s, %s)" % (reldict['subjclass'], reldict['objclass']) 171 result.append(reldict) 172 pairs = pairs[1:] 173 return result
174
175 -def extract_rels(subjclass, objclass, doc, corpus='ieer', pattern=None, window=10):
176 """ 177 Filter the output of L{mk_reldicts} according to specified NE classes and a filler pattern. 178 179 The parameters C{subjclass} and C{objclass} can be used to restrict the 180 Named Entities to particular types (any of 'LOCATION', 'ORGANIZATION', 181 'PERSON', 'DURATION', 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE'). 182 183 @param subjclass: the class of the subject Named Entity. 184 @type subjclass: C{string} 185 @param objclass: the class of the object Named Entity. 186 @type objclass: C{string} 187 @param doc: input document 188 @type doc: C{ieer} document or a list of chunk trees 189 @param corpus: name of the corpus to take as input; possible values are 190 'ieer' and 'conll2002' 191 @type corpus: C{string} 192 @param pattern: a regular expression for filtering the fillers of 193 retrieved triples. 194 @type pattern: C{SRE_Pattern} 195 @param window: filters out fillers which exceed this threshold 196 @type window: C{int} 197 @return: see L{mk_reldicts} 198 @rtype: C{list} of C{defaultdict} 199 """ 200 201 if subjclass and subjclass not in NE_CLASSES[corpus]: 202 if _expand(subjclass) in NE_CLASSES[corpus]: 203 subjclass = _expand(subjclass) 204 else: 205 raise ValueError, "your value for the subject type has not been recognized: %s" % subjclass 206 if objclass and objclass not in NE_CLASSES[corpus]: 207 if _expand(objclass) in NE_CLASSES[corpus]: 208 objclass = _expand(objclass) 209 else: 210 raise ValueError, "your value for the object type has not been recognized: %s" % objclass 211 212 if corpus == 'ieer': 213 pairs = mk_pairs(doc.text) + mk_pairs(doc.headline) 214 elif corpus == 'conll2002': 215 pairs = mk_pairs(doc) 216 else: 217 raise ValueError, "corpus type not recognized" 218 219 reldicts = mk_reldicts(pairs) 220 221 relfilter = lambda x: (x['subjclass'] == subjclass and 222 len(x['filler'].split()) <= window and 223 pattern.match(x['filler']) and 224 x['objclass'] == objclass) 225 226 return filter(relfilter, reldicts)
227 228
229 -def show_raw_rtuple(reldict, lcon=False, rcon=False):
230 """ 231 Pretty print the reldict as an rtuple. 232 @param reldict: a relation dictionary 233 @type reldict: C{defaultdict} 234 """ 235 items = [class_abbrev(reldict['subjclass']), reldict['subjtext'], reldict['filler'], class_abbrev(reldict['objclass']), reldict['objtext']] 236 format = '[%s: %r] %r [%s: %r]' 237 if lcon: 238 items = [reldict['lcon']] + items 239 format = '...%r)' + format 240 if rcon: 241 items.append(reldict['rcon']) 242 format = format + '(%r...' 243 printargs = tuple(items) 244 return format % printargs
245
246 -def show_clause(reldict, relsym):
247 """ 248 Print the relation in clausal form. 249 @param reldict: a relation dictionary 250 @type reldict: C{defaultdict} 251 @param relsym: a label for the relation 252 @type relsym: C{str} 253 """ 254 items = (relsym, reldict['subjsym'], reldict['objsym']) 255 return "%s(%r, %r)" % items
256 257 258 ####################################################### 259 # Demos of relation extraction with regular expressions 260 ####################################################### 261 262 ############################################ 263 # Example of in(ORG, LOC) 264 ############################################
265 -def in_demo(trace=0):
266 267 from nltk.corpus import ieer 268 269 IN = re.compile(r'.*\bin\b(?!\b.+ing\b)') 270 271 print 272 print "IEER: in(ORG, LOC) -- just the clauses:" 273 print "=" * 45 274 275 for file in ieer.files(): 276 for doc in ieer.parsed_docs(file): 277 if trace: 278 print doc.docno 279 print "=" * 15 280 for rel in extract_rels('ORG', 'LOC', doc, pattern=IN): 281 print show_clause(rel, relsym='IN')
282 283 284 285 ############################################ 286 # Example of has_role(PER, LOC) 287 ############################################ 288
289 -def roles_demo(trace=0):
290 from nltk.corpus import ieer 291 roles = """ 292 (.*( # assorted roles 293 analyst| 294 chair(wo)?man| 295 commissioner| 296 counsel| 297 director| 298 economist| 299 editor| 300 executive| 301 foreman| 302 governor| 303 head| 304 lawyer| 305 leader| 306 librarian).*)| 307 manager| 308 partner| 309 president| 310 producer| 311 professor| 312 researcher| 313 spokes(wo)?man| 314 writer| 315 ,\sof\sthe?\s* # "X, of (the) Y" 316 """ 317 ROLES = re.compile(roles, re.VERBOSE) 318 319 print 320 print "IEER: has_role(PER, ORG) -- raw rtuples:" 321 print "=" * 45 322 323 for file in ieer.files(): 324 for doc in ieer.parsed_docs(file): 325 lcon = rcon = False 326 if trace: 327 print doc.docno 328 print "=" * 15 329 lcon = rcon = True 330 for rel in extract_rels('PER', 'ORG', doc, pattern=ROLES): 331 print show_raw_rtuple(rel, lcon=lcon, rcon=rcon)
332 333 334 ############################################## 335 ### Show what's in the IEER Headlines 336 ############################################## 337 338
339 -def ieer_headlines():
340 341 from nltk.corpus import ieer 342 from nltk import Tree 343 344 print "IEER: Frist 20 Headlines" 345 print "=" * 45 346 347 trees = [doc.headline for file in ieer.files() for doc in ieer.parsed_docs(file)] 348 for tree in trees[:20]: 349 print 350 print "%s:\n%s" % (doc.docno, tree)
351 352 353 354 ############################################# 355 ## Dutch CONLL2002: take_on_role(PER, ORG 356 ############################################# 357
358 -def conllned(trace=1):
359 """ 360 Find the copula+'van' relation ('of') in the Dutch tagged training corpus 361 from CoNLL 2002. 362 """ 363 364 from nltk.corpus import conll2002 365 366 vnv = """ 367 ( 368 is/V| 369 was/V| 370 werd/V| 371 wordt/V 372 ) 373 .* 374 van/Prep 375 """ 376 VAN = re.compile(vnv, re.VERBOSE) 377 378 print 379 print "Dutch CoNLL2002: van(PER, ORG) -- raw rtuples with context:" 380 print "=" * 45 381 for doc in conll2002.chunked_sents('ned.train'): 382 lcon = rcon = False 383 if trace: 384 lcon = rcon = True 385 for rel in extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN): 386 print show_raw_rtuple(rel, lcon=lcon, rcon=rcon)
387 388 ############################################# 389 ## Spanish CONLL2002: (PER, ORG) 390 ############################################# 391
392 -def conllesp():
393 from nltk.corpus import conll2002 394 395 de = """ 396 .* 397 ( 398 de/SP| 399 del/SP 400 ) 401 """ 402 DE = re.compile(de, re.VERBOSE) 403 404 print 405 print "Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:" 406 print "=" * 45 407 rels = [rel for doc in conll2002.chunked_sents('esp.train') 408 for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)] 409 for r in rels[:10]: print show_clause(r, relsym='DE') 410 print
411 412 413 414 if __name__ == '__main__': 415 in_demo(trace=0) 416 roles_demo(trace=0) 417 ieer 418 conllned() 419 conllesp() 420 ieer_headlines() 421