Code Coverage for nltk.sem.relextract
Untested Functions
|
Partially Tested Functions
|
"""
Code for extracting relational triples from the ieer and conll2002 corpora.
Relations are stored internally as dictionaries ('reldicts').
The two serialization outputs are I{rtuple} and I{clause}.
- An I{rtuple} is a tuple of the form C{(subj, filler, obj)},
where C{subj} and C{obj} are pairs of Named Entity mentions, and C{filler} is the string of words
occurring between C{sub} and C{obj} (with no intervening NEs). Strings are printed via C{repr()} to
circumvent locale variations in rendering utf-8 encoded strings.
- A I{clause} is an atom of the form C{relsym(subjsym, objsym)},
where the relation, subject and object have been canonicalized to single strings.
"""
from nltk import defaultdict
from string import join
import re
import htmlentitydefs
from itertools import ifilter
NE_CLASSES = {
'ieer': ['LOCATION', 'ORGANIZATION', 'PERSON', 'DURATION',
'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE'],
'conll2002': ['LOC', 'PER', 'ORG']
}
short2long = dict(LOC = 'LOCATION', ORG = 'ORGANIZATION', PER = 'PERSON')
long2short = dict(LOCATION ='LOC', ORGANIZATION = 'ORG', PERSON = 'PER')
def _expand(type):
"""
Expand an NE class name.
@type type: C{str}
@rtype: C{str}
"""
try:
return short2long[type]
except KeyError:
return type
def class_abbrev(type):
"""
Abbreviate an NE class name.
@type type: C{str}
@rtype: C{str}
"""
try:
return long2short[type]
except KeyError:
return type
def _join(lst, sep=' ', untag=False):
"""
Join a list into a string, turning tags tuples into tag strings or just words.
@param untag: if C{True}, omit the tag from tagged input strings.
@type lst: C{list}
@rtype: C{str}
"""
try:
return join(lst, sep=sep)
except TypeError:
if untag:
return join([tup[0] for tup in lst], sep=sep)
from nltk.tag import tuple2str
return join([tuple2str(tup) for tup in lst], sep=sep)
def descape_entity(m, defs=htmlentitydefs.entitydefs):
"""
Translate one entity to its ISO Latin value.
Inspired by example from effbot.org
"""
try:
return defs[m.group(1)]
except KeyError:
return m.group(0)
def list2sym(lst):
"""
Convert a list of strings into a canonical symbol.
@type lst: C{list}
@return: a Unicode string without whitespace
@rtype: C{unicode}
"""
sym = _join(lst, '_', untag=True)
sym = sym.lower()
ENT = re.compile("&(\w+?);")
sym = ENT.sub(descape_entity, sym)
sym = sym.replace('.', '')
return sym
def mk_pairs(tree):
"""
Group a chunk structure into a list of pairs of the form (list(str), L{Tree})
In order to facilitate the construction of (L{Tree}, string, L{Tree}) triples, this
identifies pairs whose first member is a list (possibly empty) of terminal
strings, and whose second member is a L{Tree} of the form (NE_label, terminals).
@param tree: a chunk tree
@return: a list of pairs (list(C{str}), L{Tree})
@rtype: C{list} of C{tuple}
"""
from nltk import Tree
pairs = []
pair = [[], None]
for dtr in tree:
if not isinstance(dtr, Tree):
pair[0].append(dtr)
else:
pair[1] = dtr
pairs.append(pair)
pair = [[], None]
return pairs
def mk_reldicts(pairs, window=5, trace=0):
"""
Converts the pairs generated by L{mk_pairs} into a 'reldict': a dictionary which
stores information about the subject and object NEs plus the filler between them.
Additionally, a left and right context of length =< window are captured (within
a given input sentence).
@param pairs: a pair of list(str) and L{Tree}, as generated by
@param window: a threshold for the number of items to include in the left and right context
@type window: C{int}
@return: 'relation' dictionaries whose keys are 'lcon', 'subjclass', 'subjtext', 'subjsym', 'filler', objclass', objtext', 'objsym' and 'rcon'
@rtype: C{list} of C{defaultdict}
"""
result = []
while len(pairs) > 2:
reldict = defaultdict(str)
reldict['lcon'] = _join(pairs[0][0][-window:])
reldict['subjclass'] = pairs[0][1].node
reldict['subjtext'] = _join(pairs[0][1].leaves())
reldict['subjsym'] = list2sym(pairs[0][1].leaves())
reldict['filler'] = _join(pairs[1][0])
reldict['objclass'] = pairs[1][1].node
reldict['objtext'] = _join(pairs[1][1].leaves())
reldict['objsym'] = list2sym(pairs[1][1].leaves())
reldict['rcon'] = _join(pairs[2][0][:window])
if trace:
print "(rel(%s, %s)" % (reldict['subjclass'], reldict['objclass'])
result.append(reldict)
pairs = pairs[1:]
return result
def extract_rels(subjclass, objclass, doc, corpus='ieer', pattern=None, window=10):
"""
Filter the output of L{mk_reldicts} according to specified NE classes and a filler pattern.
The parameters C{subjclass} and C{objclass} can be used to restrict the
Named Entities to particular types (any of 'LOCATION', 'ORGANIZATION',
'PERSON', 'DURATION', 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE').
@param subjclass: the class of the subject Named Entity.
@type subjclass: C{string}
@param objclass: the class of the object Named Entity.
@type objclass: C{string}
@param doc: input document
@type doc: C{ieer} document or a list of chunk trees
@param corpus: name of the corpus to take as input; possible values are
'ieer' and 'conll2002'
@type corpus: C{string}
@param pattern: a regular expression for filtering the fillers of
retrieved triples.
@type pattern: C{SRE_Pattern}
@param window: filters out fillers which exceed this threshold
@type window: C{int}
@return: see L{mk_reldicts}
@rtype: C{list} of C{defaultdict}
"""
if subjclass and subjclass not in NE_CLASSES[corpus]:
if _expand(subjclass) in NE_CLASSES[corpus]:
subjclass = _expand(subjclass)
else:
raise ValueError, "your value for the subject type has not been recognized: %s" % subjclass
if objclass and objclass not in NE_CLASSES[corpus]:
if _expand(objclass) in NE_CLASSES[corpus]:
objclass = _expand(objclass)
else:
raise ValueError, "your value for the object type has not been recognized: %s" % objclass
if corpus == 'ieer':
pairs = mk_pairs(doc.text) + mk_pairs(doc.headline)
elif corpus == 'conll2002':
pairs = mk_pairs(doc)
else:
raise ValueError, "corpus type not recognized"
reldicts = mk_reldicts(pairs)
relfilter = lambda x: (x['subjclass'] == subjclass and
len(x['filler'].split()) <= window and
pattern.match(x['filler']) and
x['objclass'] == objclass)
return filter(relfilter, reldicts)
def show_raw_rtuple(reldict, lcon=False, rcon=False):
"""
Pretty print the reldict as an rtuple.
@param reldict: a relation dictionary
@type reldict: C{defaultdict}
"""
items = [class_abbrev(reldict['subjclass']), reldict['subjtext'], reldict['filler'], class_abbrev(reldict['objclass']), reldict['objtext']]
format = '[%s: %r] %r [%s: %r]'
if lcon:
items = [reldict['lcon']] + items
format = '...%r)' + format
if rcon:
items.append(reldict['rcon'])
format = format + '(%r...'
printargs = tuple(items)
return format % printargs
def show_clause(reldict, relsym):
"""
Print the relation in clausal form.
@param reldict: a relation dictionary
@type reldict: C{defaultdict}
@param relsym: a label for the relation
@type relsym: C{str}
"""
items = (relsym, reldict['subjsym'], reldict['objsym'])
return "%s(%r, %r)" % items
def in_demo(trace=0):
from nltk.corpus import ieer
IN = re.compile(r'.*\bin\b(?!\b.+ing\b)')
print
print "IEER: in(ORG, LOC) -- just the clauses:"
print "=" * 45
for file in ieer.files():
for doc in ieer.parsed_docs(file):
if trace:
print doc.docno
print "=" * 15
for rel in extract_rels('ORG', 'LOC', doc, pattern=IN):
print show_clause(rel, relsym='IN')
def roles_demo(trace=0):
from nltk.corpus import ieer
roles = """
(.*( # assorted roles
analyst|
chair(wo)?man|
commissioner|
counsel|
director|
economist|
editor|
executive|
foreman|
governor|
head|
lawyer|
leader|
librarian).*)|
manager|
partner|
president|
producer|
professor|
researcher|
spokes(wo)?man|
writer|
,\sof\sthe?\s* # "X, of (the) Y"
"""
ROLES = re.compile(roles, re.VERBOSE)
print
print "IEER: has_role(PER, ORG) -- raw rtuples:"
print "=" * 45
for file in ieer.files():
for doc in ieer.parsed_docs(file):
lcon = rcon = False
if trace:
print doc.docno
print "=" * 15
lcon = rcon = True
for rel in extract_rels('PER', 'ORG', doc, pattern=ROLES):
print show_raw_rtuple(rel, lcon=lcon, rcon=rcon)
def ieer_headlines():
from nltk.corpus import ieer
from nltk import Tree
print "IEER: Frist 20 Headlines"
print "=" * 45
trees = [doc.headline for file in ieer.files() for doc in ieer.parsed_docs(file)]
for tree in trees[:20]:
print
print "%s:\n%s" % (doc.docno, tree)
def conllned(trace=1):
"""
Find the copula+'van' relation ('of') in the Dutch tagged training corpus
from CoNLL 2002.
"""
from nltk.corpus import conll2002
vnv = """
(
is/V|
was/V|
werd/V|
wordt/V
)
.*
van/Prep
"""
VAN = re.compile(vnv, re.VERBOSE)
print
print "Dutch CoNLL2002: van(PER, ORG) -- raw rtuples with context:"
print "=" * 45
for doc in conll2002.chunked_sents('ned.train'):
lcon = rcon = False
if trace:
lcon = rcon = True
for rel in extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN):
print show_raw_rtuple(rel, lcon=lcon, rcon=rcon)
def conllesp():
from nltk.corpus import conll2002
de = """
.*
(
de/SP|
del/SP
)
"""
DE = re.compile(de, re.VERBOSE)
print
print "Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:"
print "=" * 45
rels = [rel for doc in conll2002.chunked_sents('esp.train')
for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)]
for r in rels[:10]: print show_clause(r, relsym='DE')
print
if __name__ == '__main__':
in_demo(trace=0)
roles_demo(trace=0)
ieer
conllned()
conllesp()
ieer_headlines()