1
2
3
4
5
6
7
8 """
9 Code for extracting relational triples from the ieer and conll2002 corpora.
10
11 Relations are stored internally as dictionaries ('reldicts').
12
13 The two serialization outputs are I{rtuple} and I{clause}.
14 - An I{rtuple} is a tuple of the form C{(subj, filler, obj)},
15 where C{subj} and C{obj} are pairs of Named Entity mentions, and C{filler} is the string of words
16 occurring between C{sub} and C{obj} (with no intervening NEs). Strings are printed via C{repr()} to
17 circumvent locale variations in rendering utf-8 encoded strings.
18 - A I{clause} is an atom of the form C{relsym(subjsym, objsym)},
19 where the relation, subject and object have been canonicalized to single strings.
20
21 """
22
23
24
25 from nltk import defaultdict
26
27 from string import join
28 import re
29 import htmlentitydefs
30 from itertools import ifilter
31
32
33 NE_CLASSES = {
34 'ieer': ['LOCATION', 'ORGANIZATION', 'PERSON', 'DURATION',
35 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE'],
36 'conll2002': ['LOC', 'PER', 'ORG']
37 }
38
39
40 short2long = dict(LOC = 'LOCATION', ORG = 'ORGANIZATION', PER = 'PERSON')
41 long2short = dict(LOCATION ='LOC', ORGANIZATION = 'ORG', PERSON = 'PER')
42
43
45 """
46 Expand an NE class name.
47 @type type: C{str}
48 @rtype: C{str}
49 """
50 try:
51 return short2long[type]
52 except KeyError:
53 return type
54
56 """
57 Abbreviate an NE class name.
58 @type type: C{str}
59 @rtype: C{str}
60 """
61 try:
62 return long2short[type]
63 except KeyError:
64 return type
65
66
67 -def _join(lst, sep=' ', untag=False):
68 """
69 Join a list into a string, turning tags tuples into tag strings or just words.
70 @param untag: if C{True}, omit the tag from tagged input strings.
71 @type lst: C{list}
72 @rtype: C{str}
73 """
74 try:
75 return join(lst, sep=sep)
76 except TypeError:
77 if untag:
78 return join([tup[0] for tup in lst], sep=sep)
79 from nltk.tag import tuple2str
80 return join([tuple2str(tup) for tup in lst], sep=sep)
81
83 """
84 Translate one entity to its ISO Latin value.
85 Inspired by example from effbot.org
86
87
88 """
89
90
91
92
93
94
95 try:
96 return defs[m.group(1)]
97
98 except KeyError:
99 return m.group(0)
100
102 """
103 Convert a list of strings into a canonical symbol.
104 @type lst: C{list}
105 @return: a Unicode string without whitespace
106 @rtype: C{unicode}
107 """
108 sym = _join(lst, '_', untag=True)
109 sym = sym.lower()
110 ENT = re.compile("&(\w+?);")
111 sym = ENT.sub(descape_entity, sym)
112 sym = sym.replace('.', '')
113 return sym
114
116 """
117 Group a chunk structure into a list of pairs of the form (list(str), L{Tree})
118
119 In order to facilitate the construction of (L{Tree}, string, L{Tree}) triples, this
120 identifies pairs whose first member is a list (possibly empty) of terminal
121 strings, and whose second member is a L{Tree} of the form (NE_label, terminals).
122
123 @param tree: a chunk tree
124 @return: a list of pairs (list(C{str}), L{Tree})
125 @rtype: C{list} of C{tuple}
126 """
127
128 from nltk import Tree
129
130 pairs = []
131 pair = [[], None]
132
133 for dtr in tree:
134 if not isinstance(dtr, Tree):
135 pair[0].append(dtr)
136 else:
137
138 pair[1] = dtr
139 pairs.append(pair)
140 pair = [[], None]
141 return pairs
142
143
145 """
146 Converts the pairs generated by L{mk_pairs} into a 'reldict': a dictionary which
147 stores information about the subject and object NEs plus the filler between them.
148 Additionally, a left and right context of length =< window are captured (within
149 a given input sentence).
150
151 @param pairs: a pair of list(str) and L{Tree}, as generated by
152 @param window: a threshold for the number of items to include in the left and right context
153 @type window: C{int}
154 @return: 'relation' dictionaries whose keys are 'lcon', 'subjclass', 'subjtext', 'subjsym', 'filler', objclass', objtext', 'objsym' and 'rcon'
155 @rtype: C{list} of C{defaultdict}
156 """
157 result = []
158 while len(pairs) > 2:
159 reldict = defaultdict(str)
160 reldict['lcon'] = _join(pairs[0][0][-window:])
161 reldict['subjclass'] = pairs[0][1].node
162 reldict['subjtext'] = _join(pairs[0][1].leaves())
163 reldict['subjsym'] = list2sym(pairs[0][1].leaves())
164 reldict['filler'] = _join(pairs[1][0])
165 reldict['objclass'] = pairs[1][1].node
166 reldict['objtext'] = _join(pairs[1][1].leaves())
167 reldict['objsym'] = list2sym(pairs[1][1].leaves())
168 reldict['rcon'] = _join(pairs[2][0][:window])
169 if trace:
170 print "(rel(%s, %s)" % (reldict['subjclass'], reldict['objclass'])
171 result.append(reldict)
172 pairs = pairs[1:]
173 return result
174
176 """
177 Filter the output of L{mk_reldicts} according to specified NE classes and a filler pattern.
178
179 The parameters C{subjclass} and C{objclass} can be used to restrict the
180 Named Entities to particular types (any of 'LOCATION', 'ORGANIZATION',
181 'PERSON', 'DURATION', 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE').
182
183 @param subjclass: the class of the subject Named Entity.
184 @type subjclass: C{string}
185 @param objclass: the class of the object Named Entity.
186 @type objclass: C{string}
187 @param doc: input document
188 @type doc: C{ieer} document or a list of chunk trees
189 @param corpus: name of the corpus to take as input; possible values are
190 'ieer' and 'conll2002'
191 @type corpus: C{string}
192 @param pattern: a regular expression for filtering the fillers of
193 retrieved triples.
194 @type pattern: C{SRE_Pattern}
195 @param window: filters out fillers which exceed this threshold
196 @type window: C{int}
197 @return: see L{mk_reldicts}
198 @rtype: C{list} of C{defaultdict}
199 """
200
201 if subjclass and subjclass not in NE_CLASSES[corpus]:
202 if _expand(subjclass) in NE_CLASSES[corpus]:
203 subjclass = _expand(subjclass)
204 else:
205 raise ValueError, "your value for the subject type has not been recognized: %s" % subjclass
206 if objclass and objclass not in NE_CLASSES[corpus]:
207 if _expand(objclass) in NE_CLASSES[corpus]:
208 objclass = _expand(objclass)
209 else:
210 raise ValueError, "your value for the object type has not been recognized: %s" % objclass
211
212 if corpus == 'ieer':
213 pairs = mk_pairs(doc.text) + mk_pairs(doc.headline)
214 elif corpus == 'conll2002':
215 pairs = mk_pairs(doc)
216 else:
217 raise ValueError, "corpus type not recognized"
218
219 reldicts = mk_reldicts(pairs)
220
221 relfilter = lambda x: (x['subjclass'] == subjclass and
222 len(x['filler'].split()) <= window and
223 pattern.match(x['filler']) and
224 x['objclass'] == objclass)
225
226 return filter(relfilter, reldicts)
227
228
230 """
231 Pretty print the reldict as an rtuple.
232 @param reldict: a relation dictionary
233 @type reldict: C{defaultdict}
234 """
235 items = [class_abbrev(reldict['subjclass']), reldict['subjtext'], reldict['filler'], class_abbrev(reldict['objclass']), reldict['objtext']]
236 format = '[%s: %r] %r [%s: %r]'
237 if lcon:
238 items = [reldict['lcon']] + items
239 format = '...%r)' + format
240 if rcon:
241 items.append(reldict['rcon'])
242 format = format + '(%r...'
243 printargs = tuple(items)
244 return format % printargs
245
247 """
248 Print the relation in clausal form.
249 @param reldict: a relation dictionary
250 @type reldict: C{defaultdict}
251 @param relsym: a label for the relation
252 @type relsym: C{str}
253 """
254 items = (relsym, reldict['subjsym'], reldict['objsym'])
255 return "%s(%r, %r)" % items
256
257
258
259
260
261
262
263
264
266
267 from nltk.corpus import ieer
268
269 IN = re.compile(r'.*\bin\b(?!\b.+ing\b)')
270
271 print
272 print "IEER: in(ORG, LOC) -- just the clauses:"
273 print "=" * 45
274
275 for file in ieer.files():
276 for doc in ieer.parsed_docs(file):
277 if trace:
278 print doc.docno
279 print "=" * 15
280 for rel in extract_rels('ORG', 'LOC', doc, pattern=IN):
281 print show_clause(rel, relsym='IN')
282
283
284
285
286
287
288
290 from nltk.corpus import ieer
291 roles = """
292 (.*( # assorted roles
293 analyst|
294 chair(wo)?man|
295 commissioner|
296 counsel|
297 director|
298 economist|
299 editor|
300 executive|
301 foreman|
302 governor|
303 head|
304 lawyer|
305 leader|
306 librarian).*)|
307 manager|
308 partner|
309 president|
310 producer|
311 professor|
312 researcher|
313 spokes(wo)?man|
314 writer|
315 ,\sof\sthe?\s* # "X, of (the) Y"
316 """
317 ROLES = re.compile(roles, re.VERBOSE)
318
319 print
320 print "IEER: has_role(PER, ORG) -- raw rtuples:"
321 print "=" * 45
322
323 for file in ieer.files():
324 for doc in ieer.parsed_docs(file):
325 lcon = rcon = False
326 if trace:
327 print doc.docno
328 print "=" * 15
329 lcon = rcon = True
330 for rel in extract_rels('PER', 'ORG', doc, pattern=ROLES):
331 print show_raw_rtuple(rel, lcon=lcon, rcon=rcon)
332
333
334
335
336
337
338
351
352
353
354
355
356
357
359 """
360 Find the copula+'van' relation ('of') in the Dutch tagged training corpus
361 from CoNLL 2002.
362 """
363
364 from nltk.corpus import conll2002
365
366 vnv = """
367 (
368 is/V|
369 was/V|
370 werd/V|
371 wordt/V
372 )
373 .*
374 van/Prep
375 """
376 VAN = re.compile(vnv, re.VERBOSE)
377
378 print
379 print "Dutch CoNLL2002: van(PER, ORG) -- raw rtuples with context:"
380 print "=" * 45
381 for doc in conll2002.chunked_sents('ned.train'):
382 lcon = rcon = False
383 if trace:
384 lcon = rcon = True
385 for rel in extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN):
386 print show_raw_rtuple(rel, lcon=lcon, rcon=rcon)
387
388
389
390
391
393 from nltk.corpus import conll2002
394
395 de = """
396 .*
397 (
398 de/SP|
399 del/SP
400 )
401 """
402 DE = re.compile(de, re.VERBOSE)
403
404 print
405 print "Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:"
406 print "=" * 45
407 rels = [rel for doc in conll2002.chunked_sents('esp.train')
408 for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)]
409 for r in rels[:10]: print show_clause(r, relsym='DE')
410 print
411
412
413
414 if __name__ == '__main__':
415 in_demo(trace=0)
416 roles_demo(trace=0)
417 ieer
418 conllned()
419 conllesp()
420 ieer_headlines()
421