1
2
3
4
5
6
7
8
9 """
10 Read CoNLL-style chunk files.
11 """
12
13 from nltk.corpus.reader.util import *
14 from nltk.corpus.reader.api import *
15 from nltk import chunk, tree, Tree
16 import os, codecs
17 from nltk.internals import deprecated
18 from nltk import Tree, LazyMap, LazyConcatenation
19 import textwrap
20
22 """
23 A corpus reader for CoNLL-style files. These files consist of a
24 series of sentences, seperated by blank lines. Each sentence is
25 encoded using a table (or I{grid}) of values, where each line
26 corresponds to a single word, and each column corresponds to an
27 annotation type. The set of columns used by CoNLL-style files can
28 vary from corpus to corpus; the C{ConllCorpusReader} constructor
29 therefore takes an argument, C{columntypes}, which is used to
30 specify the columns that are used by a given corpus.
31
32 @todo: Add support for reading from corpora where different
33 parallel files contain different columns.
34 @todo: Possibly add caching of the grid corpus view? This would
35 allow the same grid view to be used by different data access
36 methods (eg words() and parsed_sents() could both share the
37 same grid corpus view object).
38 @todo: Better support for -DOCSTART-. Currently, we just ignore
39 it, but it could be used to define methods that retrieve a
40 document at a time (eg parsed_documents()).
41 """
42
43
44
45
46
47 WORDS = 'words'
48 POS = 'pos'
49 TREE = 'tree'
50 CHUNK = 'chunk'
51 NE = 'ne'
52 SRL = 'srl'
53 IGNORE = 'ignore'
54
55
56 COLUMN_TYPES = (WORDS, POS, TREE, CHUNK, NE, SRL, IGNORE)
57
58
59
60
61
62 - def __init__(self, root, files, columntypes,
63 chunk_types=None, top_node='S', pos_in_tree=False,
64 srl_includes_roleset=True, encoding=None,
65 tree_class=Tree):
66 for columntype in columntypes:
67 if columntype not in self.COLUMN_TYPES:
68 raise ValueError('Bad colum type %r' % columntyp)
69 if chunk_types is not None: chunk_types = tuple(chunk_types)
70 self._chunk_types = chunk_types
71 self._colmap = dict((c,i) for (i,c) in enumerate(columntypes))
72 self._pos_in_tree = pos_in_tree
73 self._top_node = top_node
74 self._srl_includes_roleset = srl_includes_roleset
75 self._tree_class = tree_class
76 CorpusReader.__init__(self, root, files, encoding)
77
78
79
80
81
82 - def raw(self, files=None):
86
87 - def words(self, files=None):
90
91 - def sents(self, files=None):
94
99
103
109 return LazyConcatenation(LazyMap(get_chunked_words,
110 self._grids(files)))
111
117 return LazyMap(get_chunked_words, self._grids(files))
118
124 return LazyMap(get_parsed_sent, self._grids(files))
125
129
130 - def srl_instances(self, files=None, pos_in_tree=None, flatten=True):
135 result = LazyMap(get_srl_instances, self._grids(files))
136 if flatten: result = LazyConcatenation(result)
137 return result
138
149
151 """
152 @return: a list of lists of word/tag/IOB tuples
153 @rtype: C{list} of C{list}
154 @param files: the list of files that make up this corpus
155 @type files: C{None} or C{str} or C{list}
156 """
157 self._require(self.WORDS, self.POS, self.CHUNK)
158 return LazyMap(self._get_iob_words, self._grids(files))
159
160
161
162
163
164 - def _grids(self, files=None):
171
173 grids = []
174 for block in read_blankline_block(stream):
175 block = block.strip()
176 if not block: continue
177
178 grid = [line.split() for line in block.split('\n')]
179
180
181
182 if grid[0][self._colmap.get('words', 0)] == '-DOCSTART-':
183 del grid[0]
184
185
186 for row in grid:
187 if len(row) != len(grid[0]):
188 raise ValueError('Inconsistent number of columns:\n%s'
189 % block)
190 grids.append(grid)
191 return grids
192
193
194
195
196
197
198
201
205
210
212
213 words = self._get_column(grid, self._colmap['words'])
214 pos_tags = self._get_column(grid, self._colmap['pos'])
215 chunk_tags = self._get_column(grid, self._colmap['chunk'])
216
217 stack = [Tree(self._top_node, [])]
218
219 for (word, pos_tag, chunk_tag) in zip(words, pos_tags, chunk_tags):
220 if chunk_tag == 'O':
221 state, chunk_type = 'O', ''
222 else:
223 (state, chunk_type) = chunk_tag.split('-')
224
225 if chunk_types is not None and chunk_type not in chunk_types:
226 state = 'O'
227
228 if state == 'I' and chunk_type != stack[-1].node:
229 state = 'B'
230
231 if state in 'BO' and len(stack) == 2:
232 stack.pop()
233
234 if state == 'B':
235 new_chunk = Tree(chunk_type, [])
236 stack[-1].append(new_chunk)
237 stack.append(new_chunk)
238
239 stack[-1].append((word, pos_tag))
240
241 return stack[0]
242
244 words = self._get_column(grid, self._colmap['words'])
245 pos_tags = self._get_column(grid, self._colmap['pos'])
246 parse_tags = self._get_column(grid, self._colmap['tree'])
247
248 treestr = ''
249 for (word, pos_tag, parse_tag) in zip(words, pos_tags, parse_tags):
250 if word == '(': word = '-LRB-'
251 if word == ')': word = '-RRB-'
252 if pos_tag == '(': pos_tag = '-LRB-'
253 if pos_tag == ')': pos_tag = '-RRB-'
254 (left, right) = parse_tag.split('*')
255 right = right.count(')')*')'
256 treestr += '%s (%s %s) %s' % (left, pos_tag, word, right)
257 try:
258 tree = self._tree_class.parse(treestr)
259 except (ValueError, IndexError):
260 tree = self._tree_class.parse('(%s %s)' %
261 (self._top_node, treestr))
262
263 if not pos_in_tree:
264 for subtree in tree.subtrees():
265 for i, child in enumerate(subtree):
266 if (isinstance(child, nltk.Tree) and len(child)==1 and
267 isinstance(child[0], basestring)):
268 subtree[i] = (child[0], child.node)
269
270 return tree
271
273 """
274 list of list of (start, end), tag) tuples
275 """
276 if self._srl_includes_roleset:
277 predicates = self._get_column(grid, self._colmap['srl']+1)
278 start_col = self._colmap['srl']+2
279 else:
280 predicates = self._get_column(grid, self._colmap['srl'])
281 start_col = self._colmap['srl']+1
282
283
284
285 num_preds = len([p for p in predicates if p != '-'])
286
287 spanlists = []
288 for i in range(num_preds):
289 col = self._get_column(grid, start_col+i)
290 spanlist = []
291 stack = []
292 for wordnum, srl_tag in enumerate(col):
293 (left, right) = srl_tag.split('*')
294 for tag in left.split('('):
295 if tag:
296 stack.append((tag, wordnum))
297 for i in range(right.count(')')):
298 (tag, start) = stack.pop()
299 spanlist.append( ((start, wordnum+1), tag) )
300 spanlists.append(spanlist)
301
302 return spanlists
303
305 tree = self._get_parsed_sent(grid, pos_in_tree)
306 spanlists = self._get_srl_spans(grid)
307 if self._srl_includes_roleset:
308 predicates = self._get_column(grid, self._colmap['srl']+1)
309 rolesets = self._get_column(grid, self._colmap['srl'])
310 else:
311 predicates = self._get_column(grid, self._colmap['srl'])
312 rolesets = [None] * len(predicates)
313
314 instances = ConllSRLInstanceList(tree)
315 for wordnum, predicate in enumerate(predicates):
316 if predicate == '-': continue
317
318
319
320 for spanlist in spanlists:
321 for (start, end), tag in spanlist:
322 if wordnum in range(start,end) and tag in ('V', 'C-V'):
323 break
324 else: continue
325 break
326 else:
327 raise ValueError('No srl column found for %r' % predicate)
328 instances.append(ConllSRLInstance(tree, wordnum, predicate,
329 rolesets[wordnum], spanlist))
330
331 return instances
332
333
334
335
336
338 for columntype in columntypes:
339 if columntype not in self._colmap:
340 raise ValueError('This corpus does not contain a %s '
341 'column.' % columntype)
342
343 @staticmethod
346
347
348
349
350
351 @deprecated("Use .raw() or .words() or .tagged_words() or "
352 ".chunked_sents() instead.")
353 - def read(self, items, format='chunked', chunk_types=None):
359 @deprecated("Use .chunked_sents() instead.")
360 - def chunked(self, items, chunk_types=None):
362 @deprecated("Use .words() instead.")
365 @deprecated("Use .tagged_words() instead.")
368
369
371 """
372 An SRL instance from a CoNLL corpus, which identifies and
373 providing labels for the arguments of a single verb.
374 """
375
376
377 - def __init__(self, tree, verb_head, verb_stem, roleset, tagged_spans):
378 self.verb = []
379 """A list of the word indices of the words that compose the
380 verb whose arguments are identified by this instance.
381 This will contain multiple word indices when multi-word
382 verbs are used (e.g. 'turn on')."""
383
384 self.verb_head = verb_head
385 """The word index of the head word of the verb whose arguments
386 are identified by this instance. E.g., for a sentence that
387 uses the verb 'turn on,' C{verb_head} will be the word index
388 of the word 'turn'."""
389
390 self.verb_stem = verb_stem
391
392 self.roleset = roleset
393
394 self.arguments = []
395 """A list of C{(argspan, argid)} tuples, specifying the location
396 and type for each of the arguments identified by this
397 instance. C{argspan} is a tuple C{start, end}, indicating
398 that the argument consists of the C{words[start:end]}."""
399
400 self.tagged_spans = tagged_spans
401 """A list of C{(span, id)} tuples, specifying the location and
402 type for each of the arguments, as well as the verb pieces,
403 that make up this instance."""
404
405 self.tree = tree
406 """The parse tree for the sentence containing this instance."""
407
408 self.words = tree.leaves()
409 """A list of the words in the sentence containing this
410 instance."""
411
412
413 for (start, end), tag in tagged_spans:
414 if tag in ('V', 'C-V'):
415 self.verb += range(start, end)
416 else:
417 self.arguments.append( ((start, end), tag) )
418
420 plural = len(self.arguments)!=1 and 's' or ''
421 return '<ConllSRLInstance for %r with %d argument%s>' % (
422 (self.verb_stem, len(self.arguments), plural))
423
425 verbstr = ' '.join(self.words[i][0] for i in self.verb)
426 hdr = 'SRL for %r (stem=%r):\n' % (verbstr, self.verb_stem)
427 s = ''
428 for i, word in enumerate(self.words):
429 if isinstance(word, tuple): word = word[0]
430 for (start, end), argid in self.arguments:
431 if i == start: s += '[%s ' % argid
432 if i == end: s += '] '
433 if i in self.verb: word = '<<%s>>' % word
434 s += word + ' '
435 return hdr + textwrap.fill(s.replace(' ]', ']'),
436 initial_indent=' ',
437 subsequent_indent=' ')
438
440 """
441 Set of instances for a single sentence
442 """
443 - def __init__(self, tree, instances=()):
446
449
450 - def pprint(self, include_tree=False):
451
452 for inst in self:
453 if inst.tree != self.tree:
454 raise ValueError('Tree mismatch!')
455
456
457 if include_tree:
458 words = self.tree.leaves()
459 pos = [None] * len(words)
460 synt = ['*'] * len(words)
461 self._tree2conll(self.tree, 0, words, pos, synt)
462
463 s = ''
464 for i in range(len(words)):
465
466 if include_tree:
467 s += '%-20s ' % words[i]
468 s += '%-8s ' % pos[i]
469 s += '%15s*%-8s ' % tuple(synt[i].split('*'))
470
471
472 for inst in self:
473 if i == inst.verb_head:
474 s += '%-20s ' % inst.verb_stem
475 break
476 else:
477 s += '%-20s ' % '-'
478
479 for inst in self:
480 argstr = '*'
481 for (start, end), argid in inst.tagged_spans:
482 if i==start: argstr = '(%s%s' % (argid, argstr)
483 if i==(end-1): argstr += ')'
484 s += '%-12s ' % argstr
485 s += '\n'
486 return s
487
488 - def _tree2conll(self, tree, wordnum, words, pos, synt):
489 assert isinstance(tree, Tree)
490 if len(tree) == 1 and isinstance(tree[0], basestring):
491 pos[wordnum] = tree.node
492 assert words[wordnum] == tree[0]
493 return wordnum+1
494 elif len(tree) == 1 and isinstance(tree[0], tuple):
495 assert len(tree[0]) == 2
496 pos[wordnum], pos[wordnum] = tree[0]
497 return wordnum+1
498 else:
499 synt[wordnum] = '(%s%s' % (tree.node, synt[wordnum])
500 for child in tree:
501 wordnum = self._tree2conll(child, wordnum, words,
502 pos, synt)
503 synt[wordnum-1] += ')'
504 return wordnum
505
507 """
508 A ConllCorpusReader whose data file contains three columns: words,
509 pos, and chunk.
510 """
511 - def __init__(self, root, files, chunk_types, encoding=None):
515