nltk.corpus.reader.conll

1 # Natural Language Toolkit: CONLL Corpus Reader 2 # 3 # Copyright (C) 2001-2008 NLTK Project 4 # Author: Steven Bird <[email protected]> 5 # Edward Loper <[email protected]> 6 # URL: <http://nltk.org> 7 # For license information, see LICENSE.TXT 8 9 """ 10 Read CoNLL-style chunk files. 11 """ 12 13 from nltk.corpus.reader.util import * 14 from nltk.corpus.reader.api import * 15 from nltk import chunk, tree, Tree 16 import os, codecs 17 from nltk.internals import deprecated 18 from nltk import Tree, LazyMap, LazyConcatenation 19 import textwrap 20

21 -class ConllCorpusReader(CorpusReader):

22 """ 23 A corpus reader for CoNLL-style files. These files consist of a 24 series of sentences, seperated by blank lines. Each sentence is 25 encoded using a table (or I{grid}) of values, where each line 26 corresponds to a single word, and each column corresponds to an 27 annotation type. The set of columns used by CoNLL-style files can 28 vary from corpus to corpus; the C{ConllCorpusReader} constructor 29 therefore takes an argument, C{columntypes}, which is used to 30 specify the columns that are used by a given corpus. 31 32 @todo: Add support for reading from corpora where different 33 parallel files contain different columns. 34 @todo: Possibly add caching of the grid corpus view? This would 35 allow the same grid view to be used by different data access 36 methods (eg words() and parsed_sents() could both share the 37 same grid corpus view object). 38 @todo: Better support for -DOCSTART-. Currently, we just ignore 39 it, but it could be used to define methods that retrieve a 40 document at a time (eg parsed_documents()). 41 """ 42 43 #///////////////////////////////////////////////////////////////// 44 # Column Types 45 #///////////////////////////////////////////////////////////////// 46 47 WORDS = 'words' #: column type for words 48 POS = 'pos' #: column type for part-of-speech tags 49 TREE = 'tree' #: column type for parse trees 50 CHUNK = 'chunk' #: column type for chunk structures 51 NE = 'ne' #: column type for named entities 52 SRL = 'srl' #: column type for semantic role labels 53 IGNORE = 'ignore' #: column type for column that should be ignored 54 55 #: A list of all column types supported by the conll corpus reader. 56 COLUMN_TYPES = (WORDS, POS, TREE, CHUNK, NE, SRL, IGNORE) 57 58 #///////////////////////////////////////////////////////////////// 59 # Constructor 60 #///////////////////////////////////////////////////////////////// 61

62 - def __init__(self, root, files, columntypes, 63 chunk_types=None, top_node='S', pos_in_tree=False, 64 srl_includes_roleset=True, encoding=None, 65 tree_class=Tree):

66 for columntype in columntypes: 67 if columntype not in self.COLUMN_TYPES: 68 raise ValueError('Bad colum type %r' % columntyp) 69 if chunk_types is not None: chunk_types = tuple(chunk_types) 70 self._chunk_types = chunk_types 71 self._colmap = dict((c,i) for (i,c) in enumerate(columntypes)) 72 self._pos_in_tree = pos_in_tree 73 self._top_node = top_node # for chunks 74 self._srl_includes_roleset = srl_includes_roleset 75 self._tree_class = tree_class 76 CorpusReader.__init__(self, root, files, encoding)

77 78 #///////////////////////////////////////////////////////////////// 79 # Data Access Methods 80 #///////////////////////////////////////////////////////////////// 81

82 - def raw(self, files=None):

83 if files is None: files = self._files 84 elif isinstance(files, basestring): files = [files] 85 return concat([self.open(f).read() for f in files])

86

87 - def words(self, files=None):

88 self._require(self.WORDS) 89 return LazyConcatenation(LazyMap(self._get_words, self._grids(files)))

90

91 - def sents(self, files=None):

92 self._require(self.WORDS) 93 return LazyMap(self._get_words, self._grids(files))

94

95 - def tagged_words(self, files=None):

96 self._require(self.WORDS, self.POS) 97 return LazyConcatenation(LazyMap(self._get_tagged_words, 98 self._grids(files)))

99

100 - def tagged_sents(self, files=None):

101 self._require(self.WORDS, self.POS) 102 return LazyMap(self._get_tagged_words, self._grids(files))

103

104 - def chunked_words(self, files=None, chunk_types=None):

105 self._require(self.WORDS, self.POS, self.CHUNK) 106 if chunk_types is None: chunk_types = self._chunk_types 107 def get_chunked_words(grid): # capture chunk_types as local var 108 return self._get_chunked_words(grid, chunk_types)

109 return LazyConcatenation(LazyMap(get_chunked_words, 110 self._grids(files)))

111

112 - def chunked_sents(self, files=None, chunk_types=None):

113 self._require(self.WORDS, self.POS, self.CHUNK) 114 if chunk_types is None: chunk_types = self._chunk_types 115 def get_chunked_words(grid): # capture chunk_types as local var 116 return self._get_chunked_words(grid, chunk_types)

117 return LazyMap(get_chunked_words, self._grids(files)) 118

119 - def parsed_sents(self, files=None, pos_in_tree=None):

120 self._require(self.WORDS, self.POS, self.TREE) 121 if pos_in_tree is None: pos_in_tree = self._pos_in_tree 122 def get_parsed_sent(grid): # capture pos_in_tree as local var 123 return self._get_parsed_sent(grid, pos_in_tree)

124 return LazyMap(get_parsed_sent, self._grids(files)) 125

126 - def srl_spans(self, files=None):

127 self._require(self.SRL) 128 return LazyMap(self._get_srl_spans, self._grids(files))

129

130 - def srl_instances(self, files=None, pos_in_tree=None, flatten=True):

131 self._require(self.WORDS, self.POS, self.TREE, self.SRL) 132 if pos_in_tree is None: pos_in_tree = self._pos_in_tree 133 def get_srl_instances(grid): # capture pos_in_tree as local var 134 return self._get_srl_instances(grid, pos_in_tree)

135 result = LazyMap(get_srl_instances, self._grids(files)) 136 if flatten: result = LazyConcatenation(result) 137 return result 138

139 - def iob_words(self, files=None):

140 """ 141 @return: a list of word/tag/IOB tuples 142 @rtype: C{list} of C{tuple} 143 @param files: the list of files that make up this corpus 144 @type files: C{None} or C{str} or C{list} 145 """ 146 self._require(self.WORDS, self.POS, self.CHUNK) 147 return LazyConcatenation(LazyMap(self._get_iob_words, 148 self._grids(files)))

149

150 - def iob_sents(self, files=None):

151 """ 152 @return: a list of lists of word/tag/IOB tuples 153 @rtype: C{list} of C{list} 154 @param files: the list of files that make up this corpus 155 @type files: C{None} or C{str} or C{list} 156 """ 157 self._require(self.WORDS, self.POS, self.CHUNK) 158 return LazyMap(self._get_iob_words, self._grids(files))

159 160 #///////////////////////////////////////////////////////////////// 161 # Grid Reading 162 #///////////////////////////////////////////////////////////////// 163

164 - def _grids(self, files=None):

165 # n.b.: we could cache the object returned here (keyed on 166 # files), which would let us reuse the same corpus view for 167 # different things (eg srl and parse trees). 168 return concat([StreamBackedCorpusView(filename, self._read_grid_block, 169 encoding=enc) 170 for (filename, enc) in self.abspaths(files, True)])

171

172 - def _read_grid_block(self, stream):

173 grids = [] 174 for block in read_blankline_block(stream): 175 block = block.strip() 176 if not block: continue 177 178 grid = [line.split() for line in block.split('\n')] 179 180 # If there's a docstart row, then discard. ([xx] eventually it 181 # would be good to actually use it) 182 if grid[0][self._colmap.get('words', 0)] == '-DOCSTART-': 183 del grid[0] 184 185 # Check that the grid is consistent. 186 for row in grid: 187 if len(row) != len(grid[0]): 188 raise ValueError('Inconsistent number of columns:\n%s' 189 % block) 190 grids.append(grid) 191 return grids

192 193 #///////////////////////////////////////////////////////////////// 194 # Transforms 195 #///////////////////////////////////////////////////////////////// 196 # given a grid, transform it into some representation (e.g., 197 # a list of words or a parse tree). 198

199 - def _get_words(self, grid):

200 return self._get_column(grid, self._colmap['words'])

201

202 - def _get_tagged_words(self, grid):

203 return zip(self._get_column(grid, self._colmap['words']), 204 self._get_column(grid, self._colmap['pos']))

205

206 - def _get_iob_words(self, grid):

207 return zip(self._get_column(grid, self._colmap['words']), 208 self._get_column(grid, self._colmap['pos']), 209 self._get_column(grid, self._colmap['chunk']))

210

211 - def _get_chunked_words(self, grid, chunk_types):

212 # n.b.: this method is very similar to conllstr2tree. 213 words = self._get_column(grid, self._colmap['words']) 214 pos_tags = self._get_column(grid, self._colmap['pos']) 215 chunk_tags = self._get_column(grid, self._colmap['chunk']) 216 217 stack = [Tree(self._top_node, [])] 218 219 for (word, pos_tag, chunk_tag) in zip(words, pos_tags, chunk_tags): 220 if chunk_tag == 'O': 221 state, chunk_type = 'O', '' 222 else: 223 (state, chunk_type) = chunk_tag.split('-') 224 # If it's a chunk we don't care about, treat it as O. 225 if chunk_types is not None and chunk_type not in chunk_types: 226 state = 'O' 227 # Treat a mismatching I like a B. 228 if state == 'I' and chunk_type != stack[-1].node: 229 state = 'B' 230 # For B or I: close any open chunks 231 if state in 'BO' and len(stack) == 2: 232 stack.pop() 233 # For B: start a new chunk. 234 if state == 'B': 235 new_chunk = Tree(chunk_type, []) 236 stack[-1].append(new_chunk) 237 stack.append(new_chunk) 238 # Add the word token. 239 stack[-1].append((word, pos_tag)) 240 241 return stack[0]

242

243 - def _get_parsed_sent(self, grid, pos_in_tree):

244 words = self._get_column(grid, self._colmap['words']) 245 pos_tags = self._get_column(grid, self._colmap['pos']) 246 parse_tags = self._get_column(grid, self._colmap['tree']) 247 248 treestr = '' 249 for (word, pos_tag, parse_tag) in zip(words, pos_tags, parse_tags): 250 if word == '(': word = '-LRB-' 251 if word == ')': word = '-RRB-' 252 if pos_tag == '(': pos_tag = '-LRB-' 253 if pos_tag == ')': pos_tag = '-RRB-' 254 (left, right) = parse_tag.split('*') 255 right = right.count(')')*')' # only keep ')'. 256 treestr += '%s (%s %s) %s' % (left, pos_tag, word, right) 257 try: 258 tree = self._tree_class.parse(treestr) 259 except (ValueError, IndexError): 260 tree = self._tree_class.parse('(%s %s)' % 261 (self._top_node, treestr)) 262 263 if not pos_in_tree: 264 for subtree in tree.subtrees(): 265 for i, child in enumerate(subtree): 266 if (isinstance(child, nltk.Tree) and len(child)==1 and 267 isinstance(child[0], basestring)): 268 subtree[i] = (child[0], child.node) 269 270 return tree

271

272 - def _get_srl_spans(self, grid):

273 """ 274 list of list of (start, end), tag) tuples 275 """ 276 if self._srl_includes_roleset: 277 predicates = self._get_column(grid, self._colmap['srl']+1) 278 start_col = self._colmap['srl']+2 279 else: 280 predicates = self._get_column(grid, self._colmap['srl']) 281 start_col = self._colmap['srl']+1 282 283 # Count how many predicates there are. This tells us how many 284 # columns to expect for SRL data. 285 num_preds = len([p for p in predicates if p != '-']) 286 287 spanlists = [] 288 for i in range(num_preds): 289 col = self._get_column(grid, start_col+i) 290 spanlist = [] 291 stack = [] 292 for wordnum, srl_tag in enumerate(col): 293 (left, right) = srl_tag.split('*') 294 for tag in left.split('('): 295 if tag: 296 stack.append((tag, wordnum)) 297 for i in range(right.count(')')): 298 (tag, start) = stack.pop() 299 spanlist.append( ((start, wordnum+1), tag) ) 300 spanlists.append(spanlist) 301 302 return spanlists

303

304 - def _get_srl_instances(self, grid, pos_in_tree):

305 tree = self._get_parsed_sent(grid, pos_in_tree) 306 spanlists = self._get_srl_spans(grid) 307 if self._srl_includes_roleset: 308 predicates = self._get_column(grid, self._colmap['srl']+1) 309 rolesets = self._get_column(grid, self._colmap['srl']) 310 else: 311 predicates = self._get_column(grid, self._colmap['srl']) 312 rolesets = [None] * len(predicates) 313 314 instances = ConllSRLInstanceList(tree) 315 for wordnum, predicate in enumerate(predicates): 316 if predicate == '-': continue 317 # Decide which spanlist to use. Don't assume that they're 318 # sorted in the same order as the predicates (even though 319 # they usually are). 320 for spanlist in spanlists: 321 for (start, end), tag in spanlist: 322 if wordnum in range(start,end) and tag in ('V', 'C-V'): 323 break 324 else: continue 325 break 326 else: 327 raise ValueError('No srl column found for %r' % predicate) 328 instances.append(ConllSRLInstance(tree, wordnum, predicate, 329 rolesets[wordnum], spanlist)) 330 331 return instances

332 333 #///////////////////////////////////////////////////////////////// 334 # Helper Methods 335 #///////////////////////////////////////////////////////////////// 336

337 - def _require(self, *columntypes):

338 for columntype in columntypes: 339 if columntype not in self._colmap: 340 raise ValueError('This corpus does not contain a %s ' 341 'column.' % columntype)

342 343 @staticmethod

344 - def _get_column(grid, column_index):

345 return [grid[i][column_index] for i in range(len(grid))]

346 347 348 #///////////////////////////////////////////////////////////////// 349 #{ Deprecated since 0.8 350 #///////////////////////////////////////////////////////////////// 351 @deprecated("Use .raw() or .words() or .tagged_words() or " 352 ".chunked_sents() instead.")

353 - def read(self, items, format='chunked', chunk_types=None):

354 if format == 'chunked': return self.chunked_sents(items, chunk_types) 355 if format == 'raw': return self.raw(items) 356 if format == 'tokenized': return self.words(items) 357 if format == 'tagged': return self.tagged_words(items) 358 raise ValueError('bad format %r' % format)

359 @deprecated("Use .chunked_sents() instead.")

360 - def chunked(self, items, chunk_types=None):

361 return self.chunked_sents(items, chunk_types)

362 @deprecated("Use .words() instead.")

363 - def tokenized(self, items):

364 return self.words(items)

365 @deprecated("Use .tagged_words() instead.")

366 - def tagged(self, items):

367 return self.tagged_words(items)

368 #} 369

370 -class ConllSRLInstance(object):

371 """ 372 An SRL instance from a CoNLL corpus, which identifies and 373 providing labels for the arguments of a single verb. 374 """ 375 # [xx] add inst.core_arguments, inst.argm_arguments? 376

377 - def __init__(self, tree, verb_head, verb_stem, roleset, tagged_spans):

378 self.verb = [] 379 """A list of the word indices of the words that compose the 380 verb whose arguments are identified by this instance. 381 This will contain multiple word indices when multi-word 382 verbs are used (e.g. 'turn on').""" 383 384 self.verb_head = verb_head 385 """The word index of the head word of the verb whose arguments 386 are identified by this instance. E.g., for a sentence that 387 uses the verb 'turn on,' C{verb_head} will be the word index 388 of the word 'turn'.""" 389 390 self.verb_stem = verb_stem 391 392 self.roleset = roleset 393 394 self.arguments = [] 395 """A list of C{(argspan, argid)} tuples, specifying the location 396 and type for each of the arguments identified by this 397 instance. C{argspan} is a tuple C{start, end}, indicating 398 that the argument consists of the C{words[start:end]}.""" 399 400 self.tagged_spans = tagged_spans 401 """A list of C{(span, id)} tuples, specifying the location and 402 type for each of the arguments, as well as the verb pieces, 403 that make up this instance.""" 404 405 self.tree = tree 406 """The parse tree for the sentence containing this instance.""" 407 408 self.words = tree.leaves() 409 """A list of the words in the sentence containing this 410 instance.""" 411 412 # Fill in the self.verb and self.arguments values. 413 for (start, end), tag in tagged_spans: 414 if tag in ('V', 'C-V'): 415 self.verb += range(start, end) 416 else: 417 self.arguments.append( ((start, end), tag) )

418

419 - def __repr__(self):

420 plural = len(self.arguments)!=1 and 's' or '' 421 return '<ConllSRLInstance for %r with %d argument%s>' % ( 422 (self.verb_stem, len(self.arguments), plural))

423

424 - def pprint(self):

425 verbstr = ' '.join(self.words[i][0] for i in self.verb) 426 hdr = 'SRL for %r (stem=%r):\n' % (verbstr, self.verb_stem) 427 s = '' 428 for i, word in enumerate(self.words): 429 if isinstance(word, tuple): word = word[0] 430 for (start, end), argid in self.arguments: 431 if i == start: s += '[%s ' % argid 432 if i == end: s += '] ' 433 if i in self.verb: word = '<<%s>>' % word 434 s += word + ' ' 435 return hdr + textwrap.fill(s.replace(' ]', ']'), 436 initial_indent=' ', 437 subsequent_indent=' ')

438

439 -class ConllSRLInstanceList(list):

440 """ 441 Set of instances for a single sentence 442 """

443 - def __init__(self, tree, instances=()):

444 self.tree = tree 445 list.__init__(self, instances)

446

447 - def __str__(self):

448 return self.pprint()

449

450 - def pprint(self, include_tree=False):

451 # Sanity check: trees should be the same 452 for inst in self: 453 if inst.tree != self.tree: 454 raise ValueError('Tree mismatch!') 455 456 # If desired, add trees: 457 if include_tree: 458 words = self.tree.leaves() 459 pos = [None] * len(words) 460 synt = ['*'] * len(words) 461 self._tree2conll(self.tree, 0, words, pos, synt) 462 463 s = '' 464 for i in range(len(words)): 465 # optional tree columns 466 if include_tree: 467 s += '%-20s ' % words[i] 468 s += '%-8s ' % pos[i] 469 s += '%15s*%-8s ' % tuple(synt[i].split('*')) 470 471 # verb head column 472 for inst in self: 473 if i == inst.verb_head: 474 s += '%-20s ' % inst.verb_stem 475 break 476 else: 477 s += '%-20s ' % '-' 478 # Remaining columns: self 479 for inst in self: 480 argstr = '*' 481 for (start, end), argid in inst.tagged_spans: 482 if i==start: argstr = '(%s%s' % (argid, argstr) 483 if i==(end-1): argstr += ')' 484 s += '%-12s ' % argstr 485 s += '\n' 486 return s

487

488 - def _tree2conll(self, tree, wordnum, words, pos, synt):

489 assert isinstance(tree, Tree) 490 if len(tree) == 1 and isinstance(tree[0], basestring): 491 pos[wordnum] = tree.node 492 assert words[wordnum] == tree[0] 493 return wordnum+1 494 elif len(tree) == 1 and isinstance(tree[0], tuple): 495 assert len(tree[0]) == 2 496 pos[wordnum], pos[wordnum] = tree[0] 497 return wordnum+1 498 else: 499 synt[wordnum] = '(%s%s' % (tree.node, synt[wordnum]) 500 for child in tree: 501 wordnum = self._tree2conll(child, wordnum, words, 502 pos, synt) 503 synt[wordnum-1] += ')' 504 return wordnum

505

506 -class ConllChunkCorpusReader(ConllCorpusReader):

507 """ 508 A ConllCorpusReader whose data file contains three columns: words, 509 pos, and chunk. 510 """

511 - def __init__(self, root, files, chunk_types, encoding=None):

512 ConllCorpusReader.__init__( 513 self, root, files, ('words', 'pos', 'chunk'), 514 chunk_types=chunk_types, encoding=encoding)

515

Source Code for Module nltk.corpus.reader.conll