Package nltk :: Package corpus :: Package reader :: Module util
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.util

  1  # Natural Language Toolkit: Corpus Reader Utilities 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Steven Bird <[email protected]> 
  5  #         Edward Loper <[email protected]> 
  6  # URL: <http://nltk.org> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  import os, sys, bisect, re, tempfile 
 10  try: import cPickle as pickle 
 11  except ImportError: import pickle 
 12  from itertools import islice 
 13  from nltk.corpus.reader.api import CorpusReader 
 14  from nltk import tokenize 
 15  from nltk.etree import ElementTree 
 16  from nltk.internals import deprecated, slice_bounds 
 17  from nltk.util import AbstractLazySequence, LazySubsequence, LazyConcatenation 
 18  from nltk.data import PathPointer, FileSystemPathPointer, ZipFilePathPointer 
 19   
 20  ###################################################################### 
 21  #{ Corpus View 
 22  ###################################################################### 
 23   
24 -class StreamBackedCorpusView(AbstractLazySequence):
25 """ 26 A 'view' of a corpus file, which acts like a sequence of tokens: 27 it can be accessed by index, iterated over, etc. However, the 28 tokens are only constructed as-needed -- the entire corpus is 29 never stored in memory at once. 30 31 The constructor to C{StreamBackedCorpusView} takes two arguments: 32 a corpus filename (specified as a string or as a L{PathPointer}); 33 and a block reader. A X{block reader} is a function that reads 34 zero or more tokens from a stream, and returns them as a list. A 35 very simple example of a block reader is: 36 37 >>> def simple_block_reader(stream): 38 ... return stream.readline().split() 39 40 This simple block reader reads a single line at a time, and 41 returns a single token (consisting of a string) for each 42 whitespace-separated substring on the line. 43 44 When deciding how to define the block reader for a given 45 corpus, careful consideration should be given to the size of 46 blocks handled by the block reader. Smaller block sizes will 47 increase the memory requirements of the corpus view's internal 48 data structures (by 2 integers per block). On the other hand, 49 larger block sizes may decrease performance for random access to 50 the corpus. (But note that larger block sizes will I{not} 51 decrease performance for iteration.) 52 53 Internally, C{CorpusView} maintains a partial mapping from token 54 index to file position, with one entry per block. When a token 55 with a given index M{i} is requested, the C{CorpusView} constructs 56 it as follows: 57 58 1. First, it searches the toknum/filepos mapping for the token 59 index closest to (but less than or equal to) M{i}. 60 61 2. Then, starting at the file position corresponding to that 62 index, it reads one block at a time using the block reader 63 until it reaches the requested token. 64 65 The toknum/filepos mapping is created lazily: it is initially 66 empty, but every time a new block is read, the block's 67 initial token is added to the mapping. (Thus, the toknum/filepos 68 map has one entry per block.) 69 70 In order to increase efficiency for random access patterns that 71 have high degrees of locality, the corpus view may cache one or 72 more blocks. 73 74 @note: Each C{CorpusView} object internally maintains an open file 75 object for its underlying corpus file. This file should be 76 automatically closed when the C{CorpusView} is garbage collected, 77 but if you wish to close it manually, use the L{close()} 78 method. If you access a C{CorpusView}'s items after it has been 79 closed, the file object will be automatically re-opened. 80 81 @warning: If the contents of the file are modified during the 82 lifetime of the C{CorpusView}, then the C{CorpusView}'s behavior 83 is undefined. 84 85 @warning: If a unicode encoding is specified when constructing a 86 C{CorpusView}, then the block reader may only call 87 C{stream.seek()} with offsets that have been returned by 88 C{stream.tell()}; in particular, calling C{stream.seek()} with 89 relative offsets, or with offsets based on string lengths, may 90 lead to incorrect behavior. 91 92 @ivar _block_reader: The function used to read 93 a single block from the underlying file stream. 94 @ivar _toknum: A list containing the token index of each block 95 that has been processed. In particular, C{_toknum[i]} is the 96 token index of the first token in block C{i}. Together 97 with L{_filepos}, this forms a partial mapping between token 98 indices and file positions. 99 @ivar _filepos: A list containing the file position of each block 100 that has been processed. In particular, C{_toknum[i]} is the 101 file position of the first character in block C{i}. Together 102 with L{_toknum}, this forms a partial mapping between token 103 indices and file positions. 104 @ivar _stream: The stream used to access the underlying corpus file. 105 @ivar _len: The total number of tokens in the corpus, if known; 106 or C{None}, if the number of tokens is not yet known. 107 @ivar _eofpos: The character position of the last character in the 108 file. This is calculated when the corpus view is initialized, 109 and is used to decide when the end of file has been reached. 110 @ivar _cache: A cache of the most recently read block. It 111 is encoded as a tuple (start_toknum, end_toknum, tokens), where 112 start_toknum is the token index of the first token in the block; 113 end_toknum is the token index of the first token not in the 114 block; and tokens is a list of the tokens in the block. 115 """
116 - def __init__(self, filename, block_reader=None, startpos=0, 117 encoding=None):
118 """ 119 Create a new corpus view, based on the file C{filename}, and 120 read with C{block_reader}. See the class documentation 121 for more information. 122 123 @param filename: The path to the file that is read by this 124 corpus view. C{filename} can either be a string or a 125 L{PathPointer}. 126 127 @param startpos: The file position at which the view will 128 start reading. This can be used to skip over preface 129 sections. 130 131 @param encoding: The unicode encoding that should be used to 132 read the file's contents. If no encoding is specified, 133 then the file's contents will be read as a non-unicode 134 string (i.e., a C{str}). 135 """ 136 if block_reader: 137 self.read_block = block_reader 138 # Initialize our toknum/filepos mapping. 139 self._toknum = [0] 140 self._filepos = [startpos] 141 self._encoding = encoding 142 # We don't know our length (number of tokens) yet. 143 self._len = None 144 145 self._filename = filename 146 self._stream = None 147 148 self._current_toknum = None 149 """This variable is set to the index of the next token that 150 will be read, immediately before L{self.read_block()} is 151 called. This is provided for the benefit of the block 152 reader, which under rare circumstances may need to know 153 the current token number.""" 154 155 self._current_blocknum = None 156 """This variable is set to the index of the next block that 157 will be read, immediately before L{self.read_block()} is 158 called. This is provided for the benefit of the block 159 reader, which under rare circumstances may need to know 160 the current block number.""" 161 162 # Find the length of the file. 163 try: 164 if isinstance(self._filename, PathPointer): 165 self._eofpos = self._filename.file_size() 166 else: 167 self._eofpos = os.stat(self._filename).st_size 168 except Exception, exc: 169 raise ValueError('Unable to open or access %r -- %s' % 170 (filename, exc)) 171 172 # Maintain a cache of the most recently read block, to 173 # increase efficiency of random access. 174 self._cache = (-1, -1, None)
175 176 filename = property(lambda self: self._filename, doc=""" 177 The filename of the file that is accessed by this view. 178 179 @type: C{str} or L{PathPointer}""") 180
181 - def read_block(self, stream):
182 """ 183 Read a block from the input stream. 184 185 @return: a block of tokens from the input stream 186 @rtype: list of any 187 @param stream: an input stream 188 @type stream: stream 189 """ 190 raise NotImplementedError('Abstract Method')
191
192 - def _open(self):
193 """ 194 Open the file stream associated with this corpus view. This 195 will be called performed if any value is read from the view 196 while its file stream is closed. 197 """ 198 if isinstance(self._filename, PathPointer): 199 self._stream = self._filename.open(self._encoding) 200 elif self._encoding: 201 self._stream = SeekableUnicodeStreamReader( 202 open(self._filename, 'rb'), self._encoding) 203 else: 204 self._stream = open(self._filename, 'rb')
205
206 - def close(self):
207 """ 208 Close the file stream associated with this corpus view. This 209 can be useful if you are worried about running out of file 210 handles (although the stream should automatically be closed 211 upon garbage collection of the corpus view). If the corpus 212 view is accessed after it is closed, it will be automatically 213 re-opened. 214 """ 215 if self._stream is not None: 216 self._stream.close() 217 self._stream = None
218
219 - def __len__(self):
220 if self._len is None: 221 # iterate_from() sets self._len when it reaches the end 222 # of the file: 223 for tok in self.iterate_from(self._toknum[-1]): pass 224 return self._len
225
226 - def __getitem__(self, i):
227 if isinstance(i, slice): 228 start, stop = slice_bounds(self, i) 229 # Check if it's in the cache. 230 offset = self._cache[0] 231 if offset <= start and stop <= self._cache[1]: 232 return self._cache[2][start-offset:stop-offset] 233 # Construct & return the result. 234 return LazySubsequence(self, start, stop) 235 else: 236 # Handle negative indices 237 if i < 0: i += len(self) 238 if i < 0: raise IndexError('index out of range') 239 # Check if it's in the cache. 240 offset = self._cache[0] 241 if offset <= i < self._cache[1]: 242 return self._cache[2][i-offset] 243 # Use iterate_from to extract it. 244 try: 245 return self.iterate_from(i).next() 246 except StopIteration: 247 raise IndexError('index out of range')
248 249 # If we wanted to be thread-safe, then this method would need to 250 # do some locking.
251 - def iterate_from(self, start_tok):
252 # Decide where in the file we should start. If `start` is in 253 # our mapping, then we can jump straight to the correct block; 254 # otherwise, start at the last block we've processed. 255 if start_tok < self._toknum[-1]: 256 block_index = bisect.bisect_right(self._toknum, start_tok)-1 257 toknum = self._toknum[block_index] 258 filepos = self._filepos[block_index] 259 else: 260 block_index = len(self._toknum)-1 261 toknum = self._toknum[-1] 262 filepos = self._filepos[-1] 263 264 # Open the stream, if it's not open already. 265 if self._stream is None: 266 self._open() 267 268 # Each iteration through this loop, we read a single block 269 # from the stream. 270 while filepos < self._eofpos: 271 # Read the next block. 272 self._stream.seek(filepos) 273 self._current_toknum = toknum 274 self._current_blocknum = block_index 275 tokens = self.read_block(self._stream) 276 assert isinstance(tokens, (tuple, list)), ( 277 'block reader %s() should return list or tuple.' % 278 self.read_block.__name__) 279 num_toks = len(tokens) 280 new_filepos = self._stream.tell() 281 assert new_filepos > filepos, ( 282 'block reader %s() should consume at least 1 byte' % 283 (self.read_block.__name__, filepos)) 284 285 # Update our cache. 286 self._cache = (toknum, toknum+num_toks, list(tokens)) 287 288 # Update our mapping. 289 assert toknum <= self._toknum[-1] 290 if num_toks > 0: 291 block_index += 1 292 if toknum == self._toknum[-1]: 293 assert new_filepos > self._filepos[-1] # monotonic! 294 self._filepos.append(new_filepos) 295 self._toknum.append(toknum+num_toks) 296 else: 297 # Check for consistency: 298 assert new_filepos == self._filepos[block_index], ( 299 'inconsistent block reader (num chars read)') 300 assert toknum+num_toks == self._toknum[block_index], ( 301 'inconsistent block reader (num tokens returned)') 302 303 # Generate the tokens in this block (but skip any tokens 304 # before start_tok). Note that between yields, our state 305 # may be modified. 306 for tok in tokens[max(0, start_tok-toknum):]: 307 yield tok 308 # If we're at the end of the file, then we're done. 309 # Set our length and terminate the generator. 310 assert new_filepos <= self._eofpos 311 if new_filepos == self._eofpos: 312 self._len = toknum + num_toks 313 break 314 # Update our indices 315 toknum += num_toks 316 filepos = new_filepos 317 318 # If we reach this point, then we should know our length. 319 assert self._len is not None
320 321 322 # Use concat for these, so we can use a ConcatenatedCorpusView 323 # when possible.
324 - def __add__(self, other):
325 return concat([self, other])
326 - def __radd__(self, other):
327 return concat([other, self])
328 - def __mul__(self, count):
329 return concat([self] * count)
330 - def __rmul__(self, count):
331 return concat([self] * count)
332
333 -class ConcatenatedCorpusView(AbstractLazySequence):
334 """ 335 A 'view' of a corpus file that joins together one or more 336 L{StreamBackedCorpusViews<StreamBackedCorpusView>}. At most 337 one file handle is left open at any time. 338 """
339 - def __init__(self, corpus_views):
340 self._pieces = corpus_views 341 """A list of the corpus subviews that make up this 342 concatenation.""" 343 344 self._offsets = [0] 345 """A list of offsets, indicating the index at which each 346 subview begins. In particular:: 347 offsets[i] = sum([len(p) for p in pieces[:i]])""" 348 349 self._open_piece = None 350 """The most recently accessed corpus subview (or C{None}). 351 Before a new subview is accessed, this subview will be closed."""
352
353 - def __len__(self):
354 if len(self._offsets) <= len(self._pieces): 355 # Iterate to the end of the corpus. 356 for tok in self.iterate_from(self._offsets[-1]): pass 357 358 return self._offsets[-1]
359
360 - def close(self):
361 for piece in self._pieces: 362 piece.close()
363
364 - def iterate_from(self, start_tok):
365 piecenum = bisect.bisect_right(self._offsets, start_tok)-1 366 367 while piecenum < len(self._pieces): 368 offset = self._offsets[piecenum] 369 piece = self._pieces[piecenum] 370 371 # If we've got another piece open, close it first. 372 if self._open_piece is not piece: 373 if self._open_piece is not None: 374 self._open_piece.close() 375 self._open_piece = piece 376 377 # Get everything we can from this piece. 378 for tok in piece.iterate_from(max(0, start_tok-offset)): 379 yield tok 380 381 # Update the offset table. 382 if piecenum+1 == len(self._offsets): 383 self._offsets.append(self._offsets[-1] + len(piece)) 384 385 # Move on to the next piece. 386 piecenum += 1
387
388 -def concat(docs):
389 """ 390 Concatenate together the contents of multiple documents from a 391 single corpus, using an appropriate concatenation function. This 392 utility function is used by corpus readers when the user requests 393 more than one document at a time. 394 """ 395 if len(docs) == 1: 396 return docs[0] 397 if len(docs) == 0: 398 raise ValueError('concat() expects at least one object!') 399 400 types = set([d.__class__ for d in docs]) 401 402 # If they're all strings, use string concatenation. 403 if types.issubset([str, unicode, basestring]): 404 return reduce((lambda a,b:a+b), docs, '') 405 406 # If they're all corpus views, then use ConcatenatedCorpusView. 407 for typ in types: 408 if not issubclass(typ, (StreamBackedCorpusView, 409 ConcatenatedCorpusView)): 410 break 411 else: 412 return ConcatenatedCorpusView(docs) 413 414 # If they're all lazy sequences, use a lazy concatenation 415 for typ in types: 416 if not issubclass(typ, AbstractLazySequence): 417 break 418 else: 419 return LazyConcatenation(docs) 420 421 # Otherwise, see what we can do: 422 if len(types) == 1: 423 typ = list(types)[0] 424 425 if issubclass(typ, list): 426 return reduce((lambda a,b:a+b), docs, []) 427 428 if issubclass(typ, tuple): 429 return reduce((lambda a,b:a+b), docs, ()) 430 431 if ElementTree.iselement(typ): 432 xmltree = ElementTree.Element('documents') 433 for doc in docs: xmltree.append(doc) 434 return xmltree 435 436 # No method found! 437 raise ValueError("Don't know how to concatenate types: %r" % types)
438 439 ###################################################################### 440 #{ Corpus View for Pickled Sequences 441 ###################################################################### 442
443 -class PickleCorpusView(StreamBackedCorpusView):
444 """ 445 A stream backed corpus view for corpus files that consist of 446 sequences of serialized Python objects (serialized using 447 C{pickle.dump}). One use case for this class is to store the 448 result of running feature detection on a corpus to disk. This can 449 be useful when performing feature detection is expensive (so we 450 don't want to repeat it); but the corpus is too large to store in 451 memory. The following example illustrates this technique: 452 453 >>> feature_corpus = LazyMap(detect_features, corpus) 454 >>> PickleCorpusView.write(feature_corpus, some_filename) 455 >>> pcv = PickledCorpusView(some_filename) 456 """ 457 BLOCK_SIZE = 100 458 PROTOCOL = -1 459
460 - def __init__(self, filename, delete_on_gc=False):
461 """ 462 Create a new corpus view that reads the pickle corpus 463 C{filename}. 464 465 @param delete_on_gc: If true, then C{filename} will be deleted 466 whenever this object gets garbage-collected. 467 """ 468 self._delete_on_gc = delete_on_gc 469 StreamBackedCorpusView.__init__(self, filename)
470
471 - def read_block(self, stream):
472 result = [] 473 for i in range(self.BLOCK_SIZE): 474 try: result.append(pickle.load(stream)) 475 except EOFError: break 476 return result
477
478 - def __del__(self):
479 """ 480 If C{delete_on_gc} was set to true when this 481 C{PickleCorpusView} was created, then delete the corpus view's 482 filename. (This method is called whenever a 483 C{PickledCorpusView} is garbage-collected. 484 """ 485 if getattr(self, '_delete_on_gc'): 486 if os.path.exists(self._filename): 487 try: os.remove(self._filename) 488 except (OSError, IOError): pass 489 self.__dict__.clear() # make the garbage collector's job easier
490 491 @classmethod
492 - def write(cls, sequence, output_file):
493 if isinstance(output_file, basestring): 494 output_file = open(output_file, 'wb') 495 for item in sequence: 496 pickle.dump(item, output_file, cls.PROTOCOL)
497 498 @classmethod
499 - def cache_to_tempfile(cls, sequence, delete_on_gc=True):
500 """ 501 Write the given sequence to a temporary file as a pickle 502 corpus; and then return a C{PickleCorpusView} view for that 503 temporary corpus file. 504 505 @param delete_on_gc: If true, then the temporary file will be 506 deleted whenever this object gets garbage-collected. 507 """ 508 try: 509 fd, output_file_name = tempfile.mkstemp('.pcv', 'nltk-') 510 output_file = os.fdopen(fd, 'wb') 511 cls.write(sequence, output_file) 512 output_file.close() 513 return PickleCorpusView(output_file_name, delete_on_gc) 514 except (OSError, IOError), e: 515 raise ValueError('Error while creating temp file: %s' % e)
516 517 518 519 ###################################################################### 520 #{ Block Readers 521 ###################################################################### 522
523 -def read_whitespace_block(stream):
524 toks = [] 525 for i in range(20): # Read 20 lines at a time. 526 toks.extend(stream.readline().split()) 527 return toks
528
529 -def read_wordpunct_block(stream):
530 toks = [] 531 for i in range(20): # Read 20 lines at a time. 532 toks.extend(wordpuct_tokenize(stream.readline())) 533 return toks
534
535 -def read_line_block(stream):
536 toks = [] 537 for i in range(20): 538 line = stream.readline() 539 if not line: return toks 540 toks.append(line.replace('\n', '')) 541 return toks
542
543 -def read_blankline_block(stream):
544 s = '' 545 while True: 546 line = stream.readline() 547 # End of file: 548 if not line: 549 if s: return [s] 550 else: return [] 551 # Blank line: 552 elif line and not line.strip(): 553 if s: return [s] 554 # Other line: 555 else: 556 s += line
557
558 -def read_regexp_block(stream, start_re, end_re=None):
559 """ 560 Read a sequence of tokens from a stream, where tokens begin with 561 lines that match C{start_re}. If C{end_re} is specified, then 562 tokens end with lines that match C{end_re}; otherwise, tokens end 563 whenever the next line matching C{start_re} or EOF is found. 564 """ 565 # Scan until we find a line matching the start regexp. 566 while True: 567 line = stream.readline() 568 if not line: return [] # end of file. 569 if re.match(start_re, line): break 570 571 # Scan until we find another line matching the regexp, or EOF. 572 lines = [line] 573 while True: 574 oldpos = stream.tell() 575 line = stream.readline() 576 # End of file: 577 if not line: 578 return [''.join(lines)] 579 # End of token: 580 if end_re is not None and re.match(end_re, line): 581 return [''.join(lines)] 582 # Start of new token: backup to just before it starts, and 583 # return the token we've already collected. 584 if end_re is None and re.match(start_re, line): 585 stream.seek(oldpos) 586 return [''.join(lines)] 587 # Anything else is part of the token. 588 lines.append(line)
589
590 -def read_sexpr_block(stream, block_size=16384, comment_char=None):
591 """ 592 Read a sequence of s-expressions from the stream, and leave the 593 stream's file position at the end the last complete s-expression 594 read. This function will always return at least one s-expression, 595 unless there are no more s-expressions in the file. 596 597 If the file ends in in the middle of an s-expression, then that 598 incomplete s-expression is returned when the end of the file is 599 reached. 600 601 @param block_size: The default block size for reading. If an 602 s-expression is longer than one block, then more than one 603 block will be read. 604 @param comment_char: A character that marks comments. Any lines 605 that begin with this character will be stripped out. 606 (If spaces or tabs preceed the comment character, then the 607 line will not be stripped.) 608 """ 609 start = stream.tell() 610 block = stream.read(block_size) 611 encoding = getattr(stream, 'encoding', None) 612 assert encoding is not None or isinstance(block, str) 613 if encoding not in (None, 'utf-8'): 614 import warnings 615 warnings.warn('Parsing may fail, depending on the properties ' 616 'of the %s encoding!' % encoding) 617 # (e.g., the utf-16 encoding does not work because it insists 618 # on adding BOMs to the beginning of encoded strings.) 619 620 if comment_char: 621 COMMENT = re.compile('(?m)^%s.*$' % re.escape(comment_char)) 622 while True: 623 try: 624 # If we're stripping comments, then make sure our block ends 625 # on a line boundary; and then replace any comments with 626 # space characters. (We can't just strip them out -- that 627 # would make our offset wrong.) 628 if comment_char: 629 block += stream.readline() 630 block = re.sub(COMMENT, _sub_space, block) 631 # Read the block. 632 tokens, offset = _parse_sexpr_block(block) 633 # Skip whitespace 634 offset = re.compile(r'\s*').search(block, offset).end() 635 636 # Move to the end position. 637 if encoding is None: 638 stream.seek(start+offset) 639 else: 640 stream.seek(start+len(block[:offset].encode(encoding))) 641 642 # Return the list of tokens we processed 643 return tokens 644 except ValueError, e: 645 if e.args[0] == 'Block too small': 646 next_block = stream.read(block_size) 647 if next_block: 648 block += next_block 649 continue 650 else: 651 # The file ended mid-sexpr -- return what we got. 652 return [block.strip()] 653 else: raise
654
655 -def _sub_space(m):
656 """Helper function: given a regexp match, return a string of 657 spaces that's the same length as the matched string.""" 658 return ' '*(m.end()-m.start())
659
660 -def _parse_sexpr_block(block):
661 tokens = [] 662 start = end = 0 663 664 while end < len(block): 665 m = re.compile(r'\S').search(block, end) 666 if not m: 667 return tokens, end 668 669 start = m.start() 670 671 # Case 1: sexpr is not parenthesized. 672 if m.group() != '(': 673 m2 = re.compile(r'[\s(]').search(block, start) 674 if m2: 675 end = m2.start() 676 else: 677 if tokens: return tokens, end 678 raise ValueError('Block too small') 679 680 # Case 2: parenthesized sexpr. 681 else: 682 nesting = 0 683 for m in re.compile(r'[()]').finditer(block, start): 684 if m.group()=='(': nesting += 1 685 else: nesting -= 1 686 if nesting == 0: 687 end = m.end() 688 break 689 else: 690 if tokens: return tokens, end 691 raise ValueError('Block too small') 692 693 tokens.append(block[start:end]) 694 695 return tokens, end
696 697 ###################################################################### 698 #{ Treebank readers 699 ###################################################################### 700 701 #[xx] is it worth it to factor this out?
702 -class SyntaxCorpusReader(CorpusReader):
703 """ 704 An abstract base class for reading corpora consisting of 705 syntactically parsed text. Subclasses should define: 706 707 - L{__init__}, which specifies the location of the corpus 708 and a method for detecting the sentence blocks in corpus files. 709 - L{_read_block}, which reads a block from the input stream. 710 - L{_word}, which takes a block and returns a list of list of words. 711 - L{_tag}, which takes a block and returns a list of list of tagged 712 words. 713 - L{_parse}, which takes a block and returns a list of parsed 714 sentences. 715 """
716 - def _parse(self, s):
717 raise AssertionError('Abstract method')
718 - def _word(self, s):
719 raise AssertionError('Abstract method')
720 - def _tag(self, s):
721 raise AssertionError('Abstract method')
722 - def _read_block(self, stream):
723 raise AssertionError('Abstract method')
724
725 - def raw(self, files=None):
726 if files is None: files = self._files 727 elif isinstance(files, basestring): files = [files] 728 return concat([self.open(f).read() for f in files])
729
730 - def parsed_sents(self, files=None):
731 reader = self._read_parsed_sent_block 732 return concat([StreamBackedCorpusView(filename, reader, encoding=enc) 733 for filename, enc in self.abspaths(files, True)])
734
735 - def tagged_sents(self, files=None, simplify_tags=False):
736 def reader(stream): 737 return self._read_tagged_sent_block(stream, simplify_tags)
738 return concat([StreamBackedCorpusView(filename, reader, encoding=enc) 739 for filename, enc in self.abspaths(files, True)])
740
741 - def sents(self, files=None):
742 reader = self._read_sent_block 743 return concat([StreamBackedCorpusView(filename, reader, encoding=enc) 744 for filename, enc in self.abspaths(files, True)])
745
746 - def tagged_words(self, files=None, simplify_tags=False):
747 def reader(stream): 748 return self._read_tagged_word_block(stream, simplify_tags)
749 return concat([StreamBackedCorpusView(filename, reader, encoding=enc) 750 for filename, enc in self.abspaths(files, True)]) 751
752 - def words(self, files=None):
753 return concat([StreamBackedCorpusView(filename, 754 self._read_word_block, 755 encoding=enc) 756 for filename, enc in self.abspaths(files, True)])
757 758 #------------------------------------------------------------ 759 #{ Block Readers 760
761 - def _read_word_block(self, stream):
762 return sum(self._read_sent_block(stream), [])
763
764 - def _read_tagged_word_block(self, stream, simplify_tags=False):
765 return sum(self._read_tagged_sent_block(stream, simplify_tags), [])
766
767 - def _read_sent_block(self, stream):
768 return filter(None, [self._word(t) for t in self._read_block(stream)])
769
770 - def _read_tagged_sent_block(self, stream, simplify_tags=False):
771 return filter(None, [self._tag(t, simplify_tags) 772 for t in self._read_block(stream)])
773
774 - def _read_parsed_sent_block(self, stream):
775 return filter(None, [self._parse(t) for t in self._read_block(stream)])
776 777 #} End of Block Readers 778 #------------------------------------------------------------ 779 780 #{ Deprecated since 0.8 781 @deprecated("Use .raw() or .sents() or .tagged_sents() or " 782 ".parsed_sents() instead.")
783 - def read(self, items=None, format='parsed'):
784 if format == 'parsed': return self.parsed_sents(items) 785 if format == 'raw': return self.raw(items) 786 if format == 'tokenized': return self.sents(items) 787 if format == 'tagged': return self.tagged_sents(items) 788 raise ValueError('bad format %r' % format)
789 @deprecated("Use .parsed_sents() instead.")
790 - def parsed(self, items=None):
791 return self.parsed_sents(items)
792 @deprecated("Use .sents() instead.")
793 - def tokenized(self, items=None):
794 return self.sents(items)
795 @deprecated("Use .tagged_sents() instead.")
796 - def tagged(self, items=None):
797 return self.tagged_sents(items)
798 #} 799 800 801 ###################################################################### 802 #{ Finding Corpus Items 803 ###################################################################### 804
805 -def find_corpus_files(root, regexp):
806 if not isinstance(root, PathPointer): 807 raise TypeError('find_corpus_files: expected a PathPointer') 808 regexp += '$' 809 810 # Find files in a zipfile: scan the zipfile's namelist. Filter 811 # out entries that end in '/' -- they're directories. 812 if isinstance(root, ZipFilePathPointer): 813 files = [name[len(root.entry):] for name in root.zipfile.namelist() 814 if not name.endswith('/')] 815 items = [name for name in files if re.match(regexp, name)] 816 return tuple(sorted(items)) 817 818 # Find files in a directory: use os.walk to search all 819 # subdirectories, and match paths against the regexp. 820 elif isinstance(root, FileSystemPathPointer): 821 items = [] 822 for dirname, subdirs, filenames in os.walk(root.path): 823 prefix = ''.join('%s/' % p for p in _path_from(root.path, dirname)) 824 items += [prefix+filename for filename in filenames 825 if re.match(regexp, prefix+filename)] 826 # Don't visit svn directories: 827 if '.svn' in subdirs: subdirs.remove('.svn') 828 return tuple(sorted(items)) 829 830 else: 831 raise AssertionError("Don't know how to handle %r" % root)
832
833 -def _path_from(parent, child):
834 if os.path.split(parent)[1] == '': 835 parent = os.path.split(parent)[0] 836 path = [] 837 while parent != child: 838 child, dirname = os.path.split(child) 839 path.insert(0, dirname) 840 assert os.path.split(child)[0] != child 841 return path
842 843 ###################################################################### 844 #{ Paragraph structure in Treebank files 845 ###################################################################### 846
847 -def tagged_treebank_para_block_reader(stream):
848 # Read the next paragraph. 849 para = '' 850 while True: 851 line = stream.readline() 852 # End of paragraph: 853 if re.match('======+\s*$', line): 854 if para.strip(): return [para] 855 # End of file: 856 elif line == '': 857 if para.strip(): return [para] 858 else: return [] 859 # Content line: 860 else: 861 para += line
862