1
2
3
4
5
6
7
8
9 import os, sys, bisect, re, tempfile
10 try: import cPickle as pickle
11 except ImportError: import pickle
12 from itertools import islice
13 from nltk.corpus.reader.api import CorpusReader
14 from nltk import tokenize
15 from nltk.etree import ElementTree
16 from nltk.internals import deprecated, slice_bounds
17 from nltk.util import AbstractLazySequence, LazySubsequence, LazyConcatenation
18 from nltk.data import PathPointer, FileSystemPathPointer, ZipFilePathPointer
19
20
21
22
23
25 """
26 A 'view' of a corpus file, which acts like a sequence of tokens:
27 it can be accessed by index, iterated over, etc. However, the
28 tokens are only constructed as-needed -- the entire corpus is
29 never stored in memory at once.
30
31 The constructor to C{StreamBackedCorpusView} takes two arguments:
32 a corpus filename (specified as a string or as a L{PathPointer});
33 and a block reader. A X{block reader} is a function that reads
34 zero or more tokens from a stream, and returns them as a list. A
35 very simple example of a block reader is:
36
37 >>> def simple_block_reader(stream):
38 ... return stream.readline().split()
39
40 This simple block reader reads a single line at a time, and
41 returns a single token (consisting of a string) for each
42 whitespace-separated substring on the line.
43
44 When deciding how to define the block reader for a given
45 corpus, careful consideration should be given to the size of
46 blocks handled by the block reader. Smaller block sizes will
47 increase the memory requirements of the corpus view's internal
48 data structures (by 2 integers per block). On the other hand,
49 larger block sizes may decrease performance for random access to
50 the corpus. (But note that larger block sizes will I{not}
51 decrease performance for iteration.)
52
53 Internally, C{CorpusView} maintains a partial mapping from token
54 index to file position, with one entry per block. When a token
55 with a given index M{i} is requested, the C{CorpusView} constructs
56 it as follows:
57
58 1. First, it searches the toknum/filepos mapping for the token
59 index closest to (but less than or equal to) M{i}.
60
61 2. Then, starting at the file position corresponding to that
62 index, it reads one block at a time using the block reader
63 until it reaches the requested token.
64
65 The toknum/filepos mapping is created lazily: it is initially
66 empty, but every time a new block is read, the block's
67 initial token is added to the mapping. (Thus, the toknum/filepos
68 map has one entry per block.)
69
70 In order to increase efficiency for random access patterns that
71 have high degrees of locality, the corpus view may cache one or
72 more blocks.
73
74 @note: Each C{CorpusView} object internally maintains an open file
75 object for its underlying corpus file. This file should be
76 automatically closed when the C{CorpusView} is garbage collected,
77 but if you wish to close it manually, use the L{close()}
78 method. If you access a C{CorpusView}'s items after it has been
79 closed, the file object will be automatically re-opened.
80
81 @warning: If the contents of the file are modified during the
82 lifetime of the C{CorpusView}, then the C{CorpusView}'s behavior
83 is undefined.
84
85 @warning: If a unicode encoding is specified when constructing a
86 C{CorpusView}, then the block reader may only call
87 C{stream.seek()} with offsets that have been returned by
88 C{stream.tell()}; in particular, calling C{stream.seek()} with
89 relative offsets, or with offsets based on string lengths, may
90 lead to incorrect behavior.
91
92 @ivar _block_reader: The function used to read
93 a single block from the underlying file stream.
94 @ivar _toknum: A list containing the token index of each block
95 that has been processed. In particular, C{_toknum[i]} is the
96 token index of the first token in block C{i}. Together
97 with L{_filepos}, this forms a partial mapping between token
98 indices and file positions.
99 @ivar _filepos: A list containing the file position of each block
100 that has been processed. In particular, C{_toknum[i]} is the
101 file position of the first character in block C{i}. Together
102 with L{_toknum}, this forms a partial mapping between token
103 indices and file positions.
104 @ivar _stream: The stream used to access the underlying corpus file.
105 @ivar _len: The total number of tokens in the corpus, if known;
106 or C{None}, if the number of tokens is not yet known.
107 @ivar _eofpos: The character position of the last character in the
108 file. This is calculated when the corpus view is initialized,
109 and is used to decide when the end of file has been reached.
110 @ivar _cache: A cache of the most recently read block. It
111 is encoded as a tuple (start_toknum, end_toknum, tokens), where
112 start_toknum is the token index of the first token in the block;
113 end_toknum is the token index of the first token not in the
114 block; and tokens is a list of the tokens in the block.
115 """
116 - def __init__(self, filename, block_reader=None, startpos=0,
117 encoding=None):
118 """
119 Create a new corpus view, based on the file C{filename}, and
120 read with C{block_reader}. See the class documentation
121 for more information.
122
123 @param filename: The path to the file that is read by this
124 corpus view. C{filename} can either be a string or a
125 L{PathPointer}.
126
127 @param startpos: The file position at which the view will
128 start reading. This can be used to skip over preface
129 sections.
130
131 @param encoding: The unicode encoding that should be used to
132 read the file's contents. If no encoding is specified,
133 then the file's contents will be read as a non-unicode
134 string (i.e., a C{str}).
135 """
136 if block_reader:
137 self.read_block = block_reader
138
139 self._toknum = [0]
140 self._filepos = [startpos]
141 self._encoding = encoding
142
143 self._len = None
144
145 self._filename = filename
146 self._stream = None
147
148 self._current_toknum = None
149 """This variable is set to the index of the next token that
150 will be read, immediately before L{self.read_block()} is
151 called. This is provided for the benefit of the block
152 reader, which under rare circumstances may need to know
153 the current token number."""
154
155 self._current_blocknum = None
156 """This variable is set to the index of the next block that
157 will be read, immediately before L{self.read_block()} is
158 called. This is provided for the benefit of the block
159 reader, which under rare circumstances may need to know
160 the current block number."""
161
162
163 try:
164 if isinstance(self._filename, PathPointer):
165 self._eofpos = self._filename.file_size()
166 else:
167 self._eofpos = os.stat(self._filename).st_size
168 except Exception, exc:
169 raise ValueError('Unable to open or access %r -- %s' %
170 (filename, exc))
171
172
173
174 self._cache = (-1, -1, None)
175
176 filename = property(lambda self: self._filename, doc="""
177 The filename of the file that is accessed by this view.
178
179 @type: C{str} or L{PathPointer}""")
180
182 """
183 Read a block from the input stream.
184
185 @return: a block of tokens from the input stream
186 @rtype: list of any
187 @param stream: an input stream
188 @type stream: stream
189 """
190 raise NotImplementedError('Abstract Method')
191
193 """
194 Open the file stream associated with this corpus view. This
195 will be called performed if any value is read from the view
196 while its file stream is closed.
197 """
198 if isinstance(self._filename, PathPointer):
199 self._stream = self._filename.open(self._encoding)
200 elif self._encoding:
201 self._stream = SeekableUnicodeStreamReader(
202 open(self._filename, 'rb'), self._encoding)
203 else:
204 self._stream = open(self._filename, 'rb')
205
207 """
208 Close the file stream associated with this corpus view. This
209 can be useful if you are worried about running out of file
210 handles (although the stream should automatically be closed
211 upon garbage collection of the corpus view). If the corpus
212 view is accessed after it is closed, it will be automatically
213 re-opened.
214 """
215 if self._stream is not None:
216 self._stream.close()
217 self._stream = None
218
220 if self._len is None:
221
222
223 for tok in self.iterate_from(self._toknum[-1]): pass
224 return self._len
225
248
249
250
252
253
254
255 if start_tok < self._toknum[-1]:
256 block_index = bisect.bisect_right(self._toknum, start_tok)-1
257 toknum = self._toknum[block_index]
258 filepos = self._filepos[block_index]
259 else:
260 block_index = len(self._toknum)-1
261 toknum = self._toknum[-1]
262 filepos = self._filepos[-1]
263
264
265 if self._stream is None:
266 self._open()
267
268
269
270 while filepos < self._eofpos:
271
272 self._stream.seek(filepos)
273 self._current_toknum = toknum
274 self._current_blocknum = block_index
275 tokens = self.read_block(self._stream)
276 assert isinstance(tokens, (tuple, list)), (
277 'block reader %s() should return list or tuple.' %
278 self.read_block.__name__)
279 num_toks = len(tokens)
280 new_filepos = self._stream.tell()
281 assert new_filepos > filepos, (
282 'block reader %s() should consume at least 1 byte' %
283 (self.read_block.__name__, filepos))
284
285
286 self._cache = (toknum, toknum+num_toks, list(tokens))
287
288
289 assert toknum <= self._toknum[-1]
290 if num_toks > 0:
291 block_index += 1
292 if toknum == self._toknum[-1]:
293 assert new_filepos > self._filepos[-1]
294 self._filepos.append(new_filepos)
295 self._toknum.append(toknum+num_toks)
296 else:
297
298 assert new_filepos == self._filepos[block_index], (
299 'inconsistent block reader (num chars read)')
300 assert toknum+num_toks == self._toknum[block_index], (
301 'inconsistent block reader (num tokens returned)')
302
303
304
305
306 for tok in tokens[max(0, start_tok-toknum):]:
307 yield tok
308
309
310 assert new_filepos <= self._eofpos
311 if new_filepos == self._eofpos:
312 self._len = toknum + num_toks
313 break
314
315 toknum += num_toks
316 filepos = new_filepos
317
318
319 assert self._len is not None
320
321
322
323
325 return concat([self, other])
327 return concat([other, self])
332
334 """
335 A 'view' of a corpus file that joins together one or more
336 L{StreamBackedCorpusViews<StreamBackedCorpusView>}. At most
337 one file handle is left open at any time.
338 """
340 self._pieces = corpus_views
341 """A list of the corpus subviews that make up this
342 concatenation."""
343
344 self._offsets = [0]
345 """A list of offsets, indicating the index at which each
346 subview begins. In particular::
347 offsets[i] = sum([len(p) for p in pieces[:i]])"""
348
349 self._open_piece = None
350 """The most recently accessed corpus subview (or C{None}).
351 Before a new subview is accessed, this subview will be closed."""
352
354 if len(self._offsets) <= len(self._pieces):
355
356 for tok in self.iterate_from(self._offsets[-1]): pass
357
358 return self._offsets[-1]
359
361 for piece in self._pieces:
362 piece.close()
363
365 piecenum = bisect.bisect_right(self._offsets, start_tok)-1
366
367 while piecenum < len(self._pieces):
368 offset = self._offsets[piecenum]
369 piece = self._pieces[piecenum]
370
371
372 if self._open_piece is not piece:
373 if self._open_piece is not None:
374 self._open_piece.close()
375 self._open_piece = piece
376
377
378 for tok in piece.iterate_from(max(0, start_tok-offset)):
379 yield tok
380
381
382 if piecenum+1 == len(self._offsets):
383 self._offsets.append(self._offsets[-1] + len(piece))
384
385
386 piecenum += 1
387
389 """
390 Concatenate together the contents of multiple documents from a
391 single corpus, using an appropriate concatenation function. This
392 utility function is used by corpus readers when the user requests
393 more than one document at a time.
394 """
395 if len(docs) == 1:
396 return docs[0]
397 if len(docs) == 0:
398 raise ValueError('concat() expects at least one object!')
399
400 types = set([d.__class__ for d in docs])
401
402
403 if types.issubset([str, unicode, basestring]):
404 return reduce((lambda a,b:a+b), docs, '')
405
406
407 for typ in types:
408 if not issubclass(typ, (StreamBackedCorpusView,
409 ConcatenatedCorpusView)):
410 break
411 else:
412 return ConcatenatedCorpusView(docs)
413
414
415 for typ in types:
416 if not issubclass(typ, AbstractLazySequence):
417 break
418 else:
419 return LazyConcatenation(docs)
420
421
422 if len(types) == 1:
423 typ = list(types)[0]
424
425 if issubclass(typ, list):
426 return reduce((lambda a,b:a+b), docs, [])
427
428 if issubclass(typ, tuple):
429 return reduce((lambda a,b:a+b), docs, ())
430
431 if ElementTree.iselement(typ):
432 xmltree = ElementTree.Element('documents')
433 for doc in docs: xmltree.append(doc)
434 return xmltree
435
436
437 raise ValueError("Don't know how to concatenate types: %r" % types)
438
439
440
441
442
444 """
445 A stream backed corpus view for corpus files that consist of
446 sequences of serialized Python objects (serialized using
447 C{pickle.dump}). One use case for this class is to store the
448 result of running feature detection on a corpus to disk. This can
449 be useful when performing feature detection is expensive (so we
450 don't want to repeat it); but the corpus is too large to store in
451 memory. The following example illustrates this technique:
452
453 >>> feature_corpus = LazyMap(detect_features, corpus)
454 >>> PickleCorpusView.write(feature_corpus, some_filename)
455 >>> pcv = PickledCorpusView(some_filename)
456 """
457 BLOCK_SIZE = 100
458 PROTOCOL = -1
459
460 - def __init__(self, filename, delete_on_gc=False):
461 """
462 Create a new corpus view that reads the pickle corpus
463 C{filename}.
464
465 @param delete_on_gc: If true, then C{filename} will be deleted
466 whenever this object gets garbage-collected.
467 """
468 self._delete_on_gc = delete_on_gc
469 StreamBackedCorpusView.__init__(self, filename)
470
472 result = []
473 for i in range(self.BLOCK_SIZE):
474 try: result.append(pickle.load(stream))
475 except EOFError: break
476 return result
477
479 """
480 If C{delete_on_gc} was set to true when this
481 C{PickleCorpusView} was created, then delete the corpus view's
482 filename. (This method is called whenever a
483 C{PickledCorpusView} is garbage-collected.
484 """
485 if getattr(self, '_delete_on_gc'):
486 if os.path.exists(self._filename):
487 try: os.remove(self._filename)
488 except (OSError, IOError): pass
489 self.__dict__.clear()
490
491 @classmethod
492 - def write(cls, sequence, output_file):
493 if isinstance(output_file, basestring):
494 output_file = open(output_file, 'wb')
495 for item in sequence:
496 pickle.dump(item, output_file, cls.PROTOCOL)
497
498 @classmethod
500 """
501 Write the given sequence to a temporary file as a pickle
502 corpus; and then return a C{PickleCorpusView} view for that
503 temporary corpus file.
504
505 @param delete_on_gc: If true, then the temporary file will be
506 deleted whenever this object gets garbage-collected.
507 """
508 try:
509 fd, output_file_name = tempfile.mkstemp('.pcv', 'nltk-')
510 output_file = os.fdopen(fd, 'wb')
511 cls.write(sequence, output_file)
512 output_file.close()
513 return PickleCorpusView(output_file_name, delete_on_gc)
514 except (OSError, IOError), e:
515 raise ValueError('Error while creating temp file: %s' % e)
516
517
518
519
520
521
522
524 toks = []
525 for i in range(20):
526 toks.extend(stream.readline().split())
527 return toks
528
530 toks = []
531 for i in range(20):
532 toks.extend(wordpuct_tokenize(stream.readline()))
533 return toks
534
542
544 s = ''
545 while True:
546 line = stream.readline()
547
548 if not line:
549 if s: return [s]
550 else: return []
551
552 elif line and not line.strip():
553 if s: return [s]
554
555 else:
556 s += line
557
559 """
560 Read a sequence of tokens from a stream, where tokens begin with
561 lines that match C{start_re}. If C{end_re} is specified, then
562 tokens end with lines that match C{end_re}; otherwise, tokens end
563 whenever the next line matching C{start_re} or EOF is found.
564 """
565
566 while True:
567 line = stream.readline()
568 if not line: return []
569 if re.match(start_re, line): break
570
571
572 lines = [line]
573 while True:
574 oldpos = stream.tell()
575 line = stream.readline()
576
577 if not line:
578 return [''.join(lines)]
579
580 if end_re is not None and re.match(end_re, line):
581 return [''.join(lines)]
582
583
584 if end_re is None and re.match(start_re, line):
585 stream.seek(oldpos)
586 return [''.join(lines)]
587
588 lines.append(line)
589
591 """
592 Read a sequence of s-expressions from the stream, and leave the
593 stream's file position at the end the last complete s-expression
594 read. This function will always return at least one s-expression,
595 unless there are no more s-expressions in the file.
596
597 If the file ends in in the middle of an s-expression, then that
598 incomplete s-expression is returned when the end of the file is
599 reached.
600
601 @param block_size: The default block size for reading. If an
602 s-expression is longer than one block, then more than one
603 block will be read.
604 @param comment_char: A character that marks comments. Any lines
605 that begin with this character will be stripped out.
606 (If spaces or tabs preceed the comment character, then the
607 line will not be stripped.)
608 """
609 start = stream.tell()
610 block = stream.read(block_size)
611 encoding = getattr(stream, 'encoding', None)
612 assert encoding is not None or isinstance(block, str)
613 if encoding not in (None, 'utf-8'):
614 import warnings
615 warnings.warn('Parsing may fail, depending on the properties '
616 'of the %s encoding!' % encoding)
617
618
619
620 if comment_char:
621 COMMENT = re.compile('(?m)^%s.*$' % re.escape(comment_char))
622 while True:
623 try:
624
625
626
627
628 if comment_char:
629 block += stream.readline()
630 block = re.sub(COMMENT, _sub_space, block)
631
632 tokens, offset = _parse_sexpr_block(block)
633
634 offset = re.compile(r'\s*').search(block, offset).end()
635
636
637 if encoding is None:
638 stream.seek(start+offset)
639 else:
640 stream.seek(start+len(block[:offset].encode(encoding)))
641
642
643 return tokens
644 except ValueError, e:
645 if e.args[0] == 'Block too small':
646 next_block = stream.read(block_size)
647 if next_block:
648 block += next_block
649 continue
650 else:
651
652 return [block.strip()]
653 else: raise
654
656 """Helper function: given a regexp match, return a string of
657 spaces that's the same length as the matched string."""
658 return ' '*(m.end()-m.start())
659
661 tokens = []
662 start = end = 0
663
664 while end < len(block):
665 m = re.compile(r'\S').search(block, end)
666 if not m:
667 return tokens, end
668
669 start = m.start()
670
671
672 if m.group() != '(':
673 m2 = re.compile(r'[\s(]').search(block, start)
674 if m2:
675 end = m2.start()
676 else:
677 if tokens: return tokens, end
678 raise ValueError('Block too small')
679
680
681 else:
682 nesting = 0
683 for m in re.compile(r'[()]').finditer(block, start):
684 if m.group()=='(': nesting += 1
685 else: nesting -= 1
686 if nesting == 0:
687 end = m.end()
688 break
689 else:
690 if tokens: return tokens, end
691 raise ValueError('Block too small')
692
693 tokens.append(block[start:end])
694
695 return tokens, end
696
697
698
699
700
701
703 """
704 An abstract base class for reading corpora consisting of
705 syntactically parsed text. Subclasses should define:
706
707 - L{__init__}, which specifies the location of the corpus
708 and a method for detecting the sentence blocks in corpus files.
709 - L{_read_block}, which reads a block from the input stream.
710 - L{_word}, which takes a block and returns a list of list of words.
711 - L{_tag}, which takes a block and returns a list of list of tagged
712 words.
713 - L{_parse}, which takes a block and returns a list of parsed
714 sentences.
715 """
717 raise AssertionError('Abstract method')
719 raise AssertionError('Abstract method')
721 raise AssertionError('Abstract method')
723 raise AssertionError('Abstract method')
724
725 - def raw(self, files=None):
729
734
738 return concat([StreamBackedCorpusView(filename, reader, encoding=enc)
739 for filename, enc in self.abspaths(files, True)])
740
741 - def sents(self, files=None):
745
749 return concat([StreamBackedCorpusView(filename, reader, encoding=enc)
750 for filename, enc in self.abspaths(files, True)])
751
752 - def words(self, files=None):
757
758
759
760
763
766
769
771 return filter(None, [self._tag(t, simplify_tags)
772 for t in self._read_block(stream)])
773
776
777
778
779
780
781 @deprecated("Use .raw() or .sents() or .tagged_sents() or "
782 ".parsed_sents() instead.")
783 - def read(self, items=None, format='parsed'):
789 @deprecated("Use .parsed_sents() instead.")
790 - def parsed(self, items=None):
792 @deprecated("Use .sents() instead.")
795 @deprecated("Use .tagged_sents() instead.")
796 - def tagged(self, items=None):
798
799
800
801
802
803
804
832
842
843
844
845
846
848
849 para = ''
850 while True:
851 line = stream.readline()
852
853 if re.match('======+\s*$', line):
854 if para.strip(): return [para]
855
856 elif line == '':
857 if para.strip(): return [para]
858 else: return []
859
860 else:
861 para += line
862