Package nltk :: Module util
[hide private]
[frames] | no frames]

Source Code for Module nltk.util

  1  # Natural Language Toolkit: Utility functions 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Steven Bird <[email protected]> 
  5  # URL: <http://nltk.org> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  import locale 
  9  import re 
 10  import types 
 11  import textwrap 
 12  import pydoc 
 13  import bisect 
 14   
 15  from itertools import islice 
 16  from pprint import pprint 
 17  from collections import defaultdict 
 18  from UserDict import UserDict 
 19   
 20  from nltk.internals import Deprecated, slice_bounds 
 21   
 22  ###################################################################### 
 23  # Short usage message 
 24  ###################################################################### 
 25   
26 -def usage(obj, selfname='self'):
27 import inspect 28 str(obj) # In case it's lazy, this will load it. 29 30 if not isinstance(obj, (types.TypeType, types.ClassType)): 31 obj = obj.__class__ 32 33 print '%s supports the following operations:' % obj.__name__ 34 for (name, method) in sorted(pydoc.allmethods(obj).items()): 35 if name.startswith('_'): continue 36 if getattr(method, '__deprecated__', False): continue 37 38 args, varargs, varkw, defaults = inspect.getargspec(method) 39 if (args and args[0]=='self' and 40 (defaults is None or len(args)>len(defaults))): 41 args = args[1:] 42 name = '%s.%s' % (selfname, name) 43 argspec = inspect.formatargspec( 44 args, varargs, varkw, defaults) 45 print textwrap.fill('%s%s' % (name, argspec), 46 initial_indent=' - ', 47 subsequent_indent=' '*(len(name)+5))
48 49 ########################################################################## 50 # PRETTY PRINTING 51 ########################################################################## 52
53 -def pr(data, start=0, end=None):
54 """ 55 Pretty print a sequence of data items 56 57 @param data: the data stream to print 58 @type data: C{sequence} or C{iterator} 59 @param start: the start position 60 @type start: C{int} 61 @param end: the end position 62 @type end: C{int} 63 """ 64 pprint(list(islice(data, start, end)))
65 84 85 ########################################################################## 86 # MINIMAL SETS 87 ########################################################################## 88
89 -class MinimalSet(object):
90 """ 91 Find contexts where more than one possible target value can 92 appear. E.g. if targets are word-initial letters, and contexts 93 are the remainders of words, then we would like to find cases like 94 "fat" vs "cat", and "training" vs "draining". If targets are 95 parts-of-speech and contexts are words, then we would like to find 96 cases like wind (noun) 'air in rapid motion', vs wind (verb) 97 'coil, wrap'. 98 """
99 - def __init__(self, parameters=None):
100 """ 101 Create a new minimal set. 102 103 @param parameters: The (context, target, display) tuples for the item 104 @type parameters: C{list} of C{tuple} of C{string} 105 """ 106 self._targets = set() # the contrastive information 107 self._contexts = set() # what we are controlling for 108 self._seen = {} # to record what we have seen 109 self._displays = {} # what we will display 110 111 if parameters: 112 for context, target, display in parameters: 113 self.add(context, target, display)
114
115 - def add(self, context, target, display):
116 """ 117 Add a new item to the minimal set, having the specified 118 context, target, and display form. 119 120 @param context: The context in which the item of interest appears 121 @type context: C{string} 122 @param target: The item of interest 123 @type target: C{string} 124 @param display: The information to be reported for each item 125 @type display: C{string} 126 """ 127 # Store the set of targets that occurred in this context 128 if context not in self._seen: 129 self._seen[context] = set() 130 self._seen[context].add(target) 131 132 # Keep track of which contexts and targets we have seen 133 self._contexts.add(context) 134 self._targets.add(target) 135 136 # For a given context and target, store the display form 137 self._displays[(context, target)] = display
138
139 - def contexts(self, minimum=2):
140 """ 141 Determine which contexts occurred with enough distinct targets. 142 143 @param minimum: the minimum number of distinct target forms 144 @type minimum: C{int} 145 @rtype C{list} 146 """ 147 return [c for c in self._contexts if len(self._seen[c]) >= minimum]
148
149 - def display(self, context, target, default=""):
150 if (context, target) not in self._displays: 151 return self._displays[(context, target)] 152 else: 153 return default
154
155 - def display_all(self, context):
156 result = [] 157 for target in self._targets: 158 x = self.display(context, target) 159 if x: result.append(x) 160 return result
161
162 - def targets(self):
163 return self._targets
164 165 166 ###################################################################### 167 ## Regexp display (thanks to David Mertz) 168 ###################################################################### 169
170 -def re_show(regexp, string, left="{", right="}"):
171 """ 172 Search C{string} for substrings matching C{regexp} and wrap 173 the matches with braces. This is convenient for learning about 174 regular expressions. 175 176 @param regexp: The regular expression. 177 @type regexp: C{string} 178 @param string: The string being matched. 179 @type string: C{string} 180 @param left: The left delimiter (printed before the matched substring) 181 @type left: C{string} 182 @param right: The right delimiter (printed after the matched substring) 183 @type right: C{string} 184 @rtype: C{string} 185 @return: A string with markers surrounding the matched substrings. 186 """ 187 print re.compile(regexp, re.M).sub(left + r"\g<0>" + right, string.rstrip())
188 189 190 ########################################################################## 191 # READ FROM FILE OR STRING 192 ########################################################################## 193 194 # recipe from David Mertz
195 -def filestring(f):
196 if hasattr(f, 'read'): 197 return f.read() 198 elif isinstance(f, basestring): 199 return open(f).read() 200 else: 201 raise ValueError, "Must be called with a filename or file-like object"
202 203 ########################################################################## 204 # Breadth-First Search 205 ########################################################################## 206
207 -def breadth_first(tree, children=iter, depth=-1, queue=None):
208 """Traverse the nodes of a tree in breadth-first order. 209 (No need to check for cycles.) 210 The first argument should be the tree root; 211 children should be a function taking as argument a tree node 212 and returning an iterator of the node's children. 213 """ 214 if queue == None: 215 queue = [] 216 queue.append(tree) 217 218 while queue: 219 node = queue.pop(0) 220 yield node 221 if depth != 0: 222 try: 223 queue += children(node) 224 depth -= 1 225 except: 226 pass
227 228 ########################################################################## 229 # Guess Character Encoding 230 ########################################################################## 231 232 # adapted from io.py in the docutils extension module (http://docutils.sourceforge.net) 233 # http://www.pyzine.com/Issue008/Section_Articles/article_Encodings.html 234
235 -def guess_encoding(data):
236 """ 237 Given a byte string, attempt to decode it. 238 Tries the standard 'UTF8' and 'latin-1' encodings, 239 Plus several gathered from locale information. 240 241 The calling program *must* first call:: 242 243 locale.setlocale(locale.LC_ALL, '') 244 245 If successful it returns C{(decoded_unicode, successful_encoding)}. 246 If unsuccessful it raises a C{UnicodeError}. 247 """ 248 successful_encoding = None 249 # we make 'utf-8' the first encoding 250 encodings = ['utf-8'] 251 # 252 # next we add anything we can learn from the locale 253 try: 254 encodings.append(locale.nl_langinfo(locale.CODESET)) 255 except AttributeError: 256 pass 257 try: 258 encodings.append(locale.getlocale()[1]) 259 except (AttributeError, IndexError): 260 pass 261 try: 262 encodings.append(locale.getdefaultlocale()[1]) 263 except (AttributeError, IndexError): 264 pass 265 # 266 # we try 'latin-1' last 267 encodings.append('latin-1') 268 for enc in encodings: 269 # some of the locale calls 270 # may have returned None 271 if not enc: 272 continue 273 try: 274 decoded = unicode(data, enc) 275 successful_encoding = enc 276 277 except (UnicodeError, LookupError): 278 pass 279 else: 280 break 281 if not successful_encoding: 282 raise UnicodeError( 283 'Unable to decode input data. Tried the following encodings: %s.' 284 % ', '.join([repr(enc) for enc in encodings if enc])) 285 else: 286 return (decoded, successful_encoding)
287 288 289 ########################################################################## 290 # Invert a dictionary 291 ########################################################################## 292
293 -def invert_dict(d):
294 from nltk import defaultdict 295 inverted_dict = defaultdict(list) 296 for key in d: 297 for term in d[key]: 298 inverted_dict[term].append(key) 299 return inverted_dict
300 301 302 ########################################################################## 303 # HTML Cleaning 304 ########################################################################## 305 306 from HTMLParser import HTMLParser 307 skip = ['script', 'style'] # non-nesting tags to skip 308
309 -class HTMLCleaner(HTMLParser):
310 - def __init__(self):
311 self.reset() 312 self.fed = [] 313 self._flag = True
314 - def handle_data(self, d):
315 if self._flag: 316 self.fed.append(d)
317 - def handle_starttag(self, tag, attrs):
318 if tag in skip: 319 self._flag = False
320 - def handle_endtag(self, tag):
321 if tag in skip: 322 self._flag = True
323 - def clean_text(self):
324 return ''.join(self.fed)
325
326 -def clean_html(html):
327 """ 328 Remove HTML markup from the given string. 329 330 @param html: the HTML string to be cleaned 331 @type html: C{string} 332 @rtype: C{string} 333 """ 334 335 cleaner = HTMLCleaner() 336 cleaner.feed(html) 337 return cleaner.clean_text()
338
339 -def clean_url(url):
340 from urllib import urlopen 341 html = urlopen(url).read() 342 return clean_html(html)
343 344 ########################################################################## 345 # Ngram iteration 346 ########################################################################## 347
348 -def ngram(sequence, n):
349 """ 350 A utility that produces a sequence of ngrams from a sequence of items. 351 For example: 352 353 >>> ngram([1,2,3,4,5], 3) 354 [(1, 2, 3), (2, 3, 4), (3, 4, 5)] 355 356 Use ingram for an iterator version of this function. 357 358 @param sequence: the source data to be converted into ngrams 359 @type sequence: C{sequence} or C{iterator} 360 @param n: the degree of the ngram 361 @type n: C{int} 362 @return: The ngrams 363 @rtype: C{list} of C{tuple}s 364 """ 365 366 count = max(0, len(list(sequence)) - n + 1) 367 return [tuple(sequence[i:i+n]) for i in range(count)]
368
369 -def ingram(sequence, n):
370 """ 371 A utility that produces an iterator over ngrams generated from a sequence of items. 372 373 For example: 374 375 >>> list(ingram([1,2,3,4,5], 3)) 376 [(1, 2, 3), (2, 3, 4), (3, 4, 5)] 377 378 Use ngram for a list version of this function. 379 380 @param sequence: the source data to be converted into ngrams 381 @type sequence: C{sequence} or C{iterator} 382 @param n: the degree of the ngram 383 @type n: C{int} 384 @return: The ngrams 385 @rtype: C{iterator} of C{tuple}s 386 """ 387 388 sequence = iter(sequence) 389 history = [] 390 while n > 1: 391 history.append(sequence.next()) 392 n -= 1 393 for item in sequence: 394 history.append(item) 395 yield tuple(history) 396 del history[0]
397 398 ########################################################################## 399 # Ordered Dictionary 400 ########################################################################## 401
402 -class OrderedDict(UserDict):
403 - def __init__(self, data=None, **kwargs):
404 self._keys = self.keys(data, kwargs.get('keys')) 405 self._default_factory = kwargs.get('default_factory') 406 UserDict.__init__(self, data)
407
408 - def __delitem__(self, key):
409 UserDict.__delitem__(self, key) 410 self._keys.remove(key)
411
412 - def __getitem__(self, key):
413 try: 414 return UserDict.__getitem__(self, key) 415 except KeyError: 416 return self.__missing__(key)
417
418 - def __iter__(self):
419 return (key for key in self.keys())
420
421 - def __missing__(self, key):
422 if not self._default_factory and key not in self._keys: 423 raise KeyError() 424 else: 425 return self._default_factory()
426
427 - def __setitem__(self, key, item):
428 UserDict.__setitem__(self, key, item) 429 if key not in self._keys: 430 self._keys.append(key)
431
432 - def clear(self):
433 UserDict.clear(self) 434 self._keys.clear()
435
436 - def copy(self):
437 dict = UserDict.copy(self) 438 dict._keys = self._keys 439 return dict
440
441 - def items(self):
442 return zip(self.keys(), self.values())
443
444 - def keys(self, data=None, keys=None):
445 if data: 446 if keys: 447 assert isinstance(keys, list) 448 assert len(data) == len(keys) 449 return keys 450 else: 451 assert isinstance(data, dict) or \ 452 isinstance(data, OrderedDict) or \ 453 isinstance(data, list) 454 if isinstance(data, dict) or isinstance(data, OrderedDict): 455 return data.keys() 456 elif isinstance(data, list): 457 return [key for (key, value) in data] 458 elif '_keys' in self.__dict__: 459 return self._keys 460 else: 461 return []
462
463 - def popitem(self):
464 if self._keys: 465 key = self._keys.pop() 466 value = self[key] 467 del self[key] 468 return (key, value) 469 else: 470 raise KeyError()
471
472 - def setdefault(self, key, failobj=None):
473 UserDict.setdefault(self, key, failobj) 474 if key not in self._keys: 475 self._keys.append(key)
476
477 - def update(self, data):
478 UserDict.update(self, data) 479 for key in self.keys(data): 480 if key not in self._keys: 481 self._keys.append(key)
482
483 - def values(self):
484 return map(self.get, self._keys)
485 486 ###################################################################### 487 # Lazy Sequences 488 ###################################################################### 489
490 -class AbstractLazySequence(object):
491 """ 492 An abstract base class for read-only sequences whose values are 493 computed as needed. Lazy sequences act like tuples -- they can be 494 indexed, sliced, and iterated over; but they may not be modified. 495 496 The most common application of lazy sequences in NLTK is for 497 I{corpus view} objects, which provide access to the contents of a 498 corpus without loading the entire corpus into memory, by loading 499 pieces of the corpus from disk as needed. 500 501 The result of modifying a mutable element of a lazy sequence is 502 undefined. In particular, the modifications made to the element 503 may or may not persist, depending on whether and when the lazy 504 sequence caches that element's value or reconstructs it from 505 scratch. 506 507 Subclasses are required to define two methods: 508 509 - L{__len__()} 510 - L{iterate_from()}. 511 """
512 - def __len__(self):
513 """ 514 Return the number of tokens in the corpus file underlying this 515 corpus view. 516 """ 517 raise NotImplementedError('should be implemented by subclass')
518
519 - def iterate_from(self, start):
520 """ 521 Return an iterator that generates the tokens in the corpus 522 file underlying this corpus view, starting at the token number 523 C{start}. If C{start>=len(self)}, then this iterator will 524 generate no tokens. 525 """ 526 raise NotImplementedError('should be implemented by subclass')
527
528 - def __getitem__(self, i):
529 """ 530 Return the C{i}th token in the corpus file underlying this 531 corpus view. Negative indices and spans are both supported. 532 """ 533 if isinstance(i, slice): 534 start, stop = slice_bounds(self, i) 535 return LazySubsequence(self, start, stop) 536 else: 537 # Handle negative indices 538 if i < 0: i += len(self) 539 if i < 0: raise IndexError('index out of range') 540 # Use iterate_from to extract it. 541 try: 542 return self.iterate_from(i).next() 543 except StopIteration: 544 raise IndexError('index out of range')
545
546 - def __iter__(self):
547 """Return an iterator that generates the tokens in the corpus 548 file underlying this corpus view.""" 549 return self.iterate_from(0)
550
551 - def count(self, value):
552 """Return the number of times this list contains C{value}.""" 553 return sum(1 for elt in self if elt==value)
554
555 - def index(self, value, start=None, stop=None):
556 """Return the index of the first occurance of C{value} in this 557 list that is greater than or equal to C{start} and less than 558 C{stop}. Negative start & stop values are treated like negative 559 slice bounds -- i.e., they count from the end of the list.""" 560 start, stop = slice_bounds(self, slice(start, stop)) 561 for i, elt in enumerate(islice(self, start, stop)): 562 if elt == value: return i+start 563 raise ValueError('index(x): x not in list')
564
565 - def __contains__(self, value):
566 """Return true if this list contains C{value}.""" 567 return bool(self.count(value))
568
569 - def __add__(self, other):
570 """Return a list concatenating self with other.""" 571 return LazyConcatenation([self, other])
572
573 - def __radd__(self, other):
574 """Return a list concatenating other with self.""" 575 return LazyConcatenation([other, self])
576
577 - def __mul__(self, count):
578 """Return a list concatenating self with itself C{count} times.""" 579 return LazyConcatenation([self] * count)
580
581 - def __rmul__(self, count):
582 """Return a list concatenating self with itself C{count} times.""" 583 return LazyConcatenation([self] * count)
584 585 _MAX_REPR_SIZE = 60
586 - def __repr__(self):
587 """ 588 @return: A string representation for this corpus view that is 589 similar to a list's representation; but if it would be more 590 than 60 characters long, it is truncated. 591 """ 592 pieces = [] 593 length = 5 594 for elt in self: 595 pieces.append(repr(elt)) 596 length += len(pieces[-1]) + 2 597 if length > self._MAX_REPR_SIZE and len(pieces) > 2: 598 return '[%s, ...]' % ', '.join(pieces[:-1]) 599 else: 600 return '[%s]' % ', '.join(pieces)
601
602 - def __cmp__(self, other):
603 """ 604 Return a number indicating how C{self} relates to other. 605 606 - If C{other} is not a corpus view or a C{list}, return -1. 607 - Otherwise, return C{cmp(list(self), list(other))}. 608 609 Note: corpus views do not compare equal to tuples containing 610 equal elements. Otherwise, transitivity would be violated, 611 since tuples do not compare equal to lists. 612 """ 613 if not isinstance(other, (AbstractLazySequence, list)): return -1 614 return cmp(list(self), list(other))
615
616 - def __hash__(self):
617 """ 618 @raise ValueError: Corpus view objects are unhashable. 619 """ 620 raise ValueError('%s objects are unhashable' % 621 self.__class__.__name__)
622 623
624 -class LazySubsequence(AbstractLazySequence):
625 """ 626 A subsequence produced by slicing a lazy sequence. This slice 627 keeps a reference to its source sequence, and generates its values 628 by looking them up in the source sequence. 629 """ 630 631 MIN_SIZE = 100 632 """The minimum size for which lazy slices should be created. If 633 C{LazySubsequence()} is called with a subsequence that is 634 shorter than C{MIN_SIZE}, then a tuple will be returned 635 instead.""" 636
637 - def __new__(cls, source, start, stop):
638 """ 639 Construct a new slice from a given underlying sequence. The 640 C{start} and C{stop} indices should be absolute indices -- 641 i.e., they should not be negative (for indexing from the back 642 of a list) or greater than the length of C{source}. 643 """ 644 # If the slice is small enough, just use a tuple. 645 if stop-start < cls.MIN_SIZE: 646 return list(islice(source.iterate_from(start), stop-start)) 647 else: 648 return object.__new__(cls, source, start, stop)
649
650 - def __init__(self, source, start, stop):
651 self._source = source 652 self._start = start 653 self._stop = stop
654
655 - def __len__(self):
656 return self._stop - self._start
657
658 - def iterate_from(self, start):
659 return islice(self._source.iterate_from(start+self._start), 660 max(0, len(self)-start))
661 662
663 -class LazyConcatenation(AbstractLazySequence):
664 """ 665 A lazy sequence formed by concatenating a list of lists. This 666 underlying list of lists may itself be lazy. C{LazyConcatenation} 667 maintains an index that it uses to keep track of the relationship 668 between offsets in the concatenated lists and offsets in the 669 sublists. 670 """
671 - def __init__(self, list_of_lists):
672 self._list = list_of_lists 673 self._offsets = [0]
674
675 - def __len__(self):
676 if len(self._offsets) <= len(self._list): 677 for tok in self.iterate_from(self._offsets[-1]): pass 678 return self._offsets[-1]
679
680 - def iterate_from(self, start_index):
681 if start_index < self._offsets[-1]: 682 sublist_index = bisect.bisect_right(self._offsets, start_index)-1 683 else: 684 sublist_index = len(self._offsets)-1 685 686 index = self._offsets[sublist_index] 687 688 # Construct an iterator over the sublists. 689 if isinstance(self._list, AbstractLazySequence): 690 sublist_iter = self._list.iterate_from(sublist_index) 691 else: 692 sublist_iter = islice(self._list, sublist_index, None) 693 694 for sublist in sublist_iter: 695 if sublist_index == (len(self._offsets)-1): 696 assert index+len(sublist) >= self._offsets[-1], ( 697 'offests not monotonic increasing!') 698 self._offsets.append(index+len(sublist)) 699 else: 700 assert self._offsets[sublist_index+1] == index+len(sublist), ( 701 'inconsistent list value (num elts)') 702 703 for value in sublist[max(0, start_index-index):]: 704 yield value 705 706 index += len(sublist) 707 sublist_index += 1
708 709
710 -class LazyMap(AbstractLazySequence):
711 """ 712 A lazy sequence whose elements are formed by applying a given 713 function to each element in one or more underlying lists. The 714 function is applied lazily -- i.e., when you read a value from the 715 list, C{LazyMap} will calculate that value by applying its 716 function to the underlying lists' value(s). C{LazyMap} is 717 essentially a lazy version of the Python primitive function 718 C{map}. In particular, the following two expressions are 719 equivalent: 720 721 >>> map(f, sequences...) 722 >>> list(LazyMap(f, sequences...)) 723 724 Like the Python C{map} primitive, if the source lists do not have 725 equal size, then the value C{None} will be supplied for the 726 'missing' elements. 727 728 Lazy maps can be useful for conserving memory, in cases where 729 individual values take up a lot of space. This is especially true 730 if the underlying list's values are constructed lazily, as is the 731 case with many corpus readers. 732 733 A typical example of a use case for this class is performing 734 feature detection on the tokens in a corpus. Since featuresets 735 are encoded as dictionaries, which can take up a lot of memory, 736 using a C{LazyMap} can significantly reduce memory usage when 737 training and running classifiers. 738 """
739 - def __init__(self, function, *lists, **config):
740 """ 741 @param function: The function that should be applied to 742 elements of C{lists}. It should take as many arguments 743 as there are C{lists}. 744 @param lists: The underlying lists. 745 @kwparam cache_size: Determines the size of the cache used 746 by this lazy map. (default=5) 747 """ 748 if not lists: 749 raise TypeError('LazyMap requires at least two args') 750 751 self._lists = lists 752 self._func = function 753 self._cache_size = config.get('cache_size', 5) 754 if self._cache_size > 0: 755 self._cache = {} 756 else: 757 self._cache = None 758 759 # If you just take bool() of sum() here _all_lazy will be true just 760 # in case n >= 1 list is an AbstractLazySequence. Presumably this 761 # isn't what's intended. 762 self._all_lazy = sum(isinstance(lst, AbstractLazySequence) 763 for lst in lists) == len(lists)
764
765 - def iterate_from(self, index):
766 # Special case: one lazy sublist 767 if len(self._lists) == 1 and self._all_lazy: 768 for value in self._lists[0].iterate_from(index): 769 yield self._func(value) 770 return 771 772 # Special case: one non-lazy sublist 773 elif len(self._lists) == 1: 774 while True: 775 try: yield self._func(self._lists[0][index]) 776 except IndexError: return 777 index += 1 778 779 # Special case: n lazy sublists 780 elif self._all_lazy: 781 iterators = [lst.iterate_from(index) for lst in self._lists] 782 while True: 783 elements = [] 784 for iterator in iterators: 785 try: elements.append(iterator.next()) 786 except: elements.append(None) 787 if elements == [None] * len(self._lists): 788 return 789 yield self._func(*elements) 790 index += 1 791 792 # general case 793 else: 794 while True: 795 try: elements = [lst[index] for lst in self._lists] 796 except IndexError: 797 elements = [None] * len(self._lists) 798 for i, lst in enumerate(self._lists): 799 try: elements[i] = lst[index] 800 except IndexError: pass 801 if elements == [None] * len(self._lists): 802 return 803 yield self._func(*elements) 804 index += 1
805
806 - def __getitem__(self, index):
807 if isinstance(index, slice): 808 sliced_lists = [lst[index] for lst in self._lists] 809 return LazyMap(self._func, *sliced_lists) 810 else: 811 # Handle negative indices 812 if index < 0: index += len(self) 813 if index < 0: raise IndexError('index out of range') 814 # Check the cache 815 if self._cache is not None and index in self._cache: 816 return self._cache[index] 817 # Calculate the value 818 try: val = self.iterate_from(index).next() 819 except StopIteration: 820 raise IndexError('index out of range') 821 # Update the cache 822 if self._cache is not None: 823 if len(self._cache) > self._cache_size: 824 self._cache.popitem() # discard random entry 825 self._cache[index] = val 826 # Return the value 827 return val
828
829 - def __len__(self):
830 return max(len(lst) for lst in self._lists)
831 832
833 -class LazyMappedList(Deprecated, LazyMap):
834 """Use LazyMap instead."""
835 - def __init__(self, lst, func):
836 LazyMap.__init__(self, func, lst)
837 838
839 -class LazyZip(LazyMap):
840 """ 841 A lazy sequence whose elements are tuples, each containing the i-th 842 element from each of the argument sequences. The returned list is 843 truncated in length to the length of the shortest argument sequence. The 844 tuples are constructed lazily -- i.e., when you read a value from the 845 list, C{LazyZip} will calculate that value by forming a C{tuple} from 846 the i-th element of each of the argument sequences. 847 848 C{LazyZip} is essentially a lazy version of the Python primitive function 849 C{zip}. In particular, the following two expressions are equivalent: 850 851 >>> zip(sequences...) 852 >>> list(LazyZip(sequences...)) 853 854 Lazy zips can be useful for conserving memory in cases where the argument 855 sequences are particularly long. 856 857 A typical example of a use case for this class is combining long sequences 858 of gold standard and predicted values in a classification or tagging task 859 in order to calculate accuracy. By constructing tuples lazily and 860 avoiding the creation of an additional long sequence, memory usage can be 861 significantly reduced. 862 """
863 - def __init__(self, *lists):
864 """ 865 @param lists: the underlying lists 866 @type lists: C{list} of C{list} 867 """ 868 LazyMap.__init__(self, lambda *elts: elts, *lists)
869
870 - def iterate_from(self, index):
871 iterator = LazyMap.iterate_from(self, index) 872 while index < len(self): 873 yield iterator.next() 874 index += 1 875 return
876
877 - def __len__(self):
878 return min(len(lst) for lst in self._lists)
879 880
881 -class LazyEnumerate(LazyZip):
882 """ 883 A lazy sequence whose elements are tuples, each ontaining a count (from 884 zero) and a value yielded by underlying sequence. C{LazyEnumerate} is 885 useful for obtaining an indexed list. The tuples are constructed lazily 886 -- i.e., when you read a value from the list, C{LazyEnumerate} will 887 calculate that value by forming a C{tuple} from the count of the i-th 888 element and the i-th element of the underlying sequence. 889 890 C{LazyEnumerate} is essentially a lazy version of the Python primitive 891 function C{enumerate}. In particular, the following two expressions are 892 equivalent: 893 894 >>> enumerate(sequence) 895 >>> list(LazyEnumerate(sequence)) 896 897 Lazy enumerations can be useful for conserving memory in cases where the 898 argument sequences are particularly long. 899 900 A typical example of a use case for this class is obtaining an indexed 901 list for a long sequence of values. By constructing tuples lazily and 902 avoiding the creation of an additional long sequence, memory usage can be 903 significantly reduced. 904 """ 905
906 - def __init__(self, lst):
907 """ 908 @param lst: the underlying list 909 @type lst: C{list} 910 """ 911 LazyZip.__init__(self, xrange(len(lst)), lst)
912 913
914 -class LazyMappedList(Deprecated, LazyMap):
915 """Use LazyMap instead."""
916 - def __init__(self, lst, func):
917 LazyMap.__init__(self, func, lst)
918 919
920 -class LazyMappedChain(Deprecated, LazyConcatenation):
921 """Use LazyConcatenation(LazyMap(func, lists)) instead."""
922 - def __init__(self, lst, func):
923 LazyConcatenation.__init__(self, LazyMap(func, lst))
924