1
2
3
4
5
6
7
8 import locale
9 import re
10 import types
11 import textwrap
12 import pydoc
13 import bisect
14
15 from itertools import islice
16 from pprint import pprint
17 from collections import defaultdict
18 from UserDict import UserDict
19
20 from nltk.internals import Deprecated, slice_bounds
21
22
23
24
25
26 -def usage(obj, selfname='self'):
27 import inspect
28 str(obj)
29
30 if not isinstance(obj, (types.TypeType, types.ClassType)):
31 obj = obj.__class__
32
33 print '%s supports the following operations:' % obj.__name__
34 for (name, method) in sorted(pydoc.allmethods(obj).items()):
35 if name.startswith('_'): continue
36 if getattr(method, '__deprecated__', False): continue
37
38 args, varargs, varkw, defaults = inspect.getargspec(method)
39 if (args and args[0]=='self' and
40 (defaults is None or len(args)>len(defaults))):
41 args = args[1:]
42 name = '%s.%s' % (selfname, name)
43 argspec = inspect.formatargspec(
44 args, varargs, varkw, defaults)
45 print textwrap.fill('%s%s' % (name, argspec),
46 initial_indent=' - ',
47 subsequent_indent=' '*(len(name)+5))
48
49
50
51
52
53 -def pr(data, start=0, end=None):
54 """
55 Pretty print a sequence of data items
56
57 @param data: the data stream to print
58 @type data: C{sequence} or C{iterator}
59 @param start: the start position
60 @type start: C{int}
61 @param end: the end position
62 @type end: C{int}
63 """
64 pprint(list(islice(data, start, end)))
65
67 """
68 Pretty print a string, breaking lines on whitespace
69
70 @param s: the string to print, consisting of words and spaces
71 @type s: C{string}
72 @param width: the display width
73 @type width: C{int}
74 """
75 while s:
76 s = s.strip()
77 try:
78 i = s[:width].rindex(' ')
79 except ValueError:
80 print s
81 return
82 print s[:i]
83 s = s[i:]
84
85
86
87
88
90 """
91 Find contexts where more than one possible target value can
92 appear. E.g. if targets are word-initial letters, and contexts
93 are the remainders of words, then we would like to find cases like
94 "fat" vs "cat", and "training" vs "draining". If targets are
95 parts-of-speech and contexts are words, then we would like to find
96 cases like wind (noun) 'air in rapid motion', vs wind (verb)
97 'coil, wrap'.
98 """
100 """
101 Create a new minimal set.
102
103 @param parameters: The (context, target, display) tuples for the item
104 @type parameters: C{list} of C{tuple} of C{string}
105 """
106 self._targets = set()
107 self._contexts = set()
108 self._seen = {}
109 self._displays = {}
110
111 if parameters:
112 for context, target, display in parameters:
113 self.add(context, target, display)
114
115 - def add(self, context, target, display):
116 """
117 Add a new item to the minimal set, having the specified
118 context, target, and display form.
119
120 @param context: The context in which the item of interest appears
121 @type context: C{string}
122 @param target: The item of interest
123 @type target: C{string}
124 @param display: The information to be reported for each item
125 @type display: C{string}
126 """
127
128 if context not in self._seen:
129 self._seen[context] = set()
130 self._seen[context].add(target)
131
132
133 self._contexts.add(context)
134 self._targets.add(target)
135
136
137 self._displays[(context, target)] = display
138
139 - def contexts(self, minimum=2):
140 """
141 Determine which contexts occurred with enough distinct targets.
142
143 @param minimum: the minimum number of distinct target forms
144 @type minimum: C{int}
145 @rtype C{list}
146 """
147 return [c for c in self._contexts if len(self._seen[c]) >= minimum]
148
149 - def display(self, context, target, default=""):
150 if (context, target) not in self._displays:
151 return self._displays[(context, target)]
152 else:
153 return default
154
156 result = []
157 for target in self._targets:
158 x = self.display(context, target)
159 if x: result.append(x)
160 return result
161
164
165
166
167
168
169
170 -def re_show(regexp, string, left="{", right="}"):
171 """
172 Search C{string} for substrings matching C{regexp} and wrap
173 the matches with braces. This is convenient for learning about
174 regular expressions.
175
176 @param regexp: The regular expression.
177 @type regexp: C{string}
178 @param string: The string being matched.
179 @type string: C{string}
180 @param left: The left delimiter (printed before the matched substring)
181 @type left: C{string}
182 @param right: The right delimiter (printed after the matched substring)
183 @type right: C{string}
184 @rtype: C{string}
185 @return: A string with markers surrounding the matched substrings.
186 """
187 print re.compile(regexp, re.M).sub(left + r"\g<0>" + right, string.rstrip())
188
189
190
191
192
193
194
196 if hasattr(f, 'read'):
197 return f.read()
198 elif isinstance(f, basestring):
199 return open(f).read()
200 else:
201 raise ValueError, "Must be called with a filename or file-like object"
202
203
204
205
206
208 """Traverse the nodes of a tree in breadth-first order.
209 (No need to check for cycles.)
210 The first argument should be the tree root;
211 children should be a function taking as argument a tree node
212 and returning an iterator of the node's children.
213 """
214 if queue == None:
215 queue = []
216 queue.append(tree)
217
218 while queue:
219 node = queue.pop(0)
220 yield node
221 if depth != 0:
222 try:
223 queue += children(node)
224 depth -= 1
225 except:
226 pass
227
228
229
230
231
232
233
234
236 """
237 Given a byte string, attempt to decode it.
238 Tries the standard 'UTF8' and 'latin-1' encodings,
239 Plus several gathered from locale information.
240
241 The calling program *must* first call::
242
243 locale.setlocale(locale.LC_ALL, '')
244
245 If successful it returns C{(decoded_unicode, successful_encoding)}.
246 If unsuccessful it raises a C{UnicodeError}.
247 """
248 successful_encoding = None
249
250 encodings = ['utf-8']
251
252
253 try:
254 encodings.append(locale.nl_langinfo(locale.CODESET))
255 except AttributeError:
256 pass
257 try:
258 encodings.append(locale.getlocale()[1])
259 except (AttributeError, IndexError):
260 pass
261 try:
262 encodings.append(locale.getdefaultlocale()[1])
263 except (AttributeError, IndexError):
264 pass
265
266
267 encodings.append('latin-1')
268 for enc in encodings:
269
270
271 if not enc:
272 continue
273 try:
274 decoded = unicode(data, enc)
275 successful_encoding = enc
276
277 except (UnicodeError, LookupError):
278 pass
279 else:
280 break
281 if not successful_encoding:
282 raise UnicodeError(
283 'Unable to decode input data. Tried the following encodings: %s.'
284 % ', '.join([repr(enc) for enc in encodings if enc]))
285 else:
286 return (decoded, successful_encoding)
287
288
289
290
291
292
300
301
302
303
304
305
306 from HTMLParser import HTMLParser
307 skip = ['script', 'style']
308
311 self.reset()
312 self.fed = []
313 self._flag = True
315 if self._flag:
316 self.fed.append(d)
318 if tag in skip:
319 self._flag = False
321 if tag in skip:
322 self._flag = True
323 - def clean_text(self):
324 return ''.join(self.fed)
325
327 """
328 Remove HTML markup from the given string.
329
330 @param html: the HTML string to be cleaned
331 @type html: C{string}
332 @rtype: C{string}
333 """
334
335 cleaner = HTMLCleaner()
336 cleaner.feed(html)
337 return cleaner.clean_text()
338
340 from urllib import urlopen
341 html = urlopen(url).read()
342 return clean_html(html)
343
344
345
346
347
349 """
350 A utility that produces a sequence of ngrams from a sequence of items.
351 For example:
352
353 >>> ngram([1,2,3,4,5], 3)
354 [(1, 2, 3), (2, 3, 4), (3, 4, 5)]
355
356 Use ingram for an iterator version of this function.
357
358 @param sequence: the source data to be converted into ngrams
359 @type sequence: C{sequence} or C{iterator}
360 @param n: the degree of the ngram
361 @type n: C{int}
362 @return: The ngrams
363 @rtype: C{list} of C{tuple}s
364 """
365
366 count = max(0, len(list(sequence)) - n + 1)
367 return [tuple(sequence[i:i+n]) for i in range(count)]
368
370 """
371 A utility that produces an iterator over ngrams generated from a sequence of items.
372
373 For example:
374
375 >>> list(ingram([1,2,3,4,5], 3))
376 [(1, 2, 3), (2, 3, 4), (3, 4, 5)]
377
378 Use ngram for a list version of this function.
379
380 @param sequence: the source data to be converted into ngrams
381 @type sequence: C{sequence} or C{iterator}
382 @param n: the degree of the ngram
383 @type n: C{int}
384 @return: The ngrams
385 @rtype: C{iterator} of C{tuple}s
386 """
387
388 sequence = iter(sequence)
389 history = []
390 while n > 1:
391 history.append(sequence.next())
392 n -= 1
393 for item in sequence:
394 history.append(item)
395 yield tuple(history)
396 del history[0]
397
398
399
400
401
403 - def __init__(self, data=None, **kwargs):
407
411
417
420
422 if not self._default_factory and key not in self._keys:
423 raise KeyError()
424 else:
425 return self._default_factory()
426
431
435
440
443
444 - def keys(self, data=None, keys=None):
462
464 if self._keys:
465 key = self._keys.pop()
466 value = self[key]
467 del self[key]
468 return (key, value)
469 else:
470 raise KeyError()
471
476
482
485
486
487
488
489
491 """
492 An abstract base class for read-only sequences whose values are
493 computed as needed. Lazy sequences act like tuples -- they can be
494 indexed, sliced, and iterated over; but they may not be modified.
495
496 The most common application of lazy sequences in NLTK is for
497 I{corpus view} objects, which provide access to the contents of a
498 corpus without loading the entire corpus into memory, by loading
499 pieces of the corpus from disk as needed.
500
501 The result of modifying a mutable element of a lazy sequence is
502 undefined. In particular, the modifications made to the element
503 may or may not persist, depending on whether and when the lazy
504 sequence caches that element's value or reconstructs it from
505 scratch.
506
507 Subclasses are required to define two methods:
508
509 - L{__len__()}
510 - L{iterate_from()}.
511 """
513 """
514 Return the number of tokens in the corpus file underlying this
515 corpus view.
516 """
517 raise NotImplementedError('should be implemented by subclass')
518
520 """
521 Return an iterator that generates the tokens in the corpus
522 file underlying this corpus view, starting at the token number
523 C{start}. If C{start>=len(self)}, then this iterator will
524 generate no tokens.
525 """
526 raise NotImplementedError('should be implemented by subclass')
527
529 """
530 Return the C{i}th token in the corpus file underlying this
531 corpus view. Negative indices and spans are both supported.
532 """
533 if isinstance(i, slice):
534 start, stop = slice_bounds(self, i)
535 return LazySubsequence(self, start, stop)
536 else:
537
538 if i < 0: i += len(self)
539 if i < 0: raise IndexError('index out of range')
540
541 try:
542 return self.iterate_from(i).next()
543 except StopIteration:
544 raise IndexError('index out of range')
545
547 """Return an iterator that generates the tokens in the corpus
548 file underlying this corpus view."""
549 return self.iterate_from(0)
550
552 """Return the number of times this list contains C{value}."""
553 return sum(1 for elt in self if elt==value)
554
555 - def index(self, value, start=None, stop=None):
556 """Return the index of the first occurance of C{value} in this
557 list that is greater than or equal to C{start} and less than
558 C{stop}. Negative start & stop values are treated like negative
559 slice bounds -- i.e., they count from the end of the list."""
560 start, stop = slice_bounds(self, slice(start, stop))
561 for i, elt in enumerate(islice(self, start, stop)):
562 if elt == value: return i+start
563 raise ValueError('index(x): x not in list')
564
566 """Return true if this list contains C{value}."""
567 return bool(self.count(value))
568
570 """Return a list concatenating self with other."""
571 return LazyConcatenation([self, other])
572
574 """Return a list concatenating other with self."""
575 return LazyConcatenation([other, self])
576
578 """Return a list concatenating self with itself C{count} times."""
579 return LazyConcatenation([self] * count)
580
582 """Return a list concatenating self with itself C{count} times."""
583 return LazyConcatenation([self] * count)
584
585 _MAX_REPR_SIZE = 60
587 """
588 @return: A string representation for this corpus view that is
589 similar to a list's representation; but if it would be more
590 than 60 characters long, it is truncated.
591 """
592 pieces = []
593 length = 5
594 for elt in self:
595 pieces.append(repr(elt))
596 length += len(pieces[-1]) + 2
597 if length > self._MAX_REPR_SIZE and len(pieces) > 2:
598 return '[%s, ...]' % ', '.join(pieces[:-1])
599 else:
600 return '[%s]' % ', '.join(pieces)
601
603 """
604 Return a number indicating how C{self} relates to other.
605
606 - If C{other} is not a corpus view or a C{list}, return -1.
607 - Otherwise, return C{cmp(list(self), list(other))}.
608
609 Note: corpus views do not compare equal to tuples containing
610 equal elements. Otherwise, transitivity would be violated,
611 since tuples do not compare equal to lists.
612 """
613 if not isinstance(other, (AbstractLazySequence, list)): return -1
614 return cmp(list(self), list(other))
615
617 """
618 @raise ValueError: Corpus view objects are unhashable.
619 """
620 raise ValueError('%s objects are unhashable' %
621 self.__class__.__name__)
622
623
625 """
626 A subsequence produced by slicing a lazy sequence. This slice
627 keeps a reference to its source sequence, and generates its values
628 by looking them up in the source sequence.
629 """
630
631 MIN_SIZE = 100
632 """The minimum size for which lazy slices should be created. If
633 C{LazySubsequence()} is called with a subsequence that is
634 shorter than C{MIN_SIZE}, then a tuple will be returned
635 instead."""
636
637 - def __new__(cls, source, start, stop):
638 """
639 Construct a new slice from a given underlying sequence. The
640 C{start} and C{stop} indices should be absolute indices --
641 i.e., they should not be negative (for indexing from the back
642 of a list) or greater than the length of C{source}.
643 """
644
645 if stop-start < cls.MIN_SIZE:
646 return list(islice(source.iterate_from(start), stop-start))
647 else:
648 return object.__new__(cls, source, start, stop)
649
650 - def __init__(self, source, start, stop):
654
656 return self._stop - self._start
657
661
662
664 """
665 A lazy sequence formed by concatenating a list of lists. This
666 underlying list of lists may itself be lazy. C{LazyConcatenation}
667 maintains an index that it uses to keep track of the relationship
668 between offsets in the concatenated lists and offsets in the
669 sublists.
670 """
672 self._list = list_of_lists
673 self._offsets = [0]
674
676 if len(self._offsets) <= len(self._list):
677 for tok in self.iterate_from(self._offsets[-1]): pass
678 return self._offsets[-1]
679
681 if start_index < self._offsets[-1]:
682 sublist_index = bisect.bisect_right(self._offsets, start_index)-1
683 else:
684 sublist_index = len(self._offsets)-1
685
686 index = self._offsets[sublist_index]
687
688
689 if isinstance(self._list, AbstractLazySequence):
690 sublist_iter = self._list.iterate_from(sublist_index)
691 else:
692 sublist_iter = islice(self._list, sublist_index, None)
693
694 for sublist in sublist_iter:
695 if sublist_index == (len(self._offsets)-1):
696 assert index+len(sublist) >= self._offsets[-1], (
697 'offests not monotonic increasing!')
698 self._offsets.append(index+len(sublist))
699 else:
700 assert self._offsets[sublist_index+1] == index+len(sublist), (
701 'inconsistent list value (num elts)')
702
703 for value in sublist[max(0, start_index-index):]:
704 yield value
705
706 index += len(sublist)
707 sublist_index += 1
708
709
710 -class LazyMap(AbstractLazySequence):
711 """
712 A lazy sequence whose elements are formed by applying a given
713 function to each element in one or more underlying lists. The
714 function is applied lazily -- i.e., when you read a value from the
715 list, C{LazyMap} will calculate that value by applying its
716 function to the underlying lists' value(s). C{LazyMap} is
717 essentially a lazy version of the Python primitive function
718 C{map}. In particular, the following two expressions are
719 equivalent:
720
721 >>> map(f, sequences...)
722 >>> list(LazyMap(f, sequences...))
723
724 Like the Python C{map} primitive, if the source lists do not have
725 equal size, then the value C{None} will be supplied for the
726 'missing' elements.
727
728 Lazy maps can be useful for conserving memory, in cases where
729 individual values take up a lot of space. This is especially true
730 if the underlying list's values are constructed lazily, as is the
731 case with many corpus readers.
732
733 A typical example of a use case for this class is performing
734 feature detection on the tokens in a corpus. Since featuresets
735 are encoded as dictionaries, which can take up a lot of memory,
736 using a C{LazyMap} can significantly reduce memory usage when
737 training and running classifiers.
738 """
739 - def __init__(self, function, *lists, **config):
740 """
741 @param function: The function that should be applied to
742 elements of C{lists}. It should take as many arguments
743 as there are C{lists}.
744 @param lists: The underlying lists.
745 @kwparam cache_size: Determines the size of the cache used
746 by this lazy map. (default=5)
747 """
748 if not lists:
749 raise TypeError('LazyMap requires at least two args')
750
751 self._lists = lists
752 self._func = function
753 self._cache_size = config.get('cache_size', 5)
754 if self._cache_size > 0:
755 self._cache = {}
756 else:
757 self._cache = None
758
759
760
761
762 self._all_lazy = sum(isinstance(lst, AbstractLazySequence)
763 for lst in lists) == len(lists)
764
766
767 if len(self._lists) == 1 and self._all_lazy:
768 for value in self._lists[0].iterate_from(index):
769 yield self._func(value)
770 return
771
772
773 elif len(self._lists) == 1:
774 while True:
775 try: yield self._func(self._lists[0][index])
776 except IndexError: return
777 index += 1
778
779
780 elif self._all_lazy:
781 iterators = [lst.iterate_from(index) for lst in self._lists]
782 while True:
783 elements = []
784 for iterator in iterators:
785 try: elements.append(iterator.next())
786 except: elements.append(None)
787 if elements == [None] * len(self._lists):
788 return
789 yield self._func(*elements)
790 index += 1
791
792
793 else:
794 while True:
795 try: elements = [lst[index] for lst in self._lists]
796 except IndexError:
797 elements = [None] * len(self._lists)
798 for i, lst in enumerate(self._lists):
799 try: elements[i] = lst[index]
800 except IndexError: pass
801 if elements == [None] * len(self._lists):
802 return
803 yield self._func(*elements)
804 index += 1
805
807 if isinstance(index, slice):
808 sliced_lists = [lst[index] for lst in self._lists]
809 return LazyMap(self._func, *sliced_lists)
810 else:
811
812 if index < 0: index += len(self)
813 if index < 0: raise IndexError('index out of range')
814
815 if self._cache is not None and index in self._cache:
816 return self._cache[index]
817
818 try: val = self.iterate_from(index).next()
819 except StopIteration:
820 raise IndexError('index out of range')
821
822 if self._cache is not None:
823 if len(self._cache) > self._cache_size:
824 self._cache.popitem()
825 self._cache[index] = val
826
827 return val
828
830 return max(len(lst) for lst in self._lists)
831
832
834 """Use LazyMap instead."""
837
838
840 """
841 A lazy sequence whose elements are tuples, each containing the i-th
842 element from each of the argument sequences. The returned list is
843 truncated in length to the length of the shortest argument sequence. The
844 tuples are constructed lazily -- i.e., when you read a value from the
845 list, C{LazyZip} will calculate that value by forming a C{tuple} from
846 the i-th element of each of the argument sequences.
847
848 C{LazyZip} is essentially a lazy version of the Python primitive function
849 C{zip}. In particular, the following two expressions are equivalent:
850
851 >>> zip(sequences...)
852 >>> list(LazyZip(sequences...))
853
854 Lazy zips can be useful for conserving memory in cases where the argument
855 sequences are particularly long.
856
857 A typical example of a use case for this class is combining long sequences
858 of gold standard and predicted values in a classification or tagging task
859 in order to calculate accuracy. By constructing tuples lazily and
860 avoiding the creation of an additional long sequence, memory usage can be
861 significantly reduced.
862 """
864 """
865 @param lists: the underlying lists
866 @type lists: C{list} of C{list}
867 """
868 LazyMap.__init__(self, lambda *elts: elts, *lists)
869
876
878 return min(len(lst) for lst in self._lists)
879
880
882 """
883 A lazy sequence whose elements are tuples, each ontaining a count (from
884 zero) and a value yielded by underlying sequence. C{LazyEnumerate} is
885 useful for obtaining an indexed list. The tuples are constructed lazily
886 -- i.e., when you read a value from the list, C{LazyEnumerate} will
887 calculate that value by forming a C{tuple} from the count of the i-th
888 element and the i-th element of the underlying sequence.
889
890 C{LazyEnumerate} is essentially a lazy version of the Python primitive
891 function C{enumerate}. In particular, the following two expressions are
892 equivalent:
893
894 >>> enumerate(sequence)
895 >>> list(LazyEnumerate(sequence))
896
897 Lazy enumerations can be useful for conserving memory in cases where the
898 argument sequences are particularly long.
899
900 A typical example of a use case for this class is obtaining an indexed
901 list for a long sequence of values. By constructing tuples lazily and
902 avoiding the creation of an additional long sequence, memory usage can be
903 significantly reduced.
904 """
905
907 """
908 @param lst: the underlying list
909 @type lst: C{list}
910 """
911 LazyZip.__init__(self, xrange(len(lst)), lst)
912
913
915 """Use LazyMap instead."""
918
919
921 """Use LazyConcatenation(LazyMap(func, lists)) instead."""
924