1
2
3
4
5
6
7
8
9
10
11
12 _NINF = float('-1e300')
13
14 """
15 Classes for representing and processing probabilistic information.
16
17 The L{FreqDist} class is used to encode X{frequency distributions},
18 which count the number of times that each outcome of an experiment
19 occurs.
20
21 The L{ProbDistI} class defines a standard interface for X{probability
22 distributions}, which encode the probability of each outcome for an
23 experiment. There are two types of probability distribution:
24
25 - X{derived probability distributions} are created from frequency
26 distributions. They attempt to model the probability distribution
27 that generated the frequency distribution.
28 - X{analytic probability distributions} are created directly from
29 parameters (such as variance).
30
31 The L{ConditionalFreqDist} class and L{ConditionalProbDistI} interface
32 are used to encode conditional distributions. Conditional probability
33 distributions can be derived or analytic; but currently the only
34 implementation of the C{ConditionalProbDistI} interface is
35 L{ConditionalProbDist}, a derived distribution.
36
37 """
38
39 import math
40 import random
41 import warnings
42
43
44
45
46
48 """
49 A frequency distribution for the outcomes of an experiment. A
50 frequency distribution records the number of times each outcome of
51 an experiment has occurred. For example, a frequency distribution
52 could be used to record the frequency of each word type in a
53 document. Formally, a frequency distribution can be defined as a
54 function mapping from each sample to the number of times that
55 sample occurred as an outcome.
56
57 Frequency distributions are generally constructed by running a
58 number of experiments, and incrementing the count for a sample
59 every time it is an outcome of an experiment. For example, the
60 following code will produce a frequency distribution that encodes
61 how often each word occurs in a text:
62
63 >>> fdist = FreqDist()
64 >>> for word in tokenize.whitespace(sent):
65 ... fdist.inc(word.lower())
66
67 An equivalent way to do this is with the initializer:
68
69 >>> fdist = FreqDist(word.lower() for word in tokenize.whitespace(sent))
70
71 """
73 """
74 Construct a new frequency distribution. If C{samples} is
75 given, then the frequency distribution will be initialized
76 with the count of each object in C{samples}; otherwise, it
77 will be initialized to be empty.
78
79 In particular, C{FreqDist()} returns an empty frequency
80 distribution; and C{FreqDist(samples)} first creates an empty
81 frequency distribution, and then calls C{inc} for each element
82 in the list C{samples}.
83
84 @param samples: The samples to initialize the frequency
85 distribution with.
86 @type samples: Sequence
87 """
88 dict.__init__(self)
89 self._N = 0
90 self._Nr_cache = None
91 self._max_cache = None
92 if samples:
93 for sample in samples:
94 self.inc(sample)
95
96 - def inc(self, sample, count=1):
97 """
98 Increment this C{FreqDist}'s count for the given
99 sample.
100
101 @param sample: The sample whose count should be incremented.
102 @type sample: any
103 @param count: The amount to increment the sample's count by.
104 @type count: C{int}
105 @rtype: None
106 @raise NotImplementedError: If C{sample} is not a
107 supported sample type.
108 """
109 if count == 0: return
110
111 self._N += count
112 self[sample] = self.get(sample,0) + count
113
114
115 self._Nr_cache = None
116 self._max_cache = None
117
119 """
120 @return: The total number of sample outcomes that have been
121 recorded by this C{FreqDist}. For the number of unique
122 sample values (or bins) with counts greater than zero, use
123 C{FreqDist.B()}.
124 @rtype: C{int}
125 """
126 return self._N
127
129 """
130 @return: The total number of sample values (or X{bins}) that
131 have counts greater than zero. For the total
132 number of sample outcomes recorded, use C{FreqDist.N()}.
133 @rtype: C{int}
134 """
135 return len(self)
136
138 """
139 @return: A list of all samples that have been recorded as
140 outcomes by this frequency distribution. Use C{count()}
141 to determine the count for each sample.
142 @rtype: C{list}
143 """
144 return self.keys()
145
146 - def Nr(self, r, bins=None):
147 """
148 @return: The number of samples with count r.
149 @rtype: C{int}
150 @type r: C{int}
151 @param r: A sample count.
152 @type bins: C{int}
153 @param bins: The number of possible sample outcomes. C{bins}
154 is used to calculate Nr(0). In particular, Nr(0) is
155 C{bins-self.B()}. If C{bins} is not specified, it
156 defaults to C{self.B()} (so Nr(0) will be 0).
157 """
158 if r < 0: raise IndexError, 'FreqDist.Nr(): r must be non-negative'
159
160
161 if r == 0:
162 if bins is None: return 0
163 else: return bins-self.B()
164
165
166
167
168 if self._Nr_cache is None:
169 self._cache_Nr_values()
170
171 if r >= len(self._Nr_cache): return 0
172 return self._Nr_cache[r]
173
175 Nr = [0]
176 for sample in self:
177 c = self.get(sample, 0)
178 if c >= len(Nr):
179 Nr += [0]*(c+1-len(Nr))
180 Nr[c] += 1
181 self._Nr_cache = Nr
182
183 - def count(self, sample):
184 """
185 Return the count of a given sample. The count of a sample is
186 defined as the number of times that sample outcome was
187 recorded by this C{FreqDist}. Counts are non-negative
188 integers. This method has been replaced by conventional
189 dictionary indexing; use fd[item] instead of fd.count(item).
190
191 @return: The count of a given sample.
192 @rtype: C{int}
193 @param sample: the sample whose count
194 should be returned.
195 @type sample: any.
196 """
197 raise AttributeError, "Use indexing to look up an entry in a FreqDist, e.g. fd[item]"
198
199 - def freq(self, sample):
200 """
201 Return the frequency of a given sample. The frequency of a
202 sample is defined as the count of that sample divided by the
203 total number of sample outcomes that have been recorded by
204 this C{FreqDist}. The count of a sample is defined as the
205 number of times that sample outcome was recorded by this
206 C{FreqDist}. Frequencies are always real numbers in the range
207 [0, 1].
208
209 @return: The frequency of a given sample.
210 @rtype: float
211 @param sample: the sample whose frequency
212 should be returned.
213 @type sample: any
214 """
215 if self._N is 0:
216 return 0
217 return float(self[sample]) / self._N
218
220 """
221 Return the sample with the greatest number of outcomes in this
222 frequency distribution. If two or more samples have the same
223 number of outcomes, return one of them; which sample is
224 returned is undefined. If no outcomes have occurred in this
225 frequency distribution, return C{None}.
226
227 @return: The sample with the maximum number of outcomes in this
228 frequency distribution.
229 @rtype: any or C{None}
230 """
231 if self._max_cache is None:
232 best_sample = None
233 best_count = -1
234 for sample in self:
235 if self[sample] > best_count:
236 best_sample = sample
237 best_count = self[sample]
238 self._max_cache = best_sample
239 return self._max_cache
240
242 raise AttributeError, "Use FreqDist.sorted() to get the sorted samples"
243
244 - def plot(self, samples=None, *args, **kwargs):
245 """
246 Plot the given samples from the frequency distribution.
247 If no samples are specified, use all samples, in lexical sort order.
248 (Requires Matplotlib to be installed.)
249
250 @param samples: The samples to plot.
251 @type samples: C{list}
252 """
253 try:
254 import pylab
255 except ImportError:
256 raise ValueError('The plot function requires the matplotlib package.'
257 'See http://matplotlib.sourceforge.net/')
258 if not samples:
259 samples = sorted(self.samples())
260 values = [self[sample] for sample in samples]
261 if not args:
262 args = ["bo"]
263 pylab.grid(True, color="silver")
264 pylab.semilogy(values, *args, **kwargs)
265 pylab.xticks(range(len(samples)), samples, rotation=45, color="b")
266 pylab.show()
267
268 - def zipf_plot(self, num=40, *args, **kwargs):
269 """
270 Plot the most frequent samples of the frequency distribution.
271 (Requires Matplotlib to be installed.)
272
273 @param num: The number of samples to plot.
274 @type num: C{int}
275 """
276 samples = self.sorted()[:num]
277 self.plot(samples, *args, **kwargs)
278
279
281 """
282 Return the samples sorted in decreasing order of frequency. Instances
283 with the same count will be arbitrarily ordered. Instances with a
284 count of zero will be omitted. This method is C{O(N^2)}, where C{N} is
285 the number of samples, but will complete in a shorter time on average.
286
287 @return: The set of samples in sorted order.
288 @rtype: sequence of any
289 """
290 from operator import itemgetter
291 return [sample for (sample, count) in sorted(self.items(), key=itemgetter(1), reverse=True)]
292
294 """
295 @return: A string representation of this C{FreqDist}.
296 @rtype: string
297 """
298 return '<FreqDist with %d samples>' % self.N()
299
301 """
302 @return: A string representation of this C{FreqDist}.
303 @rtype: string
304 """
305 items = ['%r: %r' % (s, self[s]) for s in self.sorted()]
306 return '<FreqDist: %s>' % ', '.join(items)
307
309 return self.get(sample, 0)
310
311
312
313
314
316 """
317 A probability distribution for the outcomes of an experiment. A
318 probability distribution specifies how likely it is that an
319 experiment will have any given outcome. For example, a
320 probability distribution could be used to predict the probability
321 that a token in a document will have a given type. Formally, a
322 probability distribution can be defined as a function mapping from
323 samples to nonnegative real numbers, such that the sum of every
324 number in the function's range is 1.0. C{ProbDist}s are often
325 used to model the probability distribution of the experiment used
326 to generate a frequency distribution.
327 """
328 SUM_TO_ONE = True
329 """True if the probabilities of the samples in this probability
330 distribution will always sum to one."""
331
333 if self.__class__ == ProbDistI:
334 raise AssertionError, "Interfaces can't be instantiated"
335
336 - def prob(self, sample):
337 """
338 @return: the probability for a given sample. Probabilities
339 are always real numbers in the range [0, 1].
340 @rtype: float
341 @param sample: The sample whose probability
342 should be returned.
343 @type sample: any
344 """
345 raise AssertionError()
346
348 """
349 @return: the base 2 logarithm of the probability for a given
350 sample. Log probabilities range from negitive infinity to
351 zero.
352 @rtype: float
353 @param sample: The sample whose probability
354 should be returned.
355 @type sample: any
356 """
357
358 p = self.prob(sample)
359 if p == 0:
360
361
362 return _NINF
363 else:
364 return math.log(p, 2)
365
367 """
368 @return: the sample with the greatest probability. If two or
369 more samples have the same probability, return one of them;
370 which sample is returned is undefined.
371 @rtype: any
372 """
373 raise AssertionError()
374
376 """
377 @return: A list of all samples that have nonzero
378 probabilities. Use C{prob} to find the probability of
379 each sample.
380 @rtype: C{list}
381 """
382 raise AssertionError()
383
384
386 """
387 @return: The ratio by which counts are discounted on average: c*/c
388 @rtype: C{float}
389 """
390 return 0.0
391
392
393
395 """
396 @return: A randomly selected sample from this probabilitiy
397 distribution. The probability of returning each sample
398 C{samp} is equal to C{self.prob(samp)}.
399 """
400 p = random.random()
401 for sample in self.samples():
402 p -= self.prob(sample)
403 if p <= 0: return sample
404
405 if p < .0001:
406 return sample
407
408 if self.SUM_TO_ONE:
409 warnings.warn("Probability distribution %r sums to %r; generate()"
410 " is returning an arbitrary sample." % (self, 1-p))
411 return random.choice(list(self.samples()))
412
443
445 """
446 A probability distribution whose probabilities are directly
447 specified by a given dictionary. The given dictionary maps
448 samples to probabilities.
449 """
450 - def __init__(self, prob_dict=None, log=False, normalize=False):
451 """
452 Construct a new probability distribution from the given
453 dictionary, which maps values to probabilities (or to log
454 probabilities, if C{log} is true). If C{normalize} is
455 true, then the probability values are scaled by a constant
456 factor such that they sum to 1.
457 """
458 self._prob_dict = prob_dict.copy()
459 self._log = log
460
461
462 if normalize:
463 if log:
464 value_sum = sum_logs(self._prob_dict.values())
465 if value_sum <= _NINF:
466 logp = math.log(1.0/len(prob_dict), 2)
467 for x in prob_dict.keys():
468 self._prob_dict[x] = logp
469 else:
470 for (x, p) in self._prob_dict.items():
471 self._prob_dict[x] -= value_sum
472 else:
473 value_sum = sum(self._prob_dict.values())
474 if value_sum == 0:
475 p = 1.0/len(prob_dict)
476 for x in prob_dict:
477 self._prob_dict[x] = p
478 else:
479 norm_factor = 1.0/value_sum
480 for (x, p) in self._prob_dict.items():
481 self._prob_dict[x] *= norm_factor
482
483 - def prob(self, sample):
484 if self._log:
485 if sample not in self._prob_dict: return 0
486 else: return 2**(self._prob_dict[sample])
487 else:
488 return self._prob_dict.get(sample, 0)
489
491 if self._log:
492 return self._prob_dict.get(sample, _NINF)
493 else:
494 if sample not in self._prob_dict: return _NINF
495 elif self._prob_dict[sample] == 0: return _NINF
496 else: return math.log(self._prob_dict[sample], 2)
497
499 if not hasattr(self, '_max'):
500 self._max = max((p,v) for (v,p) in self._prob_dict.items())[1]
501 return self._max
503 return self._prob_dict.keys()
505 return '<ProbDist with %d samples>' % len(self._prob_dict)
506
508 """
509 The maximum likelihood estimate for the probability distribution
510 of the experiment used to generate a frequency distribution. The
511 X{maximum likelihood estimate} approximates the probability of
512 each sample as the frequency of that sample in the frequency
513 distribution.
514 """
516 """
517 Use the maximum likelihood estimate to create a probability
518 distribution for the experiment used to generate C{freqdist}.
519
520 @type freqdist: C{FreqDist}
521 @param freqdist: The frequency distribution that the
522 probability estimates should be based on.
523 """
524 if freqdist.N() == 0:
525 raise ValueError('An MLE probability distribution must '+
526 'have at least one sample.')
527
528 self._freqdist = freqdist
529
531 """
532 @return: The frequency distribution that this probability
533 distribution is based on.
534 @rtype: C{FreqDist}
535 """
536 return self._freqdist
537
538 - def prob(self, sample):
539 return self._freqdist.freq(sample)
540
542 return self._freqdist.max()
543
545 return self._freqdist.keys()
546
548 """
549 @rtype: C{string}
550 @return: A string representation of this C{ProbDist}.
551 """
552 return '<MLEProbDist based on %d samples>' % self._freqdist.N()
553
555 """
556 The Lidstone estimate for the probability distribution of the
557 experiment used to generate a frequency distribution. The
558 C{Lidstone estimate} is paramaterized by a real number M{gamma},
559 which typically ranges from 0 to 1. The X{Lidstone estimate}
560 approximates the probability of a sample with count M{c} from an
561 experiment with M{N} outcomes and M{B} bins as
562 M{(c+gamma)/(N+B*gamma)}. This is equivalant to adding
563 M{gamma} to the count for each bin, and taking the maximum
564 likelihood estimate of the resulting frequency distribution.
565 """
566 SUM_TO_ONE = False
567 - def __init__(self, freqdist, gamma, bins=None):
568 """
569 Use the Lidstone estimate to create a probability distribution
570 for the experiment used to generate C{freqdist}.
571
572 @type freqdist: C{FreqDist}
573 @param freqdist: The frequency distribution that the
574 probability estimates should be based on.
575 @type gamma: C{float}
576 @param gamma: A real number used to paramaterize the
577 estimate. The Lidstone estimate is equivalant to adding
578 M{gamma} to the count for each bin, and taking the
579 maximum likelihood estimate of the resulting frequency
580 distribution.
581 @type bins: C{int}
582 @param bins: The number of sample values that can be generated
583 by the experiment that is described by the probability
584 distribution. This value must be correctly set for the
585 probabilities of the sample values to sum to one. If
586 C{bins} is not specified, it defaults to C{freqdist.B()}.
587 """
588 if (bins == 0) or (bins is None and freqdist.N() == 0):
589 name = self.__class__.__name__[:-8]
590 raise ValueError('A %s probability distribution ' % name +
591 'must have at least one bin.')
592 if (bins is not None) and (bins < freqdist.B()):
593 name = self.__class__.__name__[:-8]
594 raise ValueError('\nThe number of bins in a %s distribution ' % name +
595 '(%d) must be greater than or equal to\n' % bins +
596 'the number of bins in the FreqDist used ' +
597 'to create it (%d).' % freqdist.N())
598
599 self._freqdist = freqdist
600 self._gamma = float(gamma)
601 self._N = self._freqdist.N()
602
603 if bins is None: bins = freqdist.B()
604 self._bins = bins
605
607 """
608 @return: The frequency distribution that this probability
609 distribution is based on.
610 @rtype: C{FreqDist}
611 """
612 return self._freqdist
613
614 - def prob(self, sample):
615 c = self._freqdist[sample]
616 return (c + self._gamma) / (self._N + self._bins * self._gamma)
617
619
620
621
622 return self._freqdist.max()
623
625 return self._freqdist.keys()
626
628 gb = self._gamma * self._bins
629 return gb / (self._N + gb)
630
632 """
633 @rtype: C{string}
634 @return: A string representation of this C{ProbDist}.
635 """
636 return '<LidstoneProbDist based on %d samples>' % self._freqdist.N()
637
638
640 """
641 The Laplace estimate for the probability distribution of the
642 experiment used to generate a frequency distribution. The
643 X{Lidstone estimate} approximates the probability of a sample with
644 count M{c} from an experiment with M{N} outcomes and M{B} bins as
645 M{(c+1)/(N+B)}. This is equivalant to adding one to the count for
646 each bin, and taking the maximum likelihood estimate of the
647 resulting frequency distribution.
648 """
649 - def __init__(self, freqdist, bins=None):
650 """
651 Use the Laplace estimate to create a probability distribution
652 for the experiment used to generate C{freqdist}.
653
654 @type freqdist: C{FreqDist}
655 @param freqdist: The frequency distribution that the
656 probability estimates should be based on.
657 @type bins: C{int}
658 @param bins: The number of sample values that can be generated
659 by the experiment that is described by the probability
660 distribution. This value must be correctly set for the
661 probabilities of the sample values to sum to one. If
662 C{bins} is not specified, it defaults to C{freqdist.B()}.
663 """
664 LidstoneProbDist.__init__(self, freqdist, 1, bins)
665
667 """
668 @rtype: C{string}
669 @return: A string representation of this C{ProbDist}.
670 """
671 return '<LaplaceProbDist based on %d samples>' % self._freqdist.N()
672
674 """
675 The expected likelihood estimate for the probability distribution
676 of the experiment used to generate a frequency distribution. The
677 X{expected likelihood estimate} approximates the probability of a
678 sample with count M{c} from an experiment with M{N} outcomes and
679 M{B} bins as M{(c+0.5)/(N+B/2)}. This is equivalant to adding 0.5
680 to the count for each bin, and taking the maximum likelihood
681 estimate of the resulting frequency distribution.
682 """
683 - def __init__(self, freqdist, bins=None):
684 """
685 Use the expected likelihood estimate to create a probability
686 distribution for the experiment used to generate C{freqdist}.
687
688 @type freqdist: C{FreqDist}
689 @param freqdist: The frequency distribution that the
690 probability estimates should be based on.
691 @type bins: C{int}
692 @param bins: The number of sample values that can be generated
693 by the experiment that is described by the probability
694 distribution. This value must be correctly set for the
695 probabilities of the sample values to sum to one. If
696 C{bins} is not specified, it defaults to C{freqdist.B()}.
697 """
698 LidstoneProbDist.__init__(self, freqdist, 0.5, bins)
699
701 """
702 @rtype: C{string}
703 @return: A string representation of this C{ProbDist}.
704 """
705 return '<ELEProbDist based on %d samples>' % self._freqdist.N()
706
708 """
709 The heldout estimate for the probability distribution of the
710 experiment used to generate two frequency distributions. These
711 two frequency distributions are called the "heldout frequency
712 distribution" and the "base frequency distribution." The
713 X{heldout estimate} uses uses the X{heldout frequency
714 distribution} to predict the probability of each sample, given its
715 frequency in the X{base frequency distribution}.
716
717 In particular, the heldout estimate approximates the probability
718 for a sample that occurs M{r} times in the base distribution as
719 the average frequency in the heldout distribution of all samples
720 that occur M{r} times in the base distribution.
721
722 This average frequency is M{Tr[r]/(Nr[r]*N)}, where:
723 - M{Tr[r]} is the total count in the heldout distribution for
724 all samples that occur M{r} times in the base
725 distribution.
726 - M{Nr[r]} is the number of samples that occur M{r} times in
727 the base distribution.
728 - M{N} is the number of outcomes recorded by the heldout
729 frequency distribution.
730
731 In order to increase the efficiency of the C{prob} member
732 function, M{Tr[r]/(Nr[r]*N)} is precomputed for each value of M{r}
733 when the C{HeldoutProbDist} is created.
734
735 @type _estimate: C{list} of C{float}
736 @ivar _estimate: A list mapping from M{r}, the number of
737 times that a sample occurs in the base distribution, to the
738 probability estimate for that sample. C{_estimate[M{r}]} is
739 calculated by finding the average frequency in the heldout
740 distribution of all samples that occur M{r} times in the base
741 distribution. In particular, C{_estimate[M{r}]} =
742 M{Tr[r]/(Nr[r]*N)}.
743 @type _max_r: C{int}
744 @ivar _max_r: The maximum number of times that any sample occurs
745 in the base distribution. C{_max_r} is used to decide how
746 large C{_estimate} must be.
747 """
748 SUM_TO_ONE = False
749 - def __init__(self, base_fdist, heldout_fdist, bins=None):
750 """
751 Use the heldout estimate to create a probability distribution
752 for the experiment used to generate C{base_fdist} and
753 C{heldout_fdist}.
754
755 @type base_fdist: C{FreqDist}
756 @param base_fdist: The base frequency distribution.
757 @type heldout_fdist: C{FreqDist}
758 @param heldout_fdist: The heldout frequency distribution.
759 @type bins: C{int}
760 @param bins: The number of sample values that can be generated
761 by the experiment that is described by the probability
762 distribution. This value must be correctly set for the
763 probabilities of the sample values to sum to one. If
764 C{bins} is not specified, it defaults to C{freqdist.B()}.
765 """
766
767 self._base_fdist = base_fdist
768 self._heldout_fdist = heldout_fdist
769
770
771 self._max_r = base_fdist[base_fdist.max()]
772
773
774 Tr = self._calculate_Tr()
775 Nr = [base_fdist.Nr(r, bins) for r in range(self._max_r+1)]
776 N = heldout_fdist.N()
777
778
779
780 self._estimate = self._calculate_estimate(Tr, Nr, N)
781
783 """
784 @return: the list M{Tr}, where M{Tr[r]} is the total count in
785 C{heldout_fdist} for all samples that occur M{r}
786 times in C{base_fdist}.
787 @rtype: C{list} of C{float}
788 """
789 Tr = [0.0] * (self._max_r+1)
790 for sample in self._heldout_fdist:
791 r = self._base_fdist[sample]
792 Tr[r] += self._heldout_fdist[sample]
793 return Tr
794
796 """
797 @return: the list M{estimate}, where M{estimate[r]} is the
798 probability estimate for any sample that occurs M{r} times
799 in the base frequency distribution. In particular,
800 M{estimate[r]} is M{Tr[r]/(N[r]*N)}. In the special case
801 that M{N[r]=0}, M{estimate[r]} will never be used; so we
802 define M{estimate[r]=None} for those cases.
803 @rtype: C{list} of C{float}
804 @type Tr: C{list} of C{float}
805 @param Tr: the list M{Tr}, where M{Tr[r]} is the total count in
806 the heldout distribution for all samples that occur M{r}
807 times in base distribution.
808 @type Nr: C{list} of C{float}
809 @param Nr: The list M{Nr}, where M{Nr[r]} is the number of
810 samples that occur M{r} times in the base distribution.
811 @type N: C{int}
812 @param N: The total number of outcomes recorded by the heldout
813 frequency distribution.
814 """
815 estimate = []
816 for r in range(self._max_r+1):
817 if Nr[r] == 0: estimate.append(None)
818 else: estimate.append(Tr[r]/(Nr[r]*N))
819 return estimate
820
822 """
823 @return: The base frequency distribution that this probability
824 distribution is based on.
825 @rtype: C{FreqDist}
826 """
827 return self._base_fdist
828
830 """
831 @return: The heldout frequency distribution that this
832 probability distribution is based on.
833 @rtype: C{FreqDist}
834 """
835 return self._heldout_fdist
836
838 return self._base_fdist.keys()
839
840 - def prob(self, sample):
841
842 r = self._base_fdist[sample]
843 return self._estimate[r]
844
846
847
848
849 return self._base_fdist.max()
850
852 raise NotImplementedError()
853
855 """
856 @rtype: C{string}
857 @return: A string representation of this C{ProbDist}.
858 """
859 s = '<HeldoutProbDist: %d base samples; %d heldout samples>'
860 return s % (self._base_fdist.N(), self._heldout_fdist.N())
861
863 """
864 The cross-validation estimate for the probability distribution of
865 the experiment used to generate a set of frequency distribution.
866 The X{cross-validation estimate} for the probability of a sample
867 is found by averaging the held-out estimates for the sample in
868 each pair of frequency distributions.
869 """
870 SUM_TO_ONE = False
872 """
873 Use the cross-validation estimate to create a probability
874 distribution for the experiment used to generate
875 C{freqdists}.
876
877 @type freqdists: C{list} of C{FreqDist}
878 @param freqdists: A list of the frequency distributions
879 generated by the experiment.
880 @type bins: C{int}
881 @param bins: The number of sample values that can be generated
882 by the experiment that is described by the probability
883 distribution. This value must be correctly set for the
884 probabilities of the sample values to sum to one. If
885 C{bins} is not specified, it defaults to C{freqdist.B()}.
886 """
887 self._freqdists = freqdists
888
889
890
891 self._heldout_probdists = []
892 for fdist1 in freqdists:
893 for fdist2 in freqdists:
894 if fdist1 is not fdist2:
895 probdist = HeldoutProbDist(fdist1, fdist2, bins)
896 self._heldout_probdists.append(probdist)
897
899 """
900 @rtype: C{list} of C{FreqDist}
901 @return: The list of frequency distributions that this
902 C{ProbDist} is based on.
903 """
904 return self._freqdists
905
907
908 return set(sum([fd.keys() for fd in self._freqdists], []))
909
910 - def prob(self, sample):
911
912
913 prob = 0.0
914 for heldout_probdist in self._heldout_probdists:
915 prob += heldout_probdist.prob(sample)
916 return prob/len(self._heldout_probdists)
917
919 raise NotImplementedError()
920
922 """
923 @rtype: C{string}
924 @return: A string representation of this C{ProbDist}.
925 """
926 return '<CrossValidationProbDist: %d-way>' % len(self._freqdists)
927
929 """
930 The Witten-Bell estimate of a probability distribution. This distribution
931 allocates uniform probability mass to as yet unseen events by using the
932 number of events that have only been seen once. The probability mass
933 reserved for unseen events is equal to:
934
935 - M{T / (N + T)}
936
937 where M{T} is the number of observed event types and M{N} is the total
938 number of observed events. This equates to the maximum likelihood estimate
939 of a new type event occuring. The remaining probability mass is discounted
940 such that all probability estimates sum to one, yielding:
941
942 - M{p = T / Z (N + T)}, if count = 0
943 - M{p = c / (N + T)}, otherwise
944 """
945
946 - def __init__(self, freqdist, bins=None):
947 """
948 Creates a distribution of Witten-Bell probability estimates. This
949 distribution allocates uniform probability mass to as yet unseen
950 events by using the number of events that have only been seen once.
951 The probability mass reserved for unseen events is equal to:
952
953 - M{T / (N + T)}
954
955 where M{T} is the number of observed event types and M{N} is the total
956 number of observed events. This equates to the maximum likelihood
957 estimate of a new type event occuring. The remaining probability mass
958 is discounted such that all probability estimates sum to one,
959 yielding:
960
961 - M{p = T / Z (N + T)}, if count = 0
962 - M{p = c / (N + T)}, otherwise
963
964 The parameters M{T} and M{N} are taken from the C{freqdist} parameter
965 (the C{B()} and C{N()} values). The normalising factor M{Z} is
966 calculated using these values along with the C{bins} parameter.
967
968 @param freqdist: The frequency counts upon which to base the
969 estimation.
970 @type freqdist: C{FreqDist}
971 @param bins: The number of possible event types. This must be
972 at least as large as the number of bins in the
973 C{freqdist}. If C{None}, then it's assumed to be
974 equal to that of the C{freqdist}
975 @type bins: C{Int}
976 """
977 assert bins == None or bins >= freqdist.B(),\
978 'Bins parameter must not be less than freqdist.B()'
979 if bins == None:
980 bins = freqdist.B()
981 self._freqdist = freqdist
982 self._T = self._freqdist.B()
983 self._Z = bins - self._freqdist.B()
984 self._N = self._freqdist.N()
985
986 - def prob(self, sample):
987
988 c = self._freqdist[sample]
989 if c == 0:
990 return self._T / float(self._Z * (self._N + self._T))
991 else:
992 return c / float(self._N + self._T)
993
995 return self._freqdist.max()
996
998 return self._freqdist.keys()
999
1001 return self._freqdist
1002
1004 raise NotImplementedError()
1005
1007 """
1008 @rtype: C{string}
1009 @return: A string representation of this C{ProbDist}.
1010 """
1011 return '<WittenBellProbDist based on %d samples>' % self._freqdist.N()
1012
1014 """
1015 The Good-Turing estimate of a probability distribution. This method
1016 calculates the probability mass to assign to events with zero or low
1017 counts based on the number of events with higher counts. It does so by
1018 using the smoothed count M{c*}:
1019
1020 - M{c* = (c + 1) N(c + 1) / N(c)}
1021
1022 where M{c} is the original count, M{N(i)} is the number of event types
1023 observed with count M{i}. These smoothed counts are then normalised to
1024 yield a probability distribution.
1025 """
1026
1027
1028
1029 - def __init__(self, freqdist, bins=None):
1030 """
1031 Creates a Good-Turing probability distribution estimate. This method
1032 calculates the probability mass to assign to events with zero or low
1033 counts based on the number of events with higher counts. It does so by
1034 using the smoothed count M{c*}:
1035
1036 - M{c* = (c + 1) N(c + 1) / N(c)}
1037
1038 where M{c} is the original count, M{N(i)} is the number of event types
1039 observed with count M{i}. These smoothed counts are then normalised to
1040 yield a probability distribution.
1041
1042 The C{bins} parameter allows C{N(0)} to be estimated.
1043
1044 @param freqdist: The frequency counts upon which to base the
1045 estimation.
1046 @type freqdist: C{FreqDist}
1047 @param bins: The number of possible event types. This must be
1048 at least as large as the number of bins in the
1049 C{freqdist}. If C{None}, then it's taken to be
1050 equal to C{freqdist.B()}.
1051 @type bins: C{Int}
1052 """
1053 assert bins == None or bins >= freqdist.B(),\
1054 'Bins parameter must not be less than freqdist.B()'
1055 if bins == None:
1056 bins = freqdist.B()
1057 self._freqdist = freqdist
1058 self._bins = bins
1059
1060 - def prob(self, sample):
1061
1062 c = self._freqdist[sample]
1063 nc = self._freqdist.Nr(c, self._bins)
1064 ncn = self._freqdist.Nr(c + 1, self._bins)
1065
1066
1067 if nc == 0 or self._freqdist.N() == 0:
1068 return 0.0
1069
1070 return float(c + 1) * ncn / (nc * self._freqdist.N())
1071
1073 return self._freqdist.max()
1074
1076 return self._freqdist.keys()
1077
1079 raise NotImplementedError()
1080
1082 return self._freqdist
1083
1085 """
1086 @rtype: C{string}
1087 @return: A string representation of this C{ProbDist}.
1088 """
1089 return '<GoodTuringProbDist based on %d samples>' % self._freqdist.N()
1090
1092 """
1093 An mutable probdist where the probabilities may be easily modified. This
1094 simply copies an existing probdist, storing the probability values in a
1095 mutable dictionary and providing an update method.
1096 """
1097
1098 - def __init__(self, prob_dist, samples, store_logs=True):
1099 """
1100 Creates the mutable probdist based on the given prob_dist and using
1101 the list of samples given. These values are stored as log
1102 probabilities if the store_logs flag is set.
1103
1104 @param prob_dist: the distribution from which to garner the
1105 probabilities
1106 @type prob_dist: ProbDist
1107 @param samples: the complete set of samples
1108 @type samples: sequence of any
1109 @param store_logs: whether to store the probabilities as logarithms
1110 @type store_logs: bool
1111 """
1112 try:
1113 import numpy
1114 except ImportError:
1115 print "Error: Please install numpy; for instructions see http://nltk.org/install.html"
1116 exit()
1117 self._samples = samples
1118 self._sample_dict = dict((samples[i], i) for i in range(len(samples)))
1119 self._data = numpy.zeros(len(samples), numpy.float64)
1120 for i in range(len(samples)):
1121 if store_logs:
1122 self._data[i] = prob_dist.logprob(samples[i])
1123 else:
1124 self._data[i] = prob_dist.prob(samples[i])
1125 self._logs = store_logs
1126
1128
1129 return self._samples
1130
1131 - def prob(self, sample):
1132
1133 i = self._sample_dict.get(sample)
1134 if i != None:
1135 if self._logs:
1136 return 2**(self._data[i])
1137 else:
1138 return self._data[i]
1139 else:
1140 return 0.0
1141
1143
1144 i = self._sample_dict.get(sample)
1145 if i != None:
1146 if self._logs:
1147 return self._data[i]
1148 else:
1149 return math.log(self._data[i], 2)
1150 else:
1151 return float('-inf')
1152
1153 - def update(self, sample, prob, log=True):
1154 """
1155 Update the probability for the given sample. This may cause the object
1156 to stop being the valid probability distribution - the user must
1157 ensure that they update the sample probabilities such that all samples
1158 have probabilities between 0 and 1 and that all probabilities sum to
1159 one.
1160
1161 @param sample: the sample for which to update the probability
1162 @type sample: C{any}
1163 @param prob: the new probability
1164 @type prob: C{float}
1165 @param log: is the probability already logged
1166 @type log: C{bool}
1167 """
1168 i = self._sample_dict.get(sample)
1169 assert i != None
1170 if self._logs:
1171 if log: self._data[i] = prob
1172 else: self._data[i] = math.log(prob, 2)
1173 else:
1174 if log: self._data[i] = 2**(prob)
1175 else: self._data[i] = prob
1176
1177
1178
1179
1180
1182 if (not isinstance(test_pdist, ProbDistI) or
1183 not isinstance(actual_pdist, ProbDistI)):
1184 raise ValueError('expected a ProbDist.')
1185
1186 return sum(actual_pdist.prob(s) * math.log(test_pdist.prob(s), 2)
1187 for s in actual_pdist.keys())
1188
1190 probs = [pdist.prob(s) for s in pdist.samples()]
1191 return -sum([p * math.log(p,2) for p in probs])
1192
1193
1194
1195
1196
1198 """
1199 A collection of frequency distributions for a single experiment
1200 run under different conditions. Conditional frequency
1201 distributions are used to record the number of times each sample
1202 occurred, given the condition under which the experiment was run.
1203 For example, a conditional frequency distribution could be used to
1204 record the frequency of each word (type) in a document, given its
1205 length. Formally, a conditional frequency distribution can be
1206 defined as a function that maps from each condition to the
1207 C{FreqDist} for the experiment under that condition.
1208
1209 The frequency distribution for each condition is accessed using
1210 the indexing operator:
1211
1212 >>> cfdist[3]
1213 <FreqDist with 73 outcomes>
1214 >>> cfdist[3].freq('the')
1215 0.4
1216 >>> cfdist[3]['dog']
1217 2
1218
1219 When the indexing operator is used to access the frequency
1220 distribution for a condition that has not been accessed before,
1221 C{ConditionalFreqDist} creates a new empty C{FreqDist} for that
1222 condition.
1223
1224 Conditional frequency distributions are typically constructed by
1225 repeatedly running an experiment under a variety of conditions,
1226 and incrementing the sample outcome counts for the appropriate
1227 conditions. For example, the following code will produce a
1228 conditional frequency distribution that encodes how often each
1229 word type occurs, given the length of that word type:
1230
1231 >>> cfdist = ConditionalFreqDist()
1232 >>> for word in tokenize.whitespace(sent):
1233 ... condition = len(word)
1234 ... cfdist[condition].inc(word)
1235
1236 An equivalent way to do this is with the initializer:
1237
1238 >>> cfdist = ConditionalFreqDist((len(word), word) for word in tokenize.whitespace(sent))
1239
1240 """
1241 - def __init__(self, cond_samples=None):
1242 """
1243 Construct a new empty conditional frequency distribution. In
1244 particular, the count for every sample, under every condition,
1245 is zero.
1246
1247 @param cond_samples: The samples to initialize the conditional frequency distribution with
1248 @type cond_samples: Sequence of (condition, sample) tuples
1249 """
1250 self._fdists = {}
1251 if cond_samples:
1252 for (cond, sample) in cond_samples:
1253 self[cond].inc(sample)
1254
1256 """
1257 Return the frequency distribution that encodes the frequency
1258 of each sample outcome, given that the experiment was run
1259 under the given condition. If the frequency distribution for
1260 the given condition has not been accessed before, then this
1261 will create a new empty C{FreqDist} for that condition.
1262
1263 @return: The frequency distribution that encodes the frequency
1264 of each sample outcome, given that the experiment was run
1265 under the given condition.
1266 @rtype: C{FreqDist}
1267
1268 @param condition: The condition under which the experiment was
1269 run.
1270 @type condition: any
1271 """
1272
1273 if condition not in self._fdists:
1274 self._fdists[condition] = FreqDist()
1275 return self._fdists[condition]
1276
1278 """
1279 @return: A list of the conditions that have been accessed for
1280 this C{ConditionalFreqDist}. Use the indexing operator to
1281 access the frequency distribution for a given condition.
1282 Note that the frequency distributions for some conditions
1283 may contain zero sample outcomes.
1284 @rtype: C{list}
1285 """
1286 return self._fdists.keys()
1287
1289 """
1290 @return: The number of conditions that have been accessed
1291 for this C{ConditionalFreqDist}.
1292 @rtype: C{int}
1293 """
1294 return len(self._fdists)
1295
1297 """
1298 @return: A string representation of this
1299 C{ConditionalFreqDist}.
1300 @rtype: C{string}
1301 """
1302 n = len(self._fdists)
1303 return '<ConditionalFreqDist with %d conditions>' % n
1304
1306 """
1307 A collection of probability distributions for a single experiment
1308 run under different conditions. Conditional probability
1309 distributions are used to estimate the likelihood of each sample,
1310 given the condition under which the experiment was run. For
1311 example, a conditional probability distribution could be used to
1312 estimate the probability of each word type in a document, given
1313 the length of the word type. Formally, a conditional probability
1314 distribution can be defined as a function that maps from each
1315 condition to the C{ProbDist} for the experiment under that
1316 condition.
1317 """
1319 raise AssertionError, 'ConditionalProbDistI is an interface'
1320
1322 """
1323 @return: The probability distribution for the experiment run
1324 under the given condition.
1325 @rtype: C{ProbDistI}
1326 @param condition: The condition whose probability distribution
1327 should be returned.
1328 @type condition: any
1329 """
1330 raise AssertionError
1331
1333 """
1334 @return: The number of conditions that are represented by
1335 this C{ConditionalProbDist}.
1336 @rtype: C{int}
1337 """
1338 raise AssertionError
1339
1341 """
1342 @return: A list of the conditions that are represented by
1343 this C{ConditionalProbDist}. Use the indexing operator to
1344 access the probability distribution for a given condition.
1345 @rtype: C{list}
1346 """
1347 raise AssertionError
1348
1349
1350
1351
1352
1353
1355 """
1356 A conditional probability distribution modelling the experiments
1357 that were used to generate a conditional frequency distribution.
1358 A C{ConditoinalProbDist} is constructed from a
1359 C{ConditionalFreqDist} and a X{C{ProbDist} factory}:
1360
1361 - The B{C{ConditionalFreqDist}} specifies the frequency
1362 distribution for each condition.
1363 - The B{C{ProbDist} factory} is a function that takes a
1364 condition's frequency distribution, and returns its
1365 probability distribution. A C{ProbDist} class's name (such as
1366 C{MLEProbDist} or C{HeldoutProbDist}) can be used to specify
1367 that class's constructor.
1368
1369 The first argument to the C{ProbDist} factory is the frequency
1370 distribution that it should model; and the remaining arguments are
1371 specified by the C{factory_args} parameter to the
1372 C{ConditionalProbDist} constructor. For example, the following
1373 code constructs a C{ConditionalProbDist}, where the probability
1374 distribution for each condition is an C{ELEProbDist} with 10 bins:
1375
1376 >>> cpdist = ConditionalProbDist(cfdist, ELEProbDist, 10)
1377 >>> print cpdist['run'].max()
1378 'NN'
1379 >>> print cpdist['run'].prob('NN')
1380 0.0813
1381 """
1382 - def __init__(self, cfdist, probdist_factory,
1383 supply_condition=False, *factory_args):
1384 """
1385 Construct a new conditional probability distribution, based on
1386 the given conditional frequency distribution and C{ProbDist}
1387 factory.
1388
1389 @type cfdist: L{ConditionalFreqDist}
1390 @param cfdist: The C{ConditionalFreqDist} specifying the
1391 frequency distribution for each condition.
1392 @type probdist_factory: C{class} or C{function}
1393 @param probdist_factory: The function or class that maps
1394 a condition's frequency distribution to its probability
1395 distribution. The function is called with the frequency
1396 distribution as its first argument, the condition as its
1397 second argument (only if C{supply_condition=True}), and
1398 C{factory_args} as its remaining arguments.
1399 @type supply_condition: C{bool}
1400 @param supply_condition: If true, then pass the condition as
1401 the second argument to C{probdist_factory}.
1402 @type factory_args: (any)
1403 @param factory_args: Extra arguments for C{probdist_factory}.
1404 These arguments are usually used to specify extra
1405 properties for the probability distributions of individual
1406 conditions, such as the number of bins they contain.
1407 """
1408 self._probdist_factory = probdist_factory
1409 self._cfdist = cfdist
1410 self._supply_condition = supply_condition
1411 self._factory_args = factory_args
1412
1413 self._pdists = {}
1414 for c in cfdist.conditions():
1415 if supply_condition:
1416 pdist = probdist_factory(cfdist[c], c, *factory_args)
1417 else:
1418 pdist = probdist_factory(cfdist[c], *factory_args)
1419 self._pdists[c] = pdist
1420
1422 return condition in self._pdists
1423
1425 if condition not in self._pdists:
1426
1427
1428
1429 pdist = self._probdist_factory(FreqDist(), *self._factory_args)
1430 self._pdists[condition] = pdist
1431
1432 return self._pdists[condition]
1433
1435 return self._pdists.keys()
1436
1438 return len(self._pdists)
1439
1441 """
1442 @return: A string representation of this
1443 C{ConditionalProbDist}.
1444 @rtype: C{string}
1445 """
1446 return '<ConditionalProbDist with %d conditions>' % self.__len__()
1447
1448
1450 """
1451 An alternative ConditionalProbDist that simply wraps a dictionary of
1452 ProbDists rather than creating these from FreqDists.
1453 """
1454
1456 """
1457 @param probdist_dict: a dictionary containing the probdists indexed
1458 by the conditions
1459 @type probdist_dict: dict any -> probdist
1460 """
1461 self._dict = probdist_dict
1462
1464
1465
1466 return self._dict[condition]
1467
1469
1470 return self._dict.keys()
1471
1472
1473
1474
1475
1476
1477 _ADD_LOGS_MAX_DIFF = math.log(1e-30, 2)
1478
1480 """
1481 Given two numbers C{logx}=M{log(x)} and C{logy}=M{log(y)}, return
1482 M{log(x+y)}. Conceptually, this is the same as returning
1483 M{log(2**(C{logx})+2**(C{logy}))}, but the actual implementation
1484 avoids overflow errors that could result from direct computation.
1485 """
1486 if (logx < logy + _ADD_LOGS_MAX_DIFF):
1487 return logy
1488 if (logy < logx + _ADD_LOGS_MAX_DIFF):
1489 return logx
1490 base = min(logx, logy)
1491 return base + math.log(2**(logx-base) + 2**(logy-base), 2)
1492
1494 if len(logs) == 0:
1495
1496
1497 return _NINF
1498 else:
1499 return reduce(add_logs, logs[1:], logs[0])
1500
1501
1502
1503
1504
1506 """
1507 A mix-in class to associate probabilities with other classes
1508 (trees, rules, etc.). To use the C{ProbabilisticMixIn} class,
1509 define a new class that derives from an existing class and from
1510 ProbabilisticMixIn. You will need to define a new constructor for
1511 the new class, which explicitly calls the constructors of both its
1512 parent classes. For example:
1513
1514 >>> class A:
1515 ... def __init__(self, x, y): self.data = (x,y)
1516 ...
1517 >>> class ProbabilisticA(A, ProbabilisticMixIn):
1518 ... def __init__(self, x, y, **prob_kwarg):
1519 ... A.__init__(self, x, y)
1520 ... ProbabilisticMixIn.__init__(self, **prob_kwarg)
1521
1522 See the documentation for the ProbabilisticMixIn
1523 L{constructor<__init__>} for information about the arguments it
1524 expects.
1525
1526 You should generally also redefine the string representation
1527 methods, the comparison methods, and the hashing method.
1528 """
1530 """
1531 Initialize this object's probability. This initializer should
1532 be called by subclass constructors. C{prob} should generally be
1533 the first argument for those constructors.
1534
1535 @kwparam prob: The probability associated with the object.
1536 @type prob: C{float}
1537 @kwparam logprob: The log of the probability associated with
1538 the object.
1539 @type logprob: C{float}
1540 """
1541 if 'prob' in kwargs:
1542 if 'logprob' in kwargs:
1543 raise TypeError('Must specify either prob or logprob '
1544 '(not both)')
1545 else:
1546 ProbabilisticMixIn.set_prob(self, kwargs['prob'])
1547 elif 'logprob' in kwargs:
1548 ProbabilisticMixIn.set_logprob(self, kwargs['logprob'])
1549 else:
1550 self.__prob = self.__logprob = None
1551
1553 """
1554 Set the probability associated with this object to C{prob}.
1555 @param prob: The new probability
1556 @type prob: C{float}
1557 """
1558 self.__prob = prob
1559 self.__logprob = None
1560
1562 """
1563 Set the log probability associated with this object to
1564 C{logprob}. I.e., set the probability associated with this
1565 object to C{2**(logprob)}.
1566 @param logprob: The new log probability
1567 @type logprob: C{float}
1568 """
1569 self.__logprob = prob
1570 self.__prob = None
1571
1573 """
1574 @return: The probability associated with this object.
1575 @rtype: C{float}
1576 """
1577 if self.__prob is None:
1578 if self.__logprob is None: return None
1579 self.__prob = 2**(self.__logprob)
1580 return self.__prob
1581
1583 """
1584 @return: C{log(p)}, where C{p} is the probability associated
1585 with this object.
1586
1587 @rtype: C{float}
1588 """
1589 if self.__logprob is None:
1590 if self.__prob is None: return None
1591 self.__logprob = math.log(self.__prob, 2)
1592 return self.__logprob
1593
1596 raise ValueError, '%s is immutable' % self.__class__.__name__
1598 raise ValueError, '%s is immutable' % self.__class__.__name__
1599
1600
1601
1602
1603
1605 """
1606 Create a new frequency distribution, with random samples. The
1607 samples are numbers from 1 to C{numsamples}, and are generated by
1608 summing two numbers, each of which has a uniform distribution.
1609 """
1610 import random
1611 from math import sqrt
1612 fdist = FreqDist()
1613 for x in range(numoutcomes):
1614 y = (random.randint(1, (1+numsamples)/2) +
1615 random.randint(0, numsamples/2))
1616 fdist.inc(y)
1617 return fdist
1618
1620 """
1621 Return the true probability distribution for the experiment
1622 C{_create_rand_fdist(numsamples, x)}.
1623 """
1624 fdist = FreqDist()
1625 for x in range(1, (1+numsamples)/2+1):
1626 for y in range(0, numsamples/2+1):
1627 fdist.inc(x+y)
1628 return MLEProbDist(fdist)
1629
1630 -def demo(numsamples=6, numoutcomes=500):
1631 """
1632 A demonstration of frequency distributions and probability
1633 distributions. This demonstration creates three frequency
1634 distributions with, and uses them to sample a random process with
1635 C{numsamples} samples. Each frequency distribution is sampled
1636 C{numoutcomes} times. These three frequency distributions are
1637 then used to build six probability distributions. Finally, the
1638 probability estimates of these distributions are compared to the
1639 actual probability of each sample.
1640
1641 @type numsamples: C{int}
1642 @param numsamples: The number of samples to use in each demo
1643 frequency distributions.
1644 @type numoutcomes: C{int}
1645 @param numoutcomes: The total number of outcomes for each
1646 demo frequency distribution. These outcomes are divided into
1647 C{numsamples} bins.
1648 @rtype: C{None}
1649 """
1650
1651
1652 fdist1 = _create_rand_fdist(numsamples, numoutcomes)
1653 fdist2 = _create_rand_fdist(numsamples, numoutcomes)
1654 fdist3 = _create_rand_fdist(numsamples, numoutcomes)
1655
1656
1657 pdists = [
1658 MLEProbDist(fdist1),
1659 LidstoneProbDist(fdist1, 0.5, numsamples),
1660 HeldoutProbDist(fdist1, fdist2, numsamples),
1661 HeldoutProbDist(fdist2, fdist1, numsamples),
1662 CrossValidationProbDist([fdist1, fdist2, fdist3], numsamples),
1663 _create_sum_pdist(numsamples),
1664 ]
1665
1666
1667 vals = []
1668 for n in range(1,numsamples+1):
1669 vals.append(tuple([n, fdist1.freq(n)] +
1670 [pdist.prob(n) for pdist in pdists]))
1671
1672
1673 print ('%d samples (1-%d); %d outcomes were sampled for each FreqDist' %
1674 (numsamples, numsamples, numoutcomes))
1675 print '='*9*(len(pdists)+2)
1676 FORMATSTR = ' FreqDist '+ '%8s '*(len(pdists)-1) + '| Actual'
1677 print FORMATSTR % tuple(`pdist`[1:9] for pdist in pdists[:-1])
1678 print '-'*9*(len(pdists)+2)
1679 FORMATSTR = '%3d %8.6f ' + '%8.6f '*(len(pdists)-1) + '| %8.6f'
1680 for val in vals:
1681 print FORMATSTR % val
1682
1683
1684 zvals = zip(*vals)
1685 def sum(lst): return reduce(lambda x,y:x+y, lst, 0)
1686 sums = [sum(val) for val in zvals[1:]]
1687 print '-'*9*(len(pdists)+2)
1688 FORMATSTR = 'Total ' + '%8.6f '*(len(pdists)) + '| %8.6f'
1689 print FORMATSTR % tuple(sums)
1690 print '='*9*(len(pdists)+2)
1691
1692
1693 if len(`str(fdist1)`) < 70:
1694 print ' fdist1:', str(fdist1)
1695 print ' fdist2:', str(fdist2)
1696 print ' fdist3:', str(fdist3)
1697 print
1698
1699 print 'Generating:'
1700 for pdist in pdists:
1701 fdist = FreqDist(pdist.generate() for i in range(5000))
1702 print '%20s %s' % (pdist.__class__.__name__[:20], str(fdist)[:55])
1703 print
1704
1705 if __name__ == '__main__':
1706 demo(6, 10)
1707 demo(5, 5000)
1708
1709 __all__ = ['ConditionalFreqDist', 'ConditionalProbDist',
1710 'ConditionalProbDistI', 'CrossValidationProbDist',
1711 'DictionaryConditionalProbDist', 'DictionaryProbDist', 'ELEProbDist',
1712 'FreqDist', 'GoodTuringProbDist', 'HeldoutProbDist',
1713 'ImmutableProbabilisticMixIn', 'LaplaceProbDist', 'LidstoneProbDist',
1714 'MLEProbDist', 'MutableProbDist', 'ProbDistI', 'ProbabilisticMixIn',
1715 'UniformProbDist', 'WittenBellProbDist', 'add_logs', 'demo',
1716 'log_likelihood', 'sum_logs', 'entropy']
1717