nltk.probability

1 # Natural Language Toolkit: Probability and Statistics 2 # 3 # Copyright (C) 2001-2008 NLTK Project 4 # Author: Edward Loper <[email protected]> 5 # Steven Bird <[email protected]> (additions) 6 # Trevor Cohn <[email protected]> (additions) 7 # URL: <http://nltk.org> 8 # For license information, see LICENSE.TXT 9 # 10 # $Id: probability.py 6445 2008-08-16 07:46:34Z stevenbird $ 11 12 _NINF = float('-1e300') 13 14 """ 15 Classes for representing and processing probabilistic information. 16 17 The L{FreqDist} class is used to encode X{frequency distributions}, 18 which count the number of times that each outcome of an experiment 19 occurs. 20 21 The L{ProbDistI} class defines a standard interface for X{probability 22 distributions}, which encode the probability of each outcome for an 23 experiment. There are two types of probability distribution: 24 25 - X{derived probability distributions} are created from frequency 26 distributions. They attempt to model the probability distribution 27 that generated the frequency distribution. 28 - X{analytic probability distributions} are created directly from 29 parameters (such as variance). 30 31 The L{ConditionalFreqDist} class and L{ConditionalProbDistI} interface 32 are used to encode conditional distributions. Conditional probability 33 distributions can be derived or analytic; but currently the only 34 implementation of the C{ConditionalProbDistI} interface is 35 L{ConditionalProbDist}, a derived distribution. 36 37 """ 38 39 import math 40 import random 41 import warnings 42 43 ##////////////////////////////////////////////////////// 44 ## Frequency Distributions 45 ##////////////////////////////////////////////////////// 46

47 -class FreqDist(dict):

48 """ 49 A frequency distribution for the outcomes of an experiment. A 50 frequency distribution records the number of times each outcome of 51 an experiment has occurred. For example, a frequency distribution 52 could be used to record the frequency of each word type in a 53 document. Formally, a frequency distribution can be defined as a 54 function mapping from each sample to the number of times that 55 sample occurred as an outcome. 56 57 Frequency distributions are generally constructed by running a 58 number of experiments, and incrementing the count for a sample 59 every time it is an outcome of an experiment. For example, the 60 following code will produce a frequency distribution that encodes 61 how often each word occurs in a text: 62 63 >>> fdist = FreqDist() 64 >>> for word in tokenize.whitespace(sent): 65 ... fdist.inc(word.lower()) 66 67 An equivalent way to do this is with the initializer: 68 69 >>> fdist = FreqDist(word.lower() for word in tokenize.whitespace(sent)) 70 71 """

72 - def __init__(self, samples=None):

73 """ 74 Construct a new frequency distribution. If C{samples} is 75 given, then the frequency distribution will be initialized 76 with the count of each object in C{samples}; otherwise, it 77 will be initialized to be empty. 78 79 In particular, C{FreqDist()} returns an empty frequency 80 distribution; and C{FreqDist(samples)} first creates an empty 81 frequency distribution, and then calls C{inc} for each element 82 in the list C{samples}. 83 84 @param samples: The samples to initialize the frequency 85 distribution with. 86 @type samples: Sequence 87 """ 88 dict.__init__(self) 89 self._N = 0 90 self._Nr_cache = None 91 self._max_cache = None 92 if samples: 93 for sample in samples: 94 self.inc(sample)

95

96 - def inc(self, sample, count=1):

97 """ 98 Increment this C{FreqDist}'s count for the given 99 sample. 100 101 @param sample: The sample whose count should be incremented. 102 @type sample: any 103 @param count: The amount to increment the sample's count by. 104 @type count: C{int} 105 @rtype: None 106 @raise NotImplementedError: If C{sample} is not a 107 supported sample type. 108 """ 109 if count == 0: return 110 111 self._N += count 112 self[sample] = self.get(sample,0) + count 113 114 # Invalidate the Nr cache and max cache. 115 self._Nr_cache = None 116 self._max_cache = None

117

118 - def N(self):

119 """ 120 @return: The total number of sample outcomes that have been 121 recorded by this C{FreqDist}. For the number of unique 122 sample values (or bins) with counts greater than zero, use 123 C{FreqDist.B()}. 124 @rtype: C{int} 125 """ 126 return self._N

127

128 - def B(self):

129 """ 130 @return: The total number of sample values (or X{bins}) that 131 have counts greater than zero. For the total 132 number of sample outcomes recorded, use C{FreqDist.N()}. 133 @rtype: C{int} 134 """ 135 return len(self)

136

137 - def samples(self):

138 """ 139 @return: A list of all samples that have been recorded as 140 outcomes by this frequency distribution. Use C{count()} 141 to determine the count for each sample. 142 @rtype: C{list} 143 """ 144 return self.keys()

145

146 - def Nr(self, r, bins=None):

147 """ 148 @return: The number of samples with count r. 149 @rtype: C{int} 150 @type r: C{int} 151 @param r: A sample count. 152 @type bins: C{int} 153 @param bins: The number of possible sample outcomes. C{bins} 154 is used to calculate Nr(0). In particular, Nr(0) is 155 C{bins-self.B()}. If C{bins} is not specified, it 156 defaults to C{self.B()} (so Nr(0) will be 0). 157 """ 158 if r < 0: raise IndexError, 'FreqDist.Nr(): r must be non-negative' 159 160 # Special case for Nr(0): 161 if r == 0: 162 if bins is None: return 0 163 else: return bins-self.B() 164 165 # We have to search the entire distribution to find Nr. Since 166 # this is an expensive operation, and is likely to be used 167 # repeatedly, cache the results. 168 if self._Nr_cache is None: 169 self._cache_Nr_values() 170 171 if r >= len(self._Nr_cache): return 0 172 return self._Nr_cache[r]

173

174 - def _cache_Nr_values(self):

175 Nr = [0] 176 for sample in self: 177 c = self.get(sample, 0) 178 if c >= len(Nr): 179 Nr += [0]*(c+1-len(Nr)) 180 Nr[c] += 1 181 self._Nr_cache = Nr

182

183 - def count(self, sample):

184 """ 185 Return the count of a given sample. The count of a sample is 186 defined as the number of times that sample outcome was 187 recorded by this C{FreqDist}. Counts are non-negative 188 integers. This method has been replaced by conventional 189 dictionary indexing; use fd[item] instead of fd.count(item). 190 191 @return: The count of a given sample. 192 @rtype: C{int} 193 @param sample: the sample whose count 194 should be returned. 195 @type sample: any. 196 """ 197 raise AttributeError, "Use indexing to look up an entry in a FreqDist, e.g. fd[item]"

198

199 - def freq(self, sample):

200 """ 201 Return the frequency of a given sample. The frequency of a 202 sample is defined as the count of that sample divided by the 203 total number of sample outcomes that have been recorded by 204 this C{FreqDist}. The count of a sample is defined as the 205 number of times that sample outcome was recorded by this 206 C{FreqDist}. Frequencies are always real numbers in the range 207 [0, 1]. 208 209 @return: The frequency of a given sample. 210 @rtype: float 211 @param sample: the sample whose frequency 212 should be returned. 213 @type sample: any 214 """ 215 if self._N is 0: 216 return 0 217 return float(self[sample]) / self._N

218

219 - def max(self):

220 """ 221 Return the sample with the greatest number of outcomes in this 222 frequency distribution. If two or more samples have the same 223 number of outcomes, return one of them; which sample is 224 returned is undefined. If no outcomes have occurred in this 225 frequency distribution, return C{None}. 226 227 @return: The sample with the maximum number of outcomes in this 228 frequency distribution. 229 @rtype: any or C{None} 230 """ 231 if self._max_cache is None: 232 best_sample = None 233 best_count = -1 234 for sample in self: 235 if self[sample] > best_count: 236 best_sample = sample 237 best_count = self[sample] 238 self._max_cache = best_sample 239 return self._max_cache

240

241 - def sorted_samples(self):

242 raise AttributeError, "Use FreqDist.sorted() to get the sorted samples"

243

244 - def plot(self, samples=None, *args, **kwargs):

245 """ 246 Plot the given samples from the frequency distribution. 247 If no samples are specified, use all samples, in lexical sort order. 248 (Requires Matplotlib to be installed.) 249 250 @param samples: The samples to plot. 251 @type samples: C{list} 252 """ 253 try: 254 import pylab 255 except ImportError: 256 raise ValueError('The plot function requires the matplotlib package.' 257 'See http://matplotlib.sourceforge.net/') 258 if not samples: 259 samples = sorted(self.samples()) 260 values = [self[sample] for sample in samples] 261 if not args: 262 args = ["bo"] 263 pylab.grid(True, color="silver") 264 pylab.semilogy(values, *args, **kwargs) 265 pylab.xticks(range(len(samples)), samples, rotation=45, color="b") 266 pylab.show()

267

268 - def zipf_plot(self, num=40, *args, **kwargs):

269 """ 270 Plot the most frequent samples of the frequency distribution. 271 (Requires Matplotlib to be installed.) 272 273 @param num: The number of samples to plot. 274 @type num: C{int} 275 """ 276 samples = self.sorted()[:num] 277 self.plot(samples, *args, **kwargs)

278 279 # SB: cache the sorted samples?

280 - def sorted(self):

281 """ 282 Return the samples sorted in decreasing order of frequency. Instances 283 with the same count will be arbitrarily ordered. Instances with a 284 count of zero will be omitted. This method is C{O(N^2)}, where C{N} is 285 the number of samples, but will complete in a shorter time on average. 286 287 @return: The set of samples in sorted order. 288 @rtype: sequence of any 289 """ 290 from operator import itemgetter 291 return [sample for (sample, count) in sorted(self.items(), key=itemgetter(1), reverse=True)]

292

293 - def __repr__(self):

294 """ 295 @return: A string representation of this C{FreqDist}. 296 @rtype: string 297 """ 298 return '<FreqDist with %d samples>' % self.N()

299

300 - def __str__(self):

301 """ 302 @return: A string representation of this C{FreqDist}. 303 @rtype: string 304 """ 305 items = ['%r: %r' % (s, self[s]) for s in self.sorted()] 306 return '<FreqDist: %s>' % ', '.join(items)

307

308 - def __getitem__(self, sample):

309 return self.get(sample, 0)

310 311 ##////////////////////////////////////////////////////// 312 ## Probability Distributions 313 ##////////////////////////////////////////////////////// 314

315 -class ProbDistI(object):

316 """ 317 A probability distribution for the outcomes of an experiment. A 318 probability distribution specifies how likely it is that an 319 experiment will have any given outcome. For example, a 320 probability distribution could be used to predict the probability 321 that a token in a document will have a given type. Formally, a 322 probability distribution can be defined as a function mapping from 323 samples to nonnegative real numbers, such that the sum of every 324 number in the function's range is 1.0. C{ProbDist}s are often 325 used to model the probability distribution of the experiment used 326 to generate a frequency distribution. 327 """ 328 SUM_TO_ONE = True 329 """True if the probabilities of the samples in this probability 330 distribution will always sum to one.""" 331

332 - def __init__(self):

333 if self.__class__ == ProbDistI: 334 raise AssertionError, "Interfaces can't be instantiated"

335

336 - def prob(self, sample):

337 """ 338 @return: the probability for a given sample. Probabilities 339 are always real numbers in the range [0, 1]. 340 @rtype: float 341 @param sample: The sample whose probability 342 should be returned. 343 @type sample: any 344 """ 345 raise AssertionError()

346

347 - def logprob(self, sample):

348 """ 349 @return: the base 2 logarithm of the probability for a given 350 sample. Log probabilities range from negitive infinity to 351 zero. 352 @rtype: float 353 @param sample: The sample whose probability 354 should be returned. 355 @type sample: any 356 """ 357 # Default definition, in terms of prob() 358 p = self.prob(sample) 359 if p == 0: 360 # Use some approximation to infinity. What this does 361 # depends on your system's float implementation. 362 return _NINF 363 else: 364 return math.log(p, 2)

365

366 - def max(self):

367 """ 368 @return: the sample with the greatest probability. If two or 369 more samples have the same probability, return one of them; 370 which sample is returned is undefined. 371 @rtype: any 372 """ 373 raise AssertionError()

374

375 - def samples(self):

376 """ 377 @return: A list of all samples that have nonzero 378 probabilities. Use C{prob} to find the probability of 379 each sample. 380 @rtype: C{list} 381 """ 382 raise AssertionError()

383 384 # cf self.SUM_TO_ONE

385 - def discount(self):

386 """ 387 @return: The ratio by which counts are discounted on average: c*/c 388 @rtype: C{float} 389 """ 390 return 0.0

391 392 # Subclasses shuld define more efficient implementations of this, 393 # where possible.

394 - def generate(self):

395 """ 396 @return: A randomly selected sample from this probabilitiy 397 distribution. The probability of returning each sample 398 C{samp} is equal to C{self.prob(samp)}. 399 """ 400 p = random.random() 401 for sample in self.samples(): 402 p -= self.prob(sample) 403 if p <= 0: return sample 404 # allow for some rounding error: 405 if p < .0001: 406 return sample 407 # we *should* never get here 408 if self.SUM_TO_ONE: 409 warnings.warn("Probability distribution %r sums to %r; generate()" 410 " is returning an arbitrary sample." % (self, 1-p)) 411 return random.choice(list(self.samples()))

412

413 -class UniformProbDist(ProbDistI):

414 """ 415 A probability distribution that assigns equal probability to each 416 sample in a given set; and a zero probability to all other 417 samples. 418 """

419 - def __init__(self, samples):

420 """ 421 Construct a new uniform probability distribution, that assigns 422 equal probability to each sample in C{samples}. 423 424 @param samples: The samples that should be given uniform 425 probability. 426 @type samples: C{list} 427 @raise ValueError: If C{samples} is empty. 428 """ 429 if len(samples) == 0: 430 raise ValueError('A Uniform probability distribution must '+ 431 'have at least one sample.') 432 self._sampleset = set(samples) 433 self._prob = 1.0/len(self._sampleset) 434 self._samples = list(self._sampleset)

435

436 - def prob(self, sample):

437 if sample in self._sampleset: return self._prob 438 else: return 0

439 - def max(self): return self._samples[0]

440 - def samples(self): return self._samples

441 - def __repr__(self):

442 return '<UniformProbDist with %d samples>' % len(self._sampleset)

443

444 -class DictionaryProbDist(ProbDistI):

445 """ 446 A probability distribution whose probabilities are directly 447 specified by a given dictionary. The given dictionary maps 448 samples to probabilities. 449 """

450 - def __init__(self, prob_dict=None, log=False, normalize=False):

451 """ 452 Construct a new probability distribution from the given 453 dictionary, which maps values to probabilities (or to log 454 probabilities, if C{log} is true). If C{normalize} is 455 true, then the probability values are scaled by a constant 456 factor such that they sum to 1. 457 """ 458 self._prob_dict = prob_dict.copy() 459 self._log = log 460 461 # Normalize the distribution, if requested. 462 if normalize: 463 if log: 464 value_sum = sum_logs(self._prob_dict.values()) 465 if value_sum <= _NINF: 466 logp = math.log(1.0/len(prob_dict), 2) 467 for x in prob_dict.keys(): 468 self._prob_dict[x] = logp 469 else: 470 for (x, p) in self._prob_dict.items(): 471 self._prob_dict[x] -= value_sum 472 else: 473 value_sum = sum(self._prob_dict.values()) 474 if value_sum == 0: 475 p = 1.0/len(prob_dict) 476 for x in prob_dict: 477 self._prob_dict[x] = p 478 else: 479 norm_factor = 1.0/value_sum 480 for (x, p) in self._prob_dict.items(): 481 self._prob_dict[x] *= norm_factor

482

483 - def prob(self, sample):

484 if self._log: 485 if sample not in self._prob_dict: return 0 486 else: return 2**(self._prob_dict[sample]) 487 else: 488 return self._prob_dict.get(sample, 0)

489

490 - def logprob(self, sample):

491 if self._log: 492 return self._prob_dict.get(sample, _NINF) 493 else: 494 if sample not in self._prob_dict: return _NINF 495 elif self._prob_dict[sample] == 0: return _NINF 496 else: return math.log(self._prob_dict[sample], 2)

497

498 - def max(self):

499 if not hasattr(self, '_max'): 500 self._max = max((p,v) for (v,p) in self._prob_dict.items())[1] 501 return self._max

502 - def samples(self):

503 return self._prob_dict.keys()

504 - def __repr__(self):

505 return '<ProbDist with %d samples>' % len(self._prob_dict)

506

507 -class MLEProbDist(ProbDistI):

508 """ 509 The maximum likelihood estimate for the probability distribution 510 of the experiment used to generate a frequency distribution. The 511 X{maximum likelihood estimate} approximates the probability of 512 each sample as the frequency of that sample in the frequency 513 distribution. 514 """

515 - def __init__(self, freqdist):

516 """ 517 Use the maximum likelihood estimate to create a probability 518 distribution for the experiment used to generate C{freqdist}. 519 520 @type freqdist: C{FreqDist} 521 @param freqdist: The frequency distribution that the 522 probability estimates should be based on. 523 """ 524 if freqdist.N() == 0: 525 raise ValueError('An MLE probability distribution must '+ 526 'have at least one sample.') 527 528 self._freqdist = freqdist

529

530 - def freqdist(self):

531 """ 532 @return: The frequency distribution that this probability 533 distribution is based on. 534 @rtype: C{FreqDist} 535 """ 536 return self._freqdist

537

538 - def prob(self, sample):

539 return self._freqdist.freq(sample)

540

541 - def max(self):

542 return self._freqdist.max()

543

544 - def samples(self):

545 return self._freqdist.keys()

546

547 - def __repr__(self):

548 """ 549 @rtype: C{string} 550 @return: A string representation of this C{ProbDist}. 551 """ 552 return '<MLEProbDist based on %d samples>' % self._freqdist.N()

553

554 -class LidstoneProbDist(ProbDistI):

555 """ 556 The Lidstone estimate for the probability distribution of the 557 experiment used to generate a frequency distribution. The 558 C{Lidstone estimate} is paramaterized by a real number M{gamma}, 559 which typically ranges from 0 to 1. The X{Lidstone estimate} 560 approximates the probability of a sample with count M{c} from an 561 experiment with M{N} outcomes and M{B} bins as 562 M{(c+gamma)/(N+B*gamma)}. This is equivalant to adding 563 M{gamma} to the count for each bin, and taking the maximum 564 likelihood estimate of the resulting frequency distribution. 565 """ 566 SUM_TO_ONE = False

567 - def __init__(self, freqdist, gamma, bins=None):

568 """ 569 Use the Lidstone estimate to create a probability distribution 570 for the experiment used to generate C{freqdist}. 571 572 @type freqdist: C{FreqDist} 573 @param freqdist: The frequency distribution that the 574 probability estimates should be based on. 575 @type gamma: C{float} 576 @param gamma: A real number used to paramaterize the 577 estimate. The Lidstone estimate is equivalant to adding 578 M{gamma} to the count for each bin, and taking the 579 maximum likelihood estimate of the resulting frequency 580 distribution. 581 @type bins: C{int} 582 @param bins: The number of sample values that can be generated 583 by the experiment that is described by the probability 584 distribution. This value must be correctly set for the 585 probabilities of the sample values to sum to one. If 586 C{bins} is not specified, it defaults to C{freqdist.B()}. 587 """ 588 if (bins == 0) or (bins is None and freqdist.N() == 0): 589 name = self.__class__.__name__[:-8] 590 raise ValueError('A %s probability distribution ' % name + 591 'must have at least one bin.') 592 if (bins is not None) and (bins < freqdist.B()): 593 name = self.__class__.__name__[:-8] 594 raise ValueError('\nThe number of bins in a %s distribution ' % name + 595 '(%d) must be greater than or equal to\n' % bins + 596 'the number of bins in the FreqDist used ' + 597 'to create it (%d).' % freqdist.N()) 598 599 self._freqdist = freqdist 600 self._gamma = float(gamma) 601 self._N = self._freqdist.N() 602 603 if bins is None: bins = freqdist.B() 604 self._bins = bins

605

606 - def freqdist(self):

607 """ 608 @return: The frequency distribution that this probability 609 distribution is based on. 610 @rtype: C{FreqDist} 611 """ 612 return self._freqdist

613

614 - def prob(self, sample):

615 c = self._freqdist[sample] 616 return (c + self._gamma) / (self._N + self._bins * self._gamma)

617

618 - def max(self):

619 # For Lidstone distributions, probability is monotonic with 620 # frequency, so the most probable sample is the one that 621 # occurs most frequently. 622 return self._freqdist.max()

623

624 - def samples(self):

625 return self._freqdist.keys()

626

627 - def discount(self):

628 gb = self._gamma * self._bins 629 return gb / (self._N + gb)

630

631 - def __repr__(self):

632 """ 633 @rtype: C{string} 634 @return: A string representation of this C{ProbDist}. 635 """ 636 return '<LidstoneProbDist based on %d samples>' % self._freqdist.N()

637 638

639 -class LaplaceProbDist(LidstoneProbDist):

640 """ 641 The Laplace estimate for the probability distribution of the 642 experiment used to generate a frequency distribution. The 643 X{Lidstone estimate} approximates the probability of a sample with 644 count M{c} from an experiment with M{N} outcomes and M{B} bins as 645 M{(c+1)/(N+B)}. This is equivalant to adding one to the count for 646 each bin, and taking the maximum likelihood estimate of the 647 resulting frequency distribution. 648 """

649 - def __init__(self, freqdist, bins=None):

650 """ 651 Use the Laplace estimate to create a probability distribution 652 for the experiment used to generate C{freqdist}. 653 654 @type freqdist: C{FreqDist} 655 @param freqdist: The frequency distribution that the 656 probability estimates should be based on. 657 @type bins: C{int} 658 @param bins: The number of sample values that can be generated 659 by the experiment that is described by the probability 660 distribution. This value must be correctly set for the 661 probabilities of the sample values to sum to one. If 662 C{bins} is not specified, it defaults to C{freqdist.B()}. 663 """ 664 LidstoneProbDist.__init__(self, freqdist, 1, bins)

665

666 - def __repr__(self):

667 """ 668 @rtype: C{string} 669 @return: A string representation of this C{ProbDist}. 670 """ 671 return '<LaplaceProbDist based on %d samples>' % self._freqdist.N()

672

673 -class ELEProbDist(LidstoneProbDist):

674 """ 675 The expected likelihood estimate for the probability distribution 676 of the experiment used to generate a frequency distribution. The 677 X{expected likelihood estimate} approximates the probability of a 678 sample with count M{c} from an experiment with M{N} outcomes and 679 M{B} bins as M{(c+0.5)/(N+B/2)}. This is equivalant to adding 0.5 680 to the count for each bin, and taking the maximum likelihood 681 estimate of the resulting frequency distribution. 682 """

683 - def __init__(self, freqdist, bins=None):

684 """ 685 Use the expected likelihood estimate to create a probability 686 distribution for the experiment used to generate C{freqdist}. 687 688 @type freqdist: C{FreqDist} 689 @param freqdist: The frequency distribution that the 690 probability estimates should be based on. 691 @type bins: C{int} 692 @param bins: The number of sample values that can be generated 693 by the experiment that is described by the probability 694 distribution. This value must be correctly set for the 695 probabilities of the sample values to sum to one. If 696 C{bins} is not specified, it defaults to C{freqdist.B()}. 697 """ 698 LidstoneProbDist.__init__(self, freqdist, 0.5, bins)

699

700 - def __repr__(self):

701 """ 702 @rtype: C{string} 703 @return: A string representation of this C{ProbDist}. 704 """ 705 return '<ELEProbDist based on %d samples>' % self._freqdist.N()

706

707 -class HeldoutProbDist(ProbDistI):

708 """ 709 The heldout estimate for the probability distribution of the 710 experiment used to generate two frequency distributions. These 711 two frequency distributions are called the "heldout frequency 712 distribution" and the "base frequency distribution." The 713 X{heldout estimate} uses uses the X{heldout frequency 714 distribution} to predict the probability of each sample, given its 715 frequency in the X{base frequency distribution}. 716 717 In particular, the heldout estimate approximates the probability 718 for a sample that occurs M{r} times in the base distribution as 719 the average frequency in the heldout distribution of all samples 720 that occur M{r} times in the base distribution. 721 722 This average frequency is M{Tr[r]/(Nr[r]*N)}, where: 723 - M{Tr[r]} is the total count in the heldout distribution for 724 all samples that occur M{r} times in the base 725 distribution. 726 - M{Nr[r]} is the number of samples that occur M{r} times in 727 the base distribution. 728 - M{N} is the number of outcomes recorded by the heldout 729 frequency distribution. 730 731 In order to increase the efficiency of the C{prob} member 732 function, M{Tr[r]/(Nr[r]*N)} is precomputed for each value of M{r} 733 when the C{HeldoutProbDist} is created. 734 735 @type _estimate: C{list} of C{float} 736 @ivar _estimate: A list mapping from M{r}, the number of 737 times that a sample occurs in the base distribution, to the 738 probability estimate for that sample. C{_estimate[M{r}]} is 739 calculated by finding the average frequency in the heldout 740 distribution of all samples that occur M{r} times in the base 741 distribution. In particular, C{_estimate[M{r}]} = 742 M{Tr[r]/(Nr[r]*N)}. 743 @type _max_r: C{int} 744 @ivar _max_r: The maximum number of times that any sample occurs 745 in the base distribution. C{_max_r} is used to decide how 746 large C{_estimate} must be. 747 """ 748 SUM_TO_ONE = False

749 - def __init__(self, base_fdist, heldout_fdist, bins=None):

750 """ 751 Use the heldout estimate to create a probability distribution 752 for the experiment used to generate C{base_fdist} and 753 C{heldout_fdist}. 754 755 @type base_fdist: C{FreqDist} 756 @param base_fdist: The base frequency distribution. 757 @type heldout_fdist: C{FreqDist} 758 @param heldout_fdist: The heldout frequency distribution. 759 @type bins: C{int} 760 @param bins: The number of sample values that can be generated 761 by the experiment that is described by the probability 762 distribution. This value must be correctly set for the 763 probabilities of the sample values to sum to one. If 764 C{bins} is not specified, it defaults to C{freqdist.B()}. 765 """ 766 767 self._base_fdist = base_fdist 768 self._heldout_fdist = heldout_fdist 769 770 # The max number of times any sample occurs in base_fdist. 771 self._max_r = base_fdist[base_fdist.max()] 772 773 # Calculate Tr, Nr, and N. 774 Tr = self._calculate_Tr() 775 Nr = [base_fdist.Nr(r, bins) for r in range(self._max_r+1)] 776 N = heldout_fdist.N() 777 778 # Use Tr, Nr, and N to compute the probability estimate for 779 # each value of r. 780 self._estimate = self._calculate_estimate(Tr, Nr, N)

781

782 - def _calculate_Tr(self):

783 """ 784 @return: the list M{Tr}, where M{Tr[r]} is the total count in 785 C{heldout_fdist} for all samples that occur M{r} 786 times in C{base_fdist}. 787 @rtype: C{list} of C{float} 788 """ 789 Tr = [0.0] * (self._max_r+1) 790 for sample in self._heldout_fdist: 791 r = self._base_fdist[sample] 792 Tr[r] += self._heldout_fdist[sample] 793 return Tr

794

795 - def _calculate_estimate(self, Tr, Nr, N):

796 """ 797 @return: the list M{estimate}, where M{estimate[r]} is the 798 probability estimate for any sample that occurs M{r} times 799 in the base frequency distribution. In particular, 800 M{estimate[r]} is M{Tr[r]/(N[r]*N)}. In the special case 801 that M{N[r]=0}, M{estimate[r]} will never be used; so we 802 define M{estimate[r]=None} for those cases. 803 @rtype: C{list} of C{float} 804 @type Tr: C{list} of C{float} 805 @param Tr: the list M{Tr}, where M{Tr[r]} is the total count in 806 the heldout distribution for all samples that occur M{r} 807 times in base distribution. 808 @type Nr: C{list} of C{float} 809 @param Nr: The list M{Nr}, where M{Nr[r]} is the number of 810 samples that occur M{r} times in the base distribution. 811 @type N: C{int} 812 @param N: The total number of outcomes recorded by the heldout 813 frequency distribution. 814 """ 815 estimate = [] 816 for r in range(self._max_r+1): 817 if Nr[r] == 0: estimate.append(None) 818 else: estimate.append(Tr[r]/(Nr[r]*N)) 819 return estimate

820

821 - def base_fdist(self):

822 """ 823 @return: The base frequency distribution that this probability 824 distribution is based on. 825 @rtype: C{FreqDist} 826 """ 827 return self._base_fdist

828

829 - def heldout_fdist(self):

830 """ 831 @return: The heldout frequency distribution that this 832 probability distribution is based on. 833 @rtype: C{FreqDist} 834 """ 835 return self._heldout_fdist

836

837 - def samples(self):

838 return self._base_fdist.keys()

839

840 - def prob(self, sample):

841 # Use our precomputed probability estimate. 842 r = self._base_fdist[sample] 843 return self._estimate[r]

844

845 - def max(self):

846 # Note: the Heldout estimation is *not* necessarily monotonic; 847 # so this implementation is currently broken. However, it 848 # should give the right answer *most* of the time. :) 849 return self._base_fdist.max()

850

851 - def discount(self):

852 raise NotImplementedError()

853

854 - def __repr__(self):

855 """ 856 @rtype: C{string} 857 @return: A string representation of this C{ProbDist}. 858 """ 859 s = '<HeldoutProbDist: %d base samples; %d heldout samples>' 860 return s % (self._base_fdist.N(), self._heldout_fdist.N())

861

862 -class CrossValidationProbDist(ProbDistI):

863 """ 864 The cross-validation estimate for the probability distribution of 865 the experiment used to generate a set of frequency distribution. 866 The X{cross-validation estimate} for the probability of a sample 867 is found by averaging the held-out estimates for the sample in 868 each pair of frequency distributions. 869 """ 870 SUM_TO_ONE = False

871 - def __init__(self, freqdists, bins):

872 """ 873 Use the cross-validation estimate to create a probability 874 distribution for the experiment used to generate 875 C{freqdists}. 876 877 @type freqdists: C{list} of C{FreqDist} 878 @param freqdists: A list of the frequency distributions 879 generated by the experiment. 880 @type bins: C{int} 881 @param bins: The number of sample values that can be generated 882 by the experiment that is described by the probability 883 distribution. This value must be correctly set for the 884 probabilities of the sample values to sum to one. If 885 C{bins} is not specified, it defaults to C{freqdist.B()}. 886 """ 887 self._freqdists = freqdists 888 889 # Create a heldout probability distribution for each pair of 890 # frequency distributions in freqdists. 891 self._heldout_probdists = [] 892 for fdist1 in freqdists: 893 for fdist2 in freqdists: 894 if fdist1 is not fdist2: 895 probdist = HeldoutProbDist(fdist1, fdist2, bins) 896 self._heldout_probdists.append(probdist)

897

898 - def freqdists(self):

899 """ 900 @rtype: C{list} of C{FreqDist} 901 @return: The list of frequency distributions that this 902 C{ProbDist} is based on. 903 """ 904 return self._freqdists

905

906 - def samples(self):

907 # [xx] nb: this is not too efficient 908 return set(sum([fd.keys() for fd in self._freqdists], []))

909

910 - def prob(self, sample):

911 # Find the average probability estimate returned by each 912 # heldout distribution. 913 prob = 0.0 914 for heldout_probdist in self._heldout_probdists: 915 prob += heldout_probdist.prob(sample) 916 return prob/len(self._heldout_probdists)

917

918 - def discount(self):

919 raise NotImplementedError()

920

921 - def __repr__(self):

922 """ 923 @rtype: C{string} 924 @return: A string representation of this C{ProbDist}. 925 """ 926 return '<CrossValidationProbDist: %d-way>' % len(self._freqdists)

927

928 -class WittenBellProbDist(ProbDistI):

929 """ 930 The Witten-Bell estimate of a probability distribution. This distribution 931 allocates uniform probability mass to as yet unseen events by using the 932 number of events that have only been seen once. The probability mass 933 reserved for unseen events is equal to: 934 935 - M{T / (N + T)} 936 937 where M{T} is the number of observed event types and M{N} is the total 938 number of observed events. This equates to the maximum likelihood estimate 939 of a new type event occuring. The remaining probability mass is discounted 940 such that all probability estimates sum to one, yielding: 941 942 - M{p = T / Z (N + T)}, if count = 0 943 - M{p = c / (N + T)}, otherwise 944 """ 945

946 - def __init__(self, freqdist, bins=None):

947 """ 948 Creates a distribution of Witten-Bell probability estimates. This 949 distribution allocates uniform probability mass to as yet unseen 950 events by using the number of events that have only been seen once. 951 The probability mass reserved for unseen events is equal to: 952 953 - M{T / (N + T)} 954 955 where M{T} is the number of observed event types and M{N} is the total 956 number of observed events. This equates to the maximum likelihood 957 estimate of a new type event occuring. The remaining probability mass 958 is discounted such that all probability estimates sum to one, 959 yielding: 960 961 - M{p = T / Z (N + T)}, if count = 0 962 - M{p = c / (N + T)}, otherwise 963 964 The parameters M{T} and M{N} are taken from the C{freqdist} parameter 965 (the C{B()} and C{N()} values). The normalising factor M{Z} is 966 calculated using these values along with the C{bins} parameter. 967 968 @param freqdist: The frequency counts upon which to base the 969 estimation. 970 @type freqdist: C{FreqDist} 971 @param bins: The number of possible event types. This must be 972 at least as large as the number of bins in the 973 C{freqdist}. If C{None}, then it's assumed to be 974 equal to that of the C{freqdist} 975 @type bins: C{Int} 976 """ 977 assert bins == None or bins >= freqdist.B(),\ 978 'Bins parameter must not be less than freqdist.B()' 979 if bins == None: 980 bins = freqdist.B() 981 self._freqdist = freqdist 982 self._T = self._freqdist.B() 983 self._Z = bins - self._freqdist.B() 984 self._N = self._freqdist.N()

985

986 - def prob(self, sample):

987 # inherit docs from ProbDistI 988 c = self._freqdist[sample] 989 if c == 0: 990 return self._T / float(self._Z * (self._N + self._T)) 991 else: 992 return c / float(self._N + self._T)

993

994 - def max(self):

995 return self._freqdist.max()

996

997 - def samples(self):

998 return self._freqdist.keys()

999

1000 - def freqdist(self):

1001 return self._freqdist

1002

1003 - def discount(self):

1004 raise NotImplementedError()

1005

1006 - def __repr__(self):

1007 """ 1008 @rtype: C{string} 1009 @return: A string representation of this C{ProbDist}. 1010 """ 1011 return '<WittenBellProbDist based on %d samples>' % self._freqdist.N()

1012

1013 -class GoodTuringProbDist(ProbDistI):

1014 """ 1015 The Good-Turing estimate of a probability distribution. This method 1016 calculates the probability mass to assign to events with zero or low 1017 counts based on the number of events with higher counts. It does so by 1018 using the smoothed count M{c*}: 1019 1020 - M{c* = (c + 1) N(c + 1) / N(c)} 1021 1022 where M{c} is the original count, M{N(i)} is the number of event types 1023 observed with count M{i}. These smoothed counts are then normalised to 1024 yield a probability distribution. 1025 """ 1026 # TODO - add a cut-off parameter, above which the counts are unmodified 1027 # (see J&M p216) 1028

1029 - def __init__(self, freqdist, bins=None):

1030 """ 1031 Creates a Good-Turing probability distribution estimate. This method 1032 calculates the probability mass to assign to events with zero or low 1033 counts based on the number of events with higher counts. It does so by 1034 using the smoothed count M{c*}: 1035 1036 - M{c* = (c + 1) N(c + 1) / N(c)} 1037 1038 where M{c} is the original count, M{N(i)} is the number of event types 1039 observed with count M{i}. These smoothed counts are then normalised to 1040 yield a probability distribution. 1041 1042 The C{bins} parameter allows C{N(0)} to be estimated. 1043 1044 @param freqdist: The frequency counts upon which to base the 1045 estimation. 1046 @type freqdist: C{FreqDist} 1047 @param bins: The number of possible event types. This must be 1048 at least as large as the number of bins in the 1049 C{freqdist}. If C{None}, then it's taken to be 1050 equal to C{freqdist.B()}. 1051 @type bins: C{Int} 1052 """ 1053 assert bins == None or bins >= freqdist.B(),\ 1054 'Bins parameter must not be less than freqdist.B()' 1055 if bins == None: 1056 bins = freqdist.B() 1057 self._freqdist = freqdist 1058 self._bins = bins

1059

1060 - def prob(self, sample):

1061 # inherit docs from FreqDist 1062 c = self._freqdist[sample] 1063 nc = self._freqdist.Nr(c, self._bins) 1064 ncn = self._freqdist.Nr(c + 1, self._bins) 1065 1066 # avoid divide-by-zero errors for sparse datasets 1067 if nc == 0 or self._freqdist.N() == 0: 1068 return 0.0 1069 1070 return float(c + 1) * ncn / (nc * self._freqdist.N())

1071

1072 - def max(self):

1073 return self._freqdist.max()

1074

1075 - def samples(self):

1076 return self._freqdist.keys()

1077

1078 - def discount(self):

1079 raise NotImplementedError()

1080

1081 - def freqdist(self):

1082 return self._freqdist

1083

1084 - def __repr__(self):

1085 """ 1086 @rtype: C{string} 1087 @return: A string representation of this C{ProbDist}. 1088 """ 1089 return '<GoodTuringProbDist based on %d samples>' % self._freqdist.N()

1090

1091 -class MutableProbDist(ProbDistI):

1092 """ 1093 An mutable probdist where the probabilities may be easily modified. This 1094 simply copies an existing probdist, storing the probability values in a 1095 mutable dictionary and providing an update method. 1096 """ 1097

1098 - def __init__(self, prob_dist, samples, store_logs=True):

1099 """ 1100 Creates the mutable probdist based on the given prob_dist and using 1101 the list of samples given. These values are stored as log 1102 probabilities if the store_logs flag is set. 1103 1104 @param prob_dist: the distribution from which to garner the 1105 probabilities 1106 @type prob_dist: ProbDist 1107 @param samples: the complete set of samples 1108 @type samples: sequence of any 1109 @param store_logs: whether to store the probabilities as logarithms 1110 @type store_logs: bool 1111 """ 1112 try: 1113 import numpy 1114 except ImportError: 1115 print "Error: Please install numpy; for instructions see http://nltk.org/install.html" 1116 exit() 1117 self._samples = samples 1118 self._sample_dict = dict((samples[i], i) for i in range(len(samples))) 1119 self._data = numpy.zeros(len(samples), numpy.float64) 1120 for i in range(len(samples)): 1121 if store_logs: 1122 self._data[i] = prob_dist.logprob(samples[i]) 1123 else: 1124 self._data[i] = prob_dist.prob(samples[i]) 1125 self._logs = store_logs

1126

1127 - def samples(self):

1128 # inherit documentation 1129 return self._samples

1130

1131 - def prob(self, sample):

1132 # inherit documentation 1133 i = self._sample_dict.get(sample) 1134 if i != None: 1135 if self._logs: 1136 return 2**(self._data[i]) 1137 else: 1138 return self._data[i] 1139 else: 1140 return 0.0

1141

1142 - def logprob(self, sample):

1143 # inherit documentation 1144 i = self._sample_dict.get(sample) 1145 if i != None: 1146 if self._logs: 1147 return self._data[i] 1148 else: 1149 return math.log(self._data[i], 2) 1150 else: 1151 return float('-inf')

1152

1153 - def update(self, sample, prob, log=True):

1154 """ 1155 Update the probability for the given sample. This may cause the object 1156 to stop being the valid probability distribution - the user must 1157 ensure that they update the sample probabilities such that all samples 1158 have probabilities between 0 and 1 and that all probabilities sum to 1159 one. 1160 1161 @param sample: the sample for which to update the probability 1162 @type sample: C{any} 1163 @param prob: the new probability 1164 @type prob: C{float} 1165 @param log: is the probability already logged 1166 @type log: C{bool} 1167 """ 1168 i = self._sample_dict.get(sample) 1169 assert i != None 1170 if self._logs: 1171 if log: self._data[i] = prob 1172 else: self._data[i] = math.log(prob, 2) 1173 else: 1174 if log: self._data[i] = 2**(prob) 1175 else: self._data[i] = prob

1176 1177 ##////////////////////////////////////////////////////// 1178 ## Probability Distribution Operations 1179 ##////////////////////////////////////////////////////// 1180

1181 -def log_likelihood(test_pdist, actual_pdist):

1182 if (not isinstance(test_pdist, ProbDistI) or 1183 not isinstance(actual_pdist, ProbDistI)): 1184 raise ValueError('expected a ProbDist.') 1185 # Is this right? 1186 return sum(actual_pdist.prob(s) * math.log(test_pdist.prob(s), 2) 1187 for s in actual_pdist.keys())

1188

1189 -def entropy(pdist):

1190 probs = [pdist.prob(s) for s in pdist.samples()] 1191 return -sum([p * math.log(p,2) for p in probs])

1192 1193 ##////////////////////////////////////////////////////// 1194 ## Conditional Distributions 1195 ##////////////////////////////////////////////////////// 1196

1197 -class ConditionalFreqDist(object):

1198 """ 1199 A collection of frequency distributions for a single experiment 1200 run under different conditions. Conditional frequency 1201 distributions are used to record the number of times each sample 1202 occurred, given the condition under which the experiment was run. 1203 For example, a conditional frequency distribution could be used to 1204 record the frequency of each word (type) in a document, given its 1205 length. Formally, a conditional frequency distribution can be 1206 defined as a function that maps from each condition to the 1207 C{FreqDist} for the experiment under that condition. 1208 1209 The frequency distribution for each condition is accessed using 1210 the indexing operator: 1211 1212 >>> cfdist[3] 1213 <FreqDist with 73 outcomes> 1214 >>> cfdist[3].freq('the') 1215 0.4 1216 >>> cfdist[3]['dog'] 1217 2 1218 1219 When the indexing operator is used to access the frequency 1220 distribution for a condition that has not been accessed before, 1221 C{ConditionalFreqDist} creates a new empty C{FreqDist} for that 1222 condition. 1223 1224 Conditional frequency distributions are typically constructed by 1225 repeatedly running an experiment under a variety of conditions, 1226 and incrementing the sample outcome counts for the appropriate 1227 conditions. For example, the following code will produce a 1228 conditional frequency distribution that encodes how often each 1229 word type occurs, given the length of that word type: 1230 1231 >>> cfdist = ConditionalFreqDist() 1232 >>> for word in tokenize.whitespace(sent): 1233 ... condition = len(word) 1234 ... cfdist[condition].inc(word) 1235 1236 An equivalent way to do this is with the initializer: 1237 1238 >>> cfdist = ConditionalFreqDist((len(word), word) for word in tokenize.whitespace(sent)) 1239 1240 """

1241 - def __init__(self, cond_samples=None):

1242 """ 1243 Construct a new empty conditional frequency distribution. In 1244 particular, the count for every sample, under every condition, 1245 is zero. 1246 1247 @param cond_samples: The samples to initialize the conditional frequency distribution with 1248 @type cond_samples: Sequence of (condition, sample) tuples 1249 """ 1250 self._fdists = {} 1251 if cond_samples: 1252 for (cond, sample) in cond_samples: 1253 self[cond].inc(sample)

1254

1255 - def __getitem__(self, condition):

1256 """ 1257 Return the frequency distribution that encodes the frequency 1258 of each sample outcome, given that the experiment was run 1259 under the given condition. If the frequency distribution for 1260 the given condition has not been accessed before, then this 1261 will create a new empty C{FreqDist} for that condition. 1262 1263 @return: The frequency distribution that encodes the frequency 1264 of each sample outcome, given that the experiment was run 1265 under the given condition. 1266 @rtype: C{FreqDist} 1267 1268 @param condition: The condition under which the experiment was 1269 run. 1270 @type condition: any 1271 """ 1272 # Create the conditioned freq dist, if it doesn't exist 1273 if condition not in self._fdists: 1274 self._fdists[condition] = FreqDist() 1275 return self._fdists[condition]

1276

1277 - def conditions(self):

1278 """ 1279 @return: A list of the conditions that have been accessed for 1280 this C{ConditionalFreqDist}. Use the indexing operator to 1281 access the frequency distribution for a given condition. 1282 Note that the frequency distributions for some conditions 1283 may contain zero sample outcomes. 1284 @rtype: C{list} 1285 """ 1286 return self._fdists.keys()

1287

1288 - def __len__(self):

1289 """ 1290 @return: The number of conditions that have been accessed 1291 for this C{ConditionalFreqDist}. 1292 @rtype: C{int} 1293 """ 1294 return len(self._fdists)

1295

1296 - def __repr__(self):

1297 """ 1298 @return: A string representation of this 1299 C{ConditionalFreqDist}. 1300 @rtype: C{string} 1301 """ 1302 n = len(self._fdists) 1303 return '<ConditionalFreqDist with %d conditions>' % n

1304

1305 -class ConditionalProbDistI(object):

1306 """ 1307 A collection of probability distributions for a single experiment 1308 run under different conditions. Conditional probability 1309 distributions are used to estimate the likelihood of each sample, 1310 given the condition under which the experiment was run. For 1311 example, a conditional probability distribution could be used to 1312 estimate the probability of each word type in a document, given 1313 the length of the word type. Formally, a conditional probability 1314 distribution can be defined as a function that maps from each 1315 condition to the C{ProbDist} for the experiment under that 1316 condition. 1317 """

1318 - def __init__(self):

1319 raise AssertionError, 'ConditionalProbDistI is an interface'

1320

1321 - def __getitem__(self, condition):

1322 """ 1323 @return: The probability distribution for the experiment run 1324 under the given condition. 1325 @rtype: C{ProbDistI} 1326 @param condition: The condition whose probability distribution 1327 should be returned. 1328 @type condition: any 1329 """ 1330 raise AssertionError

1331

1332 - def __len__(self):

1333 """ 1334 @return: The number of conditions that are represented by 1335 this C{ConditionalProbDist}. 1336 @rtype: C{int} 1337 """ 1338 raise AssertionError

1339

1340 - def conditions(self):

1341 """ 1342 @return: A list of the conditions that are represented by 1343 this C{ConditionalProbDist}. Use the indexing operator to 1344 access the probability distribution for a given condition. 1345 @rtype: C{list} 1346 """ 1347 raise AssertionError

1348 1349 # For now, this is the only implementation of ConditionalProbDistI; 1350 # but we would want a different implementation if we wanted to build a 1351 # conditional probability distribution analytically (e.g., a gaussian 1352 # distribution), rather than basing it on an underlying frequency 1353 # distribution.

1354 -class ConditionalProbDist(ConditionalProbDistI):

1355 """ 1356 A conditional probability distribution modelling the experiments 1357 that were used to generate a conditional frequency distribution. 1358 A C{ConditoinalProbDist} is constructed from a 1359 C{ConditionalFreqDist} and a X{C{ProbDist} factory}: 1360 1361 - The B{C{ConditionalFreqDist}} specifies the frequency 1362 distribution for each condition. 1363 - The B{C{ProbDist} factory} is a function that takes a 1364 condition's frequency distribution, and returns its 1365 probability distribution. A C{ProbDist} class's name (such as 1366 C{MLEProbDist} or C{HeldoutProbDist}) can be used to specify 1367 that class's constructor. 1368 1369 The first argument to the C{ProbDist} factory is the frequency 1370 distribution that it should model; and the remaining arguments are 1371 specified by the C{factory_args} parameter to the 1372 C{ConditionalProbDist} constructor. For example, the following 1373 code constructs a C{ConditionalProbDist}, where the probability 1374 distribution for each condition is an C{ELEProbDist} with 10 bins: 1375 1376 >>> cpdist = ConditionalProbDist(cfdist, ELEProbDist, 10) 1377 >>> print cpdist['run'].max() 1378 'NN' 1379 >>> print cpdist['run'].prob('NN') 1380 0.0813 1381 """

1382 - def __init__(self, cfdist, probdist_factory, 1383 supply_condition=False, *factory_args):

1384 """ 1385 Construct a new conditional probability distribution, based on 1386 the given conditional frequency distribution and C{ProbDist} 1387 factory. 1388 1389 @type cfdist: L{ConditionalFreqDist} 1390 @param cfdist: The C{ConditionalFreqDist} specifying the 1391 frequency distribution for each condition. 1392 @type probdist_factory: C{class} or C{function} 1393 @param probdist_factory: The function or class that maps 1394 a condition's frequency distribution to its probability 1395 distribution. The function is called with the frequency 1396 distribution as its first argument, the condition as its 1397 second argument (only if C{supply_condition=True}), and 1398 C{factory_args} as its remaining arguments. 1399 @type supply_condition: C{bool} 1400 @param supply_condition: If true, then pass the condition as 1401 the second argument to C{probdist_factory}. 1402 @type factory_args: (any) 1403 @param factory_args: Extra arguments for C{probdist_factory}. 1404 These arguments are usually used to specify extra 1405 properties for the probability distributions of individual 1406 conditions, such as the number of bins they contain. 1407 """ 1408 self._probdist_factory = probdist_factory 1409 self._cfdist = cfdist 1410 self._supply_condition = supply_condition 1411 self._factory_args = factory_args 1412 1413 self._pdists = {} 1414 for c in cfdist.conditions(): 1415 if supply_condition: 1416 pdist = probdist_factory(cfdist[c], c, *factory_args) 1417 else: 1418 pdist = probdist_factory(cfdist[c], *factory_args) 1419 self._pdists[c] = pdist

1420

1421 - def __contains__(self, condition):

1422 return condition in self._pdists

1423

1424 - def __getitem__(self, condition):

1425 if condition not in self._pdists: 1426 # If it's a condition we haven't seen, create a new prob 1427 # dist from the empty freq dist. Typically, this will 1428 # give a uniform prob dist. 1429 pdist = self._probdist_factory(FreqDist(), *self._factory_args) 1430 self._pdists[condition] = pdist 1431 1432 return self._pdists[condition]

1433

1434 - def conditions(self):

1435 return self._pdists.keys()

1436

1437 - def __len__(self):

1438 return len(self._pdists)

1439

1440 - def __repr__(self):

1441 """ 1442 @return: A string representation of this 1443 C{ConditionalProbDist}. 1444 @rtype: C{string} 1445 """ 1446 return '<ConditionalProbDist with %d conditions>' % self.__len__()

1447 1448

1449 -class DictionaryConditionalProbDist(ConditionalProbDistI):

1450 """ 1451 An alternative ConditionalProbDist that simply wraps a dictionary of 1452 ProbDists rather than creating these from FreqDists. 1453 """ 1454

1455 - def __init__(self, probdist_dict):

1456 """ 1457 @param probdist_dict: a dictionary containing the probdists indexed 1458 by the conditions 1459 @type probdist_dict: dict any -> probdist 1460 """ 1461 self._dict = probdist_dict

1462

1463 - def __getitem__(self, condition):

1464 # inherit documentation 1465 # this will cause an exception for unseen conditions 1466 return self._dict[condition]

1467

1468 - def conditions(self):

1469 # inherit documentation 1470 return self._dict.keys()

1471 1472 ##////////////////////////////////////////////////////// 1473 ## Adding in log-space. 1474 ##////////////////////////////////////////////////////// 1475 1476 # If the difference is bigger than this, then just take the bigger one: 1477 _ADD_LOGS_MAX_DIFF = math.log(1e-30, 2) 1478

1479 -def add_logs(logx, logy):

1480 """ 1481 Given two numbers C{logx}=M{log(x)} and C{logy}=M{log(y)}, return 1482 M{log(x+y)}. Conceptually, this is the same as returning 1483 M{log(2**(C{logx})+2**(C{logy}))}, but the actual implementation 1484 avoids overflow errors that could result from direct computation. 1485 """ 1486 if (logx < logy + _ADD_LOGS_MAX_DIFF): 1487 return logy 1488 if (logy < logx + _ADD_LOGS_MAX_DIFF): 1489 return logx 1490 base = min(logx, logy) 1491 return base + math.log(2**(logx-base) + 2**(logy-base), 2)

1492

1493 -def sum_logs(logs):

1494 if len(logs) == 0: 1495 # Use some approximation to infinity. What this does 1496 # depends on your system's float implementation. 1497 return _NINF 1498 else: 1499 return reduce(add_logs, logs[1:], logs[0])

1500 1501 ##////////////////////////////////////////////////////// 1502 ## Probabilistic Mix-in 1503 ##////////////////////////////////////////////////////// 1504

1505 -class ProbabilisticMixIn(object):

1506 """ 1507 A mix-in class to associate probabilities with other classes 1508 (trees, rules, etc.). To use the C{ProbabilisticMixIn} class, 1509 define a new class that derives from an existing class and from 1510 ProbabilisticMixIn. You will need to define a new constructor for 1511 the new class, which explicitly calls the constructors of both its 1512 parent classes. For example: 1513 1514 >>> class A: 1515 ... def __init__(self, x, y): self.data = (x,y) 1516 ... 1517 >>> class ProbabilisticA(A, ProbabilisticMixIn): 1518 ... def __init__(self, x, y, **prob_kwarg): 1519 ... A.__init__(self, x, y) 1520 ... ProbabilisticMixIn.__init__(self, **prob_kwarg) 1521 1522 See the documentation for the ProbabilisticMixIn 1523 L{constructor<__init__>} for information about the arguments it 1524 expects. 1525 1526 You should generally also redefine the string representation 1527 methods, the comparison methods, and the hashing method. 1528 """

1529 - def __init__(self, **kwargs):

1530 """ 1531 Initialize this object's probability. This initializer should 1532 be called by subclass constructors. C{prob} should generally be 1533 the first argument for those constructors. 1534 1535 @kwparam prob: The probability associated with the object. 1536 @type prob: C{float} 1537 @kwparam logprob: The log of the probability associated with 1538 the object. 1539 @type logprob: C{float} 1540 """ 1541 if 'prob' in kwargs: 1542 if 'logprob' in kwargs: 1543 raise TypeError('Must specify either prob or logprob ' 1544 '(not both)') 1545 else: 1546 ProbabilisticMixIn.set_prob(self, kwargs['prob']) 1547 elif 'logprob' in kwargs: 1548 ProbabilisticMixIn.set_logprob(self, kwargs['logprob']) 1549 else: 1550 self.__prob = self.__logprob = None

1551

1552 - def set_prob(self, prob):

1553 """ 1554 Set the probability associated with this object to C{prob}. 1555 @param prob: The new probability 1556 @type prob: C{float} 1557 """ 1558 self.__prob = prob 1559 self.__logprob = None

1560

1561 - def set_logprob(self, logprob):

1562 """ 1563 Set the log probability associated with this object to 1564 C{logprob}. I.e., set the probability associated with this 1565 object to C{2**(logprob)}. 1566 @param logprob: The new log probability 1567 @type logprob: C{float} 1568 """ 1569 self.__logprob = prob 1570 self.__prob = None

1571

1572 - def prob(self):

1573 """ 1574 @return: The probability associated with this object. 1575 @rtype: C{float} 1576 """ 1577 if self.__prob is None: 1578 if self.__logprob is None: return None 1579 self.__prob = 2**(self.__logprob) 1580 return self.__prob

1581

1582 - def logprob(self):

1583 """ 1584 @return: C{log(p)}, where C{p} is the probability associated 1585 with this object. 1586 1587 @rtype: C{float} 1588 """ 1589 if self.__logprob is None: 1590 if self.__prob is None: return None 1591 self.__logprob = math.log(self.__prob, 2) 1592 return self.__logprob

1593

1594 -class ImmutableProbabilisticMixIn(ProbabilisticMixIn):

1595 - def set_prob(self, prob):

1596 raise ValueError, '%s is immutable' % self.__class__.__name__

1597 - def set_logprob(self, prob):

1598 raise ValueError, '%s is immutable' % self.__class__.__name__

1599 1600 ##////////////////////////////////////////////////////// 1601 ## Demonstration 1602 ##////////////////////////////////////////////////////// 1603

1604 -def _create_rand_fdist(numsamples, numoutcomes):

1605 """ 1606 Create a new frequency distribution, with random samples. The 1607 samples are numbers from 1 to C{numsamples}, and are generated by 1608 summing two numbers, each of which has a uniform distribution. 1609 """ 1610 import random 1611 from math import sqrt 1612 fdist = FreqDist() 1613 for x in range(numoutcomes): 1614 y = (random.randint(1, (1+numsamples)/2) + 1615 random.randint(0, numsamples/2)) 1616 fdist.inc(y) 1617 return fdist

1618

1619 -def _create_sum_pdist(numsamples):

1620 """ 1621 Return the true probability distribution for the experiment 1622 C{_create_rand_fdist(numsamples, x)}. 1623 """ 1624 fdist = FreqDist() 1625 for x in range(1, (1+numsamples)/2+1): 1626 for y in range(0, numsamples/2+1): 1627 fdist.inc(x+y) 1628 return MLEProbDist(fdist)

1629

1630 -def demo(numsamples=6, numoutcomes=500):

1631 """ 1632 A demonstration of frequency distributions and probability 1633 distributions. This demonstration creates three frequency 1634 distributions with, and uses them to sample a random process with 1635 C{numsamples} samples. Each frequency distribution is sampled 1636 C{numoutcomes} times. These three frequency distributions are 1637 then used to build six probability distributions. Finally, the 1638 probability estimates of these distributions are compared to the 1639 actual probability of each sample. 1640 1641 @type numsamples: C{int} 1642 @param numsamples: The number of samples to use in each demo 1643 frequency distributions. 1644 @type numoutcomes: C{int} 1645 @param numoutcomes: The total number of outcomes for each 1646 demo frequency distribution. These outcomes are divided into 1647 C{numsamples} bins. 1648 @rtype: C{None} 1649 """ 1650 1651 # Randomly sample a stochastic process three times. 1652 fdist1 = _create_rand_fdist(numsamples, numoutcomes) 1653 fdist2 = _create_rand_fdist(numsamples, numoutcomes) 1654 fdist3 = _create_rand_fdist(numsamples, numoutcomes) 1655 1656 # Use our samples to create probability distributions. 1657 pdists = [ 1658 MLEProbDist(fdist1), 1659 LidstoneProbDist(fdist1, 0.5, numsamples), 1660 HeldoutProbDist(fdist1, fdist2, numsamples), 1661 HeldoutProbDist(fdist2, fdist1, numsamples), 1662 CrossValidationProbDist([fdist1, fdist2, fdist3], numsamples), 1663 _create_sum_pdist(numsamples), 1664 ] 1665 1666 # Find the probability of each sample. 1667 vals = [] 1668 for n in range(1,numsamples+1): 1669 vals.append(tuple([n, fdist1.freq(n)] + 1670 [pdist.prob(n) for pdist in pdists])) 1671 1672 # Print the results in a formatted table. 1673 print ('%d samples (1-%d); %d outcomes were sampled for each FreqDist' % 1674 (numsamples, numsamples, numoutcomes)) 1675 print '='*9*(len(pdists)+2) 1676 FORMATSTR = ' FreqDist '+ '%8s '*(len(pdists)-1) + '| Actual' 1677 print FORMATSTR % tuple(`pdist`[1:9] for pdist in pdists[:-1]) 1678 print '-'*9*(len(pdists)+2) 1679 FORMATSTR = '%3d %8.6f ' + '%8.6f '*(len(pdists)-1) + '| %8.6f' 1680 for val in vals: 1681 print FORMATSTR % val 1682 1683 # Print the totals for each column (should all be 1.0) 1684 zvals = zip(*vals) 1685 def sum(lst): return reduce(lambda x,y:x+y, lst, 0) 1686 sums = [sum(val) for val in zvals[1:]] 1687 print '-'*9*(len(pdists)+2) 1688 FORMATSTR = 'Total ' + '%8.6f '*(len(pdists)) + '| %8.6f' 1689 print FORMATSTR % tuple(sums) 1690 print '='*9*(len(pdists)+2) 1691 1692 # Display the distributions themselves, if they're short enough. 1693 if len(`str(fdist1)`) < 70: 1694 print ' fdist1:', str(fdist1) 1695 print ' fdist2:', str(fdist2) 1696 print ' fdist3:', str(fdist3) 1697 print 1698 1699 print 'Generating:' 1700 for pdist in pdists: 1701 fdist = FreqDist(pdist.generate() for i in range(5000)) 1702 print '%20s %s' % (pdist.__class__.__name__[:20], str(fdist)[:55]) 1703 print

1704 1705 if __name__ == '__main__': 1706 demo(6, 10) 1707 demo(5, 5000) 1708 1709 __all__ = ['ConditionalFreqDist', 'ConditionalProbDist', 1710 'ConditionalProbDistI', 'CrossValidationProbDist', 1711 'DictionaryConditionalProbDist', 'DictionaryProbDist', 'ELEProbDist', 1712 'FreqDist', 'GoodTuringProbDist', 'HeldoutProbDist', 1713 'ImmutableProbabilisticMixIn', 'LaplaceProbDist', 'LidstoneProbDist', 1714 'MLEProbDist', 'MutableProbDist', 'ProbDistI', 'ProbabilisticMixIn', 1715 'UniformProbDist', 'WittenBellProbDist', 'add_logs', 'demo', 1716 'log_likelihood', 'sum_logs', 'entropy'] 1717

Source Code for Module nltk.probability