>>> import nltk >>> from nltk.probability import *
>>> text1 = ['no', 'good', 'fish', 'goes', 'anywhere', 'without', 'a', 'porpoise', '!'] >>> text2 = ['no', 'good', 'porpoise', 'likes', 'to', 'fish', 'fish', 'anywhere', '.']>>> fd1 = nltk.FreqDist(text1) >>> fd1 == nltk.FreqDist(text1) True
Note that items are sorted in order of decreasing frequency; two items of the same frequency appear in indeterminate order.
>>> import itertools >>> both = nltk.FreqDist(text1 + text2) >>> both_most_common = both.most_common() >>> list(itertools.chain(*(sorted(ys) for k, ys in itertools.groupby(both_most_common, key=lambda t: t[1])))) [('fish', 3), ('anywhere', 2), ('good', 2), ('no', 2), ('porpoise', 2), ('!', 1), ('.', 1), ('a', 1), ('goes', 1), ('likes', 1), ('to', 1), ('without', 1)]>>> both == fd1 + nltk.FreqDist(text2) True >>> fd1 == nltk.FreqDist(text1) # But fd1 is unchanged True>>> fd2 = nltk.FreqDist(text2) >>> fd1.update(fd2) >>> fd1 == both True>>> fd1 = nltk.FreqDist(text1) >>> fd1.update(text2) >>> fd1 == both True>>> fd1 = nltk.FreqDist(text1) >>> fd2 = nltk.FreqDist(fd1) >>> fd2 == fd1 True
nltk.FreqDist can be pickled:
>>> import pickle >>> fd1 = nltk.FreqDist(text1) >>> pickled = pickle.dumps(fd1) >>> fd1 == pickle.loads(pickled) True
We extract a small part (500 sentences) of the Brown corpus
>>> corpus = nltk.corpus.brown.tagged_sents(categories='adventure')[:500] >>> print(len(corpus)) 500
We create a HMM trainer - note that we need the tags and symbols from the whole corpus, not just the training corpus
>>> from nltk.util import unique_list >>> tag_set = unique_list(tag for sent in corpus for (word,tag) in sent) >>> print(len(tag_set)) 92 >>> symbols = unique_list(word for sent in corpus for (word,tag) in sent) >>> print(len(symbols)) 1464 >>> print(len(tag_set)) 92 >>> symbols = unique_list(word for sent in corpus for (word,tag) in sent) >>> print(len(symbols)) 1464 >>> trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols)
We divide the corpus into 90% training and 10% testing
>>> train_corpus = [] >>> test_corpus = [] >>> for i in range(len(corpus)): ... if i % 10: ... train_corpus += [corpus[i]] ... else: ... test_corpus += [corpus[i]] >>> print(len(train_corpus)) 450 >>> print(len(test_corpus)) 50
And now we can test the estimators
>>> def train_and_test(est): ... hmm = trainer.train_supervised(train_corpus, estimator=est) ... print('%.2f%%' % (100 * hmm.evaluate(test_corpus)))
this resulted in an initialization error before r7209
>>> mle = lambda fd, bins: MLEProbDist(fd) >>> train_and_test(mle) 22.75%
Laplace (= Lidstone with gamma==1)
>>> train_and_test(LaplaceProbDist) 66.04%
Expected Likelihood Estimation (= Lidstone with gamma==0.5)
>>> train_and_test(ELEProbDist) 73.01%
Lidstone Estimation, for gamma==0.1, 0.5 and 1 (the later two should be exactly equal to MLE and ELE above)
>>> def lidstone(gamma): ... return lambda fd, bins: LidstoneProbDist(fd, gamma, bins) >>> train_and_test(lidstone(0.1)) 82.51% >>> train_and_test(lidstone(0.5)) 73.01% >>> train_and_test(lidstone(1.0)) 66.04%
This resulted in ZeroDivisionError before r7209
>>> train_and_test(WittenBellProbDist) 88.12%
Good Turing Estimation
>>> gt = lambda fd, bins: SimpleGoodTuringProbDist(fd, bins=1e5) >>> train_and_test(gt) 86.93%
Since the Kneser-Ney distribution is best suited for trigrams, we must adjust our testing accordingly.
>>> corpus = [[((x[0],y[0],z[0]),(x[1],y[1],z[1])) ... for x, y, z in nltk.trigrams(sent)] ... for sent in corpus[:100]]
>>> tag_set = unique_list(tag for sent in corpus for (word,tag) in sent) >>> len(tag_set) 906
>>> symbols = unique_list(word for sent in corpus for (word,tag) in sent) >>> len(symbols) 1341
>>> trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols) >>> train_corpus = [] >>> test_corpus = []
>>> for i in range(len(corpus)): ... if i % 10: ... train_corpus += [corpus[i]] ... else: ... test_corpus += [corpus[i]]
>>> len(train_corpus) 90 >>> len(test_corpus) 10
>>> kn = lambda fd, bins: KneserNeyProbDist(fd) >>> train_and_test(kn) 0.86%
Remains to be added: - Tests for HeldoutProbDist, CrossValidationProbDist and MutableProbDist
Issue 511: override pop and popitem to invalidate the cache
>>> fd = nltk.FreqDist('a') >>> list(fd.keys()) ['a'] >>> fd.pop('a') 1 >>> list(fd.keys()) []
Issue 533: access cumulative frequencies with no arguments
>>> fd = nltk.FreqDist('aab') >>> list(fd._cumulative_frequencies(['a'])) [2.0] >>> list(fd._cumulative_frequencies(['a', 'b'])) [2.0, 3.0]
Issue 579: override clear to reset some variables
>>> fd = FreqDist('aab') >>> fd.clear() >>> fd.N() 0
Issue 351: fix fileids method of CategorizedCorpusReader to inadvertently add errant categories
>>> from nltk.corpus import brown >>> brown.fileids('blah') Traceback (most recent call last): ... ValueError: Category blah not found >>> brown.categories() ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
Issue 175: add the unseen bin to SimpleGoodTuringProbDist by default otherwise any unseen events get a probability of zero, i.e., they don't get smoothed
>>> from nltk import SimpleGoodTuringProbDist, FreqDist >>> fd = FreqDist({'a':1, 'b':1, 'c': 2, 'd': 3, 'e': 4, 'f': 4, 'g': 4, 'h': 5, 'i': 5, 'j': 6, 'k': 6, 'l': 6, 'm': 7, 'n': 7, 'o': 8, 'p': 9, 'q': 10}) >>> p = SimpleGoodTuringProbDist(fd) >>> p.prob('a') 0.017649766667026317... >>> p.prob('o') 0.08433050215340411... >>> p.prob('z') 0.022727272727272728... >>> p.prob('foobar') 0.022727272727272728...
MLEProbDist, ConditionalProbDist'', ``DictionaryConditionalProbDist and ConditionalFreqDist can be pickled:
>>> import pickle >>> pd = MLEProbDist(fd) >>> sorted(pd.samples()) == sorted(pickle.loads(pickle.dumps(pd)).samples()) True >>> dpd = DictionaryConditionalProbDist({'x': pd}) >>> unpickled = pickle.loads(pickle.dumps(dpd)) >>> dpd['x'].prob('a') 0.011363636... >>> dpd['x'].prob('a') == unpickled['x'].prob('a') True >>> cfd = nltk.probability.ConditionalFreqDist() >>> cfd['foo']['hello'] += 1 >>> cfd['foo']['hello'] += 1 >>> cfd['bar']['hello'] += 1 >>> cfd2 = pickle.loads(pickle.dumps(cfd)) >>> cfd2 == cfd True >>> cpd = ConditionalProbDist(cfd, SimpleGoodTuringProbDist) >>> cpd2 = pickle.loads(pickle.dumps(cpd)) >>> cpd['foo'].prob('hello') == cpd2['foo'].prob('hello') True