1
2
3
4
5
6
7
8 import random
9 from itertools import chain
10
11 from nltk.probability import ConditionalProbDist, ConditionalFreqDist, MLEProbDist
12 from nltk.util import ingram
13
14 from api import *
15
17 """
18 A processing interface for assigning a probability to the next word.
19 """
20
21
22 - def __init__(self, n, train, estimator=None):
23 """
24 Creates an ngram language model to capture patterns in n consecutive
25 words of training text. An estimator smooths the probabilities derived
26 from the text and may allow generation of ngrams not seen during training.
27
28 @param n: the order of the language model (ngram size)
29 @type n: C{int}
30 @param train: the training text
31 @type train: C{list} of C{list} of C{string}
32 @param estimator: a function for generating a probability distribution
33 @type estimator: a function that takes a C{ConditionalFreqDist} and returns
34 a C{ConditionalProbDist}
35 """
36
37 self._n = n
38
39 if estimator == None:
40 estimator = lambda fdist, bins: MLEProbDist(fdist)
41
42 cfd = ConditionalFreqDist()
43 self._ngrams = set()
44 self._prefix = ('',) * (n - 1)
45
46 for ngram in ingram(chain(self._prefix, train), n):
47 self._ngrams.add(ngram)
48 context = tuple(ngram[:-1])
49 token = ngram[-1]
50 cfd[context].inc(token)
51
52 self._model = ConditionalProbDist(cfd, estimator, False, len(cfd))
53
54
55 if n>1:
56 self._backoff = NgramModel(n-1, train, estimator)
57
58
59 - def prob(self, word, context):
69
72
78
79
80
81 - def generate(self, num_words, context=()):
87
97
99 '''Evaluate the total entropy of a text with respect to the model.
100 This is the sum of the log probability of each word in the message.'''
101
102 e = 0.0
103 for i in range(self._n - 1, len(text)):
104 context = tuple(text[i - self._n + 1, i - 1])
105 token = text[i]
106 e -= self.logprob(token, context)
107 return e
108
110 return tuple(item) in self._model
111
113 return self._model[tuple(item)]
114
116 return '<NgramModel with %d %d-grams>' % (len(self._ngrams), self._n)
117
129
130 if __name__ == '__main__':
131 demo()
132