nltk.model.ngram

17 """ 18 A processing interface for assigning a probability to the next word. 19 """ 20 21 # add cutoff

22 - def __init__(self, n, train, estimator=None):

23 """ 24 Creates an ngram language model to capture patterns in n consecutive 25 words of training text. An estimator smooths the probabilities derived 26 from the text and may allow generation of ngrams not seen during training. 27 28 @param n: the order of the language model (ngram size) 29 @type n: C{int} 30 @param train: the training text 31 @type train: C{list} of C{list} of C{string} 32 @param estimator: a function for generating a probability distribution 33 @type estimator: a function that takes a C{ConditionalFreqDist} and returns 34 a C{ConditionalProbDist} 35 """ 36 37 self._n = n 38 39 if estimator == None: 40 estimator = lambda fdist, bins: MLEProbDist(fdist) 41 42 cfd = ConditionalFreqDist() 43 self._ngrams = set() 44 self._prefix = ('',) * (n - 1) 45 46 for ngram in ingram(chain(self._prefix, train), n): 47 self._ngrams.add(ngram) 48 context = tuple(ngram[:-1]) 49 token = ngram[-1] 50 cfd[context].inc(token) 51 52 self._model = ConditionalProbDist(cfd, estimator, False, len(cfd)) 53 54 # recursively construct the lower-order models 55 if n>1: 56 self._backoff = NgramModel(n-1, train, estimator)

57 58 # Katz Backoff probability

59 - def prob(self, word, context):

60 '''Evaluate the probability of this word in this context.''' 61 62 context = tuple(context) 63 if context + (word,) in self._ngrams: 64 return self[context].prob(word) 65 elif self._n > 1: 66 return self._alpha(context) * self._backoff.prob(word, context[:-1]) 67 else: 68 raise RuntimeError("No probability mass assigned to word %s in context %s" % (word, ' '.join(context)))

69

70 - def _alpha(self, tokens):

71 return self._beta(tokens) / self._backoff._beta(tokens[:-1])

72

73 - def _beta(self, tokens):

74 if tokens in self: 75 return self[tokens].discount() 76 else: 77 return 1

78 79 # NB, this will always start with same word since model 80 # is trained on a single text

81 - def generate(self, num_words, context=()):

82 '''Generate random text based on the language model.''' 83 text = list(context) 84 for i in range(num_words): 85 text.append(self._generate_one(text)) 86 return text

87

88 - def _generate_one(self, context):

89 context = (self._prefix + tuple(context))[-self._n+1:] 90 # print "Context (%d): <%s>" % (self._n, ','.join(context)) 91 if context in self: 92 return self[context].generate() 93 elif self._n > 1: 94 return self._backoff._generate_one(context[1:]) 95 else: 96 return '.'

97

98 - def entropy(self, text):

99 '''Evaluate the total entropy of a text with respect to the model. 100 This is the sum of the log probability of each word in the message.''' 101 102 e = 0.0 103 for i in range(self._n - 1, len(text)): 104 context = tuple(text[i - self._n + 1, i - 1]) 105 token = text[i] 106 e -= self.logprob(token, context) 107 return e

108

109 - def __contains__(self, item):

110 return tuple(item) in self._model

111

112 - def __getitem__(self, item):

113 return self._model[tuple(item)]

114

115 - def __repr__(self):

116 return '<NgramModel with %d %d-grams>' % (len(self._ngrams), self._n)

Source Code for Module nltk.model.ngram