Package nltk :: Package model :: Module ngram
[hide private]
[frames] | no frames]

Source Code for Module nltk.model.ngram

  1  # Natural Language Toolkit: Language Models 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Steven Bird <[email protected]> 
  5  # URL: <http://nltk.org> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  import random 
  9  from itertools import chain 
 10   
 11  from nltk.probability import ConditionalProbDist, ConditionalFreqDist, MLEProbDist 
 12  from nltk.util import ingram 
 13   
 14  from api import * 
 15   
16 -class NgramModel(ModelI):
17 """ 18 A processing interface for assigning a probability to the next word. 19 """ 20 21 # add cutoff
22 - def __init__(self, n, train, estimator=None):
23 """ 24 Creates an ngram language model to capture patterns in n consecutive 25 words of training text. An estimator smooths the probabilities derived 26 from the text and may allow generation of ngrams not seen during training. 27 28 @param n: the order of the language model (ngram size) 29 @type n: C{int} 30 @param train: the training text 31 @type train: C{list} of C{list} of C{string} 32 @param estimator: a function for generating a probability distribution 33 @type estimator: a function that takes a C{ConditionalFreqDist} and returns 34 a C{ConditionalProbDist} 35 """ 36 37 self._n = n 38 39 if estimator == None: 40 estimator = lambda fdist, bins: MLEProbDist(fdist) 41 42 cfd = ConditionalFreqDist() 43 self._ngrams = set() 44 self._prefix = ('',) * (n - 1) 45 46 for ngram in ingram(chain(self._prefix, train), n): 47 self._ngrams.add(ngram) 48 context = tuple(ngram[:-1]) 49 token = ngram[-1] 50 cfd[context].inc(token) 51 52 self._model = ConditionalProbDist(cfd, estimator, False, len(cfd)) 53 54 # recursively construct the lower-order models 55 if n>1: 56 self._backoff = NgramModel(n-1, train, estimator)
57 58 # Katz Backoff probability
59 - def prob(self, word, context):
60 '''Evaluate the probability of this word in this context.''' 61 62 context = tuple(context) 63 if context + (word,) in self._ngrams: 64 return self[context].prob(word) 65 elif self._n > 1: 66 return self._alpha(context) * self._backoff.prob(word, context[:-1]) 67 else: 68 raise RuntimeError("No probability mass assigned to word %s in context %s" % (word, ' '.join(context)))
69
70 - def _alpha(self, tokens):
71 return self._beta(tokens) / self._backoff._beta(tokens[:-1])
72
73 - def _beta(self, tokens):
74 if tokens in self: 75 return self[tokens].discount() 76 else: 77 return 1
78 79 # NB, this will always start with same word since model 80 # is trained on a single text
81 - def generate(self, num_words, context=()):
82 '''Generate random text based on the language model.''' 83 text = list(context) 84 for i in range(num_words): 85 text.append(self._generate_one(text)) 86 return text
87
88 - def _generate_one(self, context):
89 context = (self._prefix + tuple(context))[-self._n+1:] 90 # print "Context (%d): <%s>" % (self._n, ','.join(context)) 91 if context in self: 92 return self[context].generate() 93 elif self._n > 1: 94 return self._backoff._generate_one(context[1:]) 95 else: 96 return '.'
97
98 - def entropy(self, text):
99 '''Evaluate the total entropy of a text with respect to the model. 100 This is the sum of the log probability of each word in the message.''' 101 102 e = 0.0 103 for i in range(self._n - 1, len(text)): 104 context = tuple(text[i - self._n + 1, i - 1]) 105 token = text[i] 106 e -= self.logprob(token, context) 107 return e
108
109 - def __contains__(self, item):
110 return tuple(item) in self._model
111
112 - def __getitem__(self, item):
113 return self._model[tuple(item)]
114
115 - def __repr__(self):
116 return '<NgramModel with %d %d-grams>' % (len(self._ngrams), self._n)
117
118 -def demo():
119 from nltk.corpus import brown 120 from nltk.probability import LidstoneProbDist, WittenBellProbDist 121 estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) 122 # estimator = lambda fdist, bins: WittenBellProbDist(fdist, 0.2) 123 lm = NgramModel(3, brown.words(categories='a'), estimator) 124 print lm 125 # print lm.entropy(sent) 126 text = lm.generate(100) 127 import textwrap 128 print '\n'.join(textwrap.wrap(' '.join(text)))
129 130 if __name__ == '__main__': 131 demo() 132