Package nltk :: Module text
[hide private]
[frames] | no frames]

Source Code for Module nltk.text

  1  # Natural Language Toolkit: Texts 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Steven Bird <[email protected]> 
  5  # URL: <http://nltk.org> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  import textwrap 
  9   
 10  from probability import FreqDist, LidstoneProbDist 
 11  from compat import defaultdict 
 12  from util import ngram 
 13  from model import NgramModel 
 14   
15 -class Text(list):
16 """A text object, which can be loaded with a sequence of words, 17 and which supports counting, concordancing, collocation discovery, etc. 18 This class is intended to support initial exploration of texts. 19 It is initialized with a list of words, e.g.: 20 21 >>> moby = Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt')) 22 23 Many of the methods simply print their results, and are intended 24 for use via the interactive console. 25 """ 26
27 - def __init__(self, text, name=None):
28 """ 29 Create a Text object. 30 31 @param words: The source text. 32 @type words: C{sequence} of C{str} 33 """ 34 list.__init__(self, text) 35 36 if name: 37 self.name = name 38 elif ']' in self[:20]: 39 end = self[:20].index(']') 40 self.name = " ".join(self[1:end]) 41 else: 42 self.name = " ".join(self[:8]) + "..."
43
44 - def concordance(self, word, width=80, lines=25):
45 """ 46 Print a concordance for the word with the specified context window. 47 48 @param word: The target word 49 @type word: C{str} 50 @param width: The width of each line, in characters (default=80) 51 @type width: C{int} 52 @param lines: The number of lines to display (default=25) 53 @type lines: C{int} 54 """ 55 if '_offsets' not in self.__dict__: 56 print "Building index..." 57 self._offsets = defaultdict(list) 58 for i in range(len(self)): 59 w = self[i].lower() 60 self._offsets[w].append(i) 61 62 word = word.lower() 63 half_width = (width - len(word)) / 2 64 context = width/4 # approx number of words of context 65 if word in self._offsets: 66 lines = min(lines, self._offsets[word]) 67 print "Displaying %s of %s matches:" %\ 68 (lines, len(self._offsets[word])) 69 for i in self._offsets[word]: 70 left = ' ' * half_width + ' '.join(self[i-context:i]) 71 right = ' '.join(self[i+1:i+context]) 72 left = left[-half_width:] 73 right = right[:half_width] 74 print left, word, right 75 lines -= 1 76 if lines < 0: 77 break 78 else: 79 print "No matches"
80
81 - def collocations(self, num=20):
82 """ 83 Print collocations derived from the text. 84 85 @param num: The number of collocations to produce. 86 @type num: C{int} 87 """ 88 if '_collocations' not in self.__dict__: 89 print "Building word index..." 90 from operator import itemgetter 91 text = filter(lambda w: len(w) > 2, self) 92 fd = FreqDist(tuple(text[i:i+2]) 93 for i in range(len(text)-1)) 94 scored = [((w1,w2), fd[(w1,w2)] ** 3 / float(self.vocab()[w1] * self.vocab()[w2])) 95 for w1, w2 in fd] 96 scored.sort(key=itemgetter(1), reverse=True) 97 self._collocations = map(itemgetter(0), scored) 98 print '; '.join([w1+' '+w2 for w1, w2 in self._collocations[:num]])
99
100 - def readability(self, method):
101 # code from nltk_contrib.readability 102 raise NotImplementedError
103
104 - def generate(self, length=100):
105 """ 106 Print random text, generated using a trigram language model. 107 108 @param length: The length of text to generate (default=100) 109 @type length: C{int} 110 """ 111 if '_model' not in self.__dict__: 112 print "Building ngram index..." 113 estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) 114 self._model = NgramModel(3, self, estimator) 115 text = self._model.generate(length) 116 print '\n'.join(textwrap.wrap(' '.join(text)))
117
118 - def similar(self, word, num=20):
119 """ 120 Distributional similarity: find other words which appear in the 121 same contexts as the specified word. 122 123 @param word: The word used to seed the similarity search 124 @type word: C{str} 125 @param num: The number of words to generate (default=20) 126 @type num: C{int} 127 """ 128 129 if '_word_context_map' not in self.__dict__: 130 print "Building word-context index..." 131 self._word_context_map = defaultdict(list) 132 for w1, w2, w3 in ngram([w.lower() for w in self], 3): 133 self._word_context_map[w2].append( (w1, w3) ) 134 135 word = word.lower() 136 if word in self._word_context_map: 137 contexts = set(self._word_context_map[word]) 138 fd = FreqDist(w for w in self._word_context_map 139 for c in self._word_context_map[w] 140 if c in contexts and not w == word) 141 words = fd.sorted()[:num] 142 print '\n'.join(textwrap.wrap(' '.join(words))) 143 else: 144 return "No matches"
145
146 - def dispersion_plot(self, words):
147 from nltk.draw import dispersion_plot 148 dispersion_plot(self, words)
149
150 - def zipf_plot(self, *args):
151 self.vocab().zipf_plot(*args)
152
153 - def vocab(self):
154 if "_vocab" not in self.__dict__: 155 print "Building vocabulary index..." 156 self._vocab = FreqDist(self) 157 return self._vocab
158
159 - def __str__(self):
160 """ 161 @return: A string representation of this C{FreqDist}. 162 @rtype: string 163 """ 164 return '<Text: %s>' % self.name
165
166 - def __repr__(self):
167 """ 168 @return: A string representation of this C{FreqDist}. 169 @rtype: string 170 """ 171 return self.__str__()
172
173 -def demo():
174 from nltk.corpus import brown 175 text = Text(brown.words(categories='a')) 176 print text 177 print 178 print "Concordance:" 179 text.concordance('news') 180 print 181 print "Distributionally similar words:" 182 text.similar('news') 183 print 184 print "Collocations:" 185 text.collocations() 186 print 187 print "Automatically generated text:" 188 text.generate() 189 print 190 print "Dispersion plot:" 191 text.dispersion_plot(['news', 'report', 'said', 'announced']) 192 print 193 print "Vocabulary plot:" 194 text.zipf_plot() 195 print 196 print "Indexing:" 197 print "text[3]:", text[3] 198 print "text[3:5]:", text[3:5] 199 print "text.vocab['news']:", text.vocab['news']
200 201 if __name__ == '__main__': 202 demo() 203