1
2
3
4
5
6
7
8 import textwrap
9
10 from probability import FreqDist, LidstoneProbDist
11 from compat import defaultdict
12 from util import ngram
13 from model import NgramModel
14
16 """A text object, which can be loaded with a sequence of words,
17 and which supports counting, concordancing, collocation discovery, etc.
18 This class is intended to support initial exploration of texts.
19 It is initialized with a list of words, e.g.:
20
21 >>> moby = Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt'))
22
23 Many of the methods simply print their results, and are intended
24 for use via the interactive console.
25 """
26
27 - def __init__(self, text, name=None):
28 """
29 Create a Text object.
30
31 @param words: The source text.
32 @type words: C{sequence} of C{str}
33 """
34 list.__init__(self, text)
35
36 if name:
37 self.name = name
38 elif ']' in self[:20]:
39 end = self[:20].index(']')
40 self.name = " ".join(self[1:end])
41 else:
42 self.name = " ".join(self[:8]) + "..."
43
44 - def concordance(self, word, width=80, lines=25):
45 """
46 Print a concordance for the word with the specified context window.
47
48 @param word: The target word
49 @type word: C{str}
50 @param width: The width of each line, in characters (default=80)
51 @type width: C{int}
52 @param lines: The number of lines to display (default=25)
53 @type lines: C{int}
54 """
55 if '_offsets' not in self.__dict__:
56 print "Building index..."
57 self._offsets = defaultdict(list)
58 for i in range(len(self)):
59 w = self[i].lower()
60 self._offsets[w].append(i)
61
62 word = word.lower()
63 half_width = (width - len(word)) / 2
64 context = width/4
65 if word in self._offsets:
66 lines = min(lines, self._offsets[word])
67 print "Displaying %s of %s matches:" %\
68 (lines, len(self._offsets[word]))
69 for i in self._offsets[word]:
70 left = ' ' * half_width + ' '.join(self[i-context:i])
71 right = ' '.join(self[i+1:i+context])
72 left = left[-half_width:]
73 right = right[:half_width]
74 print left, word, right
75 lines -= 1
76 if lines < 0:
77 break
78 else:
79 print "No matches"
80
81 - def collocations(self, num=20):
82 """
83 Print collocations derived from the text.
84
85 @param num: The number of collocations to produce.
86 @type num: C{int}
87 """
88 if '_collocations' not in self.__dict__:
89 print "Building word index..."
90 from operator import itemgetter
91 text = filter(lambda w: len(w) > 2, self)
92 fd = FreqDist(tuple(text[i:i+2])
93 for i in range(len(text)-1))
94 scored = [((w1,w2), fd[(w1,w2)] ** 3 / float(self.vocab()[w1] * self.vocab()[w2]))
95 for w1, w2 in fd]
96 scored.sort(key=itemgetter(1), reverse=True)
97 self._collocations = map(itemgetter(0), scored)
98 print '; '.join([w1+' '+w2 for w1, w2 in self._collocations[:num]])
99
100 - def readability(self, method):
101
102 raise NotImplementedError
103
104 - def generate(self, length=100):
105 """
106 Print random text, generated using a trigram language model.
107
108 @param length: The length of text to generate (default=100)
109 @type length: C{int}
110 """
111 if '_model' not in self.__dict__:
112 print "Building ngram index..."
113 estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
114 self._model = NgramModel(3, self, estimator)
115 text = self._model.generate(length)
116 print '\n'.join(textwrap.wrap(' '.join(text)))
117
118 - def similar(self, word, num=20):
119 """
120 Distributional similarity: find other words which appear in the
121 same contexts as the specified word.
122
123 @param word: The word used to seed the similarity search
124 @type word: C{str}
125 @param num: The number of words to generate (default=20)
126 @type num: C{int}
127 """
128
129 if '_word_context_map' not in self.__dict__:
130 print "Building word-context index..."
131 self._word_context_map = defaultdict(list)
132 for w1, w2, w3 in ngram([w.lower() for w in self], 3):
133 self._word_context_map[w2].append( (w1, w3) )
134
135 word = word.lower()
136 if word in self._word_context_map:
137 contexts = set(self._word_context_map[word])
138 fd = FreqDist(w for w in self._word_context_map
139 for c in self._word_context_map[w]
140 if c in contexts and not w == word)
141 words = fd.sorted()[:num]
142 print '\n'.join(textwrap.wrap(' '.join(words)))
143 else:
144 return "No matches"
145
146 - def dispersion_plot(self, words):
149
150 - def zipf_plot(self, *args):
151 self.vocab().zipf_plot(*args)
152
154 if "_vocab" not in self.__dict__:
155 print "Building vocabulary index..."
156 self._vocab = FreqDist(self)
157 return self._vocab
158
160 """
161 @return: A string representation of this C{FreqDist}.
162 @rtype: string
163 """
164 return '<Text: %s>' % self.name
165
166 - def __repr__(self):
167 """
168 @return: A string representation of this C{FreqDist}.
169 @rtype: string
170 """
171 return self.__str__()
172
200
201 if __name__ == '__main__':
202 demo()
203