1
2
3
4
5
6
7
8
9
10 import re
11 import nltk.corpus
12 from Tkinter import *
13 from nltk.draw import *
14 from string import join
15 import threading
16
17 WORD_OR_TAG = '[^/ ]+'
18 BOUNDARY = r'\b'
19
20 CORPUS_LOADED_EVENT = '<<CL_EVENT>>'
21 SEARCH_TERMINATED_EVENT = '<<ST_EVENT>>'
22 SEARCH_ERROR_EVENT = '<<SE_EVENT>>'
23 ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>'
24
25
26
27
28 _DEFAULT = 'English: Brown Corpus (Humor, simplified)'
29 _CORPORA = {
30 'Catalan: CESS-CAT Corpus (simplified)':
31 lambda: nltk.corpus.cess_cat.tagged_sents(simplify_tags=True),
32 'English: Brown Corpus':
33 lambda: nltk.corpus.brown.tagged_sents(),
34 'English: Brown Corpus (simplified)':
35 lambda: nltk.corpus.brown.tagged_sents(simplify_tags=True),
36 'English: Brown Corpus (Press, simplified)':
37 lambda: nltk.corpus.brown.tagged_sents(categories='abc', simplify_tags=True),
38 'English: Brown Corpus (Religion, simplified)':
39 lambda: nltk.corpus.brown.tagged_sents(categories='d', simplify_tags=True),
40 'English: Brown Corpus (Learned, simplified)':
41 lambda: nltk.corpus.brown.tagged_sents(categories='j', simplify_tags=True),
42 'English: Brown Corpus (Science Fiction, simplified)':
43 lambda: nltk.corpus.brown.tagged_sents(categories='m', simplify_tags=True),
44 'English: Brown Corpus (Romance, simplified)':
45 lambda: nltk.corpus.brown.tagged_sents(categories='p', simplify_tags=True),
46 'English: Brown Corpus (Humor, simplified)':
47 lambda: nltk.corpus.brown.tagged_sents(categories='r', simplify_tags=True),
48 'English: NPS Chat Corpus':
49 lambda: nltk.corpus.nps_chat.tagged_posts(),
50 'English: NPS Chat Corpus (simplified)':
51 lambda: nltk.corpus.nps_chat.tagged_posts(simplify_tags=True),
52 'English: Wall Street Journal Corpus':
53 lambda: nltk.corpus.treebank.tagged_sents(),
54 'English: Wall Street Journal Corpus (simplified)':
55 lambda: nltk.corpus.treebank.tagged_sents(simplify_tags=True),
56 'Chinese: Sinica Corpus':
57 lambda: nltk.corpus.sinica_treebank.tagged_sents(),
58 'Chinese: Sinica Corpus (simplified)':
59 lambda: nltk.corpus.sinica_treebank.tagged_sents(simplify_tags=True),
60 'Dutch: Alpino Corpus':
61 lambda: nltk.corpus.alpino.tagged_sents(),
62 'Dutch: Alpino Corpus (simplified)':
63 lambda: nltk.corpus.alpino.tagged_sents(simplify_tags=True),
64 'Hindi: Indian Languages Corpus':
65 lambda: nltk.corpus.indian.tagged_sents(files='hindi.pos'),
66 'Hindi: Indian Languages Corpus (simplified)':
67 lambda: nltk.corpus.indian.tagged_sents(files='hindi.pos', simplify_tags=True),
68 'Portuguese: Floresta Corpus (Portugal)':
69 lambda: nltk.corpus.floresta.tagged_sents(),
70 'Portuguese: Floresta Corpus (Portugal, simplified)':
71 lambda: nltk.corpus.floresta.tagged_sents(simplify_tags=True),
72 'Portuguese: MAC-MORPHO Corpus (Brazil)':
73 lambda: nltk.corpus.mac_morpho.tagged_sents(),
74 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
75 lambda: nltk.corpus.mac_morpho.tagged_sents(simplify_tags=True),
76 'Spanish: CESS-ESP Corpus (simplified)':
77 lambda: nltk.corpus.cess_esp.tagged_sents(simplify_tags=True),
78 }
79
81 _BACKGROUND_COLOUR='#FFF'
82
83
84 _HIGHLIGHT_WORD_COLOUR='#F00'
85 _HIGHLIGHT_WORD_TAG='HL_WRD_TAG'
86
87 _HIGHLIGHT_LABEL_COLOUR='#C0C0C0'
88 _HIGHLIGHT_LABEL_TAG='HL_LBL_TAG'
89
90
91
92 _FRACTION_LEFT_TEXT=0.30
93
94
95 _CHAR_BEFORE=75
96
97 _CHAR_AFTER=85
98
108
110 top.geometry('950x680+50+50')
111 top.title('NLTK Concordance Search')
112 top.bind('<Control-q>', self.destroy)
113 top.minsize(950,680)
114
123
125 self._result_size = IntVar(self.top)
126 menubar = Menu(self.top)
127
128 filemenu = Menu(menubar, tearoff=0, borderwidth=0)
129 filemenu.add_command(label='Exit', underline=1,
130 command=self.destroy, accelerator='Ctrl-q')
131 menubar.add_cascade(label='File', underline=0, menu=filemenu)
132
133 editmenu = Menu(menubar, tearoff=0)
134 rescntmenu = Menu(editmenu, tearoff=0)
135 rescntmenu.add_radiobutton(label='20', variable=self._result_size,
136 underline=0, value=20, command=self.set_result_size)
137 rescntmenu.add_radiobutton(label='50', variable=self._result_size,
138 underline=0, value=50, command=self.set_result_size)
139 rescntmenu.add_radiobutton(label='100', variable=self._result_size,
140 underline=0, value=100, command=self.set_result_size)
141 rescntmenu.invoke(1)
142
143
144 editmenu.add_cascade(label='Result Count', underline=0, menu=rescntmenu)
145 menubar.add_cascade(label='Edit', underline=0, menu=editmenu)
146
147 self.top.config(menu=menubar)
148
150 self.model.result_count = self._result_size.get()
151
153 innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
154 self.var = StringVar(innerframe)
155 self.var.set(self.model.DEFAULT_CORPUS)
156 Label(innerframe, justify=LEFT, text=' Corpus: ', background=self._BACKGROUND_COLOUR, padx = 2, pady = 1, border = 0).pack(side='left')
157
158 other_corpora = self.model.CORPORA.keys().remove(self.model.DEFAULT_CORPUS)
159 om = OptionMenu(innerframe, self.var, self.model.DEFAULT_CORPUS, command=self.corpus_selected, *self.model.non_default_corpora())
160 om['borderwidth'] = 0
161 om['highlightthickness'] = 1
162 om.pack(side='left')
163 innerframe.pack(side='top', fill='x', anchor='n')
164
168
170 innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
171 another = Frame(innerframe, background=self._BACKGROUND_COLOUR)
172 self.query_box = Entry(another, width=60)
173 self.query_box.pack(side='left', fill='x', pady=25, anchor='center')
174 self.search_button = Button(another, text='Search', command=self.search, borderwidth=1, highlightthickness=1)
175 self.search_button.pack(side='left', fill='x', pady=25, anchor='center')
176 self.query_box.bind('<KeyPress-Return>', self.search_enter_keypress_handler)
177 another.pack()
178 innerframe.pack(side='top', fill='x', anchor='n')
179
182
184 innerframe = Frame(parent)
185 i1 = Frame(innerframe)
186 i2 = Frame(innerframe)
187 vscrollbar = Scrollbar(i1, borderwidth=1)
188 hscrollbar = Scrollbar(i2, borderwidth=1, orient='horiz')
189 self.results_box = Text(i1,
190 font=tkFont.Font(family='courier', size='16'),
191 state='disabled', borderwidth=1,
192 yscrollcommand=vscrollbar.set,
193 xscrollcommand=hscrollbar.set, wrap='none', width='40', height = '20')
194 self.results_box.pack(side='left', fill='both', expand=True)
195 self.results_box.tag_config(self._HIGHLIGHT_WORD_TAG, foreground=self._HIGHLIGHT_WORD_COLOUR)
196 self.results_box.tag_config(self._HIGHLIGHT_LABEL_TAG, foreground=self._HIGHLIGHT_LABEL_COLOUR)
197 vscrollbar.pack(side='left', fill='y', anchor='e')
198 vscrollbar.config(command=self.results_box.yview)
199 hscrollbar.pack(side='left', fill='x', expand=True, anchor='w')
200 hscrollbar.config(command=self.results_box.xview)
201
202 Label(i2, text=' ', background=self._BACKGROUND_COLOUR).pack(side='left', anchor='e')
203 i1.pack(side='top', fill='both', expand=True, anchor='n')
204 i2.pack(side='bottom', fill='x', anchor='s')
205 innerframe.pack(side='top', fill='both', expand=True)
206
208 innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
209 self.prev = prev = Button(innerframe, text='Previous', command=self.previous, width='10', borderwidth=1, highlightthickness=1, state='disabled')
210 prev.pack(side='left', anchor='center')
211 self.next = next = Button(innerframe, text='Next', command=self.next, width='10', borderwidth=1, highlightthickness=1, state='disabled')
212 next.pack(side='right', anchor='center')
213 innerframe.pack(side='top', fill='y')
214 self.current_page = 0
215
220
225
227 ABOUT = ("NLTK Concordance Search Demo\n")
228 TITLE = 'About: NLTK Concordance Search Demo'
229 try:
230 from tkMessageBox import Message
231 Message(message=ABOUT, title=TITLE, parent=self.main_frame).show()
232 except:
233 ShowText(self.top, TITLE, ABOUT)
234
240
246
252
264
265
269
271 new_selection = self.var.get()
272 self.load_corpus(new_selection)
273
279
289
290
292 self.results_box['state'] = 'normal'
293 row = 1
294 for each in results:
295 sent, pos1, pos2 = each[0].strip(), each[1], each[2]
296 if len(sent) != 0:
297 if (pos1 < self._CHAR_BEFORE):
298 sent, pos1, pos2 = self.pad(sent, pos1, pos2)
299 sentence = sent[pos1-self._CHAR_BEFORE:pos1+self._CHAR_AFTER]
300 if not row == len(results):
301 sentence += '\n'
302 self.results_box.insert(str(row) + '.0', sentence)
303 word_markers, label_markers = self.words_and_labels(sent, pos1, pos2)
304 for marker in word_markers: self.results_box.tag_add(self._HIGHLIGHT_WORD_TAG, str(row) + '.' + str(marker[0]), str(row) + '.' + str(marker[1]))
305 for marker in label_markers: self.results_box.tag_add(self._HIGHLIGHT_LABEL_TAG, str(row) + '.' + str(marker[0]), str(row) + '.' + str(marker[1]))
306 row += 1
307 self.results_box['state'] = 'disabled'
308
325
326 - def pad(self, sent, hstart, hend):
332
334 if self.top is None: return
335 self.top.destroy()
336 self.top = None
337
342
344 self.results_box['state'] = 'normal'
345 self.results_box.delete("1.0", END)
346 self.results_box['state'] = 'disabled'
347
349 self.query_box['state'] = 'disabled'
350 self.search_button['state'] = 'disabled'
351 self.prev['state'] = 'disabled'
352 self.next['state'] = 'disabled'
353
358
368
370
371 self.top.event_generate(event, when='tail')
372
373 - def mainloop(self, *args, **kwargs):
374 if in_idle(): return
375 self.top.mainloop(*args, **kwargs)
376
379 self.listeners = []
380 self.CORPORA = _CORPORA
381 self.DEFAULT_CORPUS = _DEFAULT
382 self.selected_corpus = None
383 self.reset_query()
384 self.reset_results()
385 self.result_count = None
386 self.last_sent_searched = 0
387
394
400
401 - def search(self, query, page):
405
406 - def next(self, page):
412
413 - def prev(self, page):
416
418 self.listeners.append(listener)
419
421 for each in self.listeners:
422 each.fire_event(event)
423
425 self.last_sent_searched = 0
426 self.results = []
427 self.last_page = None
428
431
434
436 return self.results[self.last_requested_page - 1]
437
438 - def has_more_pages(self, page):
439 if self.results == [] or self.results[0] == []:
440 return False
441 if self.last_page == None:
442 return True
443 return page < self.last_page
444
457
459 - def __init__(self, model, page, count):
462
487
499
503
506
507 if __name__ == '__main__':
508 demo()
509