Package nltk :: Package draw :: Module concordance
[hide private]
[frames] | no frames]

Source Code for Module nltk.draw.concordance

  1  # Natural Language Toolkit: Concordance Search Demo 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Sumukh Ghodke <[email protected]> 
  5  # URL: <http://nltk.org> 
  6  # For license information, see LICENSE.TXT 
  7  # 
  8  # $Id: concordance.py 6121 2008-07-11 02:10:33Z stevenbird $ 
  9   
 10  import re 
 11  import nltk.corpus 
 12  from Tkinter import * 
 13  from nltk.draw import * 
 14  from string import join 
 15  import threading 
 16   
 17  WORD_OR_TAG = '[^/ ]+' 
 18  BOUNDARY = r'\b' 
 19   
 20  CORPUS_LOADED_EVENT = '<<CL_EVENT>>' 
 21  SEARCH_TERMINATED_EVENT = '<<ST_EVENT>>' 
 22  SEARCH_ERROR_EVENT = '<<SE_EVENT>>' 
 23  ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>' 
 24   
 25  # NB All corpora must be specified in a lambda expression so as not to be 
 26  # loaded when the module is imported. 
 27   
 28  _DEFAULT = 'English: Brown Corpus (Humor, simplified)' 
 29  _CORPORA = { 
 30              'Catalan: CESS-CAT Corpus (simplified)': 
 31                  lambda: nltk.corpus.cess_cat.tagged_sents(simplify_tags=True), 
 32              'English: Brown Corpus': 
 33                  lambda: nltk.corpus.brown.tagged_sents(), 
 34              'English: Brown Corpus (simplified)': 
 35                  lambda: nltk.corpus.brown.tagged_sents(simplify_tags=True), 
 36              'English: Brown Corpus (Press, simplified)': 
 37                  lambda: nltk.corpus.brown.tagged_sents(categories='abc', simplify_tags=True), 
 38              'English: Brown Corpus (Religion, simplified)': 
 39                  lambda: nltk.corpus.brown.tagged_sents(categories='d', simplify_tags=True), 
 40              'English: Brown Corpus (Learned, simplified)': 
 41                  lambda: nltk.corpus.brown.tagged_sents(categories='j', simplify_tags=True), 
 42              'English: Brown Corpus (Science Fiction, simplified)': 
 43                  lambda: nltk.corpus.brown.tagged_sents(categories='m', simplify_tags=True), 
 44              'English: Brown Corpus (Romance, simplified)': 
 45                  lambda: nltk.corpus.brown.tagged_sents(categories='p', simplify_tags=True), 
 46              'English: Brown Corpus (Humor, simplified)': 
 47                  lambda: nltk.corpus.brown.tagged_sents(categories='r', simplify_tags=True), 
 48              'English: NPS Chat Corpus': 
 49                  lambda: nltk.corpus.nps_chat.tagged_posts(), 
 50              'English: NPS Chat Corpus (simplified)': 
 51                  lambda: nltk.corpus.nps_chat.tagged_posts(simplify_tags=True), 
 52              'English: Wall Street Journal Corpus': 
 53                  lambda: nltk.corpus.treebank.tagged_sents(), 
 54              'English: Wall Street Journal Corpus (simplified)': 
 55                  lambda: nltk.corpus.treebank.tagged_sents(simplify_tags=True), 
 56              'Chinese: Sinica Corpus': 
 57                  lambda: nltk.corpus.sinica_treebank.tagged_sents(), 
 58              'Chinese: Sinica Corpus (simplified)': 
 59                  lambda: nltk.corpus.sinica_treebank.tagged_sents(simplify_tags=True), 
 60              'Dutch: Alpino Corpus': 
 61                  lambda: nltk.corpus.alpino.tagged_sents(), 
 62              'Dutch: Alpino Corpus (simplified)': 
 63                  lambda: nltk.corpus.alpino.tagged_sents(simplify_tags=True), 
 64              'Hindi: Indian Languages Corpus': 
 65                  lambda: nltk.corpus.indian.tagged_sents(files='hindi.pos'), 
 66              'Hindi: Indian Languages Corpus (simplified)': 
 67                  lambda: nltk.corpus.indian.tagged_sents(files='hindi.pos', simplify_tags=True), 
 68              'Portuguese: Floresta Corpus (Portugal)': 
 69                  lambda: nltk.corpus.floresta.tagged_sents(), 
 70              'Portuguese: Floresta Corpus (Portugal, simplified)': 
 71                  lambda: nltk.corpus.floresta.tagged_sents(simplify_tags=True), 
 72              'Portuguese: MAC-MORPHO Corpus (Brazil)': 
 73                  lambda: nltk.corpus.mac_morpho.tagged_sents(), 
 74              'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': 
 75                  lambda: nltk.corpus.mac_morpho.tagged_sents(simplify_tags=True), 
 76              'Spanish: CESS-ESP Corpus (simplified)': 
 77                  lambda: nltk.corpus.cess_esp.tagged_sents(simplify_tags=True), 
 78             } 
 79   
80 -class CategorySearchView(object):
81 _BACKGROUND_COLOUR='#FFF' #white 82 83 #Colour of highlighted results 84 _HIGHLIGHT_WORD_COLOUR='#F00' #red 85 _HIGHLIGHT_WORD_TAG='HL_WRD_TAG' 86 87 _HIGHLIGHT_LABEL_COLOUR='#C0C0C0' # dark grey 88 _HIGHLIGHT_LABEL_TAG='HL_LBL_TAG' 89 90 91 #Percentage of text left of the scrollbar position 92 _FRACTION_LEFT_TEXT=0.30 93 94 #Number of characters before the position of search item 95 _CHAR_BEFORE=75 96 #Number of characters after the position of search item 97 _CHAR_AFTER=85 98
99 - def __init__(self):
100 self.model = CategorySearchModel() 101 self.model.add_listener(self) 102 self.top = Tk() 103 self._init_top(self.top) 104 self._init_menubar() 105 self._init_widgets(self.top) 106 self._bind_event_handlers() 107 self.load_corpus(self.model.DEFAULT_CORPUS)
108
109 - def _init_top(self, top):
110 top.geometry('950x680+50+50') 111 top.title('NLTK Concordance Search') 112 top.bind('<Control-q>', self.destroy) 113 top.minsize(950,680)
114
115 - def _init_widgets(self, parent):
116 self.main_frame = Frame(parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1)) 117 self._init_corpus_select(self.main_frame) 118 self._init_query_box(self.main_frame) 119 self._init_results_box(self.main_frame) 120 self._init_paging(self.main_frame) 121 self._init_status(self.main_frame) 122 self.main_frame.pack(fill='both', expand=True)
123
124 - def _init_menubar(self):
125 self._result_size = IntVar(self.top) 126 menubar = Menu(self.top) 127 128 filemenu = Menu(menubar, tearoff=0, borderwidth=0) 129 filemenu.add_command(label='Exit', underline=1, 130 command=self.destroy, accelerator='Ctrl-q') 131 menubar.add_cascade(label='File', underline=0, menu=filemenu) 132 133 editmenu = Menu(menubar, tearoff=0) 134 rescntmenu = Menu(editmenu, tearoff=0) 135 rescntmenu.add_radiobutton(label='20', variable=self._result_size, 136 underline=0, value=20, command=self.set_result_size) 137 rescntmenu.add_radiobutton(label='50', variable=self._result_size, 138 underline=0, value=50, command=self.set_result_size) 139 rescntmenu.add_radiobutton(label='100', variable=self._result_size, 140 underline=0, value=100, command=self.set_result_size) 141 rescntmenu.invoke(1) 142 143 144 editmenu.add_cascade(label='Result Count', underline=0, menu=rescntmenu) 145 menubar.add_cascade(label='Edit', underline=0, menu=editmenu) 146 147 self.top.config(menu=menubar)
148
149 - def set_result_size(self, **kwargs):
150 self.model.result_count = self._result_size.get()
151
152 - def _init_corpus_select(self, parent):
153 innerframe = Frame(parent, background=self._BACKGROUND_COLOUR) 154 self.var = StringVar(innerframe) 155 self.var.set(self.model.DEFAULT_CORPUS) 156 Label(innerframe, justify=LEFT, text=' Corpus: ', background=self._BACKGROUND_COLOUR, padx = 2, pady = 1, border = 0).pack(side='left') 157 158 other_corpora = self.model.CORPORA.keys().remove(self.model.DEFAULT_CORPUS) 159 om = OptionMenu(innerframe, self.var, self.model.DEFAULT_CORPUS, command=self.corpus_selected, *self.model.non_default_corpora()) 160 om['borderwidth'] = 0 161 om['highlightthickness'] = 1 162 om.pack(side='left') 163 innerframe.pack(side='top', fill='x', anchor='n')
164
165 - def _init_status(self, parent):
166 self.status = Label(parent, justify=LEFT, relief=SUNKEN, background=self._BACKGROUND_COLOUR, border=0, padx = 1, pady = 0) 167 self.status.pack(side='top', anchor='sw')
168
169 - def _init_query_box(self, parent):
170 innerframe = Frame(parent, background=self._BACKGROUND_COLOUR) 171 another = Frame(innerframe, background=self._BACKGROUND_COLOUR) 172 self.query_box = Entry(another, width=60) 173 self.query_box.pack(side='left', fill='x', pady=25, anchor='center') 174 self.search_button = Button(another, text='Search', command=self.search, borderwidth=1, highlightthickness=1) 175 self.search_button.pack(side='left', fill='x', pady=25, anchor='center') 176 self.query_box.bind('<KeyPress-Return>', self.search_enter_keypress_handler) 177 another.pack() 178 innerframe.pack(side='top', fill='x', anchor='n')
179
180 - def search_enter_keypress_handler(self, *event):
181 self.search()
182
183 - def _init_results_box(self, parent):
184 innerframe = Frame(parent) 185 i1 = Frame(innerframe) 186 i2 = Frame(innerframe) 187 vscrollbar = Scrollbar(i1, borderwidth=1) 188 hscrollbar = Scrollbar(i2, borderwidth=1, orient='horiz') 189 self.results_box = Text(i1, 190 font=tkFont.Font(family='courier', size='16'), 191 state='disabled', borderwidth=1, 192 yscrollcommand=vscrollbar.set, 193 xscrollcommand=hscrollbar.set, wrap='none', width='40', height = '20') 194 self.results_box.pack(side='left', fill='both', expand=True) 195 self.results_box.tag_config(self._HIGHLIGHT_WORD_TAG, foreground=self._HIGHLIGHT_WORD_COLOUR) 196 self.results_box.tag_config(self._HIGHLIGHT_LABEL_TAG, foreground=self._HIGHLIGHT_LABEL_COLOUR) 197 vscrollbar.pack(side='left', fill='y', anchor='e') 198 vscrollbar.config(command=self.results_box.yview) 199 hscrollbar.pack(side='left', fill='x', expand=True, anchor='w') 200 hscrollbar.config(command=self.results_box.xview) 201 #there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!! 202 Label(i2, text=' ', background=self._BACKGROUND_COLOUR).pack(side='left', anchor='e') 203 i1.pack(side='top', fill='both', expand=True, anchor='n') 204 i2.pack(side='bottom', fill='x', anchor='s') 205 innerframe.pack(side='top', fill='both', expand=True)
206
207 - def _init_paging(self, parent):
208 innerframe = Frame(parent, background=self._BACKGROUND_COLOUR) 209 self.prev = prev = Button(innerframe, text='Previous', command=self.previous, width='10', borderwidth=1, highlightthickness=1, state='disabled') 210 prev.pack(side='left', anchor='center') 211 self.next = next = Button(innerframe, text='Next', command=self.next, width='10', borderwidth=1, highlightthickness=1, state='disabled') 212 next.pack(side='right', anchor='center') 213 innerframe.pack(side='top', fill='y') 214 self.current_page = 0
215
216 - def previous(self):
217 self.clear_results_box() 218 self.freeze_editable() 219 self.model.prev(self.current_page - 1)
220
221 - def next(self):
222 self.clear_results_box() 223 self.freeze_editable() 224 self.model.next(self.current_page + 1)
225
226 - def about(self, *e):
227 ABOUT = ("NLTK Concordance Search Demo\n") 228 TITLE = 'About: NLTK Concordance Search Demo' 229 try: 230 from tkMessageBox import Message 231 Message(message=ABOUT, title=TITLE, parent=self.main_frame).show() 232 except: 233 ShowText(self.top, TITLE, ABOUT)
234
235 - def _bind_event_handlers(self):
240
241 - def handle_error_loading_corpus(self, event):
242 self.status['text'] = 'Error in loading ' + self.var.get() 243 self.unfreeze_editable() 244 self.clear_all() 245 self.freeze_editable()
246
247 - def handle_corpus_loaded(self, event):
248 self.status['text'] = self.var.get() + ' is loaded' 249 self.unfreeze_editable() 250 self.clear_all() 251 self.query_box.focus_set()
252
253 - def handle_search_terminated(self, event):
254 #todo: refactor the model such that it is less state sensitive 255 results = self.model.get_results() 256 self.write_results(results) 257 self.status['text'] = '' 258 if len(results) == 0: 259 self.status['text'] = 'No results found for ' + self.model.query 260 else: 261 self.current_page = self.model.last_requested_page 262 self.unfreeze_editable() 263 self.results_box.xview_moveto(self._FRACTION_LEFT_TEXT)
264 265
266 - def handle_search_error(self, event):
267 self.status['text'] = 'Error in query ' + self.model.query 268 self.unfreeze_editable()
269
270 - def corpus_selected(self, *args):
271 new_selection = self.var.get() 272 self.load_corpus(new_selection)
273
274 - def load_corpus(self, selection):
275 if self.model.selected_corpus != selection: 276 self.status['text'] = 'Loading ' + selection + '...' 277 self.freeze_editable() 278 self.model.load_corpus(selection)
279
280 - def search(self):
281 self.current_page = 0 282 self.clear_results_box() 283 self.model.reset_results() 284 query = self.query_box.get() 285 if (len(query.strip()) == 0): return 286 self.status['text'] = 'Searching for ' + query 287 self.freeze_editable() 288 self.model.search(query, self.current_page + 1, )
289 290
291 - def write_results(self, results):
292 self.results_box['state'] = 'normal' 293 row = 1 294 for each in results: 295 sent, pos1, pos2 = each[0].strip(), each[1], each[2] 296 if len(sent) != 0: 297 if (pos1 < self._CHAR_BEFORE): 298 sent, pos1, pos2 = self.pad(sent, pos1, pos2) 299 sentence = sent[pos1-self._CHAR_BEFORE:pos1+self._CHAR_AFTER] 300 if not row == len(results): 301 sentence += '\n' 302 self.results_box.insert(str(row) + '.0', sentence) 303 word_markers, label_markers = self.words_and_labels(sent, pos1, pos2) 304 for marker in word_markers: self.results_box.tag_add(self._HIGHLIGHT_WORD_TAG, str(row) + '.' + str(marker[0]), str(row) + '.' + str(marker[1])) 305 for marker in label_markers: self.results_box.tag_add(self._HIGHLIGHT_LABEL_TAG, str(row) + '.' + str(marker[0]), str(row) + '.' + str(marker[1])) 306 row += 1 307 self.results_box['state'] = 'disabled'
308
309 - def words_and_labels(self, sentence, pos1, pos2):
310 search_exp = sentence[pos1:pos2] 311 words, labels = [], [] 312 labeled_words = search_exp.split(' ') 313 index = 0 314 for each in labeled_words: 315 if each == '': 316 index += 1 317 else: 318 word, label = each.split('/') 319 words.append((self._CHAR_BEFORE + index, self._CHAR_BEFORE + index + len(word))) 320 index += len(word) + 1 321 labels.append((self._CHAR_BEFORE + index, self._CHAR_BEFORE + index + len(label))) 322 index += len(label) 323 index += 1 324 return words, labels
325
326 - def pad(self, sent, hstart, hend):
327 if hstart >= self._CHAR_BEFORE: 328 return sent, hstart, hend 329 d = self._CHAR_BEFORE - hstart 330 sent = ''.join([' '] * d) + sent 331 return sent, hstart + d, hend + d
332
333 - def destroy(self, *e):
334 if self.top is None: return 335 self.top.destroy() 336 self.top = None
337
338 - def clear_all(self):
339 self.query_box.delete(0, END) 340 self.model.reset_query() 341 self.clear_results_box()
342
343 - def clear_results_box(self):
344 self.results_box['state'] = 'normal' 345 self.results_box.delete("1.0", END) 346 self.results_box['state'] = 'disabled'
347
348 - def freeze_editable(self):
349 self.query_box['state'] = 'disabled' 350 self.search_button['state'] = 'disabled' 351 self.prev['state'] = 'disabled' 352 self.next['state'] = 'disabled'
353
354 - def unfreeze_editable(self):
355 self.query_box['state'] = 'normal' 356 self.search_button['state'] = 'normal' 357 self.set_paging_button_states()
358
359 - def set_paging_button_states(self):
360 if self.current_page == 0 or self.current_page == 1: 361 self.prev['state'] = 'disabled' 362 else: 363 self.prev['state'] = 'normal' 364 if self.model.has_more_pages(self.current_page): 365 self.next['state'] = 'normal' 366 else: 367 self.next['state'] = 'disabled'
368
369 - def fire_event(self, event):
370 #Firing an event so that rendering of widgets happen in the mainloop thread 371 self.top.event_generate(event, when='tail')
372
373 - def mainloop(self, *args, **kwargs):
374 if in_idle(): return 375 self.top.mainloop(*args, **kwargs)
376
377 -class CategorySearchModel(object):
378 - def __init__(self):
379 self.listeners = [] 380 self.CORPORA = _CORPORA 381 self.DEFAULT_CORPUS = _DEFAULT 382 self.selected_corpus = None 383 self.reset_query() 384 self.reset_results() 385 self.result_count = None 386 self.last_sent_searched = 0
387
388 - def non_default_corpora(self):
389 copy = [] 390 copy.extend(self.CORPORA.keys()) 391 copy.remove(self.DEFAULT_CORPUS) 392 copy.sort() 393 return copy
394
395 - def load_corpus(self, name):
396 self.selected_corpus = name 397 self.tagged_sents = [] 398 runner_thread = self.LoadCorpus(name, self) 399 runner_thread.start()
400
401 - def search(self, query, page):
402 self.query = query 403 self.last_requested_page = page 404 self.SearchCorpus(self, page, self.result_count).start()
405
406 - def next(self, page):
407 self.last_requested_page = page 408 if len(self.results) < page: 409 self.search(self.query, page) 410 else: 411 self.notify_listeners(SEARCH_TERMINATED_EVENT)
412
413 - def prev(self, page):
414 self.last_requested_page = page 415 self.notify_listeners(SEARCH_TERMINATED_EVENT)
416
417 - def add_listener(self, listener):
418 self.listeners.append(listener)
419
420 - def notify_listeners(self, event):
421 for each in self.listeners: 422 each.fire_event(event)
423
424 - def reset_results(self):
425 self.last_sent_searched = 0 426 self.results = [] 427 self.last_page = None
428
429 - def reset_query(self):
430 self.query = None
431
432 - def set_results(self, page, resultset):
433 self.results.insert(page - 1, resultset)
434
435 - def get_results(self):
436 return self.results[self.last_requested_page - 1]
437
438 - def has_more_pages(self, page):
439 if self.results == [] or self.results[0] == []: 440 return False 441 if self.last_page == None: 442 return True 443 return page < self.last_page
444
445 - class LoadCorpus(threading.Thread):
446 - def __init__(self, name, model):
447 self.model, self.name = model, name 448 threading.Thread.__init__(self)
449
450 - def run(self):
451 try: 452 ts = self.model.CORPORA[self.name]() 453 self.model.tagged_sents = [join(w+'/'+t for (w,t) in sent) for sent in ts] 454 self.model.notify_listeners(CORPUS_LOADED_EVENT) 455 except: 456 self.model.notify_listeners(ERROR_LOADING_CORPUS_EVENT)
457
458 - class SearchCorpus(threading.Thread):
459 - def __init__(self, model, page, count):
460 self.model, self.count, self.page = model, count, page 461 threading.Thread.__init__(self)
462
463 - def run(self):
464 q = self.processed_query() 465 sent_pos, i, sent_count = [], 0, 0 466 for sent in self.model.tagged_sents[self.model.last_sent_searched:]: 467 try: 468 m = re.search(q, sent) 469 except re.error: 470 self.model.reset_results() 471 self.model.notify_listeners(SEARCH_ERROR_EVENT) 472 return 473 if m: 474 sent_pos.append((sent, m.start(), m.end())) 475 i += 1 476 if i > self.count: 477 self.model.last_sent_searched += sent_count - 1 478 break 479 sent_count += 1 480 if (self.count >= len(sent_pos)): 481 self.model.last_sent_searched += sent_count - 1 482 self.model.last_page = self.page 483 self.model.set_results(self.page, sent_pos) 484 else: 485 self.model.set_results(self.page, sent_pos[:-1]) 486 self.model.notify_listeners(SEARCH_TERMINATED_EVENT)
487
488 - def processed_query(self):
489 new = [] 490 for term in self.model.query.split(): 491 term = re.sub(r'\.', r'[^/ ]', term) 492 if re.match('[A-Z]+$', term): 493 new.append(BOUNDARY + WORD_OR_TAG + '/' + term + BOUNDARY) 494 elif '/' in term: 495 new.append(BOUNDARY + term + BOUNDARY) 496 else: 497 new.append(BOUNDARY + term + '/' + WORD_OR_TAG + BOUNDARY) 498 return ' '.join(new)
499
500 -def pos_concordance():
501 d = CategorySearchView() 502 d.mainloop()
503
504 -def demo():
505 pos_concordance()
506 507 if __name__ == '__main__': 508 demo() 509