1
2
3
4
5
6
7
8
9
10 """
11 A graphical tool for exploring the regular expression based chunk
12 parser (L{RegexpChunkParser<nltk.chunk.regex.RegexpChunkParser>}).
13
14 @todo: Add a way to select the development set from the menubar. This
15 might just need to be a selection box (conll vs treebank etc) plus
16 configuration parameters to select what's being chunked (eg VP vs NP)
17 and what part of the data is being used as the development set.
18 """
19
20 from Tkinter import *
21 from tkFileDialog import asksaveasfilename, askopenfilename
22 import tkFont
23 import time
24 import textwrap
25 import re
26 import random
27
28 import nltk
29
30 from nltk.draw import *
31
33 """
34 A graphical tool for exploring the regular expression based chunk
35 parser (L{RegexpChunkParser<nltk.chunk.regex.RegexpChunkParser>}).
36
37 See L{HELP} for instructional text.
38 """
39
40
41
42
43
44
45
46
47 TAGSET = {
48 'CC': 'Coordinating conjunction', 'PRP$': 'Possessive pronoun',
49 'CD': 'Cardinal number', 'RB': 'Adverb',
50 'DT': 'Determiner', 'RBR': 'Adverb, comparative',
51 'EX': 'Existential there', 'RBS': 'Adverb, superlative',
52 'FW': 'Foreign word', 'RP': 'Particle',
53 'JJ': 'Adjective', 'TO': 'to',
54 'JJR': 'Adjective, comparative', 'UH': 'Interjection',
55 'JJS': 'Adjective, superlative', 'VB': 'Verb, base form',
56 'LS': 'List item marker', 'VBD': 'Verb, past tense',
57 'MD': 'Modal', 'NNS': 'Noun, plural',
58 'NN': 'Noun, singular or masps', 'VBN': 'Verb, past participle',
59 'VBZ': 'Verb,3rd ps. sing. present', 'NNP': 'Proper noun, singular',
60 'NNPS': 'Proper noun plural', 'WDT': 'wh-determiner',
61 'PDT': 'Predeterminer', 'WP': 'wh-pronoun',
62 'POS': 'Possessive ending', 'WP$': 'Possessive wh-pronoun',
63 'PRP': 'Personal pronoun', 'WRB': 'wh-adverb',
64 '(': 'open parenthesis', ')': 'close parenthesis',
65 '``': 'open quote', ',': 'comma',
66 "''": 'close quote', '.': 'period',
67 '#': 'pound sign (currency marker)',
68 '$': 'dollar sign (currency marker)',
69 'IN': 'Preposition/subord. conjunction',
70 'SYM': 'Symbol (mathematical or scientific)',
71 'VBG': 'Verb, gerund/present participle',
72 'VBP': 'Verb, non-3rd ps. sing. present',
73 ':': 'colon',
74 }
75
76
77
78
79
80
81
82
83 HELP = [
84 ('Help', '20',
85 "Welcome to the regular expression chunk-parser grammar editor. "
86 "You can use this editor to develop and test chunk parser grammars "
87 "based on NLTK's RegexpChunkParser class.\n\n"
88
89 "Use this box ('Help') to learn more about the editor; click on the "
90 "tabs for help on specific topics:"
91 "<indent>\n"
92 "Rules: grammar rule types\n"
93 "Regexps: regular expression syntax\n"
94 "Tags: part of speech tags\n</indent>\n"
95
96 "Use the upper-left box ('Grammar') to edit your grammar. "
97 "Each line of your grammar specifies a single 'rule', "
98 "which performs an action such as creating a chunk or merging "
99 "two chunks.\n\n"
100
101 "The lower-left box ('Development Set') runs your grammar on the "
102 "development set, and displays the results. "
103 "Your grammar's chunks are <highlight>highlighted</highlight>, and "
104 "the correct (gold standard) chunks are "
105 "<underline>underlined</underline>. If they "
106 "match, they are displayed in <green>green</green>; otherwise, "
107 "they are displayed in <red>red</red>. The box displays a single "
108 "sentence from the development set at a time; use the scrollbar or "
109 "the next/previous buttons view additional sentences.\n\n"
110
111 "The lower-right box ('Evaluation') tracks the performance of "
112 "your grammar on the development set. The 'precision' axis "
113 "indicates how many of your grammar's chunks are correct; and "
114 "the 'recall' axis indicates how many of the gold standard "
115 "chunks your system generated. Typically, you should try to "
116 "design a grammar that scores high on both metrics. The "
117 "exact precision and recall of the current grammar, as well "
118 "as their geometric average (the 'f-score'), are displayed in "
119 "the status bar at the bottom of the window."
120 ),
121 ('Rules', '10',
122 "<h1>{...regexp...}</h1>"
123 "<indent>\nChunk rule: creates new chunks from words matching "
124 "regexp.</indent>\n\n"
125 "<h1>}...regexp...{</h1>"
126 "<indent>\nChink rule: removes words matching regexp from existing "
127 "chunks.</indent>\n\n"
128 "<h1>...regexp1...}{...regexp2...</h1>"
129 "<indent>\nSplit rule: splits chunks that match regexp1 followed by "
130 "regexp2 in two.</indent>\n\n"
131 "<h1>...regexp...{}...regexp...</h1>"
132 "<indent>\nMerge rule: joins consecutive chunks that match regexp1 "
133 "and regexp2</indent>\n"
134 ),
135 ('Regexps', '10 60',
136
137 "<h1>Pattern\t\tMatches...</h1>\n"
138 "<hangindent>"
139 "\t<<var>T</var>>\ta word with tag <var>T</var> "
140 "(where <var>T</var> may be a regexp).\n"
141 "\t<var>x</var>?\tan optional <var>x</var>\n"
142 "\t<var>x</var>+\ta sequence of 1 or more <var>x</var>'s\n"
143 "\t<var>x</var>*\ta sequence of 0 or more <var>x</var>'s\n"
144 "\t<var>x</var>|<var>y</var>\t<var>x</var> or <var>y</var>\n"
145 "\t.\tmatches any character\n"
146 "\t(<var>x</var>)\tTreats <var>x</var> as a group\n"
147 "\t# <var>x...</var>\tTreats <var>x...</var> "
148 "(to the end of the line) as a comment\n"
149 "\t\\<var>C</var>\tmatches character <var>C</var> "
150 "(useful when <var>C</var> is a special character "
151 "like + or #)\n"
152 "</hangindent>"
153 "\n<h1>Examples:</h1>\n"
154 "<hangindent>"
155 '\t<regexp><NN></regexp>\n'
156 '\t\tMatches <match>"cow/NN"</match>\n'
157 '\t\tMatches <match>"green/NN"</match>\n'
158 '\t<regexp><VB.*></regexp>\n'
159 '\t\tMatches <match>"eating/VBG"</match>\n'
160 '\t\tMatches <match>"ate/VBD"</match>\n'
161 '\t<regexp><IN><DT><NN></regexp>\n'
162 '\t\tMatches <match>"on/IN the/DT car/NN"</match>\n'
163 '\t<regexp><RB>?<VBD></regexp>\n'
164 '\t\tMatches <match>"ran/VBD"</match>\n'
165 '\t\tMatches <match>"slowly/RB ate/VBD"</match>\n'
166 '\t<regexp><\#><CD> # This is a comment...</regexp>\n'
167 '\t\tMatches <match>"#/# 100/CD"</match>\n'
168 "</hangindent>"
169 ),
170 ('Tags', '10 60',
171 "<h1>Part of Speech Tags:</h1>\n" +
172 '<hangindent>' +
173 '<<TAGSET>>' +
174 '</hangindent>\n')
175 ]
176
177 HELP_AUTOTAG = [
178 ('red', dict(foreground='#a00')),
179 ('green', dict(foreground='#080')),
180 ('highlight', dict(background='#ddd')),
181 ('underline', dict(underline=True)),
182 ('h1', dict(underline=True)),
183 ('indent', dict(lmargin1=20, lmargin2=20)),
184 ('hangindent', dict(lmargin1=0, lmargin2=60)),
185 ('var', dict(foreground='#88f')),
186 ('regexp', dict(foreground='#ba7')),
187 ('match', dict(foreground='#6a6')),
188 ]
189
190
191
192
193
194 _EVAL_DELAY = 1
195 """If the user has not pressed any key for this amount of time (in
196 seconds), and the current grammar has not been evaluated, then
197 the eval demon will evaluate it."""
198
199 _EVAL_CHUNK = 15
200 """The number of sentences that should be evaluated by the eval
201 demon each time it runs."""
202 _EVAL_FREQ = 0.2
203 """The frequency (in seconds) at which the eval demon is run"""
204 _EVAL_DEMON_MIN = .02
205 """The minimum amount of time that the eval demon should take each time
206 it runs -- if it takes less than this time, _EVAL_CHUNK will be
207 modified upwards."""
208 _EVAL_DEMON_MAX = .04
209 """The maximum amount of time that the eval demon should take each time
210 it runs -- if it takes more than this time, _EVAL_CHUNK will be
211 modified downwards."""
212
213 _GRAMMARBOX_PARAMS = dict(
214 width=40, height=12, background='#efe', highlightbackground='#efe',
215 highlightthickness=1, relief='groove', border=2, wrap='word')
216 _HELPBOX_PARAMS = dict(
217 width=15, height=15, background='#efe', highlightbackground='#efe',
218 foreground='#555',
219 highlightthickness=1, relief='groove', border=2, wrap='word')
220 _DEVSETBOX_PARAMS = dict(
221 width=70, height=10, background='#eef', highlightbackground='#eef',
222 highlightthickness=1, relief='groove', border=2, wrap='word',
223 tabs=(30,))
224 _STATUS_PARAMS = dict(
225 background='#9bb', relief='groove', border=2)
226 _FONT_PARAMS = dict(
227 family='helvetica', size=-20)
228 _FRAME_PARAMS = dict(
229 background='#777', padx=2, pady=2, border=3)
230 _EVALBOX_PARAMS = dict(
231 background='#eef', highlightbackground='#eef',
232 highlightthickness=1, relief='groove', border=2,
233 width=300, height=280)
234 _BUTTON_PARAMS = dict(
235 background='#777', activebackground='#777',
236 highlightbackground='#777')
237 _HELPTAB_BG_COLOR = '#aba'
238 _HELPTAB_FG_COLOR = '#efe'
239
240 _HELPTAB_FG_PARAMS = dict(background='#efe')
241 _HELPTAB_BG_PARAMS = dict(background='#aba')
242 _HELPTAB_SPACER = 6
243
254
255 - def __init__(self, devset_name='conll2000', devset=None,
256 grammar = '', chunk_node='NP', tagset=None):
257 """
258 @param devset_name: The name of the development set; used for
259 display & for save files. If either the name 'treebank'
260 or the name 'conll2000' is used, and devset is None, then
261 devset will be set automatically.
262 @param devset: A list of chunked sentences
263 @param grammar: The initial grammar to display.
264 @param tagset: Dictionary from tags to string descriptions, used
265 for the help page. Defaults to C{self.TAGSET}.
266 """
267 self._chunk_node = chunk_node
268
269 if tagset is None: tagset = self.TAGSET
270 self.tagset = tagset
271
272
273 if devset is None:
274 if devset_name == 'conll2000':
275 devset = nltk.corpus.conll2000.chunked_sents('train.txt')
276 elif devset == 'treebank':
277 devset = nltk.corpus.treebank_chunk.chunked_sents()
278 else:
279 raise ValueError('Unknown development set %s' % devset_name)
280
281 self.chunker = None
282 """The chunker built from the grammar string"""
283
284 self.grammar = grammar
285 """The unparsed grammar string"""
286
287 self.normalized_grammar = None
288 """A normalized version of L{self.grammar}."""
289
290 self.grammar_changed = 0
291 """The last time() that the grammar was changed."""
292
293 self.devset = devset
294 """The development set -- a list of chunked sentences."""
295
296 self.devset_name = devset_name
297 """The name of the development set (for save files)."""
298
299 self.devset_index = -1
300 """The index into the development set of the first instance
301 that's currently being viewed."""
302
303 self._last_keypress = 0
304 """The time() when a key was most recently pressed"""
305
306 self._history = []
307 """A list of (grammar, precision, recall, fscore) tuples for
308 grammars that the user has already tried."""
309
310 self._history_index = 0
311 """When the user is scrolling through previous grammars, this
312 is used to keep track of which grammar they're looking at."""
313
314 self._eval_grammar = None
315 """The grammar that is being currently evaluated by the eval
316 demon."""
317
318 self._eval_normalized_grammar = None
319 """A normalized copy of L{_eval_grammar}."""
320
321 self._eval_index = 0
322 """The index of the next sentence in the development set that
323 should be looked at by the eval demon."""
324
325 self._eval_score = nltk.chunk.ChunkScore(chunk_node=chunk_node)
326 """The L{ChunkScore <nltk.chunk.ChunkScore>} object that's used
327 to keep track of the score of the current grammar on the
328 development set."""
329
330
331 top = self.top = Tk()
332 top.geometry('+50+50')
333 top.title('Regexp Chunk Parser Demo')
334 top.bind('<Control-q>', self.destroy)
335
336
337 self._devset_size = IntVar(top)
338 self._devset_size.set(100)
339
340
341 self._init_fonts(top)
342 self._init_widgets(top)
343 self._init_bindings(top)
344 self._init_menubar(top)
345 self.grammarbox.focus()
346
347
348
349 if grammar:
350 self.grammarbox.insert('end', grammar+'\n')
351 self.grammarbox.mark_set('insert', '1.0')
352
353
354 self.show_devset(0)
355 self.update()
356
358 top.bind('<Control-n>', self._devset_next)
359 top.bind('<Control-p>', self._devset_prev)
360 top.bind('<Control-t>', self.toggle_show_trace)
361 top.bind('<KeyPress>', self.update)
362 top.bind('<Control-s>', lambda e: self.save_grammar())
363 top.bind('<Control-o>', lambda e: self.load_grammar())
364 self.grammarbox.bind('<Control-t>', self.toggle_show_trace)
365 self.grammarbox.bind('<Control-n>', self._devset_next)
366 self.grammarbox.bind('<Control-p>', self._devset_prev)
367
368
369 self.evalbox.bind('<Configure>', self._eval_plot)
370
372
373 self._size = IntVar(top)
374 self._size.set(20)
375 self._font = tkFont.Font(family='helvetica',
376 size=-self._size.get())
377 self._smallfont = tkFont.Font(family='helvetica',
378 size=-(self._size.get()*14/20))
379
381 menubar = Menu(parent)
382
383 filemenu = Menu(menubar, tearoff=0)
384 filemenu.add_command(label='Reset Demo', underline=0,
385 command=self.reset)
386 filemenu.add_command(label='Save Current Grammar', underline=0,
387 accelerator='Ctrl-s',
388 command=self.save_grammar)
389 filemenu.add_command(label='Load Grammar', underline=0,
390 accelerator='Ctrl-o',
391 command=self.load_grammar)
392
393 filemenu.add_command(label='Save Grammar History', underline=13,
394 command=self.save_history)
395
396 filemenu.add_command(label='Exit', underline=1,
397 command=self.destroy, accelerator='Ctrl-q')
398 menubar.add_cascade(label='File', underline=0, menu=filemenu)
399
400 viewmenu = Menu(menubar, tearoff=0)
401 viewmenu.add_radiobutton(label='Tiny', variable=self._size,
402 underline=0, value=10, command=self.resize)
403 viewmenu.add_radiobutton(label='Small', variable=self._size,
404 underline=0, value=16, command=self.resize)
405 viewmenu.add_radiobutton(label='Medium', variable=self._size,
406 underline=0, value=20, command=self.resize)
407 viewmenu.add_radiobutton(label='Large', variable=self._size,
408 underline=0, value=24, command=self.resize)
409 viewmenu.add_radiobutton(label='Huge', variable=self._size,
410 underline=0, value=34, command=self.resize)
411 menubar.add_cascade(label='View', underline=0, menu=viewmenu)
412
413 devsetmenu = Menu(menubar, tearoff=0)
414 devsetmenu.add_radiobutton(label='50 sentences',
415 variable=self._devset_size,
416 value=50, command=self.set_devset_size)
417 devsetmenu.add_radiobutton(label='100 sentences',
418 variable=self._devset_size,
419 value=100, command=self.set_devset_size)
420 devsetmenu.add_radiobutton(label='200 sentences',
421 variable=self._devset_size,
422 value=200, command=self.set_devset_size)
423 devsetmenu.add_radiobutton(label='500 sentences',
424 variable=self._devset_size,
425 value=500, command=self.set_devset_size)
426 menubar.add_cascade(label='Development-Set', underline=0,
427 menu=devsetmenu)
428
429 helpmenu = Menu(menubar, tearoff=0)
430 helpmenu.add_command(label='About', underline=0,
431 command=self.about)
432 menubar.add_cascade(label='Help', underline=0, menu=helpmenu)
433
434 parent.config(menu=menubar)
435
442
443
444 _SCALE_N = 5
445 _DRAW_LINES = False
447 width = config.get('width', self.evalbox.winfo_width())
448 height = config.get('height', self.evalbox.winfo_height())
449
450
451 self.evalbox.delete('all')
452
453
454 tag = self.evalbox.create_text(10, height/2-10, justify='left',
455 anchor='w', text='Precision')
456 left, right = self.evalbox.bbox(tag)[2] + 5, width-10
457 tag = self.evalbox.create_text(left + (width-left)/2, height-10,
458 anchor='s', text='Recall', justify='center')
459 top, bot = 10, self.evalbox.bbox(tag)[1]-10
460
461
462 bg = self._EVALBOX_PARAMS['background']
463 self.evalbox.lower(self.evalbox.create_rectangle(0, 0, left-1, 5000,
464 fill=bg, outline=bg))
465 self.evalbox.lower(self.evalbox.create_rectangle(0, bot+1, 5000, 5000,
466 fill=bg, outline=bg))
467
468
469 if self._autoscale.get() and len(self._history) > 1:
470 max_precision = max_recall = 0
471 min_precision = min_recall = 1
472 for i in range(1, min(len(self._history), self._SCALE_N+1)):
473 grammar, precision, recall, fmeasure = self._history[-i]
474 min_precision = min(precision, min_precision)
475 min_recall = min(recall, min_recall)
476 max_precision = max(precision, max_precision)
477 max_recall = max(recall, max_recall)
478
479
480
481
482
483
484
485
486
487
488
489
490 min_precision = max(min_precision-.01, 0)
491 min_recall = max(min_recall-.01, 0)
492 max_precision = min(max_precision+.01, 1)
493 max_recall = min(max_recall+.01, 1)
494 else:
495 min_precision = min_recall = 0
496 max_precision = max_recall = 1
497
498
499 for i in range(11):
500 x = left + (right-left)*((i/10.-min_recall)/
501 (max_recall-min_recall))
502 y = bot - (bot-top)*((i/10.-min_precision)/
503 (max_precision-min_precision))
504 if left < x < right:
505 self.evalbox.create_line(x, top, x, bot, fill='#888')
506 if top < y < bot:
507 self.evalbox.create_line(left, y, right, y, fill='#888')
508 self.evalbox.create_line(left, top, left, bot)
509 self.evalbox.create_line(left, bot, right, bot)
510
511
512 self.evalbox.create_text(
513 left-3, bot, justify='right', anchor='se',
514 text='%d%%' % (100*min_precision))
515 self.evalbox.create_text(
516 left-3, top, justify='right', anchor='ne',
517 text='%d%%' % (100*max_precision))
518 self.evalbox.create_text(
519 left, bot+3, justify='center', anchor='nw',
520 text='%d%%' % (100*min_recall))
521 self.evalbox.create_text(
522 right, bot+3, justify='center', anchor='ne',
523 text='%d%%' % (100*max_recall))
524
525
526 prev_x = prev_y = None
527 for i, (_, precision, recall, fscore) in enumerate(self._history):
528 x = left + (right-left) * ((recall-min_recall) /
529 (max_recall-min_recall))
530 y = bot - (bot-top) * ((precision-min_precision) /
531 (max_precision-min_precision))
532 if i == self._history_index:
533 self.evalbox.create_oval(x-2,y-2,x+2,y+2,
534 fill='#0f0', outline='#000')
535 self.status['text'] = (
536 'Precision: %.2f%%\t' % (precision*100)+
537 'Recall: %.2f%%\t' % (recall*100)+
538 'F-score: %.2f%%' % (fscore*100))
539 else:
540 self.evalbox.lower(
541 self.evalbox.create_oval(x-2,y-2,x+2,y+2,
542 fill='#afa', outline='#8c8'))
543 if prev_x is not None and self._eval_lines.get():
544 self.evalbox.lower(
545 self.evalbox.create_line(prev_x, prev_y, x, y,
546 fill='#8c8'))
547 prev_x, prev_y = x, y
548
549 _eval_demon_running = False
551 if self.top is None: return
552 if self.chunker is None:
553 self._eval_demon_running = False
554 return
555
556
557 t0 = time.time()
558
559
560 if (time.time()-self._last_keypress < self._EVAL_DELAY and
561 self.normalized_grammar != self._eval_normalized_grammar):
562 self._eval_demon_running = True
563 return self.top.after(int(self._EVAL_FREQ*1000), self._eval_demon)
564
565
566 if self.normalized_grammar != self._eval_normalized_grammar:
567
568
569 for (g, p, r, f) in self._history:
570 if self.normalized_grammar == self.normalize_grammar(g):
571 self._history.append( (g, p, r, f) )
572 self._history_index = len(self._history) - 1
573 self._eval_plot()
574 self._eval_demon_running = False
575 self._eval_normalized_grammar = None
576 return
577 self._eval_index = 0
578 self._eval_score = nltk.chunk.ChunkScore(chunk_node=
579 self._chunk_node)
580 self._eval_grammar = self.grammar
581 self._eval_normalized_grammar = self.normalized_grammar
582
583
584
585 if self.normalized_grammar.strip() == '':
586
587 self._eval_demon_running = False
588 return
589
590
591 for gold in self.devset[self._eval_index:
592 min(self._eval_index+self._EVAL_CHUNK,
593 self._devset_size.get())]:
594 guess = self._chunkparse(gold.leaves())
595 self._eval_score.score(gold, guess)
596
597
598 self._eval_index += self._EVAL_CHUNK
599
600
601 if self._eval_index >= self._devset_size.get():
602 self._history.append( (self._eval_grammar,
603 self._eval_score.precision(),
604 self._eval_score.recall(),
605 self._eval_score.f_measure()) )
606 self._history_index = len(self._history)-1
607 self._eval_plot()
608 self._eval_demon_running = False
609 self._eval_normalized_grammar = None
610 else:
611 progress = 100*self._eval_index/self._devset_size.get()
612 self.status['text'] = ('Evaluating on Development Set (%d%%)' %
613 progress)
614 self._eval_demon_running = True
615 self._adaptively_modify_eval_chunk(time.time() - t0)
616 self.top.after(int(self._EVAL_FREQ*1000), self._eval_demon)
617
633
788
789 _showing_trace = False
791 self._showing_trace = True
792 self.trace_button['state'] = 'disabled'
793 self.devset_button['state'] = 'normal'
794
795 self.devsetbox['state'] = 'normal'
796
797 self.devsetbox.delete('1.0', 'end')
798 self.devsetlabel['text']='Development Set (%d/%d)' % (
799 (self.devset_index+1, self._devset_size.get()))
800
801 if self.chunker is None:
802 self.devsetbox.insert('1.0', 'Trace: waiting for a valid grammar.')
803 self.devsetbox.tag_add('error', '1.0', 'end')
804 return
805
806 gold_tree = self.devset[self.devset_index]
807 rules = self.chunker.rules()
808
809
810 tagseq = '\t'
811 charnum = [1]
812 for wordnum, (word, pos) in enumerate(gold_tree.leaves()):
813 tagseq += '%s ' % pos
814 charnum.append(len(tagseq))
815 self.charnum = dict(((i, j), charnum[j])
816 for i in range(len(rules)+1)
817 for j in range(len(charnum)))
818 self.linenum = dict((i,i*2+2) for i in range(len(rules)+1))
819
820 for i in range(len(rules)+1):
821 if i == 0:
822 self.devsetbox.insert('end', 'Start:\n')
823 self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c')
824 else:
825 self.devsetbox.insert('end', 'Apply %s:\n' % rules[i-1])
826 self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c')
827
828 self.devsetbox.insert('end', tagseq+'\n')
829 self.devsetbox.tag_add('wrapindent','end -2c linestart','end -2c')
830
831 chunker = nltk.RegexpChunkParser(rules[:i])
832 test_tree = self._chunkparse(gold_tree.leaves())
833 gold_chunks = self._chunks(gold_tree)
834 test_chunks = self._chunks(test_tree)
835
836 for chunk in gold_chunks.intersection(test_chunks):
837 self._color_chunk(i, chunk, 'true-pos')
838 for chunk in gold_chunks - test_chunks:
839 self._color_chunk(i, chunk, 'false-neg')
840 for chunk in test_chunks - gold_chunks:
841 self._color_chunk(i, chunk, 'false-pos')
842 self.devsetbox.insert('end', 'Finished.\n')
843 self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c')
844
845
846
847
848 self.top.after(100, self.devset_xscroll.set, 0, .3)
849
851 self.helpbox['state'] = 'normal'
852 self.helpbox.delete('1.0', 'end')
853 for (name, tabstops, text) in self.HELP:
854 if name == tab:
855 text = text.replace('<<TAGSET>>', '\n'.join(
856 ('\t%s\t%s' % item for item in sorted(self.tagset.items(),
857 key=lambda (t,w):re.match('\w+',t) and (0,t) or (1,t)))))
858
859 self.helptabs[name].config(**self._HELPTAB_FG_PARAMS)
860 self.helpbox.config(tabs=tabstops)
861 self.helpbox.insert('1.0', text+'\n'*20)
862 C = '1.0 + %d chars'
863 for (tag, params) in self.HELP_AUTOTAG:
864 pattern = '(?s)(<%s>)(.*?)(</%s>)' % (tag, tag)
865 for m in re.finditer(pattern, text):
866 self.helpbox.tag_add('elide',
867 C % m.start(1), C % m.end(1))
868 self.helpbox.tag_add('tag-%s' % tag,
869 C % m.start(2), C % m.end(2))
870 self.helpbox.tag_add('elide',
871 C % m.start(3), C % m.end(3))
872 else:
873 self.helptabs[name].config(**self._HELPTAB_BG_PARAMS)
874 self.helpbox['state'] = 'disabled'
875
876 - def _history_prev(self, *e):
877 self._view_history(self._history_index-1)
878 return 'break'
879
880 - def _history_next(self, *e):
881 self._view_history(self._history_index+1)
882 return 'break'
883
884 - def _view_history(self, index):
885
886 index = max(0, min(len(self._history)-1, index))
887 if not self._history: return
888
889 if index == self._history_index:
890 return
891
892
893 self.grammarbox['state'] = 'normal'
894 self.grammarbox.delete('1.0', 'end')
895 self.grammarbox.insert('end', self._history[index][0])
896 self.grammarbox.mark_set('insert', '1.0')
897 self._history_index = index
898 self._syntax_highlight_grammar(self._history[index][0])
899
900 self.normalized_grammar = self.normalize_grammar(
901 self._history[index][0])
902 if self.normalized_grammar:
903 rules = [nltk.chunk.regexp.RegexpChunkRule.parse(line)
904 for line in self.normalized_grammar.split('\n')]
905 else:
906 rules = []
907 self.chunker = nltk.RegexpChunkParser(rules)
908
909 self._eval_plot()
910
911 self._highlight_devset()
912 if self._showing_trace: self.show_trace()
913
914 if self._history_index < len(self._history)-1:
915 self.grammarlabel['text'] = 'Grammar %s/%s:' % (
916 self._history_index+1, len(self._history))
917 else:
918 self.grammarlabel['text'] = 'Grammar:'
919
923
927
929 if self.top is None: return
930 self.top.destroy()
931 self.top = None
932
946
948 if index is None: index = self.devset_index
949
950
951 index = min(max(0, index), self._devset_size.get()-1)
952
953 if index == self.devset_index and not self._showing_trace: return
954 self.devset_index = index
955
956 self._showing_trace = False
957 self.trace_button['state'] = 'normal'
958 self.devset_button['state'] = 'disabled'
959
960
961 self.devsetbox['state'] = 'normal'
962 self.devsetbox['wrap'] = 'word'
963 self.devsetbox.delete('1.0', 'end')
964 self.devsetlabel['text']='Development Set (%d/%d)' % (
965 (self.devset_index+1, self._devset_size.get()))
966
967
968 sample = self.devset[self.devset_index:self.devset_index+1]
969 self.charnum = {}
970 self.linenum = {0:1}
971 for sentnum, sent in enumerate(sample):
972 linestr = ''
973 for wordnum, (word, pos) in enumerate(sent.leaves()):
974 self.charnum[sentnum, wordnum] = len(linestr)
975 linestr += '%s/%s ' % (word, pos)
976 self.charnum[sentnum, wordnum+1] = len(linestr)
977 self.devsetbox.insert('end', linestr[:-1]+'\n\n')
978
979
980 if self.chunker is not None:
981 self._highlight_devset()
982 self.devsetbox['state'] = 'disabled'
983
984
985 first = float(self.devset_index)/self._devset_size.get()
986 last = float(self.devset_index+2)/self._devset_size.get()
987 self.devset_scroll.set(first, last)
988
990 chunks = set()
991 wordnum = 0
992 for child in tree:
993 if isinstance(child, nltk.Tree):
994 if child.node == self._chunk_node:
995 chunks.add( (wordnum, wordnum+len(child)) )
996 wordnum += len(child)
997 else:
998 wordnum += 1
999 return chunks
1000
1002 if self.top is None: return
1003 self.grammarbox.tag_remove('comment', '1.0', 'end')
1004 self.grammarbox.tag_remove('angle', '1.0', 'end')
1005 self.grammarbox.tag_remove('brace', '1.0', 'end')
1006 self.grammarbox.tag_add('hangindent', '1.0', 'end')
1007 for lineno, line in enumerate(grammar.split('\n')):
1008 if not line.strip(): continue
1009 m = re.match(r'(\\.|[^#])*(#.*)?', line)
1010 comment_start = None
1011 if m.group(2):
1012 comment_start = m.start(2)
1013 s = '%d.%d' % (lineno+1, m.start(2))
1014 e = '%d.%d' % (lineno+1, m.end(2))
1015 self.grammarbox.tag_add('comment', s, e)
1016 for m in re.finditer('[<>{}]', line):
1017 if comment_start is not None and m.start() >= comment_start:
1018 break
1019 s = '%d.%d' % (lineno+1, m.start())
1020 e = '%d.%d' % (lineno+1, m.end())
1021 if m.group() in '<>':
1022 self.grammarbox.tag_add('angle', s, e)
1023 else:
1024 self.grammarbox.tag_add('brace', s, e)
1025
1026
1028 if self.top is None: return
1029 self.grammarbox.tag_remove('error', '1.0', 'end')
1030 self._grammarcheck_errs = []
1031 for lineno, line in enumerate(grammar.split('\n')):
1032 line = re.sub(r'((\\.|[^#])*)(#.*)?', r'\1', line)
1033 line = line.strip()
1034 if line:
1035 try: nltk.chunk.regexp.RegexpChunkRule.parse(line)
1036 except ValueError, e:
1037 self.grammarbox.tag_add('error', '%s.0' % (lineno+1),
1038 '%s.0 lineend' % (lineno+1))
1039 self.status['text'] = ''
1040
1089
1091 if sample is None:
1092 sample = self.devset[self.devset_index:self.devset_index+1]
1093
1094 self.devsetbox.tag_remove('true-pos', '1.0', 'end')
1095 self.devsetbox.tag_remove('false-neg', '1.0', 'end')
1096 self.devsetbox.tag_remove('false-pos', '1.0', 'end')
1097
1098
1099 for sentnum, gold_tree in enumerate(sample):
1100
1101 test_tree = self._chunkparse(gold_tree.leaves())
1102
1103 gold_chunks = self._chunks(gold_tree)
1104 test_chunks = self._chunks(test_tree)
1105
1106 for chunk in gold_chunks.intersection(test_chunks):
1107 self._color_chunk(sentnum, chunk, 'true-pos')
1108 for chunk in gold_chunks - test_chunks:
1109 self._color_chunk(sentnum, chunk, 'false-neg')
1110 for chunk in test_chunks - gold_chunks:
1111 self._color_chunk(sentnum, chunk, 'false-pos')
1112
1114 try:
1115 return self.chunker.parse(words)
1116 except (ValueError, IndexError), e:
1117
1118
1119
1120 self.grammarbox.tag_add('error', '1.0', 'end')
1121
1122 return words
1123
1125 start, end = chunk
1126 self.devsetbox.tag_add(tag,
1127 '%s.%s' % (self.linenum[sentnum], self.charnum[sentnum, start]),
1128 '%s.%s' % (self.linenum[sentnum], self.charnum[sentnum, end]-1))
1129
1131
1132 self.chunker = None
1133 self.grammar = None
1134 self.normalized_grammar = None
1135 self.grammar_changed = 0
1136 self._history = []
1137 self._history_index = 0
1138
1139 self.grammarbox.delete('1.0', 'end')
1140 self.show_devset(0)
1141 self.update()
1142
1143
1144 SAVE_GRAMMAR_TEMPLATE = (
1145 '# Regexp Chunk Parsing Grammar\n'
1146 '# Saved %(date)s\n'
1147 '#\n'
1148 '# Development set: %(devset)s\n'
1149 '# Precision: %(precision)s\n'
1150 '# Recall: %(recall)s\n'
1151 '# F-score: %(fscore)s\n\n'
1152 '%(grammar)s\n')
1153
1155 if not filename:
1156 ftypes = [('Chunk Gramamr', '.chunk'),
1157 ('All files', '*')]
1158 filename = asksaveasfilename(filetypes=ftypes,
1159 defaultextension='.chunk')
1160 if not filename: return
1161 if (self._history and self.normalized_grammar ==
1162 self.normalize_grammar(self._history[-1][0])):
1163 precision, recall, fscore = ['%.2f%%' % (100*v) for v in
1164 self._history[-1][1:]]
1165 elif self.chunker is None:
1166 precision = recall = fscore = 'Grammar not well formed'
1167 else:
1168 precision = recall = fscore = 'Not finished evaluation yet'
1169
1170 out = open(filename, 'w')
1171 out.write(self.SAVE_GRAMMAR_TEMPLATE % dict(
1172 date=time.ctime(), devset=self.devset_name,
1173 precision=precision, recall=recall, fscore=fscore,
1174 grammar=self.grammar.strip()))
1175 out.close()
1176
1178 if not filename:
1179 ftypes = [('Chunk Gramamr', '.chunk'),
1180 ('All files', '*')]
1181 filename = askopenfilename(filetypes=ftypes,
1182 defaultextension='.chunk')
1183 if not filename: return
1184 self.grammarbox.delete('1.0', 'end')
1185 self.update()
1186 grammar = open(filename).read()
1187 grammar = re.sub('^\# Regexp Chunk Parsing Grammar[\s\S]*'
1188 'F-score:.*\n', '', grammar).lstrip()
1189 self.grammarbox.insert('1.0', grammar)
1190 self.update()
1191
1192 - def save_history(self, filename=None):
1193 if not filename:
1194 ftypes = [('Chunk Gramamr History', '.txt'),
1195 ('All files', '*')]
1196 filename = asksaveasfilename(filetypes=ftypes,
1197 defaultextension='.txt')
1198 if not filename: return
1199
1200 out = open(filename, 'w')
1201 out.write('# Regexp Chunk Parsing Grammar History\n')
1202 out.write('# Saved %s\n' % time.ctime())
1203 out.write('# Development set: %s\n' % self.devset_name)
1204 for i, (g, p, r, f) in enumerate(self._history):
1205 hdr = ('Grammar %d/%d (precision=%.2f%%, recall=%.2f%%, '
1206 'fscore=%.2f%%)' % (i+1, len(self._history),
1207 p*100, r*100, f*100))
1208 out.write('\n%s\n' % hdr)
1209 out.write(''.join(' %s\n' % line for line in g.strip().split()))
1210
1211 if not (self._history and self.normalized_grammar ==
1212 self.normalize_grammar(self._history[-1][0])):
1213 if self.chunker is None:
1214 out.write('\nCurrent Grammar (not well-formed)\n')
1215 else:
1216 out.write('\nCurrent Grammar (not evaluated)\n')
1217 out.write(''.join(' %s\n' % line for line
1218 in self.grammar.strip().split()))
1219 out.close()
1220
1222 ABOUT = ("NLTK RegExp Chunk Parser Demo\n"+
1223 "Written by Edward Loper")
1224 TITLE = 'About: Regular Expression Chunk Parser Demo'
1225 try:
1226 from tkMessageBox import Message
1227 Message(message=ABOUT, title=TITLE).show()
1228 except:
1229 ShowText(self.top, TITLE, ABOUT)
1230
1236
1237
1238 - def resize(self, size=None):
1243
1244 - def mainloop(self, *args, **kwargs):
1245 """
1246 Enter the Tkinter mainloop. This function must be called if
1247 this demo is created from a non-interactive program (e.g.
1248 from a secript); otherwise, the demo will close as soon as
1249 the script completes.
1250 """
1251 if in_idle(): return
1252 self.top.mainloop(*args, **kwargs)
1253
1256
1257 if __name__ == '__main__':
1258 demo()
1259