Package nltk :: Package draw :: Module rechunkparser
[hide private]
[frames] | no frames]

Source Code for Module nltk.draw.rechunkparser

   1  # Natural Language Toolkit: Regexp Chunk Parser Demo 
   2  # 
   3  # Copyright (C) 2008 NLTK Project 
   4  # Author: Edward Loper <[email protected]> 
   5  # URL: <http://nltk.org> 
   6  # For license information, see LICENSE.TXT 
   7  # 
   8  # $Id: srparser.py 5609 2007-12-31 03:02:41Z stevenbird $ 
   9   
  10  """ 
  11  A graphical tool for exploring the regular expression based chunk 
  12  parser (L{RegexpChunkParser<nltk.chunk.regex.RegexpChunkParser>}). 
  13   
  14  @todo: Add a way to select the development set from the menubar.  This 
  15      might just need to be a selection box (conll vs treebank etc) plus 
  16      configuration parameters to select what's being chunked (eg VP vs NP) 
  17      and what part of the data is being used as the development set. 
  18  """ 
  19   
  20  from Tkinter import * 
  21  from tkFileDialog import asksaveasfilename, askopenfilename 
  22  import tkFont 
  23  import time 
  24  import textwrap 
  25  import re 
  26  import random 
  27   
  28  import nltk 
  29   
  30  from nltk.draw import * 
  31   
32 -class RegexpChunkDemo(object):
33 """ 34 A graphical tool for exploring the regular expression based chunk 35 parser (L{RegexpChunkParser<nltk.chunk.regex.RegexpChunkParser>}). 36 37 See L{HELP} for instructional text. 38 """ 39 40 ##///////////////////////////////////////////////////////////////// 41 ## Help Text 42 ##///////////////////////////////////////////////////////////////// 43 44 #: A dictionary mapping from part of speech tags to descriptions, 45 #: which is used in the help text. (This should probably live with 46 #: the conll and/or treebank corpus instead.) 47 TAGSET = { 48 'CC': 'Coordinating conjunction', 'PRP$': 'Possessive pronoun', 49 'CD': 'Cardinal number', 'RB': 'Adverb', 50 'DT': 'Determiner', 'RBR': 'Adverb, comparative', 51 'EX': 'Existential there', 'RBS': 'Adverb, superlative', 52 'FW': 'Foreign word', 'RP': 'Particle', 53 'JJ': 'Adjective', 'TO': 'to', 54 'JJR': 'Adjective, comparative', 'UH': 'Interjection', 55 'JJS': 'Adjective, superlative', 'VB': 'Verb, base form', 56 'LS': 'List item marker', 'VBD': 'Verb, past tense', 57 'MD': 'Modal', 'NNS': 'Noun, plural', 58 'NN': 'Noun, singular or masps', 'VBN': 'Verb, past participle', 59 'VBZ': 'Verb,3rd ps. sing. present', 'NNP': 'Proper noun, singular', 60 'NNPS': 'Proper noun plural', 'WDT': 'wh-determiner', 61 'PDT': 'Predeterminer', 'WP': 'wh-pronoun', 62 'POS': 'Possessive ending', 'WP$': 'Possessive wh-pronoun', 63 'PRP': 'Personal pronoun', 'WRB': 'wh-adverb', 64 '(': 'open parenthesis', ')': 'close parenthesis', 65 '``': 'open quote', ',': 'comma', 66 "''": 'close quote', '.': 'period', 67 '#': 'pound sign (currency marker)', 68 '$': 'dollar sign (currency marker)', 69 'IN': 'Preposition/subord. conjunction', 70 'SYM': 'Symbol (mathematical or scientific)', 71 'VBG': 'Verb, gerund/present participle', 72 'VBP': 'Verb, non-3rd ps. sing. present', 73 ':': 'colon', 74 } 75 76 #: Contents for the help box. This is a list of tuples, one for 77 #: each help page, where each tuple has four elements: 78 #: - A title (displayed as a tab) 79 #: - A string description of tabstops (see Tkinter.Text for details) 80 #: - The text contents for the help page. You can use expressions 81 #: like <red>...</red> to colorize the text; see L{HELP_AUTOTAG} 82 #: for a list of tags you can use for colorizing. 83 HELP = [ 84 ('Help', '20', 85 "Welcome to the regular expression chunk-parser grammar editor. " 86 "You can use this editor to develop and test chunk parser grammars " 87 "based on NLTK's RegexpChunkParser class.\n\n" 88 # Help box. 89 "Use this box ('Help') to learn more about the editor; click on the " 90 "tabs for help on specific topics:" 91 "<indent>\n" 92 "Rules: grammar rule types\n" 93 "Regexps: regular expression syntax\n" 94 "Tags: part of speech tags\n</indent>\n" 95 # Grammar. 96 "Use the upper-left box ('Grammar') to edit your grammar. " 97 "Each line of your grammar specifies a single 'rule', " 98 "which performs an action such as creating a chunk or merging " 99 "two chunks.\n\n" 100 # Dev set. 101 "The lower-left box ('Development Set') runs your grammar on the " 102 "development set, and displays the results. " 103 "Your grammar's chunks are <highlight>highlighted</highlight>, and " 104 "the correct (gold standard) chunks are " 105 "<underline>underlined</underline>. If they " 106 "match, they are displayed in <green>green</green>; otherwise, " 107 "they are displayed in <red>red</red>. The box displays a single " 108 "sentence from the development set at a time; use the scrollbar or " 109 "the next/previous buttons view additional sentences.\n\n" 110 # Performance 111 "The lower-right box ('Evaluation') tracks the performance of " 112 "your grammar on the development set. The 'precision' axis " 113 "indicates how many of your grammar's chunks are correct; and " 114 "the 'recall' axis indicates how many of the gold standard " 115 "chunks your system generated. Typically, you should try to " 116 "design a grammar that scores high on both metrics. The " 117 "exact precision and recall of the current grammar, as well " 118 "as their geometric average (the 'f-score'), are displayed in " 119 "the status bar at the bottom of the window." 120 ), 121 ('Rules', '10', 122 "<h1>{...regexp...}</h1>" 123 "<indent>\nChunk rule: creates new chunks from words matching " 124 "regexp.</indent>\n\n" 125 "<h1>}...regexp...{</h1>" 126 "<indent>\nChink rule: removes words matching regexp from existing " 127 "chunks.</indent>\n\n" 128 "<h1>...regexp1...}{...regexp2...</h1>" 129 "<indent>\nSplit rule: splits chunks that match regexp1 followed by " 130 "regexp2 in two.</indent>\n\n" 131 "<h1>...regexp...{}...regexp...</h1>" 132 "<indent>\nMerge rule: joins consecutive chunks that match regexp1 " 133 "and regexp2</indent>\n" 134 ), 135 ('Regexps', '10 60', 136 #"Regular Expression Syntax Summary:\n\n" 137 "<h1>Pattern\t\tMatches...</h1>\n" 138 "<hangindent>" 139 "\t<<var>T</var>>\ta word with tag <var>T</var> " 140 "(where <var>T</var> may be a regexp).\n" 141 "\t<var>x</var>?\tan optional <var>x</var>\n" 142 "\t<var>x</var>+\ta sequence of 1 or more <var>x</var>'s\n" 143 "\t<var>x</var>*\ta sequence of 0 or more <var>x</var>'s\n" 144 "\t<var>x</var>|<var>y</var>\t<var>x</var> or <var>y</var>\n" 145 "\t.\tmatches any character\n" 146 "\t(<var>x</var>)\tTreats <var>x</var> as a group\n" 147 "\t# <var>x...</var>\tTreats <var>x...</var> " 148 "(to the end of the line) as a comment\n" 149 "\t\\<var>C</var>\tmatches character <var>C</var> " 150 "(useful when <var>C</var> is a special character " 151 "like + or #)\n" 152 "</hangindent>" 153 "\n<h1>Examples:</h1>\n" 154 "<hangindent>" 155 '\t<regexp><NN></regexp>\n' 156 '\t\tMatches <match>"cow/NN"</match>\n' 157 '\t\tMatches <match>"green/NN"</match>\n' 158 '\t<regexp><VB.*></regexp>\n' 159 '\t\tMatches <match>"eating/VBG"</match>\n' 160 '\t\tMatches <match>"ate/VBD"</match>\n' 161 '\t<regexp><IN><DT><NN></regexp>\n' 162 '\t\tMatches <match>"on/IN the/DT car/NN"</match>\n' 163 '\t<regexp><RB>?<VBD></regexp>\n' 164 '\t\tMatches <match>"ran/VBD"</match>\n' 165 '\t\tMatches <match>"slowly/RB ate/VBD"</match>\n' 166 '\t<regexp><\#><CD> # This is a comment...</regexp>\n' 167 '\t\tMatches <match>"#/# 100/CD"</match>\n' 168 "</hangindent>" 169 ), 170 ('Tags', '10 60', 171 "<h1>Part of Speech Tags:</h1>\n" + 172 '<hangindent>' + 173 '<<TAGSET>>' + # this gets auto-substituted w/ self.TAGSET 174 '</hangindent>\n') 175 ] 176 177 HELP_AUTOTAG = [ 178 ('red', dict(foreground='#a00')), 179 ('green', dict(foreground='#080')), 180 ('highlight', dict(background='#ddd')), 181 ('underline', dict(underline=True)), 182 ('h1', dict(underline=True)), 183 ('indent', dict(lmargin1=20, lmargin2=20)), 184 ('hangindent', dict(lmargin1=0, lmargin2=60)), 185 ('var', dict(foreground='#88f')), 186 ('regexp', dict(foreground='#ba7')), 187 ('match', dict(foreground='#6a6')), 188 ] 189 190 ##///////////////////////////////////////////////////////////////// 191 ## Config Parmeters 192 ##///////////////////////////////////////////////////////////////// 193 194 _EVAL_DELAY = 1 195 """If the user has not pressed any key for this amount of time (in 196 seconds), and the current grammar has not been evaluated, then 197 the eval demon will evaluate it.""" 198 199 _EVAL_CHUNK = 15 200 """The number of sentences that should be evaluated by the eval 201 demon each time it runs.""" 202 _EVAL_FREQ = 0.2 203 """The frequency (in seconds) at which the eval demon is run""" 204 _EVAL_DEMON_MIN = .02 205 """The minimum amount of time that the eval demon should take each time 206 it runs -- if it takes less than this time, _EVAL_CHUNK will be 207 modified upwards.""" 208 _EVAL_DEMON_MAX = .04 209 """The maximum amount of time that the eval demon should take each time 210 it runs -- if it takes more than this time, _EVAL_CHUNK will be 211 modified downwards.""" 212 213 _GRAMMARBOX_PARAMS = dict( 214 width=40, height=12, background='#efe', highlightbackground='#efe', 215 highlightthickness=1, relief='groove', border=2, wrap='word') 216 _HELPBOX_PARAMS = dict( 217 width=15, height=15, background='#efe', highlightbackground='#efe', 218 foreground='#555', 219 highlightthickness=1, relief='groove', border=2, wrap='word') 220 _DEVSETBOX_PARAMS = dict( 221 width=70, height=10, background='#eef', highlightbackground='#eef', 222 highlightthickness=1, relief='groove', border=2, wrap='word', 223 tabs=(30,)) 224 _STATUS_PARAMS = dict( 225 background='#9bb', relief='groove', border=2) 226 _FONT_PARAMS = dict( 227 family='helvetica', size=-20) 228 _FRAME_PARAMS = dict( 229 background='#777', padx=2, pady=2, border=3) 230 _EVALBOX_PARAMS = dict( 231 background='#eef', highlightbackground='#eef', 232 highlightthickness=1, relief='groove', border=2, 233 width=300, height=280) 234 _BUTTON_PARAMS = dict( 235 background='#777', activebackground='#777', 236 highlightbackground='#777') 237 _HELPTAB_BG_COLOR = '#aba' 238 _HELPTAB_FG_COLOR = '#efe' 239 240 _HELPTAB_FG_PARAMS = dict(background='#efe') 241 _HELPTAB_BG_PARAMS = dict(background='#aba') 242 _HELPTAB_SPACER = 6 243
244 - def normalize_grammar(self, grammar):
245 # Strip comments 246 grammar = re.sub(r'((\\.|[^#])*)(#.*)?', r'\1', grammar) 247 # Normalize whitespace 248 grammar = re.sub(' +', ' ', grammar) 249 grammar = re.sub('\n\s+', '\n', grammar) 250 grammar = grammar.strip() 251 # [xx] Hack: automatically backslash $! 252 grammar = re.sub(r'([^\\])\$', r'\1\\$', grammar) 253 return grammar
254
255 - def __init__(self, devset_name='conll2000', devset=None, 256 grammar = '', chunk_node='NP', tagset=None):
257 """ 258 @param devset_name: The name of the development set; used for 259 display & for save files. If either the name 'treebank' 260 or the name 'conll2000' is used, and devset is None, then 261 devset will be set automatically. 262 @param devset: A list of chunked sentences 263 @param grammar: The initial grammar to display. 264 @param tagset: Dictionary from tags to string descriptions, used 265 for the help page. Defaults to C{self.TAGSET}. 266 """ 267 self._chunk_node = chunk_node 268 269 if tagset is None: tagset = self.TAGSET 270 self.tagset = tagset 271 272 # Named development sets: 273 if devset is None: 274 if devset_name == 'conll2000': 275 devset = nltk.corpus.conll2000.chunked_sents('train.txt')#[:100] 276 elif devset == 'treebank': 277 devset = nltk.corpus.treebank_chunk.chunked_sents()#[:100] 278 else: 279 raise ValueError('Unknown development set %s' % devset_name) 280 281 self.chunker = None 282 """The chunker built from the grammar string""" 283 284 self.grammar = grammar 285 """The unparsed grammar string""" 286 287 self.normalized_grammar = None 288 """A normalized version of L{self.grammar}.""" 289 290 self.grammar_changed = 0 291 """The last time() that the grammar was changed.""" 292 293 self.devset = devset 294 """The development set -- a list of chunked sentences.""" 295 296 self.devset_name = devset_name 297 """The name of the development set (for save files).""" 298 299 self.devset_index = -1 300 """The index into the development set of the first instance 301 that's currently being viewed.""" 302 303 self._last_keypress = 0 304 """The time() when a key was most recently pressed""" 305 306 self._history = [] 307 """A list of (grammar, precision, recall, fscore) tuples for 308 grammars that the user has already tried.""" 309 310 self._history_index = 0 311 """When the user is scrolling through previous grammars, this 312 is used to keep track of which grammar they're looking at.""" 313 314 self._eval_grammar = None 315 """The grammar that is being currently evaluated by the eval 316 demon.""" 317 318 self._eval_normalized_grammar = None 319 """A normalized copy of L{_eval_grammar}.""" 320 321 self._eval_index = 0 322 """The index of the next sentence in the development set that 323 should be looked at by the eval demon.""" 324 325 self._eval_score = nltk.chunk.ChunkScore(chunk_node=chunk_node) 326 """The L{ChunkScore <nltk.chunk.ChunkScore>} object that's used 327 to keep track of the score of the current grammar on the 328 development set.""" 329 330 # Set up the main window. 331 top = self.top = Tk() 332 top.geometry('+50+50') 333 top.title('Regexp Chunk Parser Demo') 334 top.bind('<Control-q>', self.destroy) 335 336 # Varaible that restricts how much of the devset we look at. 337 self._devset_size = IntVar(top) 338 self._devset_size.set(100) 339 340 # Set up all the tkinter widgets 341 self._init_fonts(top) 342 self._init_widgets(top) 343 self._init_bindings(top) 344 self._init_menubar(top) 345 self.grammarbox.focus() 346 347 348 # If a grammar was given, then display it. 349 if grammar: 350 self.grammarbox.insert('end', grammar+'\n') 351 self.grammarbox.mark_set('insert', '1.0') 352 353 # Display the first item in the development set 354 self.show_devset(0) 355 self.update()
356
357 - def _init_bindings(self, top):
358 top.bind('<Control-n>', self._devset_next) 359 top.bind('<Control-p>', self._devset_prev) 360 top.bind('<Control-t>', self.toggle_show_trace) 361 top.bind('<KeyPress>', self.update) 362 top.bind('<Control-s>', lambda e: self.save_grammar()) 363 top.bind('<Control-o>', lambda e: self.load_grammar()) 364 self.grammarbox.bind('<Control-t>', self.toggle_show_trace) 365 self.grammarbox.bind('<Control-n>', self._devset_next) 366 self.grammarbox.bind('<Control-p>', self._devset_prev) 367 368 # Redraw the eval graph when the window size changes 369 self.evalbox.bind('<Configure>', self._eval_plot)
370
371 - def _init_fonts(self, top):
372 # TWhat's our font size (default=same as sysfont) 373 self._size = IntVar(top) 374 self._size.set(20) 375 self._font = tkFont.Font(family='helvetica', 376 size=-self._size.get()) 377 self._smallfont = tkFont.Font(family='helvetica', 378 size=-(self._size.get()*14/20))
379
380 - def _init_menubar(self, parent):
381 menubar = Menu(parent) 382 383 filemenu = Menu(menubar, tearoff=0) 384 filemenu.add_command(label='Reset Demo', underline=0, 385 command=self.reset) 386 filemenu.add_command(label='Save Current Grammar', underline=0, 387 accelerator='Ctrl-s', 388 command=self.save_grammar) 389 filemenu.add_command(label='Load Grammar', underline=0, 390 accelerator='Ctrl-o', 391 command=self.load_grammar) 392 393 filemenu.add_command(label='Save Grammar History', underline=13, 394 command=self.save_history) 395 396 filemenu.add_command(label='Exit', underline=1, 397 command=self.destroy, accelerator='Ctrl-q') 398 menubar.add_cascade(label='File', underline=0, menu=filemenu) 399 400 viewmenu = Menu(menubar, tearoff=0) 401 viewmenu.add_radiobutton(label='Tiny', variable=self._size, 402 underline=0, value=10, command=self.resize) 403 viewmenu.add_radiobutton(label='Small', variable=self._size, 404 underline=0, value=16, command=self.resize) 405 viewmenu.add_radiobutton(label='Medium', variable=self._size, 406 underline=0, value=20, command=self.resize) 407 viewmenu.add_radiobutton(label='Large', variable=self._size, 408 underline=0, value=24, command=self.resize) 409 viewmenu.add_radiobutton(label='Huge', variable=self._size, 410 underline=0, value=34, command=self.resize) 411 menubar.add_cascade(label='View', underline=0, menu=viewmenu) 412 413 devsetmenu = Menu(menubar, tearoff=0) 414 devsetmenu.add_radiobutton(label='50 sentences', 415 variable=self._devset_size, 416 value=50, command=self.set_devset_size) 417 devsetmenu.add_radiobutton(label='100 sentences', 418 variable=self._devset_size, 419 value=100, command=self.set_devset_size) 420 devsetmenu.add_radiobutton(label='200 sentences', 421 variable=self._devset_size, 422 value=200, command=self.set_devset_size) 423 devsetmenu.add_radiobutton(label='500 sentences', 424 variable=self._devset_size, 425 value=500, command=self.set_devset_size) 426 menubar.add_cascade(label='Development-Set', underline=0, 427 menu=devsetmenu) 428 429 helpmenu = Menu(menubar, tearoff=0) 430 helpmenu.add_command(label='About', underline=0, 431 command=self.about) 432 menubar.add_cascade(label='Help', underline=0, menu=helpmenu) 433 434 parent.config(menu=menubar)
435
436 - def toggle_show_trace(self, *e):
437 if self._showing_trace: 438 self.show_devset() 439 else: 440 self.show_trace() 441 return 'break'
442 443 444 _SCALE_N = 5 # center on the last 5 examples. 445 _DRAW_LINES = False
446 - def _eval_plot(self, *e, **config):
447 width = config.get('width', self.evalbox.winfo_width()) 448 height = config.get('height', self.evalbox.winfo_height()) 449 450 # Clear the canvas 451 self.evalbox.delete('all') 452 453 # Draw the precision & recall labels. 454 tag = self.evalbox.create_text(10, height/2-10, justify='left', 455 anchor='w', text='Precision') 456 left, right = self.evalbox.bbox(tag)[2] + 5, width-10 457 tag = self.evalbox.create_text(left + (width-left)/2, height-10, 458 anchor='s', text='Recall', justify='center') 459 top, bot = 10, self.evalbox.bbox(tag)[1]-10 460 461 # Draw masks for clipping the plot. 462 bg = self._EVALBOX_PARAMS['background'] 463 self.evalbox.lower(self.evalbox.create_rectangle(0, 0, left-1, 5000, 464 fill=bg, outline=bg)) 465 self.evalbox.lower(self.evalbox.create_rectangle(0, bot+1, 5000, 5000, 466 fill=bg, outline=bg)) 467 468 # Calculate the plot's scale. 469 if self._autoscale.get() and len(self._history) > 1: 470 max_precision = max_recall = 0 471 min_precision = min_recall = 1 472 for i in range(1, min(len(self._history), self._SCALE_N+1)): 473 grammar, precision, recall, fmeasure = self._history[-i] 474 min_precision = min(precision, min_precision) 475 min_recall = min(recall, min_recall) 476 max_precision = max(precision, max_precision) 477 max_recall = max(recall, max_recall) 478 # if max_precision-min_precision > max_recall-min_recall: 479 # min_recall -= (max_precision-min_precision)/2 480 # max_recall += (max_precision-min_precision)/2 481 # else: 482 # min_precision -= (max_recall-min_recall)/2 483 # max_precision += (max_recall-min_recall)/2 484 # if min_recall < 0: 485 # max_recall -= min_recall 486 # min_recall = 0 487 # if min_precision < 0: 488 # max_precision -= min_precision 489 # min_precision = 0 490 min_precision = max(min_precision-.01, 0) 491 min_recall = max(min_recall-.01, 0) 492 max_precision = min(max_precision+.01, 1) 493 max_recall = min(max_recall+.01, 1) 494 else: 495 min_precision = min_recall = 0 496 max_precision = max_recall = 1 497 498 # Draw the axis lines & grid lines 499 for i in range(11): 500 x = left + (right-left)*((i/10.-min_recall)/ 501 (max_recall-min_recall)) 502 y = bot - (bot-top)*((i/10.-min_precision)/ 503 (max_precision-min_precision)) 504 if left < x < right: 505 self.evalbox.create_line(x, top, x, bot, fill='#888') 506 if top < y < bot: 507 self.evalbox.create_line(left, y, right, y, fill='#888') 508 self.evalbox.create_line(left, top, left, bot) 509 self.evalbox.create_line(left, bot, right, bot) 510 511 # Display the plot's scale 512 self.evalbox.create_text( 513 left-3, bot, justify='right', anchor='se', 514 text='%d%%' % (100*min_precision)) 515 self.evalbox.create_text( 516 left-3, top, justify='right', anchor='ne', 517 text='%d%%' % (100*max_precision)) 518 self.evalbox.create_text( 519 left, bot+3, justify='center', anchor='nw', 520 text='%d%%' % (100*min_recall)) 521 self.evalbox.create_text( 522 right, bot+3, justify='center', anchor='ne', 523 text='%d%%' % (100*max_recall)) 524 525 # Display the scores. 526 prev_x = prev_y = None 527 for i, (_, precision, recall, fscore) in enumerate(self._history): 528 x = left + (right-left) * ((recall-min_recall) / 529 (max_recall-min_recall)) 530 y = bot - (bot-top) * ((precision-min_precision) / 531 (max_precision-min_precision)) 532 if i == self._history_index: 533 self.evalbox.create_oval(x-2,y-2,x+2,y+2, 534 fill='#0f0', outline='#000') 535 self.status['text'] = ( 536 'Precision: %.2f%%\t' % (precision*100)+ 537 'Recall: %.2f%%\t' % (recall*100)+ 538 'F-score: %.2f%%' % (fscore*100)) 539 else: 540 self.evalbox.lower( 541 self.evalbox.create_oval(x-2,y-2,x+2,y+2, 542 fill='#afa', outline='#8c8')) 543 if prev_x is not None and self._eval_lines.get(): 544 self.evalbox.lower( 545 self.evalbox.create_line(prev_x, prev_y, x, y, 546 fill='#8c8')) 547 prev_x, prev_y = x, y
548 549 _eval_demon_running = False
550 - def _eval_demon(self):
551 if self.top is None: return 552 if self.chunker is None: 553 self._eval_demon_running = False 554 return 555 556 # Note our starting time. 557 t0 = time.time() 558 559 # If are still typing, then wait for them to finish. 560 if (time.time()-self._last_keypress < self._EVAL_DELAY and 561 self.normalized_grammar != self._eval_normalized_grammar): 562 self._eval_demon_running = True 563 return self.top.after(int(self._EVAL_FREQ*1000), self._eval_demon) 564 565 # If the grammar changed, restart the evaluation. 566 if self.normalized_grammar != self._eval_normalized_grammar: 567 # Check if we've seen this grammar already. If so, then 568 # just use the old evaluation values. 569 for (g, p, r, f) in self._history: 570 if self.normalized_grammar == self.normalize_grammar(g): 571 self._history.append( (g, p, r, f) ) 572 self._history_index = len(self._history) - 1 573 self._eval_plot() 574 self._eval_demon_running = False 575 self._eval_normalized_grammar = None 576 return 577 self._eval_index = 0 578 self._eval_score = nltk.chunk.ChunkScore(chunk_node= 579 self._chunk_node) 580 self._eval_grammar = self.grammar 581 self._eval_normalized_grammar = self.normalized_grammar 582 583 # If the grammar is empty, the don't bother evaluating it, or 584 # recording it in history -- the score will just be 0. 585 if self.normalized_grammar.strip() == '': 586 #self._eval_index = self._devset_size.get() 587 self._eval_demon_running = False 588 return 589 590 # Score the next set of examples 591 for gold in self.devset[self._eval_index: 592 min(self._eval_index+self._EVAL_CHUNK, 593 self._devset_size.get())]: 594 guess = self._chunkparse(gold.leaves()) 595 self._eval_score.score(gold, guess) 596 597 # update our index in the devset. 598 self._eval_index += self._EVAL_CHUNK 599 600 # Check if we're done 601 if self._eval_index >= self._devset_size.get(): 602 self._history.append( (self._eval_grammar, 603 self._eval_score.precision(), 604 self._eval_score.recall(), 605 self._eval_score.f_measure()) ) 606 self._history_index = len(self._history)-1 607 self._eval_plot() 608 self._eval_demon_running = False 609 self._eval_normalized_grammar = None 610 else: 611 progress = 100*self._eval_index/self._devset_size.get() 612 self.status['text'] = ('Evaluating on Development Set (%d%%)' % 613 progress) 614 self._eval_demon_running = True 615 self._adaptively_modify_eval_chunk(time.time() - t0) 616 self.top.after(int(self._EVAL_FREQ*1000), self._eval_demon)
617
618 - def _adaptively_modify_eval_chunk(self, t):
619 """ 620 Modify _EVAL_CHUNK to try to keep the amount of time that the 621 eval demon takes between _EVAL_DEMON_MIN and _EVAL_DEMON_MAX. 622 623 @param t: The amount of time that the eval demon took. 624 """ 625 if t > self._EVAL_DEMON_MAX and self._EVAL_CHUNK > 5: 626 self._EVAL_CHUNK = min(self._EVAL_CHUNK-1, 627 max(int(self._EVAL_CHUNK*(self._EVAL_DEMON_MAX/t)), 628 self._EVAL_CHUNK-10)) 629 elif t < self._EVAL_DEMON_MIN: 630 self._EVAL_CHUNK = max(self._EVAL_CHUNK+1, 631 min(int(self._EVAL_CHUNK*(self._EVAL_DEMON_MIN/t)), 632 self._EVAL_CHUNK+10))
633
634 - def _init_widgets(self, top):
635 frame0 = Frame(top, **self._FRAME_PARAMS) 636 frame0.grid_columnconfigure(0, weight=4) 637 frame0.grid_columnconfigure(3, weight=2) 638 frame0.grid_rowconfigure(1, weight=1) 639 frame0.grid_rowconfigure(5, weight=1) 640 641 # The grammar 642 self.grammarbox = Text(frame0, font=self._font, 643 **self._GRAMMARBOX_PARAMS) 644 self.grammarlabel = Label(frame0, font=self._font, text='Grammar:', 645 highlightcolor='black', 646 background=self._GRAMMARBOX_PARAMS['background']) 647 self.grammarlabel.grid(column=0, row=0, sticky='SW') 648 self.grammarbox.grid(column=0, row=1, sticky='NEWS') 649 650 # Scroll bar for grammar 651 grammar_scrollbar = Scrollbar(frame0, command=self.grammarbox.yview) 652 grammar_scrollbar.grid(column=1, row=1, sticky='NWS') 653 self.grammarbox.config(yscrollcommand=grammar_scrollbar.set) 654 655 # grammar buttons 656 bg = self._FRAME_PARAMS['background'] 657 frame3 = Frame(frame0, background=bg) 658 frame3.grid(column=0, row=2, sticky='EW') 659 Button(frame3, text='Prev Grammar', command=self._history_prev, 660 **self._BUTTON_PARAMS).pack(side='left') 661 Button(frame3, text='Next Grammar', command=self._history_next, 662 **self._BUTTON_PARAMS).pack(side='left') 663 664 # Help box 665 self.helpbox = Text(frame0, font=self._smallfont, 666 **self._HELPBOX_PARAMS) 667 self.helpbox.grid(column=3, row=1, sticky='NEWS') 668 self.helptabs = {} 669 bg = self._FRAME_PARAMS['background'] 670 helptab_frame = Frame(frame0, background=bg) 671 helptab_frame.grid(column=3, row=0, sticky='SW') 672 for i, (tab, tabstops, text) in enumerate(self.HELP): 673 label = Label(helptab_frame, text=tab, font=self._smallfont) 674 label.grid(column=i*2, row=0, sticky='S') 675 #help_frame.grid_columnconfigure(i, weight=1) 676 #label.pack(side='left') 677 label.bind('<ButtonPress>', lambda e, tab=tab: self.show_help(tab)) 678 self.helptabs[tab] = label 679 Frame(helptab_frame, height=1, width=self._HELPTAB_SPACER, 680 background=bg).grid(column=i*2+1, row=0) 681 self.helptabs[self.HELP[0][0]].configure(font=self._font) 682 self.helpbox.tag_config('elide', elide=True) 683 for (tag, params) in self.HELP_AUTOTAG: 684 self.helpbox.tag_config('tag-%s' % tag, **params) 685 self.show_help(self.HELP[0][0]) 686 687 # Scroll bar for helpbox 688 help_scrollbar = Scrollbar(frame0, command=self.helpbox.yview) 689 self.helpbox.config(yscrollcommand=help_scrollbar.set) 690 help_scrollbar.grid(column=4, row=1, sticky='NWS') 691 692 # The dev set 693 frame4 = Frame(frame0, background=self._FRAME_PARAMS['background']) 694 self.devsetbox = Text(frame4, font=self._font, 695 **self._DEVSETBOX_PARAMS) 696 self.devsetbox.pack(expand=True, fill='both') 697 self.devsetlabel = Label(frame0, font=self._font, 698 text='Development Set:', justify='right', 699 background=self._DEVSETBOX_PARAMS['background']) 700 self.devsetlabel.grid(column=0, row=4, sticky='SW') 701 frame4.grid(column=0, row=5, sticky='NEWS') 702 703 # dev set scrollbars 704 self.devset_scroll = Scrollbar(frame0, command=self._devset_scroll) 705 self.devset_scroll.grid(column=1, row=5, sticky='NWS') 706 self.devset_xscroll = Scrollbar(frame4, command=self.devsetbox.xview, 707 orient='horiz') 708 self.devsetbox['xscrollcommand'] = self.devset_xscroll.set 709 self.devset_xscroll.pack(side='bottom', fill='x') 710 711 # dev set buttons 712 bg = self._FRAME_PARAMS['background'] 713 frame1 = Frame(frame0, background=bg) 714 frame1.grid(column=0, row=7, sticky='EW') 715 Button(frame1, text='Prev Example (Ctrl-p)', 716 command=self._devset_prev, 717 **self._BUTTON_PARAMS).pack(side='left') 718 Button(frame1, text='Next Example (Ctrl-n)', 719 command=self._devset_next, 720 **self._BUTTON_PARAMS).pack(side='left') 721 self.devset_button = Button(frame1, text='Show example', 722 command=self.show_devset, 723 state='disabled', 724 **self._BUTTON_PARAMS) 725 self.devset_button.pack(side='right') 726 self.trace_button = Button(frame1, text='Show trace', 727 command=self.show_trace, 728 **self._BUTTON_PARAMS) 729 self.trace_button.pack(side='right') 730 731 732 # evaluation box 733 self.evalbox = Canvas(frame0, **self._EVALBOX_PARAMS) 734 label = Label(frame0, font=self._font, text='Evaluation:', 735 justify='right', background=self._EVALBOX_PARAMS['background']) 736 label.grid(column=3, row=4, sticky='SW') 737 self.evalbox.grid(column=3, row=5, sticky='NEWS', columnspan=2) 738 739 # evaluation box buttons 740 bg = self._FRAME_PARAMS['background'] 741 frame2 = Frame(frame0, background=bg) 742 frame2.grid(column=3, row=7, sticky='EW') 743 self._autoscale = IntVar(self.top) 744 self._autoscale.set(False) 745 Checkbutton(frame2, variable=self._autoscale, command=self._eval_plot, 746 text='Zoom', **self._BUTTON_PARAMS).pack(side='left') 747 self._eval_lines = IntVar(self.top) 748 self._eval_lines.set(False) 749 Checkbutton(frame2, variable=self._eval_lines, command=self._eval_plot, 750 text='Lines', **self._BUTTON_PARAMS).pack(side='left') 751 Button(frame2, text='History', 752 **self._BUTTON_PARAMS).pack(side='right') 753 754 # The status label 755 self.status = Label(frame0, font=self._font, **self._STATUS_PARAMS) 756 self.status.grid(column=0, row=9, sticky='NEW', padx=3, pady=2, 757 columnspan=5) 758 759 # Help box & devset box can't be edited. 760 self.helpbox['state'] = 'disabled' 761 self.devsetbox['state'] = 'disabled' 762 763 # Spacers 764 bg = self._FRAME_PARAMS['background'] 765 Frame(frame0, height=10, width=0, background=bg).grid(column=0, row=3) 766 Frame(frame0, height=0, width=10, background=bg).grid(column=2, row=0) 767 Frame(frame0, height=6, width=0, background=bg).grid(column=0, row=8) 768 769 # pack the frame. 770 frame0.pack(fill='both', expand=True) 771 772 # Set up colors for the devset box 773 self.devsetbox.tag_config('true-pos', background='#afa', 774 underline='True') 775 self.devsetbox.tag_config('false-neg', underline='True', 776 foreground='#800') 777 self.devsetbox.tag_config('false-pos', background='#faa') 778 self.devsetbox.tag_config('trace', foreground='#666', wrap='none') 779 self.devsetbox.tag_config('wrapindent', lmargin2=30, wrap='none') 780 self.devsetbox.tag_config('error', foreground='#800') 781 782 # And for the grammarbox 783 self.grammarbox.tag_config('error', background='#fec') 784 self.grammarbox.tag_config('comment', foreground='#840') 785 self.grammarbox.tag_config('angle', foreground='#00f') 786 self.grammarbox.tag_config('brace', foreground='#0a0') 787 self.grammarbox.tag_config('hangindent', lmargin1=0, lmargin2=40)
788 789 _showing_trace = False
790 - def show_trace(self, *e):
791 self._showing_trace = True 792 self.trace_button['state'] = 'disabled' 793 self.devset_button['state'] = 'normal' 794 795 self.devsetbox['state'] = 'normal' 796 #self.devsetbox['wrap'] = 'none' 797 self.devsetbox.delete('1.0', 'end') 798 self.devsetlabel['text']='Development Set (%d/%d)' % ( 799 (self.devset_index+1, self._devset_size.get())) 800 801 if self.chunker is None: 802 self.devsetbox.insert('1.0', 'Trace: waiting for a valid grammar.') 803 self.devsetbox.tag_add('error', '1.0', 'end') 804 return # can't do anything more 805 806 gold_tree = self.devset[self.devset_index] 807 rules = self.chunker.rules() 808 809 # Calculate the tag sequence 810 tagseq = '\t' 811 charnum = [1] 812 for wordnum, (word, pos) in enumerate(gold_tree.leaves()): 813 tagseq += '%s ' % pos 814 charnum.append(len(tagseq)) 815 self.charnum = dict(((i, j), charnum[j]) 816 for i in range(len(rules)+1) 817 for j in range(len(charnum))) 818 self.linenum = dict((i,i*2+2) for i in range(len(rules)+1)) 819 820 for i in range(len(rules)+1): 821 if i == 0: 822 self.devsetbox.insert('end', 'Start:\n') 823 self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c') 824 else: 825 self.devsetbox.insert('end', 'Apply %s:\n' % rules[i-1]) 826 self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c') 827 # Display the tag sequence. 828 self.devsetbox.insert('end', tagseq+'\n') 829 self.devsetbox.tag_add('wrapindent','end -2c linestart','end -2c') 830 # Run a partial parser, and extract gold & test chunks 831 chunker = nltk.RegexpChunkParser(rules[:i]) 832 test_tree = self._chunkparse(gold_tree.leaves()) 833 gold_chunks = self._chunks(gold_tree) 834 test_chunks = self._chunks(test_tree) 835 # Compare them. 836 for chunk in gold_chunks.intersection(test_chunks): 837 self._color_chunk(i, chunk, 'true-pos') 838 for chunk in gold_chunks - test_chunks: 839 self._color_chunk(i, chunk, 'false-neg') 840 for chunk in test_chunks - gold_chunks: 841 self._color_chunk(i, chunk, 'false-pos') 842 self.devsetbox.insert('end', 'Finished.\n') 843 self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c') 844 845 # This is a hack, because the x-scrollbar isn't updating its 846 # position right -- I'm not sure what the underlying cause is 847 # though. (This is on OS X w/ python 2.5) 848 self.top.after(100, self.devset_xscroll.set, 0, .3)
849
850 - def show_help(self, tab):
851 self.helpbox['state'] = 'normal' 852 self.helpbox.delete('1.0', 'end') 853 for (name, tabstops, text) in self.HELP: 854 if name == tab: 855 text = text.replace('<<TAGSET>>', '\n'.join( 856 ('\t%s\t%s' % item for item in sorted(self.tagset.items(), 857 key=lambda (t,w):re.match('\w+',t) and (0,t) or (1,t))))) 858 859 self.helptabs[name].config(**self._HELPTAB_FG_PARAMS) 860 self.helpbox.config(tabs=tabstops) 861 self.helpbox.insert('1.0', text+'\n'*20) 862 C = '1.0 + %d chars' 863 for (tag, params) in self.HELP_AUTOTAG: 864 pattern = '(?s)(<%s>)(.*?)(</%s>)' % (tag, tag) 865 for m in re.finditer(pattern, text): 866 self.helpbox.tag_add('elide', 867 C % m.start(1), C % m.end(1)) 868 self.helpbox.tag_add('tag-%s' % tag, 869 C % m.start(2), C % m.end(2)) 870 self.helpbox.tag_add('elide', 871 C % m.start(3), C % m.end(3)) 872 else: 873 self.helptabs[name].config(**self._HELPTAB_BG_PARAMS) 874 self.helpbox['state'] = 'disabled'
875
876 - def _history_prev(self, *e):
877 self._view_history(self._history_index-1) 878 return 'break'
879
880 - def _history_next(self, *e):
881 self._view_history(self._history_index+1) 882 return 'break'
883
884 - def _view_history(self, index):
885 # Bounds & sanity checking: 886 index = max(0, min(len(self._history)-1, index)) 887 if not self._history: return 888 # Already viewing the requested history item? 889 if index == self._history_index: 890 return 891 # Show the requested grammar. It will get added to _history 892 # only if they edit it (causing self.update() to get run.) 893 self.grammarbox['state'] = 'normal' 894 self.grammarbox.delete('1.0', 'end') 895 self.grammarbox.insert('end', self._history[index][0]) 896 self.grammarbox.mark_set('insert', '1.0') 897 self._history_index = index 898 self._syntax_highlight_grammar(self._history[index][0]) 899 # Record the normalized grammar & regenerate the chunker. 900 self.normalized_grammar = self.normalize_grammar( 901 self._history[index][0]) 902 if self.normalized_grammar: 903 rules = [nltk.chunk.regexp.RegexpChunkRule.parse(line) 904 for line in self.normalized_grammar.split('\n')] 905 else: 906 rules = [] 907 self.chunker = nltk.RegexpChunkParser(rules) 908 # Show the score. 909 self._eval_plot() 910 # Update the devset box 911 self._highlight_devset() 912 if self._showing_trace: self.show_trace() 913 # Update the grammar label 914 if self._history_index < len(self._history)-1: 915 self.grammarlabel['text'] = 'Grammar %s/%s:' % ( 916 self._history_index+1, len(self._history)) 917 else: 918 self.grammarlabel['text'] = 'Grammar:'
919
920 - def _devset_next(self, *e):
921 self._devset_scroll('scroll', 1, 'page') 922 return 'break'
923
924 - def _devset_prev(self, *e):
925 self._devset_scroll('scroll', -1, 'page') 926 return 'break'
927
928 - def destroy(self, *e):
929 if self.top is None: return 930 self.top.destroy() 931 self.top = None
932
933 - def _devset_scroll(self, command, *args):
934 N = 1 # size of a page -- one sentence. 935 showing_trace = self._showing_trace 936 if command == 'scroll' and args[1].startswith('unit'): 937 self.show_devset(self.devset_index+int(args[0])) 938 elif command == 'scroll' and args[1].startswith('page'): 939 self.show_devset(self.devset_index+N*int(args[0])) 940 elif command == 'moveto': 941 self.show_devset(int(float(args[0])*self._devset_size.get())) 942 else: 943 assert 0, 'bad scroll command %s %s' % (command, args) 944 if showing_trace: 945 self.show_trace()
946
947 - def show_devset(self, index=None):
948 if index is None: index = self.devset_index 949 950 # Bounds checking 951 index = min(max(0, index), self._devset_size.get()-1) 952 953 if index == self.devset_index and not self._showing_trace: return 954 self.devset_index = index 955 956 self._showing_trace = False 957 self.trace_button['state'] = 'normal' 958 self.devset_button['state'] = 'disabled' 959 960 # Clear the text box. 961 self.devsetbox['state'] = 'normal' 962 self.devsetbox['wrap'] = 'word' 963 self.devsetbox.delete('1.0', 'end') 964 self.devsetlabel['text']='Development Set (%d/%d)' % ( 965 (self.devset_index+1, self._devset_size.get())) 966 967 # Add the sentences 968 sample = self.devset[self.devset_index:self.devset_index+1] 969 self.charnum = {} 970 self.linenum = {0:1} 971 for sentnum, sent in enumerate(sample): 972 linestr = '' 973 for wordnum, (word, pos) in enumerate(sent.leaves()): 974 self.charnum[sentnum, wordnum] = len(linestr) 975 linestr += '%s/%s ' % (word, pos) 976 self.charnum[sentnum, wordnum+1] = len(linestr) 977 self.devsetbox.insert('end', linestr[:-1]+'\n\n') 978 979 # Highlight chunks in the dev set 980 if self.chunker is not None: 981 self._highlight_devset() 982 self.devsetbox['state'] = 'disabled' 983 984 # Update the scrollbar 985 first = float(self.devset_index)/self._devset_size.get() 986 last = float(self.devset_index+2)/self._devset_size.get() 987 self.devset_scroll.set(first, last)
988
989 - def _chunks(self, tree):
990 chunks = set() 991 wordnum = 0 992 for child in tree: 993 if isinstance(child, nltk.Tree): 994 if child.node == self._chunk_node: 995 chunks.add( (wordnum, wordnum+len(child)) ) 996 wordnum += len(child) 997 else: 998 wordnum += 1 999 return chunks
1000
1001 - def _syntax_highlight_grammar(self, grammar):
1002 if self.top is None: return 1003 self.grammarbox.tag_remove('comment', '1.0', 'end') 1004 self.grammarbox.tag_remove('angle', '1.0', 'end') 1005 self.grammarbox.tag_remove('brace', '1.0', 'end') 1006 self.grammarbox.tag_add('hangindent', '1.0', 'end') 1007 for lineno, line in enumerate(grammar.split('\n')): 1008 if not line.strip(): continue 1009 m = re.match(r'(\\.|[^#])*(#.*)?', line) 1010 comment_start = None 1011 if m.group(2): 1012 comment_start = m.start(2) 1013 s = '%d.%d' % (lineno+1, m.start(2)) 1014 e = '%d.%d' % (lineno+1, m.end(2)) 1015 self.grammarbox.tag_add('comment', s, e) 1016 for m in re.finditer('[<>{}]', line): 1017 if comment_start is not None and m.start() >= comment_start: 1018 break 1019 s = '%d.%d' % (lineno+1, m.start()) 1020 e = '%d.%d' % (lineno+1, m.end()) 1021 if m.group() in '<>': 1022 self.grammarbox.tag_add('angle', s, e) 1023 else: 1024 self.grammarbox.tag_add('brace', s, e)
1025 1026
1027 - def _grammarcheck(self, grammar):
1028 if self.top is None: return 1029 self.grammarbox.tag_remove('error', '1.0', 'end') 1030 self._grammarcheck_errs = [] 1031 for lineno, line in enumerate(grammar.split('\n')): 1032 line = re.sub(r'((\\.|[^#])*)(#.*)?', r'\1', line) 1033 line = line.strip() 1034 if line: 1035 try: nltk.chunk.regexp.RegexpChunkRule.parse(line) 1036 except ValueError, e: 1037 self.grammarbox.tag_add('error', '%s.0' % (lineno+1), 1038 '%s.0 lineend' % (lineno+1)) 1039 self.status['text'] = ''
1040
1041 - def update(self, *event):
1042 # Record when update was called (for grammarcheck) 1043 if event: 1044 self._last_keypress = time.time() 1045 1046 # Read the grammar from the Text box. 1047 self.grammar = grammar = self.grammarbox.get('1.0', 'end') 1048 1049 # If the grammar hasn't changed, do nothing: 1050 normalized_grammar = self.normalize_grammar(grammar) 1051 if normalized_grammar == self.normalized_grammar: 1052 return 1053 else: 1054 self.normalized_grammar = normalized_grammar 1055 1056 # If the grammar has changed, and we're looking at history, 1057 # then stop looking at history. 1058 if self._history_index < len(self._history)-1: 1059 self.grammarlabel['text'] = 'Grammar:' 1060 1061 self._syntax_highlight_grammar(grammar) 1062 1063 # The grammar has changed; try parsing it. If it doesn't 1064 # parse, do nothing. (flag error location?) 1065 try: 1066 # Note: the normalized grammar has no blank lines. 1067 if normalized_grammar: 1068 rules = [nltk.chunk.regexp.RegexpChunkRule.parse(line) 1069 for line in normalized_grammar.split('\n')] 1070 else: 1071 rules = [] 1072 except ValueError, e: 1073 # Use the un-normalized grammar for error highlighting. 1074 self._grammarcheck(grammar) 1075 self.chunker = None 1076 return 1077 1078 self.chunker = nltk.RegexpChunkParser(rules) 1079 self.grammarbox.tag_remove('error', '1.0', 'end') 1080 self.grammar_changed = time.time() 1081 # Display the results 1082 if self._showing_trace: 1083 self.show_trace() 1084 else: 1085 self._highlight_devset() 1086 # Start the eval demon 1087 if not self._eval_demon_running: 1088 self._eval_demon()
1089
1090 - def _highlight_devset(self, sample=None):
1091 if sample is None: 1092 sample = self.devset[self.devset_index:self.devset_index+1] 1093 1094 self.devsetbox.tag_remove('true-pos', '1.0', 'end') 1095 self.devsetbox.tag_remove('false-neg', '1.0', 'end') 1096 self.devsetbox.tag_remove('false-pos', '1.0', 'end') 1097 1098 # Run the grammar on the test cases. 1099 for sentnum, gold_tree in enumerate(sample): 1100 # Run the chunk parser 1101 test_tree = self._chunkparse(gold_tree.leaves()) 1102 # Extract gold & test chunks 1103 gold_chunks = self._chunks(gold_tree) 1104 test_chunks = self._chunks(test_tree) 1105 # Compare them. 1106 for chunk in gold_chunks.intersection(test_chunks): 1107 self._color_chunk(sentnum, chunk, 'true-pos') 1108 for chunk in gold_chunks - test_chunks: 1109 self._color_chunk(sentnum, chunk, 'false-neg') 1110 for chunk in test_chunks - gold_chunks: 1111 self._color_chunk(sentnum, chunk, 'false-pos')
1112
1113 - def _chunkparse(self, words):
1114 try: 1115 return self.chunker.parse(words) 1116 except (ValueError, IndexError), e: 1117 # There's an error somewhere in the grammar, but we're not sure 1118 # exactly where, so just mark the whole grammar as bad. 1119 # E.g., this is caused by: "({<NN>})" 1120 self.grammarbox.tag_add('error', '1.0', 'end') 1121 # Treat it as tagging nothing: 1122 return words
1123
1124 - def _color_chunk(self, sentnum, chunk, tag):
1125 start, end = chunk 1126 self.devsetbox.tag_add(tag, 1127 '%s.%s' % (self.linenum[sentnum], self.charnum[sentnum, start]), 1128 '%s.%s' % (self.linenum[sentnum], self.charnum[sentnum, end]-1))
1129
1130 - def reset(self):
1131 # Clear various variables 1132 self.chunker = None 1133 self.grammar = None 1134 self.normalized_grammar = None 1135 self.grammar_changed = 0 1136 self._history = [] 1137 self._history_index = 0 1138 # Update the on-screen display. 1139 self.grammarbox.delete('1.0', 'end') 1140 self.show_devset(0) 1141 self.update()
1142 #self._eval_plot() 1143 1144 SAVE_GRAMMAR_TEMPLATE = ( 1145 '# Regexp Chunk Parsing Grammar\n' 1146 '# Saved %(date)s\n' 1147 '#\n' 1148 '# Development set: %(devset)s\n' 1149 '# Precision: %(precision)s\n' 1150 '# Recall: %(recall)s\n' 1151 '# F-score: %(fscore)s\n\n' 1152 '%(grammar)s\n') 1153
1154 - def save_grammar(self, filename=None):
1155 if not filename: 1156 ftypes = [('Chunk Gramamr', '.chunk'), 1157 ('All files', '*')] 1158 filename = asksaveasfilename(filetypes=ftypes, 1159 defaultextension='.chunk') 1160 if not filename: return 1161 if (self._history and self.normalized_grammar == 1162 self.normalize_grammar(self._history[-1][0])): 1163 precision, recall, fscore = ['%.2f%%' % (100*v) for v in 1164 self._history[-1][1:]] 1165 elif self.chunker is None: 1166 precision = recall = fscore = 'Grammar not well formed' 1167 else: 1168 precision = recall = fscore = 'Not finished evaluation yet' 1169 1170 out = open(filename, 'w') 1171 out.write(self.SAVE_GRAMMAR_TEMPLATE % dict( 1172 date=time.ctime(), devset=self.devset_name, 1173 precision=precision, recall=recall, fscore=fscore, 1174 grammar=self.grammar.strip())) 1175 out.close()
1176
1177 - def load_grammar(self, filename=None):
1178 if not filename: 1179 ftypes = [('Chunk Gramamr', '.chunk'), 1180 ('All files', '*')] 1181 filename = askopenfilename(filetypes=ftypes, 1182 defaultextension='.chunk') 1183 if not filename: return 1184 self.grammarbox.delete('1.0', 'end') 1185 self.update() 1186 grammar = open(filename).read() 1187 grammar = re.sub('^\# Regexp Chunk Parsing Grammar[\s\S]*' 1188 'F-score:.*\n', '', grammar).lstrip() 1189 self.grammarbox.insert('1.0', grammar) 1190 self.update()
1191
1192 - def save_history(self, filename=None):
1193 if not filename: 1194 ftypes = [('Chunk Gramamr History', '.txt'), 1195 ('All files', '*')] 1196 filename = asksaveasfilename(filetypes=ftypes, 1197 defaultextension='.txt') 1198 if not filename: return 1199 1200 out = open(filename, 'w') 1201 out.write('# Regexp Chunk Parsing Grammar History\n') 1202 out.write('# Saved %s\n' % time.ctime()) 1203 out.write('# Development set: %s\n' % self.devset_name) 1204 for i, (g, p, r, f) in enumerate(self._history): 1205 hdr = ('Grammar %d/%d (precision=%.2f%%, recall=%.2f%%, ' 1206 'fscore=%.2f%%)' % (i+1, len(self._history), 1207 p*100, r*100, f*100)) 1208 out.write('\n%s\n' % hdr) 1209 out.write(''.join(' %s\n' % line for line in g.strip().split())) 1210 1211 if not (self._history and self.normalized_grammar == 1212 self.normalize_grammar(self._history[-1][0])): 1213 if self.chunker is None: 1214 out.write('\nCurrent Grammar (not well-formed)\n') 1215 else: 1216 out.write('\nCurrent Grammar (not evaluated)\n') 1217 out.write(''.join(' %s\n' % line for line 1218 in self.grammar.strip().split())) 1219 out.close()
1220
1221 - def about(self, *e):
1222 ABOUT = ("NLTK RegExp Chunk Parser Demo\n"+ 1223 "Written by Edward Loper") 1224 TITLE = 'About: Regular Expression Chunk Parser Demo' 1225 try: 1226 from tkMessageBox import Message 1227 Message(message=ABOUT, title=TITLE).show() 1228 except: 1229 ShowText(self.top, TITLE, ABOUT)
1230
1231 - def set_devset_size(self, size=None):
1232 if size is not None: self._devset_size.set(size) 1233 self._devset_size.set(min(len(self.devset), self._devset_size.get())) 1234 self.show_devset(1) 1235 self.show_devset(0)
1236 # what about history? Evaluated at diff dev set sizes! 1237
1238 - def resize(self, size=None):
1239 if size is not None: self._size.set(size) 1240 size = self._size.get() 1241 self._font.configure(size=-(abs(size))) 1242 self._smallfont.configure(size=min(-10, -(abs(size))*14/20))
1243
1244 - def mainloop(self, *args, **kwargs):
1245 """ 1246 Enter the Tkinter mainloop. This function must be called if 1247 this demo is created from a non-interactive program (e.g. 1248 from a secript); otherwise, the demo will close as soon as 1249 the script completes. 1250 """ 1251 if in_idle(): return 1252 self.top.mainloop(*args, **kwargs)
1253
1254 -def demo():
1255 RegexpChunkDemo().mainloop()
1256 1257 if __name__ == '__main__': 1258 demo() 1259