Code Coverage for nltk.draw.rechunkparser
Untested Functions
- demo()
- RegexpChunkDemo: __init__(), _adaptively_modify_eval_chunk(), _chunkparse(), _chunks(), _color_chunk(), _devset_next(), _devset_prev(), _devset_scroll(), _eval_demon(), _eval_plot(), _grammarcheck(), _highlight_devset(), _history_next(), _history_prev(), _init_bindings(), _init_fonts(), _init_menubar(), _init_widgets(), _syntax_highlight_grammar(), _view_history(), about(), destroy(), load_grammar(), mainloop(), normalize_grammar(), reset(), resize(), save_grammar(), save_history(), set_devset_size(), show_devset(), show_help(), show_trace(), toggle_show_trace(), update()
"""
A graphical tool for exploring the regular expression based chunk
parser (L{RegexpChunkParser<nltk.chunk.regex.RegexpChunkParser>}).
@todo: Add a way to select the development set from the menubar. This
might just need to be a selection box (conll vs treebank etc) plus
configuration parameters to select what's being chunked (eg VP vs NP)
and what part of the data is being used as the development set.
"""
from Tkinter import *
from tkFileDialog import asksaveasfilename, askopenfilename
import tkFont
import time
import textwrap
import re
import random
import nltk
from nltk.draw import *
class RegexpChunkDemo(object):
"""
A graphical tool for exploring the regular expression based chunk
parser (L{RegexpChunkParser<nltk.chunk.regex.RegexpChunkParser>}).
See L{HELP} for instructional text.
"""
TAGSET = {
'CC': 'Coordinating conjunction', 'PRP$': 'Possessive pronoun',
'CD': 'Cardinal number', 'RB': 'Adverb',
'DT': 'Determiner', 'RBR': 'Adverb, comparative',
'EX': 'Existential there', 'RBS': 'Adverb, superlative',
'FW': 'Foreign word', 'RP': 'Particle',
'JJ': 'Adjective', 'TO': 'to',
'JJR': 'Adjective, comparative', 'UH': 'Interjection',
'JJS': 'Adjective, superlative', 'VB': 'Verb, base form',
'LS': 'List item marker', 'VBD': 'Verb, past tense',
'MD': 'Modal', 'NNS': 'Noun, plural',
'NN': 'Noun, singular or masps', 'VBN': 'Verb, past participle',
'VBZ': 'Verb,3rd ps. sing. present', 'NNP': 'Proper noun, singular',
'NNPS': 'Proper noun plural', 'WDT': 'wh-determiner',
'PDT': 'Predeterminer', 'WP': 'wh-pronoun',
'POS': 'Possessive ending', 'WP$': 'Possessive wh-pronoun',
'PRP': 'Personal pronoun', 'WRB': 'wh-adverb',
'(': 'open parenthesis', ')': 'close parenthesis',
'``': 'open quote', ',': 'comma',
"''": 'close quote', '.': 'period',
'#': 'pound sign (currency marker)',
'$': 'dollar sign (currency marker)',
'IN': 'Preposition/subord. conjunction',
'SYM': 'Symbol (mathematical or scientific)',
'VBG': 'Verb, gerund/present participle',
'VBP': 'Verb, non-3rd ps. sing. present',
':': 'colon',
}
HELP = [
('Help', '20',
"Welcome to the regular expression chunk-parser grammar editor. "
"You can use this editor to develop and test chunk parser grammars "
"based on NLTK's RegexpChunkParser class.\n\n"
"Use this box ('Help') to learn more about the editor; click on the "
"tabs for help on specific topics:"
"<indent>\n"
"Rules: grammar rule types\n"
"Regexps: regular expression syntax\n"
"Tags: part of speech tags\n</indent>\n"
"Use the upper-left box ('Grammar') to edit your grammar. "
"Each line of your grammar specifies a single 'rule', "
"which performs an action such as creating a chunk or merging "
"two chunks.\n\n"
"The lower-left box ('Development Set') runs your grammar on the "
"development set, and displays the results. "
"Your grammar's chunks are <highlight>highlighted</highlight>, and "
"the correct (gold standard) chunks are "
"<underline>underlined</underline>. If they "
"match, they are displayed in <green>green</green>; otherwise, "
"they are displayed in <red>red</red>. The box displays a single "
"sentence from the development set at a time; use the scrollbar or "
"the next/previous buttons view additional sentences.\n\n"
"The lower-right box ('Evaluation') tracks the performance of "
"your grammar on the development set. The 'precision' axis "
"indicates how many of your grammar's chunks are correct; and "
"the 'recall' axis indicates how many of the gold standard "
"chunks your system generated. Typically, you should try to "
"design a grammar that scores high on both metrics. The "
"exact precision and recall of the current grammar, as well "
"as their geometric average (the 'f-score'), are displayed in "
"the status bar at the bottom of the window."
),
('Rules', '10',
"<h1>{...regexp...}</h1>"
"<indent>\nChunk rule: creates new chunks from words matching "
"regexp.</indent>\n\n"
"<h1>}...regexp...{</h1>"
"<indent>\nChink rule: removes words matching regexp from existing "
"chunks.</indent>\n\n"
"<h1>...regexp1...}{...regexp2...</h1>"
"<indent>\nSplit rule: splits chunks that match regexp1 followed by "
"regexp2 in two.</indent>\n\n"
"<h1>...regexp...{}...regexp...</h1>"
"<indent>\nMerge rule: joins consecutive chunks that match regexp1 "
"and regexp2</indent>\n"
),
('Regexps', '10 60',
"<h1>Pattern\t\tMatches...</h1>\n"
"<hangindent>"
"\t<<var>T</var>>\ta word with tag <var>T</var> "
"(where <var>T</var> may be a regexp).\n"
"\t<var>x</var>?\tan optional <var>x</var>\n"
"\t<var>x</var>+\ta sequence of 1 or more <var>x</var>'s\n"
"\t<var>x</var>*\ta sequence of 0 or more <var>x</var>'s\n"
"\t<var>x</var>|<var>y</var>\t<var>x</var> or <var>y</var>\n"
"\t.\tmatches any character\n"
"\t(<var>x</var>)\tTreats <var>x</var> as a group\n"
"\t# <var>x...</var>\tTreats <var>x...</var> "
"(to the end of the line) as a comment\n"
"\t\\<var>C</var>\tmatches character <var>C</var> "
"(useful when <var>C</var> is a special character "
"like + or #)\n"
"</hangindent>"
"\n<h1>Examples:</h1>\n"
"<hangindent>"
'\t<regexp><NN></regexp>\n'
'\t\tMatches <match>"cow/NN"</match>\n'
'\t\tMatches <match>"green/NN"</match>\n'
'\t<regexp><VB.*></regexp>\n'
'\t\tMatches <match>"eating/VBG"</match>\n'
'\t\tMatches <match>"ate/VBD"</match>\n'
'\t<regexp><IN><DT><NN></regexp>\n'
'\t\tMatches <match>"on/IN the/DT car/NN"</match>\n'
'\t<regexp><RB>?<VBD></regexp>\n'
'\t\tMatches <match>"ran/VBD"</match>\n'
'\t\tMatches <match>"slowly/RB ate/VBD"</match>\n'
'\t<regexp><\#><CD> # This is a comment...</regexp>\n'
'\t\tMatches <match>"#/# 100/CD"</match>\n'
"</hangindent>"
),
('Tags', '10 60',
"<h1>Part of Speech Tags:</h1>\n" +
'<hangindent>' +
'<<TAGSET>>' +
'</hangindent>\n')
]
HELP_AUTOTAG = [
('red', dict(foreground='#a00')),
('green', dict(foreground='#080')),
('highlight', dict(background='#ddd')),
('underline', dict(underline=True)),
('h1', dict(underline=True)),
('indent', dict(lmargin1=20, lmargin2=20)),
('hangindent', dict(lmargin1=0, lmargin2=60)),
('var', dict(foreground='#88f')),
('regexp', dict(foreground='#ba7')),
('match', dict(foreground='#6a6')),
]
_EVAL_DELAY = 1
"""If the user has not pressed any key for this amount of time (in
seconds), and the current grammar has not been evaluated, then
the eval demon will evaluate it."""
_EVAL_CHUNK = 15
"""The number of sentences that should be evaluated by the eval
demon each time it runs."""
_EVAL_FREQ = 0.2
"""The frequency (in seconds) at which the eval demon is run"""
_EVAL_DEMON_MIN = .02
"""The minimum amount of time that the eval demon should take each time
it runs -- if it takes less than this time, _EVAL_CHUNK will be
modified upwards."""
_EVAL_DEMON_MAX = .04
"""The maximum amount of time that the eval demon should take each time
it runs -- if it takes more than this time, _EVAL_CHUNK will be
modified downwards."""
_GRAMMARBOX_PARAMS = dict(
width=40, height=12, background='#efe', highlightbackground='#efe',
highlightthickness=1, relief='groove', border=2, wrap='word')
_HELPBOX_PARAMS = dict(
width=15, height=15, background='#efe', highlightbackground='#efe',
foreground='#555',
highlightthickness=1, relief='groove', border=2, wrap='word')
_DEVSETBOX_PARAMS = dict(
width=70, height=10, background='#eef', highlightbackground='#eef',
highlightthickness=1, relief='groove', border=2, wrap='word',
tabs=(30,))
_STATUS_PARAMS = dict(
background='#9bb', relief='groove', border=2)
_FONT_PARAMS = dict(
family='helvetica', size=-20)
_FRAME_PARAMS = dict(
background='#777', padx=2, pady=2, border=3)
_EVALBOX_PARAMS = dict(
background='#eef', highlightbackground='#eef',
highlightthickness=1, relief='groove', border=2,
width=300, height=280)
_BUTTON_PARAMS = dict(
background='#777', activebackground='#777',
highlightbackground='#777')
_HELPTAB_BG_COLOR = '#aba'
_HELPTAB_FG_COLOR = '#efe'
_HELPTAB_FG_PARAMS = dict(background='#efe')
_HELPTAB_BG_PARAMS = dict(background='#aba')
_HELPTAB_SPACER = 6
def normalize_grammar(self, grammar):
grammar = re.sub(r'((\\.|[^#])*)(#.*)?', r'\1', grammar)
grammar = re.sub(' +', ' ', grammar)
grammar = re.sub('\n\s+', '\n', grammar)
grammar = grammar.strip()
grammar = re.sub(r'([^\\])\$', r'\1\\$', grammar)
return grammar
def __init__(self, devset_name='conll2000', devset=None,
grammar = '', chunk_node='NP', tagset=None):
"""
@param devset_name: The name of the development set; used for
display & for save files. If either the name 'treebank'
or the name 'conll2000' is used, and devset is None, then
devset will be set automatically.
@param devset: A list of chunked sentences
@param grammar: The initial grammar to display.
@param tagset: Dictionary from tags to string descriptions, used
for the help page. Defaults to C{self.TAGSET}.
"""
self._chunk_node = chunk_node
if tagset is None: tagset = self.TAGSET
self.tagset = tagset
if devset is None:
if devset_name == 'conll2000':
devset = nltk.corpus.conll2000.chunked_sents('train.txt')
elif devset == 'treebank':
devset = nltk.corpus.treebank_chunk.chunked_sents()
else:
raise ValueError('Unknown development set %s' % devset_name)
self.chunker = None
"""The chunker built from the grammar string"""
self.grammar = grammar
"""The unparsed grammar string"""
self.normalized_grammar = None
"""A normalized version of L{self.grammar}."""
self.grammar_changed = 0
"""The last time() that the grammar was changed."""
self.devset = devset
"""The development set -- a list of chunked sentences."""
self.devset_name = devset_name
"""The name of the development set (for save files)."""
self.devset_index = -1
"""The index into the development set of the first instance
that's currently being viewed."""
self._last_keypress = 0
"""The time() when a key was most recently pressed"""
self._history = []
"""A list of (grammar, precision, recall, fscore) tuples for
grammars that the user has already tried."""
self._history_index = 0
"""When the user is scrolling through previous grammars, this
is used to keep track of which grammar they're looking at."""
self._eval_grammar = None
"""The grammar that is being currently evaluated by the eval
demon."""
self._eval_normalized_grammar = None
"""A normalized copy of L{_eval_grammar}."""
self._eval_index = 0
"""The index of the next sentence in the development set that
should be looked at by the eval demon."""
self._eval_score = nltk.chunk.ChunkScore(chunk_node=chunk_node)
"""The L{ChunkScore <nltk.chunk.ChunkScore>} object that's used
to keep track of the score of the current grammar on the
development set."""
top = self.top = Tk()
top.geometry('+50+50')
top.title('Regexp Chunk Parser Demo')
top.bind('<Control-q>', self.destroy)
self._devset_size = IntVar(top)
self._devset_size.set(100)
self._init_fonts(top)
self._init_widgets(top)
self._init_bindings(top)
self._init_menubar(top)
self.grammarbox.focus()
if grammar:
self.grammarbox.insert('end', grammar+'\n')
self.grammarbox.mark_set('insert', '1.0')
self.show_devset(0)
self.update()
def _init_bindings(self, top):
top.bind('<Control-n>', self._devset_next)
top.bind('<Control-p>', self._devset_prev)
top.bind('<Control-t>', self.toggle_show_trace)
top.bind('<KeyPress>', self.update)
top.bind('<Control-s>', lambda e: self.save_grammar())
top.bind('<Control-o>', lambda e: self.load_grammar())
self.grammarbox.bind('<Control-t>', self.toggle_show_trace)
self.grammarbox.bind('<Control-n>', self._devset_next)
self.grammarbox.bind('<Control-p>', self._devset_prev)
self.evalbox.bind('<Configure>', self._eval_plot)
def _init_fonts(self, top):
self._size = IntVar(top)
self._size.set(20)
self._font = tkFont.Font(family='helvetica',
size=-self._size.get())
self._smallfont = tkFont.Font(family='helvetica',
size=-(self._size.get()*14/20))
def _init_menubar(self, parent):
menubar = Menu(parent)
filemenu = Menu(menubar, tearoff=0)
filemenu.add_command(label='Reset Demo', underline=0,
command=self.reset)
filemenu.add_command(label='Save Current Grammar', underline=0,
accelerator='Ctrl-s',
command=self.save_grammar)
filemenu.add_command(label='Load Grammar', underline=0,
accelerator='Ctrl-o',
command=self.load_grammar)
filemenu.add_command(label='Save Grammar History', underline=13,
command=self.save_history)
filemenu.add_command(label='Exit', underline=1,
command=self.destroy, accelerator='Ctrl-q')
menubar.add_cascade(label='File', underline=0, menu=filemenu)
viewmenu = Menu(menubar, tearoff=0)
viewmenu.add_radiobutton(label='Tiny', variable=self._size,
underline=0, value=10, command=self.resize)
viewmenu.add_radiobutton(label='Small', variable=self._size,
underline=0, value=16, command=self.resize)
viewmenu.add_radiobutton(label='Medium', variable=self._size,
underline=0, value=20, command=self.resize)
viewmenu.add_radiobutton(label='Large', variable=self._size,
underline=0, value=24, command=self.resize)
viewmenu.add_radiobutton(label='Huge', variable=self._size,
underline=0, value=34, command=self.resize)
menubar.add_cascade(label='View', underline=0, menu=viewmenu)
devsetmenu = Menu(menubar, tearoff=0)
devsetmenu.add_radiobutton(label='50 sentences',
variable=self._devset_size,
value=50, command=self.set_devset_size)
devsetmenu.add_radiobutton(label='100 sentences',
variable=self._devset_size,
value=100, command=self.set_devset_size)
devsetmenu.add_radiobutton(label='200 sentences',
variable=self._devset_size,
value=200, command=self.set_devset_size)
devsetmenu.add_radiobutton(label='500 sentences',
variable=self._devset_size,
value=500, command=self.set_devset_size)
menubar.add_cascade(label='Development-Set', underline=0,
menu=devsetmenu)
helpmenu = Menu(menubar, tearoff=0)
helpmenu.add_command(label='About', underline=0,
command=self.about)
menubar.add_cascade(label='Help', underline=0, menu=helpmenu)
parent.config(menu=menubar)
def toggle_show_trace(self, *e):
if self._showing_trace:
self.show_devset()
else:
self.show_trace()
return 'break'
_SCALE_N = 5
_DRAW_LINES = False
def _eval_plot(self, *e, **config):
width = config.get('width', self.evalbox.winfo_width())
height = config.get('height', self.evalbox.winfo_height())
self.evalbox.delete('all')
tag = self.evalbox.create_text(10, height/2-10, justify='left',
anchor='w', text='Precision')
left, right = self.evalbox.bbox(tag)[2] + 5, width-10
tag = self.evalbox.create_text(left + (width-left)/2, height-10,
anchor='s', text='Recall', justify='center')
top, bot = 10, self.evalbox.bbox(tag)[1]-10
bg = self._EVALBOX_PARAMS['background']
self.evalbox.lower(self.evalbox.create_rectangle(0, 0, left-1, 5000,
fill=bg, outline=bg))
self.evalbox.lower(self.evalbox.create_rectangle(0, bot+1, 5000, 5000,
fill=bg, outline=bg))
if self._autoscale.get() and len(self._history) > 1:
max_precision = max_recall = 0
min_precision = min_recall = 1
for i in range(1, min(len(self._history), self._SCALE_N+1)):
grammar, precision, recall, fmeasure = self._history[-i]
min_precision = min(precision, min_precision)
min_recall = min(recall, min_recall)
max_precision = max(precision, max_precision)
max_recall = max(recall, max_recall)
min_precision = max(min_precision-.01, 0)
min_recall = max(min_recall-.01, 0)
max_precision = min(max_precision+.01, 1)
max_recall = min(max_recall+.01, 1)
else:
min_precision = min_recall = 0
max_precision = max_recall = 1
for i in range(11):
x = left + (right-left)*((i/10.-min_recall)/
(max_recall-min_recall))
y = bot - (bot-top)*((i/10.-min_precision)/
(max_precision-min_precision))
if left < x < right:
self.evalbox.create_line(x, top, x, bot, fill='#888')
if top < y < bot:
self.evalbox.create_line(left, y, right, y, fill='#888')
self.evalbox.create_line(left, top, left, bot)
self.evalbox.create_line(left, bot, right, bot)
self.evalbox.create_text(
left-3, bot, justify='right', anchor='se',
text='%d%%' % (100*min_precision))
self.evalbox.create_text(
left-3, top, justify='right', anchor='ne',
text='%d%%' % (100*max_precision))
self.evalbox.create_text(
left, bot+3, justify='center', anchor='nw',
text='%d%%' % (100*min_recall))
self.evalbox.create_text(
right, bot+3, justify='center', anchor='ne',
text='%d%%' % (100*max_recall))
prev_x = prev_y = None
for i, (_, precision, recall, fscore) in enumerate(self._history):
x = left + (right-left) * ((recall-min_recall) /
(max_recall-min_recall))
y = bot - (bot-top) * ((precision-min_precision) /
(max_precision-min_precision))
if i == self._history_index:
self.evalbox.create_oval(x-2,y-2,x+2,y+2,
fill='#0f0', outline='#000')
self.status['text'] = (
'Precision: %.2f%%\t' % (precision*100)+
'Recall: %.2f%%\t' % (recall*100)+
'F-score: %.2f%%' % (fscore*100))
else:
self.evalbox.lower(
self.evalbox.create_oval(x-2,y-2,x+2,y+2,
fill='#afa', outline='#8c8'))
if prev_x is not None and self._eval_lines.get():
self.evalbox.lower(
self.evalbox.create_line(prev_x, prev_y, x, y,
fill='#8c8'))
prev_x, prev_y = x, y
_eval_demon_running = False
def _eval_demon(self):
if self.top is None: return
if self.chunker is None:
self._eval_demon_running = False
return
t0 = time.time()
if (time.time()-self._last_keypress < self._EVAL_DELAY and
self.normalized_grammar != self._eval_normalized_grammar):
self._eval_demon_running = True
return self.top.after(int(self._EVAL_FREQ*1000), self._eval_demon)
if self.normalized_grammar != self._eval_normalized_grammar:
for (g, p, r, f) in self._history:
if self.normalized_grammar == self.normalize_grammar(g):
self._history.append( (g, p, r, f) )
self._history_index = len(self._history) - 1
self._eval_plot()
self._eval_demon_running = False
self._eval_normalized_grammar = None
return
self._eval_index = 0
self._eval_score = nltk.chunk.ChunkScore(chunk_node=
self._chunk_node)
self._eval_grammar = self.grammar
self._eval_normalized_grammar = self.normalized_grammar
if self.normalized_grammar.strip() == '':
self._eval_demon_running = False
return
for gold in self.devset[self._eval_index:
min(self._eval_index+self._EVAL_CHUNK,
self._devset_size.get())]:
guess = self._chunkparse(gold.leaves())
self._eval_score.score(gold, guess)
self._eval_index += self._EVAL_CHUNK
if self._eval_index >= self._devset_size.get():
self._history.append( (self._eval_grammar,
self._eval_score.precision(),
self._eval_score.recall(),
self._eval_score.f_measure()) )
self._history_index = len(self._history)-1
self._eval_plot()
self._eval_demon_running = False
self._eval_normalized_grammar = None
else:
progress = 100*self._eval_index/self._devset_size.get()
self.status['text'] = ('Evaluating on Development Set (%d%%)' %
progress)
self._eval_demon_running = True
self._adaptively_modify_eval_chunk(time.time() - t0)
self.top.after(int(self._EVAL_FREQ*1000), self._eval_demon)
def _adaptively_modify_eval_chunk(self, t):
"""
Modify _EVAL_CHUNK to try to keep the amount of time that the
eval demon takes between _EVAL_DEMON_MIN and _EVAL_DEMON_MAX.
@param t: The amount of time that the eval demon took.
"""
if t > self._EVAL_DEMON_MAX and self._EVAL_CHUNK > 5:
self._EVAL_CHUNK = min(self._EVAL_CHUNK-1,
max(int(self._EVAL_CHUNK*(self._EVAL_DEMON_MAX/t)),
self._EVAL_CHUNK-10))
elif t < self._EVAL_DEMON_MIN:
self._EVAL_CHUNK = max(self._EVAL_CHUNK+1,
min(int(self._EVAL_CHUNK*(self._EVAL_DEMON_MIN/t)),
self._EVAL_CHUNK+10))
def _init_widgets(self, top):
frame0 = Frame(top, **self._FRAME_PARAMS)
frame0.grid_columnconfigure(0, weight=4)
frame0.grid_columnconfigure(3, weight=2)
frame0.grid_rowconfigure(1, weight=1)
frame0.grid_rowconfigure(5, weight=1)
self.grammarbox = Text(frame0, font=self._font,
**self._GRAMMARBOX_PARAMS)
self.grammarlabel = Label(frame0, font=self._font, text='Grammar:',
highlightcolor='black',
background=self._GRAMMARBOX_PARAMS['background'])
self.grammarlabel.grid(column=0, row=0, sticky='SW')
self.grammarbox.grid(column=0, row=1, sticky='NEWS')
grammar_scrollbar = Scrollbar(frame0, command=self.grammarbox.yview)
grammar_scrollbar.grid(column=1, row=1, sticky='NWS')
self.grammarbox.config(yscrollcommand=grammar_scrollbar.set)
bg = self._FRAME_PARAMS['background']
frame3 = Frame(frame0, background=bg)
frame3.grid(column=0, row=2, sticky='EW')
Button(frame3, text='Prev Grammar', command=self._history_prev,
**self._BUTTON_PARAMS).pack(side='left')
Button(frame3, text='Next Grammar', command=self._history_next,
**self._BUTTON_PARAMS).pack(side='left')
self.helpbox = Text(frame0, font=self._smallfont,
**self._HELPBOX_PARAMS)
self.helpbox.grid(column=3, row=1, sticky='NEWS')
self.helptabs = {}
bg = self._FRAME_PARAMS['background']
helptab_frame = Frame(frame0, background=bg)
helptab_frame.grid(column=3, row=0, sticky='SW')
for i, (tab, tabstops, text) in enumerate(self.HELP):
label = Label(helptab_frame, text=tab, font=self._smallfont)
label.grid(column=i*2, row=0, sticky='S')
label.bind('<ButtonPress>', lambda e, tab=tab: self.show_help(tab))
self.helptabs[tab] = label
Frame(helptab_frame, height=1, width=self._HELPTAB_SPACER,
background=bg).grid(column=i*2+1, row=0)
self.helptabs[self.HELP[0][0]].configure(font=self._font)
self.helpbox.tag_config('elide', elide=True)
for (tag, params) in self.HELP_AUTOTAG:
self.helpbox.tag_config('tag-%s' % tag, **params)
self.show_help(self.HELP[0][0])
help_scrollbar = Scrollbar(frame0, command=self.helpbox.yview)
self.helpbox.config(yscrollcommand=help_scrollbar.set)
help_scrollbar.grid(column=4, row=1, sticky='NWS')
frame4 = Frame(frame0, background=self._FRAME_PARAMS['background'])
self.devsetbox = Text(frame4, font=self._font,
**self._DEVSETBOX_PARAMS)
self.devsetbox.pack(expand=True, fill='both')
self.devsetlabel = Label(frame0, font=self._font,
text='Development Set:', justify='right',
background=self._DEVSETBOX_PARAMS['background'])
self.devsetlabel.grid(column=0, row=4, sticky='SW')
frame4.grid(column=0, row=5, sticky='NEWS')
self.devset_scroll = Scrollbar(frame0, command=self._devset_scroll)
self.devset_scroll.grid(column=1, row=5, sticky='NWS')
self.devset_xscroll = Scrollbar(frame4, command=self.devsetbox.xview,
orient='horiz')
self.devsetbox['xscrollcommand'] = self.devset_xscroll.set
self.devset_xscroll.pack(side='bottom', fill='x')
bg = self._FRAME_PARAMS['background']
frame1 = Frame(frame0, background=bg)
frame1.grid(column=0, row=7, sticky='EW')
Button(frame1, text='Prev Example (Ctrl-p)',
command=self._devset_prev,
**self._BUTTON_PARAMS).pack(side='left')
Button(frame1, text='Next Example (Ctrl-n)',
command=self._devset_next,
**self._BUTTON_PARAMS).pack(side='left')
self.devset_button = Button(frame1, text='Show example',
command=self.show_devset,
state='disabled',
**self._BUTTON_PARAMS)
self.devset_button.pack(side='right')
self.trace_button = Button(frame1, text='Show trace',
command=self.show_trace,
**self._BUTTON_PARAMS)
self.trace_button.pack(side='right')
self.evalbox = Canvas(frame0, **self._EVALBOX_PARAMS)
label = Label(frame0, font=self._font, text='Evaluation:',
justify='right', background=self._EVALBOX_PARAMS['background'])
label.grid(column=3, row=4, sticky='SW')
self.evalbox.grid(column=3, row=5, sticky='NEWS', columnspan=2)
bg = self._FRAME_PARAMS['background']
frame2 = Frame(frame0, background=bg)
frame2.grid(column=3, row=7, sticky='EW')
self._autoscale = IntVar(self.top)
self._autoscale.set(False)
Checkbutton(frame2, variable=self._autoscale, command=self._eval_plot,
text='Zoom', **self._BUTTON_PARAMS).pack(side='left')
self._eval_lines = IntVar(self.top)
self._eval_lines.set(False)
Checkbutton(frame2, variable=self._eval_lines, command=self._eval_plot,
text='Lines', **self._BUTTON_PARAMS).pack(side='left')
Button(frame2, text='History',
**self._BUTTON_PARAMS).pack(side='right')
self.status = Label(frame0, font=self._font, **self._STATUS_PARAMS)
self.status.grid(column=0, row=9, sticky='NEW', padx=3, pady=2,
columnspan=5)
self.helpbox['state'] = 'disabled'
self.devsetbox['state'] = 'disabled'
bg = self._FRAME_PARAMS['background']
Frame(frame0, height=10, width=0, background=bg).grid(column=0, row=3)
Frame(frame0, height=0, width=10, background=bg).grid(column=2, row=0)
Frame(frame0, height=6, width=0, background=bg).grid(column=0, row=8)
frame0.pack(fill='both', expand=True)
self.devsetbox.tag_config('true-pos', background='#afa',
underline='True')
self.devsetbox.tag_config('false-neg', underline='True',
foreground='#800')
self.devsetbox.tag_config('false-pos', background='#faa')
self.devsetbox.tag_config('trace', foreground='#666', wrap='none')
self.devsetbox.tag_config('wrapindent', lmargin2=30, wrap='none')
self.devsetbox.tag_config('error', foreground='#800')
self.grammarbox.tag_config('error', background='#fec')
self.grammarbox.tag_config('comment', foreground='#840')
self.grammarbox.tag_config('angle', foreground='#00f')
self.grammarbox.tag_config('brace', foreground='#0a0')
self.grammarbox.tag_config('hangindent', lmargin1=0, lmargin2=40)
_showing_trace = False
def show_trace(self, *e):
self._showing_trace = True
self.trace_button['state'] = 'disabled'
self.devset_button['state'] = 'normal'
self.devsetbox['state'] = 'normal'
self.devsetbox.delete('1.0', 'end')
self.devsetlabel['text']='Development Set (%d/%d)' % (
(self.devset_index+1, self._devset_size.get()))
if self.chunker is None:
self.devsetbox.insert('1.0', 'Trace: waiting for a valid grammar.')
self.devsetbox.tag_add('error', '1.0', 'end')
return
gold_tree = self.devset[self.devset_index]
rules = self.chunker.rules()
tagseq = '\t'
charnum = [1]
for wordnum, (word, pos) in enumerate(gold_tree.leaves()):
tagseq += '%s ' % pos
charnum.append(len(tagseq))
self.charnum = dict(((i, j), charnum[j])
for i in range(len(rules)+1)
for j in range(len(charnum)))
self.linenum = dict((i,i*2+2) for i in range(len(rules)+1))
for i in range(len(rules)+1):
if i == 0:
self.devsetbox.insert('end', 'Start:\n')
self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c')
else:
self.devsetbox.insert('end', 'Apply %s:\n' % rules[i-1])
self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c')
self.devsetbox.insert('end', tagseq+'\n')
self.devsetbox.tag_add('wrapindent','end -2c linestart','end -2c')
chunker = nltk.RegexpChunkParser(rules[:i])
test_tree = self._chunkparse(gold_tree.leaves())
gold_chunks = self._chunks(gold_tree)
test_chunks = self._chunks(test_tree)
for chunk in gold_chunks.intersection(test_chunks):
self._color_chunk(i, chunk, 'true-pos')
for chunk in gold_chunks - test_chunks:
self._color_chunk(i, chunk, 'false-neg')
for chunk in test_chunks - gold_chunks:
self._color_chunk(i, chunk, 'false-pos')
self.devsetbox.insert('end', 'Finished.\n')
self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c')
self.top.after(100, self.devset_xscroll.set, 0, .3)
def show_help(self, tab):
self.helpbox['state'] = 'normal'
self.helpbox.delete('1.0', 'end')
for (name, tabstops, text) in self.HELP:
if name == tab:
text = text.replace('<<TAGSET>>', '\n'.join(
('\t%s\t%s' % item for item in sorted(self.tagset.items(),
key=lambda (t,w):re.match('\w+',t) and (0,t) or (1,t)))))
self.helptabs[name].config(**self._HELPTAB_FG_PARAMS)
self.helpbox.config(tabs=tabstops)
self.helpbox.insert('1.0', text+'\n'*20)
C = '1.0 + %d chars'
for (tag, params) in self.HELP_AUTOTAG:
pattern = '(?s)(<%s>)(.*?)(</%s>)' % (tag, tag)
for m in re.finditer(pattern, text):
self.helpbox.tag_add('elide',
C % m.start(1), C % m.end(1))
self.helpbox.tag_add('tag-%s' % tag,
C % m.start(2), C % m.end(2))
self.helpbox.tag_add('elide',
C % m.start(3), C % m.end(3))
else:
self.helptabs[name].config(**self._HELPTAB_BG_PARAMS)
self.helpbox['state'] = 'disabled'
def _history_prev(self, *e):
self._view_history(self._history_index-1)
return 'break'
def _history_next(self, *e):
self._view_history(self._history_index+1)
return 'break'
def _view_history(self, index):
index = max(0, min(len(self._history)-1, index))
if not self._history: return
if index == self._history_index:
return
self.grammarbox['state'] = 'normal'
self.grammarbox.delete('1.0', 'end')
self.grammarbox.insert('end', self._history[index][0])
self.grammarbox.mark_set('insert', '1.0')
self._history_index = index
self._syntax_highlight_grammar(self._history[index][0])
self.normalized_grammar = self.normalize_grammar(
self._history[index][0])
if self.normalized_grammar:
rules = [nltk.chunk.regexp.RegexpChunkRule.parse(line)
for line in self.normalized_grammar.split('\n')]
else:
rules = []
self.chunker = nltk.RegexpChunkParser(rules)
self._eval_plot()
self._highlight_devset()
if self._showing_trace: self.show_trace()
if self._history_index < len(self._history)-1:
self.grammarlabel['text'] = 'Grammar %s/%s:' % (
self._history_index+1, len(self._history))
else:
self.grammarlabel['text'] = 'Grammar:'
def _devset_next(self, *e):
self._devset_scroll('scroll', 1, 'page')
return 'break'
def _devset_prev(self, *e):
self._devset_scroll('scroll', -1, 'page')
return 'break'
def destroy(self, *e):
if self.top is None: return
self.top.destroy()
self.top = None
def _devset_scroll(self, command, *args):
N = 1
showing_trace = self._showing_trace
if command == 'scroll' and args[1].startswith('unit'):
self.show_devset(self.devset_index+int(args[0]))
elif command == 'scroll' and args[1].startswith('page'):
self.show_devset(self.devset_index+N*int(args[0]))
elif command == 'moveto':
self.show_devset(int(float(args[0])*self._devset_size.get()))
else:
assert 0, 'bad scroll command %s %s' % (command, args)
if showing_trace:
self.show_trace()
def show_devset(self, index=None):
if index is None: index = self.devset_index
index = min(max(0, index), self._devset_size.get()-1)
if index == self.devset_index and not self._showing_trace: return
self.devset_index = index
self._showing_trace = False
self.trace_button['state'] = 'normal'
self.devset_button['state'] = 'disabled'
self.devsetbox['state'] = 'normal'
self.devsetbox['wrap'] = 'word'
self.devsetbox.delete('1.0', 'end')
self.devsetlabel['text']='Development Set (%d/%d)' % (
(self.devset_index+1, self._devset_size.get()))
sample = self.devset[self.devset_index:self.devset_index+1]
self.charnum = {}
self.linenum = {0:1}
for sentnum, sent in enumerate(sample):
linestr = ''
for wordnum, (word, pos) in enumerate(sent.leaves()):
self.charnum[sentnum, wordnum] = len(linestr)
linestr += '%s/%s ' % (word, pos)
self.charnum[sentnum, wordnum+1] = len(linestr)
self.devsetbox.insert('end', linestr[:-1]+'\n\n')
if self.chunker is not None:
self._highlight_devset()
self.devsetbox['state'] = 'disabled'
first = float(self.devset_index)/self._devset_size.get()
last = float(self.devset_index+2)/self._devset_size.get()
self.devset_scroll.set(first, last)
def _chunks(self, tree):
chunks = set()
wordnum = 0
for child in tree:
if isinstance(child, nltk.Tree):
if child.node == self._chunk_node:
chunks.add( (wordnum, wordnum+len(child)) )
wordnum += len(child)
else:
wordnum += 1
return chunks
def _syntax_highlight_grammar(self, grammar):
if self.top is None: return
self.grammarbox.tag_remove('comment', '1.0', 'end')
self.grammarbox.tag_remove('angle', '1.0', 'end')
self.grammarbox.tag_remove('brace', '1.0', 'end')
self.grammarbox.tag_add('hangindent', '1.0', 'end')
for lineno, line in enumerate(grammar.split('\n')):
if not line.strip(): continue
m = re.match(r'(\\.|[^#])*(#.*)?', line)
comment_start = None
if m.group(2):
comment_start = m.start(2)
s = '%d.%d' % (lineno+1, m.start(2))
e = '%d.%d' % (lineno+1, m.end(2))
self.grammarbox.tag_add('comment', s, e)
for m in re.finditer('[<>{}]', line):
if comment_start is not None and m.start() >= comment_start:
break
s = '%d.%d' % (lineno+1, m.start())
e = '%d.%d' % (lineno+1, m.end())
if m.group() in '<>':
self.grammarbox.tag_add('angle', s, e)
else:
self.grammarbox.tag_add('brace', s, e)
def _grammarcheck(self, grammar):
if self.top is None: return
self.grammarbox.tag_remove('error', '1.0', 'end')
self._grammarcheck_errs = []
for lineno, line in enumerate(grammar.split('\n')):
line = re.sub(r'((\\.|[^#])*)(#.*)?', r'\1', line)
line = line.strip()
if line:
try: nltk.chunk.regexp.RegexpChunkRule.parse(line)
except ValueError, e:
self.grammarbox.tag_add('error', '%s.0' % (lineno+1),
'%s.0 lineend' % (lineno+1))
self.status['text'] = ''
def update(self, *event):
if event:
self._last_keypress = time.time()
self.grammar = grammar = self.grammarbox.get('1.0', 'end')
normalized_grammar = self.normalize_grammar(grammar)
if normalized_grammar == self.normalized_grammar:
return
else:
self.normalized_grammar = normalized_grammar
if self._history_index < len(self._history)-1:
self.grammarlabel['text'] = 'Grammar:'
self._syntax_highlight_grammar(grammar)
try:
if normalized_grammar:
rules = [nltk.chunk.regexp.RegexpChunkRule.parse(line)
for line in normalized_grammar.split('\n')]
else:
rules = []
except ValueError, e:
self._grammarcheck(grammar)
self.chunker = None
return
self.chunker = nltk.RegexpChunkParser(rules)
self.grammarbox.tag_remove('error', '1.0', 'end')
self.grammar_changed = time.time()
if self._showing_trace:
self.show_trace()
else:
self._highlight_devset()
if not self._eval_demon_running:
self._eval_demon()
def _highlight_devset(self, sample=None):
if sample is None:
sample = self.devset[self.devset_index:self.devset_index+1]
self.devsetbox.tag_remove('true-pos', '1.0', 'end')
self.devsetbox.tag_remove('false-neg', '1.0', 'end')
self.devsetbox.tag_remove('false-pos', '1.0', 'end')
for sentnum, gold_tree in enumerate(sample):
test_tree = self._chunkparse(gold_tree.leaves())
gold_chunks = self._chunks(gold_tree)
test_chunks = self._chunks(test_tree)
for chunk in gold_chunks.intersection(test_chunks):
self._color_chunk(sentnum, chunk, 'true-pos')
for chunk in gold_chunks - test_chunks:
self._color_chunk(sentnum, chunk, 'false-neg')
for chunk in test_chunks - gold_chunks:
self._color_chunk(sentnum, chunk, 'false-pos')
def _chunkparse(self, words):
try:
return self.chunker.parse(words)
except (ValueError, IndexError), e:
self.grammarbox.tag_add('error', '1.0', 'end')
return words
def _color_chunk(self, sentnum, chunk, tag):
start, end = chunk
self.devsetbox.tag_add(tag,
'%s.%s' % (self.linenum[sentnum], self.charnum[sentnum, start]),
'%s.%s' % (self.linenum[sentnum], self.charnum[sentnum, end]-1))
def reset(self):
self.chunker = None
self.grammar = None
self.normalized_grammar = None
self.grammar_changed = 0
self._history = []
self._history_index = 0
self.grammarbox.delete('1.0', 'end')
self.show_devset(0)
self.update()
SAVE_GRAMMAR_TEMPLATE = (
'# Regexp Chunk Parsing Grammar\n'
'# Saved %(date)s\n'
'#\n'
'# Development set: %(devset)s\n'
'# Precision: %(precision)s\n'
'# Recall: %(recall)s\n'
'# F-score: %(fscore)s\n\n'
'%(grammar)s\n')
def save_grammar(self, filename=None):
if not filename:
ftypes = [('Chunk Gramamr', '.chunk'),
('All files', '*')]
filename = asksaveasfilename(filetypes=ftypes,
defaultextension='.chunk')
if not filename: return
if (self._history and self.normalized_grammar ==
self.normalize_grammar(self._history[-1][0])):
precision, recall, fscore = ['%.2f%%' % (100*v) for v in
self._history[-1][1:]]
elif self.chunker is None:
precision = recall = fscore = 'Grammar not well formed'
else:
precision = recall = fscore = 'Not finished evaluation yet'
out = open(filename, 'w')
out.write(self.SAVE_GRAMMAR_TEMPLATE % dict(
date=time.ctime(), devset=self.devset_name,
precision=precision, recall=recall, fscore=fscore,
grammar=self.grammar.strip()))
out.close()
def load_grammar(self, filename=None):
if not filename:
ftypes = [('Chunk Gramamr', '.chunk'),
('All files', '*')]
filename = askopenfilename(filetypes=ftypes,
defaultextension='.chunk')
if not filename: return
self.grammarbox.delete('1.0', 'end')
self.update()
grammar = open(filename).read()
grammar = re.sub('^\# Regexp Chunk Parsing Grammar[\s\S]*'
'F-score:.*\n', '', grammar).lstrip()
self.grammarbox.insert('1.0', grammar)
self.update()
def save_history(self, filename=None):
if not filename:
ftypes = [('Chunk Gramamr History', '.txt'),
('All files', '*')]
filename = asksaveasfilename(filetypes=ftypes,
defaultextension='.txt')
if not filename: return
out = open(filename, 'w')
out.write('# Regexp Chunk Parsing Grammar History\n')
out.write('# Saved %s\n' % time.ctime())
out.write('# Development set: %s\n' % self.devset_name)
for i, (g, p, r, f) in enumerate(self._history):
hdr = ('Grammar %d/%d (precision=%.2f%%, recall=%.2f%%, '
'fscore=%.2f%%)' % (i+1, len(self._history),
p*100, r*100, f*100))
out.write('\n%s\n' % hdr)
out.write(''.join(' %s\n' % line for line in g.strip().split()))
if not (self._history and self.normalized_grammar ==
self.normalize_grammar(self._history[-1][0])):
if self.chunker is None:
out.write('\nCurrent Grammar (not well-formed)\n')
else:
out.write('\nCurrent Grammar (not evaluated)\n')
out.write(''.join(' %s\n' % line for line
in self.grammar.strip().split()))
out.close()
def about(self, *e):
ABOUT = ("NLTK RegExp Chunk Parser Demo\n"+
"Written by Edward Loper")
TITLE = 'About: Regular Expression Chunk Parser Demo'
try:
from tkMessageBox import Message
Message(message=ABOUT, title=TITLE).show()
except:
ShowText(self.top, TITLE, ABOUT)
def set_devset_size(self, size=None):
if size is not None: self._devset_size.set(size)
self._devset_size.set(min(len(self.devset), self._devset_size.get()))
self.show_devset(1)
self.show_devset(0)
def resize(self, size=None):
if size is not None: self._size.set(size)
size = self._size.get()
self._font.configure(size=-(abs(size)))
self._smallfont.configure(size=min(-10, -(abs(size))*14/20))
def mainloop(self, *args, **kwargs):
"""
Enter the Tkinter mainloop. This function must be called if
this demo is created from a non-interactive program (e.g.
from a secript); otherwise, the demo will close as soon as
the script completes.
"""
if in_idle(): return
self.top.mainloop(*args, **kwargs)
def demo():
RegexpChunkDemo().mainloop()
if __name__ == '__main__':
demo()