1
2
3
4
5
6
7
8
9
10 """
11 Visualization tools for CFGs.
12 """
13
14 import re
15
16
17 """
18 Idea for a nice demo:
19 - 3 panes: grammar, treelet, working area
20 - grammar is a list of productions
21 - when you select a production, the treelet that it licenses appears
22 in the treelet area
23 - the working area has the text on the bottom, and S at top. When
24 you select a production, it shows (ghosted) the locations where
25 that production's treelet could be attached to either the text
26 or the tree rooted at S.
27 - the user can drag the treelet onto one of those (or click on them?)
28 - the user can delete pieces of the tree from the working area
29 (right click?)
30 - connecting top to bottom? drag one NP onto another?
31
32 +-------------------------------------------------------------+
33 | S -> NP VP | S |
34 |[NP -> Det N ]| / \ |
35 | ... | NP VP |
36 | N -> 'dog' | |
37 | N -> 'cat' | |
38 | ... | |
39 +--------------+ |
40 | NP | Det N |
41 | / \ | | | |
42 | Det N | the cat saw the dog |
43 | | |
44 +--------------+----------------------------------------------+
45
46 Operations:
47 - connect a new treelet -- drag or click shadow
48 - delete a treelet -- right click
49 - if only connected to top, delete everything below
50 - if only connected to bottom, delete everything above
51 - connect top & bottom -- drag a leaf to a root or a root to a leaf
52 - disconnect top & bottom -- right click
53 - if connected to top & bottom, then disconnect
54 """
55
56 from nltk.draw import *
57 from nltk.cfg import *
58 from Tkinter import *
59 from nltk.tree import *
60 from nltk.draw.tree import *
61
62
63
64
65
66
86
87
88
89
90
91 _CFGEditor_HELP = """
92
93 The CFG Editor can be used to create or modify context free grammars.
94 A context free grammar consists of a start symbol and a list of
95 productions. The start symbol is specified by the text entry field in
96 the upper right hand corner of the editor; and the list of productions
97 are specified in the main text editing box.
98
99 Every non-blank line specifies a single production. Each production
100 has the form "LHS -> RHS," where LHS is a single nonterminal, and RHS
101 is a list of nonterminals and terminals.
102
103 Nonterminals must be a single word, such as S or NP or NP_subj.
104 Currently, nonterminals must consists of alphanumeric characters and
105 underscores (_). Nonterminals are colored blue. If you place the
106 mouse over any nonterminal, then all occurances of that nonterminal
107 will be highlighted.
108
109 Termianals must be surrounded by single quotes (') or double
110 quotes(\"). For example, "dog" and "New York" are terminals.
111 Currently, the string within the quotes must consist of alphanumeric
112 characters, underscores, and spaces.
113
114 To enter a new production, go to a blank line, and type a nonterminal,
115 followed by an arrow (->), followed by a sequence of terminals and
116 nonterminals. Note that "->" (dash + greater-than) is automatically
117 converted to an arrow symbol. When you move your cursor to a
118 different line, your production will automatically be colorized. If
119 there are any errors, they will be highlighted in red.
120
121 Note that the order of the productions is signifigant for some
122 algorithms. To re-order the productions, use cut and paste to move
123 them.
124
125 Use the buttons at the bottom of the window when you are done editing
126 the CFG:
127 - Ok: apply the new CFG, and exit the editor.
128 - Apply: apply the new CFG, and do not exit the editor.
129 - Reset: revert to the original CFG, and do not exit the editor.
130 - Cancel: revert to the original CFG, and exit the editor.
131
132 """
133
135 """
136 A dialog window for creating and editing context free grammars.
137 C{CFGEditor} places the following restrictions on what C{CFG}s can
138 be edited:
139 - All nonterminals must be strings consisting of word
140 characters.
141 - All terminals must be strings consisting of word characters
142 and space characters.
143 """
144
145
146 ARROW = SymbolWidget.SYMBOLS['rightarrow']
147 _LHS_RE = re.compile(r"(^\s*\w+\s*)(->|("+ARROW+"))")
148 _ARROW_RE = re.compile("\s*(->|("+ARROW+"))\s*")
149 _PRODUCTION_RE = re.compile(r"(^\s*\w+\s*)" +
150 "(->|("+ARROW+"))\s*" +
151 r"((\w+|'[\w ]*'|\"[\w ]*\"|\|)\s*)*$")
152 _TOKEN_RE = re.compile("\\w+|->|'[\\w ]+'|\"[\\w ]+\"|("+ARROW+")")
153 _BOLD = ('helvetica', -12, 'bold')
154
155 - def __init__(self, parent, cfg=None, set_cfg_callback=None):
175
177 frame = self._startframe = Frame(self._top)
178 self._start = Entry(frame)
179 self._start.pack(side='right')
180 Label(frame, text='Start Symbol:').pack(side='right')
181 Label(frame, text='Productions:').pack(side='left')
182 self._start.insert(0, self._cfg.start().symbol())
183
196
198 self._top.title('CFG Editor')
199 self._top.bind('<Control-q>', self._cancel)
200 self._top.bind('<Alt-q>', self._cancel)
201 self._top.bind('<Control-d>', self._cancel)
202
203 self._top.bind('<Alt-x>', self._cancel)
204 self._top.bind('<Escape>', self._cancel)
205
206 self._top.bind('<Alt-c>', self._cancel)
207
208 self._top.bind('<Control-o>', self._ok)
209 self._top.bind('<Alt-o>', self._ok)
210 self._top.bind('<Control-a>', self._apply)
211 self._top.bind('<Alt-a>', self._apply)
212 self._top.bind('<Control-r>', self._reset)
213 self._top.bind('<Alt-r>', self._reset)
214 self._top.bind('<Control-h>', self._help)
215 self._top.bind('<Alt-h>', self._help)
216 self._top.bind('<F1>', self._help)
217
219 self._prodframe = Frame(self._top)
220
221
222 self._textwidget = Text(self._prodframe, background='#e0e0e0',
223 exportselection=1)
224 self._textscroll = Scrollbar(self._prodframe, takefocus=0,
225 orient='vertical')
226 self._textwidget.config(yscrollcommand = self._textscroll.set)
227 self._textscroll.config(command=self._textwidget.yview)
228 self._textscroll.pack(side='right', fill='y')
229 self._textwidget.pack(expand=1, fill='both', side='left')
230
231
232
233 self._textwidget.tag_config('terminal', foreground='#006000')
234 self._textwidget.tag_config('arrow', font='symbol')
235 self._textwidget.tag_config('error', background='red')
236
237
238
239 self._linenum = 0
240
241
242 self._top.bind('>', self._replace_arrows)
243
244
245 self._top.bind('<<Paste>>', self._analyze)
246 self._top.bind('<KeyPress>', self._check_analyze)
247 self._top.bind('<ButtonPress>', self._check_analyze)
248
249
250 def cycle(e, textwidget=self._textwidget):
251 textwidget.tk_focusNext().focus()
252 self._textwidget.bind('<Tab>', cycle)
253
254 prod_tuples = [(p.lhs(),[p.rhs()]) for p in self._cfg.productions()]
255 for i in range(len(prod_tuples)-1,0,-1):
256 if (prod_tuples[i][0] == prod_tuples[i-1][0]):
257 if () in prod_tuples[i][1]: continue
258 if () in prod_tuples[i-1][1]: continue
259 print prod_tuples[i-1][1]
260 print prod_tuples[i][1]
261 prod_tuples[i-1][1].extend(prod_tuples[i][1])
262 del prod_tuples[i]
263
264 for lhs, rhss in prod_tuples:
265 print lhs, rhss
266 s = '%s ->' % lhs
267 for rhs in rhss:
268 for elt in rhs:
269 if isinstance(elt, Nonterminal): s += ' %s' % elt
270 else: s += ' %r' % elt
271 s += ' |'
272 s = s[:-2] + '\n'
273 self._textwidget.insert('end', s)
274
275 self._analyze()
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
312
314 """
315 Check if we've moved to a new line. If we have, then remove
316 all colorization from the line we moved to, and re-colorize
317 the line that we moved from.
318 """
319 linenum = int(self._textwidget.index('insert').split('.')[0])
320 if linenum != self._linenum:
321 self._clear_tags(linenum)
322 self._analyze_line(self._linenum)
323 self._linenum = linenum
324
326 """
327 Replace any C{'->'} text strings with arrows (char \\256, in
328 symbol font). This searches the whole buffer, but is fast
329 enough to be done anytime they press '>'.
330 """
331 arrow = '1.0'
332 while 1:
333 arrow = self._textwidget.search('->', arrow, 'end+1char')
334 if arrow == '': break
335 self._textwidget.delete(arrow, arrow+'+2char')
336 self._textwidget.insert(arrow, self.ARROW, 'arrow')
337 self._textwidget.insert(arrow, '\t')
338
339 arrow = '1.0'
340 while 1:
341 arrow = self._textwidget.search(self.ARROW, arrow+'+1char',
342 'end+1char')
343 if arrow == '': break
344 self._textwidget.tag_add('arrow', arrow, arrow+'+1char')
345
347 """
348 Given a line number and a regexp match for a token on that
349 line, colorize the token. Note that the regexp match gives us
350 the token's text, start index (on the line), and end index (on
351 the line).
352 """
353
354 if match.group()[0] in "'\"": tag = 'terminal'
355 elif match.group() in ('->', self.ARROW): tag = 'arrow'
356 else:
357
358
359
360 tag = 'nonterminal_'+match.group()
361 if tag not in self._textwidget.tag_names():
362 self._init_nonterminal_tag(tag)
363
364 start = '%d.%d' % (linenum, match.start())
365 end = '%d.%d' % (linenum, match.end())
366 self._textwidget.tag_add(tag, start, end)
367
369 self._textwidget.tag_config(tag, foreground=foreground,
370 font=CFGEditor._BOLD)
371 if not self._highlight_matching_nonterminals:
372 return
373 def enter(e, textwidget=self._textwidget, tag=tag):
374 textwidget.tag_config(tag, background='#80ff80')
375 def leave(e, textwidget=self._textwidget, tag=tag):
376 textwidget.tag_config(tag, background='')
377 self._textwidget.tag_bind(tag, '<Enter>', enter)
378 self._textwidget.tag_bind(tag, '<Leave>', leave)
379
397 CFGEditor._TOKEN_RE.sub(analyze_token, line)
398 elif line.strip() != '':
399
400 self._mark_error(linenum, line)
401
403 """
404 Mark the location of an error in a line.
405 """
406 arrowmatch = CFGEditor._ARROW_RE.search(line)
407 if not arrowmatch:
408
409 start = '%d.0' % linenum
410 end = '%d.end' % linenum
411 elif not CFGEditor._LHS_RE.match(line):
412
413 start = '%d.0' % linenum
414 end = '%d.%d' % (linenum, arrowmatch.start())
415 else:
416
417 start = '%d.%d' % (linenum, arrowmatch.end())
418 end = '%d.end' % linenum
419
420
421 if self._textwidget.compare(start, '==', end):
422 start = '%d.0' % linenum
423 end = '%d.end' % linenum
424 self._textwidget.tag_add('error', start, end)
425
427 """
428 Replace C{->} with arrows, and colorize the entire buffer.
429 """
430 self._replace_arrows()
431 numlines = int(self._textwidget.index('end').split('.')[0])
432 for linenum in range(1, numlines+1):
433 self._analyze_line(linenum)
434
470
472 if self._top is None: return
473 self._top.destroy()
474 self._top = None
475
479
486
488 self._textwidget.delete('1.0', 'end')
489 for production in self._cfg.productions():
490 self._textwidget.insert('end', '%s\n' % production)
491 self._analyze()
492 if self._set_cfg_callback is not None:
493 self._set_cfg_callback(self._cfg)
494
499
508
509
510
511
512
537
538
539
540
541
544
546
548
555
557 self._treelet_canvas = Canvas(parent, background='white')
558 self._treelet_canvas.pack(side='bottom', fill='x')
559 self._treelet = None
560
566
567
568
569
570
572 c = self._workspace.canvas()
573 fontsize = int(self._size.get())
574 node_font = ('helvetica', -(fontsize+4), 'bold')
575 leaf_font = ('helvetica', -(fontsize+2))
576
577
578 if self._tree is not None:
579 self._workspace.remove_widget(self._tree)
580
581
582 start = self._grammar.start().symbol()
583 rootnode = TextWidget(c, start, font=node_font, draggable=1)
584
585
586 leaves = []
587 for word in self._text:
588 if isinstance(word, Token): word = word.type()
589 leaves.append(TextWidget(c, word, font=leaf_font, draggable=1))
590
591
592 self._tree = TreeSegmentWidget(c, rootnode, leaves,
593 color='white')
594
595
596 self._workspace.add_widget(self._tree)
597
598
599 for leaf in leaves: leaf.move(0,100)
600
601
602
603
606
627
628
629
630
631
633 canvas = self._treelet_canvas
634
635 self._prodlist.highlight(production)
636 if self._treelet is not None: self._treelet.destroy()
637
638
639 from nltk import Tree
640 rhs = production.rhs()
641 for (i, elt) in enumerate(rhs):
642 if isinstance(elt, Nonterminal): elt = Tree(elt)
643 tree = Tree(production.lhs().symbol(), *rhs)
644
645
646 fontsize = int(self._size.get())
647 node_font = ('helvetica', -(fontsize+4), 'bold')
648 leaf_font = ('helvetica', -(fontsize+2))
649 self._treelet = tree_to_treesegment(canvas, tree,
650 node_font=node_font,
651 leaf_font=leaf_font)
652 self._treelet['draggable'] = 1
653
654
655 (x1, y1, x2, y2) = self._treelet.bbox()
656 w, h = int(canvas['width']), int(canvas['height'])
657 self._treelet.move((w-x1-x2)/2, (h-y1-y2)/2)
658
659
660 self._markproduction(production)
661
664
665 - def mainloop(self, *args, **kwargs):
666 self._top.mainloop(*args, **kwargs)
667
669 from nltk import cfg
670 nonterminals = 'S VP NP PP P N Name V Det'
671 (S, VP, NP, PP, P, N, Name, V, Det) = [cfg.Nonterminal(s)
672 for s in nonterminals.split()]
673 productions = (
674
675 cfg.Production(S, [NP, VP]),
676 cfg.Production(NP, [Det, N]),
677 cfg.Production(NP, [NP, PP]),
678 cfg.Production(VP, [VP, PP]),
679 cfg.Production(VP, [V, NP, PP]),
680 cfg.Production(VP, [V, NP]),
681 cfg.Production(PP, [P, NP]),
682 cfg.Production(PP, []),
683
684 cfg.Production(PP, ['up', 'over', NP]),
685
686
687 cfg.Production(NP, ['I']), cfg.Production(Det, ['the']),
688 cfg.Production(Det, ['a']), cfg.Production(N, ['man']),
689 cfg.Production(V, ['saw']), cfg.Production(P, ['in']),
690 cfg.Production(P, ['with']), cfg.Production(N, ['park']),
691 cfg.Production(N, ['dog']), cfg.Production(N, ['statue']),
692 cfg.Production(Det, ['my']),
693 )
694 grammar = cfg.Grammar(S, productions)
695
696 text = 'I saw a man in the park'.split()
697 d=CFGDemo(grammar, text)
698 d.mainloop()
699
700
701
702
703
705 from nltk import cfg
706 nonterminals = 'S VP NP PP P N Name V Det'
707 (S, VP, NP, PP, P, N, Name, V, Det) = [cfg.Nonterminal(s)
708 for s in nonterminals.split()]
709
710 grammar = cfg.parse_cfg("""
711 S -> NP VP
712 PP -> P NP
713 NP -> Det N
714 NP -> NP PP
715 VP -> V NP
716 VP -> VP PP
717 Det -> 'a'
718 Det -> 'the'
719 Det -> 'my'
720 NP -> 'I'
721 N -> 'dog'
722 N -> 'man'
723 N -> 'park'
724 N -> 'statue'
725 V -> 'saw'
726 P -> 'in'
727 P -> 'up'
728 P -> 'over'
729 P -> 'with'
730 """)
731
732 def cb(grammar): print grammar
733 top = Tk()
734 editor = CFGEditor(top, grammar, cb)
735 Label(top, text='\nTesting CFG Editor\n').pack()
736 Button(top, text='Quit', command=top.destroy).pack()
737 top.mainloop()
738
740 from nltk import cfg
741 (S, VP, NP, PP, P, N, Name, V, Det) = \
742 nonterminals('S, VP, NP, PP, P, N, Name, V, Det')
743
744 productions = (
745
746 cfg.Production(S, [NP, VP]),
747 cfg.Production(NP, [Det, N]),
748 cfg.Production(NP, [NP, PP]),
749 cfg.Production(VP, [VP, PP]),
750 cfg.Production(VP, [V, NP, PP]),
751 cfg.Production(VP, [V, NP]),
752 cfg.Production(PP, [P, NP]),
753 cfg.Production(PP, []),
754
755 cfg.Production(PP, ['up', 'over', NP]),
756
757
758 cfg.Production(NP, ['I']), cfg.Production(Det, ['the']),
759 cfg.Production(Det, ['a']), cfg.Production(N, ['man']),
760 cfg.Production(V, ['saw']), cfg.Production(P, ['in']),
761 cfg.Production(P, ['with']), cfg.Production(N, ['park']),
762 cfg.Production(N, ['dog']), cfg.Production(N, ['statue']),
763 cfg.Production(Det, ['my']),
764 )
765
766 t = Tk()
767 def destroy(e, t=t): t.destroy()
768 t.bind('q', destroy)
769 p = ProductionList(t, productions)
770 p.pack(expand=1, fill='both')
771 p.add_callback('select', p.markonly)
772 p.add_callback('move', p.markonly)
773 p.focus()
774 p.mark(productions[2])
775 p.mark(productions[8])
776
777 if __name__ == '__main__': demo()
778