Package nltk :: Package inference :: Module discourse
Source Code for Module nltk.inference.discourse

  1  # Natural Language Toolkit: Discourse Processing 
  2  # 
  3  # Author: Ewan Klein <[email protected]> 
  4  # 
  5  # URL: <http://nltk.org> 
  6  # For license information, see LICENSE.TXT 
  7  # $Id: discourse.py 6471 2008-08-22 03:35:08Z DHGarrette $ 
  8   
  9  import os 
 10   
 11  from nltk.sem import root_semrep, Expression 
 12  from nltk import parse 
 13  from nltk.data import show_cfg 
 14   
 15  from nltk.inference import MaceCommand, spacer, get_prover 
 16   
 17  """ 
 18  Module for incrementally developing simple discourses, and checking for semantic ambiguity,  
 19  consistency and informativeness. 
 20   
 21  Many of the ideas are based on the CURT family of programs of Blackburn and Bos  
 22  (see U{http://homepages.inf.ed.ac.uk/jbos/comsem/book1.html}). 
 23   
 24  Consistency checking is carried out  by using the L{mace} module to call the Mace4 model builder. 
 25  Informativeness checking is carried out with a call to C{get_prover()} from 
 26  the L{inference}  module. 
 27   
 28  C{DiscourseTester} is a constructor for discourses.  
 29  The basic data structure is a list of sentences, stored as C{self._sentences}. Each sentence in the list 
 30  is assigned a I{sentence ID} (C{sid}) of the form C{s}I{i}. For example:: 
 31   
 32      s0: A boxer walks 
 33      s1: Every boxer chases a girl 
 34   
 35  Each sentence can be ambiguous between a number of readings, each of which receives a  
 36  I{reading ID} (C{rid}) of the form C{s}I{i} -C{r}I{j}. For example:: 
 37   
 38      s0 readings: 
 39      ------------------------------ 
 40      s0-r1: some x.((boxer x) and (walk x)) 
 41      s0-r0: some x.((boxerdog x) and (walk x)) 
 42   
 43  A I{thread} is a list of readings, represented 
 44  as a list of C{rid}s. Each thread receives a I{thread ID} (C{tid}) of the form C{d}I{i}.  
 45  For example:: 
 46   
 47      d0: ['s0-r0', 's1-r0'] 
 48   
 49  The set of all threads for a discourse is the Cartesian product of all the readings of the sequences of sentences. 
 50  (This is not intended to scale beyond very short discourses!) The method L{readings(filter=True)} will only show 
 51  those threads which are consistent (taking into account any background assumptions). 
 52  """ 
 53   
 54   
 55   
 56 -class DiscourseTester(object): 
 57      """ 
 58      Check properties of an ongoing discourse. 
 59      """ 
 60 -    def __init__(self, input, gramfile=None, background=None):        
 61          """ 
 62          Initialize a C{DiscourseTester}. 
 63           
 64          @parameter input: the discourse sentences 
 65          @type input: C{list} of C{str} 
 66          @parameter gramfile: name of file where grammar can be loaded 
 67          @type gramfile: C{str} 
 68          @parameter background: Formulas which express background assumptions 
 69          @type background: C{list} of L{logic.Expression}. 
 70          """ 
 71          self._input = input 
 72          self._sentences = dict([('s%s' % i, sent) for i, sent in enumerate(input)]) 
 73          self._models = None 
 74          self._readings = {} 
 75          if gramfile is None: 
 76              self._gramfile = 'grammars/sem4.fcfg' 
 77          else: 
 78              self._gramfile = gramfile 
 79          self._threads = {} 
 80          self._filtered_threads = {} 
 81          self._parser = parse.load_earley(self._gramfile)  
 82          if background is not None: 
 83              for e in background: 
 84                  assert isinstance(e, Expression) 
 85              self._background = background 
 86          else: 
 87              self._background = [] 
 88   
 89      ############################### 
 90      # Sentences 
 91      ############################### 
 92       
 93 -    def sentences(self): 
 94          """ 
 95          Display the list of sentences in the current discourse. 
 96          """ 
 97          for id in sorted(self._sentences.keys()): 
 98              print "%s: %s" % (id, self._sentences[id])  
 99                 
100 -    def add_sentence(self, sentence, informchk=False, consistchk=False,): 
101          """ 
102          Add a sentence to the current discourse. 
103           
104          Updates C{self._input} and C{self._sentences}. 
105          @parameter sentence: An input sentence 
106          @type sentence: C{str} 
107          @parameter informchk: if C{True}, check that the result of adding the sentence is thread-informative. Updates C{self._readings}. 
108          @parameter consistchk: if C{True}, check that the result of adding the sentence is thread-consistent. Updates C{self._readings}. 
109           
110          """ 
111          # check whether the new sentence is informative (i.e. not entailed by the previous discourse)        
112          if informchk: 
113              self.readings(quiet=True) 
114              for tid in sorted(self._threads.keys()): 
115                  assumptions = [reading for (rid, reading) in self.expand_threads(tid)] 
116                  assumptions += self._background 
117                  for sent_reading in self._get_readings(sentence): 
118                      tp = get_prover(goal=sent_reading, assumptions=assumptions, prover_name='Prover9') 
119                      if tp.prove(): 
120                          print "Sentence '%s' under reading '%s':" % (sentence, str(sent_reading)) 
121                          print "Not informative relative to thread '%s'" % tid 
122              
123          self._input.append(sentence) 
124          self._sentences = dict([('s%s' % i, sent) for i, sent in enumerate(self._input)]) 
125          # check whether adding the new sentence to the discourse preserves consistency (i.e. a model can be found for the combined set of 
126          # of assumptions 
127          if consistchk: 
128              self.readings(quiet=True) 
129              self.models(show=False)  
130                          
131 -    def retract_sentence(self, sentence, quiet=False): 
132          """ 
133          Remove a sentence from the current discourse. 
134           
135          Updates C{self._input}, C{self._sentences} and C{self._readings}. 
136          @parameter sentence: An input sentence 
137          @type sentence: C{str} 
138          @parameter quiet: If C{False},  report on the updated list of sentences. 
139          """ 
140          try: 
141              self._input.remove(sentence) 
142          except ValueError: 
143              print "Retraction failed. The sentence '%s' is not part of the current discourse:" % sentence 
144              self.sentences() 
145              return None 
146          self._sentences = dict([('s%s' % i, sent) for i, sent in enumerate(self._input)]) 
147          self.readings(quiet=True) 
148          if not quiet: 
149              print "Current sentences are " 
150              self.sentences() 
151               
152 -    def grammar(self): 
153          """ 
154          Print out the grammar in use for parsing input sentences 
155          """ 
156          show_cfg(self._gramfile) 
157           
158      ############################### 
159      # Readings and Threads 
160      ###############################         
161   
162 -    def _get_readings(self, sentence): 
163          """ 
164          Build a list of semantic readings for a sentence. 
165           
166          @rtype: C{list} of  L{logic.Expression}. 
167          """ 
168          tokens = sentence.split() 
169          trees = self._parser.nbest_parse(tokens) 
170          return [root_semrep(tree) for tree in trees]     
171                            
172 -    def _construct_readings(self): 
173          """ 
174          Use C{self._sentences} to construct a value for C{self._readings}. 
175          """ 
176          # re-initialize self._readings in case we have retracted a sentence 
177          self._readings = {} 
178          for sid in self._sentences: 
179              readings = self._get_readings(self._sentences[sid]) 
180              self._readings[sid] = dict([("%s-r%s" % (sid, rid), reading) 
181                                                          for rid, reading in enumerate(readings)]) 
182                   
183 -    def _construct_threads(self): 
184          """ 
185          Use C{self._readings} to construct a value for C{self._threads} 
186          and use the model builder to construct a value for C{self._filtered_threads} 
187          """ 
188          thread_list = [[]] 
189          for sid in sorted(self._readings.keys()): 
190              thread_list = self.multiply(thread_list, sorted(self._readings[sid].keys()))       
191          self._threads = dict([("d%s" % tid, thread) for tid, thread in enumerate(thread_list)]) 
192          # re-initialize the filtered threads 
193          self._filtered_threads = {} 
194          # keep the same ids, but only include threads which get models 
195          for (tid, thread) in self._threads.items(): 
196              if (tid, True) in self._check_consistency(self._threads): 
197                  self._filtered_threads[tid] = thread 
198      
199    
200 -    def _show_readings(self, sentence=None): 
201          """ 
202          Print out the readings for  the discourse (or a single sentence). 
203          """ 
204          if sentence is not None: 
205              print "The sentence '%s' has these readings:" % sentence 
206              for r in [str(reading) for reading in (self._get_readings(sentence))]: 
207                  print "    %s" % r 
208          else: 
209              for sid in sorted(self._readings.keys()): 
210                  print 
211                  print '%s readings:' % sid 
212                  print '-' * 30 
213                  for rid in sorted(self._readings[sid]): 
214                      lf = self._readings[sid][rid] 
215                      #TODO lf = lf.normalize('[xyz]\d*', 'z%d') 
216                      print "%s: %s" % (rid, lf) 
217       
218 -    def _show_threads(self, filter=False): 
219          """ 
220          Print out the value of C{self._threads} or C{self._filtered_hreads}  
221          """ 
222          if filter: 
223              threads = self._filtered_threads 
224          else: 
225              threads = self._threads 
226          for tid in sorted(threads.keys()): 
227              print "%s:" % tid, self._threads[tid]  
228           
229           
230 -    def readings(self, sentence=None, threaded=False, quiet=False, filter=False): 
231          """ 
232          Construct and show the readings of the discourse (or of a single sentence). 
233           
234          @parameter sentence: test just this sentence 
235          @type sentence: C{str} 
236          @parameter threaded: if C{True}, print out each thread ID and the corresponding thread. 
237          @parameter filter: if C{True}, only print out consistent thread IDs and threads. 
238          """ 
239          self._construct_readings() 
240          self._construct_threads() 
241           
242          # if we are filtering, just show threads 
243          if filter: threaded=True 
244          if not quiet: 
245              if not threaded: 
246                  self._show_readings(sentence=sentence) 
247              else: 
248                  self._show_threads(filter=filter)                             
249       
250 -    def expand_threads(self, thread_id, threads=None): 
251          """ 
252          Given a thread ID, find the list of L{logic.Expression}s corresponding to the reading IDs in that thread. 
253           
254          @parameter thread_id: thread ID 
255          @type thread_id: C{str} 
256          @parameter threads: a mapping from thread IDs to lists of reading IDs 
257          @type threads: C{dict} 
258          @return: A list of pairs (C{rid}, I{reading}) where I{reading} is the L{logic.Expression} associated with a reading ID  
259          @rtype: C{list} of C{tuple} 
260          """ 
261          if threads is None: 
262              threads = self._threads 
263          return [(rid, self._readings[sid][rid]) for rid in threads[thread_id] for sid in rid.split('-')[:1]] 
264       
265              
266      ############################### 
267      # Models and Background 
268      ###############################        
269        
270 -    def _check_consistency(self, threads, show=False, quiet=True): 
271          results = [] 
272          for tid in sorted(threads.keys()): 
273              assumptions = [reading for (rid, reading) in self.expand_threads(tid, threads=threads)] 
274              assumptions += self._background 
275              # if Mace4 finds a model, it always seems to find it quickly 
276              mb = MaceCommand(None, assumptions, timeout=2) 
277              modelfound = mb.build_model() 
278              results.append((tid, modelfound)) 
279              if show: 
280                  spacer(80) 
281                  print "Model for Discourse Thread %s" % tid 
282                  spacer(80) 
283                  if not quiet: 
284                      for a in assumptions: 
285                          print a 
286                      spacer(80) 
287                  if modelfound: 
288                      mb.show_model(format='cooked') 
289                  else: 
290                      print "No model found!\n" 
291          return results 
292       
293 -    def models(self, thread_id=None, show=True, quiet=True): 
294          """ 
295          Call Mace4 to build a model for each current discourse thread. 
296           
297          @parameter thread_id: thread ID 
298          @type thread_id: C{str} 
299          @parameter show: If C{True}, display the model that has been found. 
300          """ 
301          self._construct_readings() 
302          self._construct_threads() 
303          if thread_id is None: 
304              threads = self._threads 
305          else: 
306              threads = {thread_id: self._threads[thread_id]} 
307           
308          for (tid, modelfound) in self._check_consistency(threads, show=show, quiet=quiet):             
309              idlist = [rid for rid in threads[tid]] 
310               
311              if not modelfound: 
312                  print "Inconsistent discourse %s %s:" % (tid, idlist) 
313                  for  rid, reading in [(rid, str(reading))  for (rid, reading) in self.expand_threads(tid)]: 
314                      print "    %s: %s" % (rid, reading) 
315                  print  
316              else: 
317                  print "Consistent discourse: %s %s:" % (tid, idlist) 
318                  for  rid, reading in [(rid, str(reading))  for (rid, reading) in self.expand_threads(tid)]: 
319                      print "    %s: %s" % (rid, reading) 
320                  print  
321           
322 -    def add_background(self, background, quiet=True): 
323          """ 
324          Add a list of background assumptions for reasoning about the discourse. 
325           
326          When called,  this method also updates the discourse model's set of readings and threads. 
327          @parameter background: Formulas which contain background information 
328          @type background: C{list} of L{logic.Expression}. 
329          """ 
330          for (count, e) in enumerate(background): 
331              assert isinstance(e, Expression) 
332              if not quiet: 
333                  print "Adding assumption %s to background" % count 
334              self._background.append(e)  
335               
336          #update the state 
337          self._construct_readings() 
338          self._construct_threads() 
339           
340 -    def background(self): 
341          """ 
342          Show the current background assumptions. 
343          """ 
344          for e in self._background: 
345              print str(e) 
346       
347     ############################### 
348      # Misc 
349      ###############################                               
350                   
351      @staticmethod 
352 -    def multiply(discourse, readings): 
353          """ 
354          Multiply every thread in C{discourse} by every reading in C{readings}. 
355           
356          Given discourse = [['A'], ['B']], readings = ['a', 'b', 'c'] , returns         
357          [['A', 'a'], ['A', 'b'], ['A', 'c'], ['B', 'a'], ['B', 'b'], ['B', 'c']] 
358           
359          @parameter discourse: the current list of readings 
360          @type discourse: C{list} of C{list}s 
361          @parameter readings: an additional list of readings 
362          @type readings: C{list} of C{logic.Expression}s 
363          @rtype: A C{list} of C{list}s 
364          """ 
365          result = [] 
366          for sublist in discourse: 
367              for r in readings: 
368                  new = [] 
369                  new += sublist 
370                  new.append(r) 
371                  result.append(new) 
372          return result 
373   
374  #multiply = DiscourseTester.multiply 
375  #L1 = [['A'], ['B']] 
376  #L2 = ['a', 'b', 'c']  
377  #print multiply(L1,L2) 
378   
379 -def parse_fol(s): 
380      """ 
381      Temporarily duplicated from L{nltk.sem.util}. 
382      Convert a  file of First Order Formulas into a list of C{Expression}s. 
383       
384      @parameter s: the contents of the file 
385      @type s: C{str} 
386      @return: a list of parsed formulas. 
387      @rtype: C{list} of L{Expression} 
388      """ 
389      from nltk.sem import LogicParser 
390      statements = [] 
391      lp = LogicParser() 
392      for linenum, line in enumerate(s.splitlines()): 
393          line = line.strip() 
394          if line.startswith('#') or line=='': continue 
395          try: 
396              statements.append(lp.parse(line)) 
397          except Error: 
398              raise ValueError, 'Unable to parse line %s: %s' % (linenum, line) 
399      return statements 
400               
401  ############################### 
402  # Demo 
403  ###############################     
404   
405 -def discourse_demo():   
406      """ 
407      Illustrate the various methods of C{DiscourseTester} 
408      """ 
409      dt = DiscourseTester(['A boxer walks', 'Every boxer chases a girl']) 
410      dt.models() 
411      print 
412      #dt.grammar() 
413      print  
414      dt.sentences() 
415      print  
416      dt.readings() 
417      print 
418      dt.readings(threaded=True) 
419      print 
420      dt.models('d1') 
421      dt.add_sentence('John is a boxer') 
422      print  
423      dt.sentences() 
424      print 
425      dt.readings(threaded=True) 
426      print 
427      dt = DiscourseTester(['A student dances', 'Every student is a person']) 
428      print  
429      dt.add_sentence('No person dances', consistchk=True) 
430      print 
431      dt.readings() 
432      print 
433      dt.retract_sentence('No person dances', quiet=False) 
434      print  
435      dt.models() 
436      print 
437      dt.readings('A person dances') 
438      print 
439      dt.add_sentence('A person dances', informchk=True) 
440      dt = DiscourseTester(['Vincent is a boxer', 'Fido is a boxer', 'Vincent is married', 'Fido barks']) 
441      dt.readings(filter=True) 
442      import nltk.data 
443      world = nltk.data.load('/grammars/world.fol') 
444      print 
445      dt.add_background(world, quiet=True) 
446      dt.background() 
447      print 
448      dt.readings(filter=True) 
449      print 
450      dt.models() 
451       
452       
453   
454  if __name__ == '__main__': 
455      discourse_demo() 
456